|
|
@ -40,6 +40,7 @@ class MysqlDB: |
|
|
|
finally: |
|
|
|
finally: |
|
|
|
connection.close() |
|
|
|
connection.close() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 美团美食爬虫 |
|
|
|
# 美团美食爬虫 |
|
|
|
class MeiTuanCrawler: |
|
|
|
class MeiTuanCrawler: |
|
|
|
# 城市列表网址 |
|
|
|
# 城市列表网址 |
|
|
@ -71,53 +72,63 @@ class MeiTuanCrawler: |
|
|
|
else: |
|
|
|
else: |
|
|
|
writeError("无法爬取城市链接,响应码:{0}".format(res.status_code)) |
|
|
|
writeError("无法爬取城市链接,响应码:{0}".format(res.status_code)) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def meishiInputCity(self): |
|
|
|
|
|
|
|
city = input("请输入要爬取的城市:\n") |
|
|
|
|
|
|
|
while city not in self.cityMap: |
|
|
|
|
|
|
|
writeInfo("城市:{}不合法".format(city)) |
|
|
|
|
|
|
|
city = input("请输入要爬取的城市:\n") |
|
|
|
|
|
|
|
self.meishiWithCity(city) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def meishiWithCity(self, city): |
|
|
|
|
|
|
|
# 获取请求参数 |
|
|
|
|
|
|
|
c = CityParam(self.cityMap[city]) |
|
|
|
|
|
|
|
GET_PARAM = c.getParam() |
|
|
|
|
|
|
|
totalPage = None |
|
|
|
|
|
|
|
# 当前页数 |
|
|
|
|
|
|
|
GET_PARAM['page'] = 1 |
|
|
|
|
|
|
|
# 爬取美食列表数据 |
|
|
|
|
|
|
|
while totalPage is None or GET_PARAM['page'] <= totalPage: |
|
|
|
|
|
|
|
# 添加token参数 |
|
|
|
|
|
|
|
GET_PARAM["_token"] = c.encrypt_token() |
|
|
|
|
|
|
|
writeInfo("param:{}".format(GET_PARAM)) |
|
|
|
|
|
|
|
# 拼接城市美食接口地址 |
|
|
|
|
|
|
|
base_url = "https://{}.meituan.com/meishi/api/poi/getPoiList?".format(self.cityMap[city][en]) |
|
|
|
|
|
|
|
url = base_url + urlencode(GET_PARAM, encoding='utf-8') |
|
|
|
|
|
|
|
writeInfo("接口地址:{}".format(url)) |
|
|
|
|
|
|
|
writeInfo("解析{}美食列表第{}页数据".format(self.cityMap[city][zh], GET_PARAM['page'])) |
|
|
|
|
|
|
|
# 发送请求获取美食数据 |
|
|
|
|
|
|
|
res = self.session.get(url, headers=c.getHeaders(), timeout=TIMEOUT) |
|
|
|
|
|
|
|
# 解析接口响应的json数据 |
|
|
|
|
|
|
|
if res.status_code == 200 and 'json' in res.headers['Content-Type']: |
|
|
|
|
|
|
|
jsonres = json.loads(res.content) |
|
|
|
|
|
|
|
writeInfo("响应json{}".format(json.dumps(jsonres, indent=1))) |
|
|
|
|
|
|
|
# 判断数据结构合法性 |
|
|
|
|
|
|
|
if data in jsonres and totalCounts in jsonres[data] and poiInfos in jsonres[data]: |
|
|
|
|
|
|
|
# 获取分页数 |
|
|
|
|
|
|
|
if totalPage is None: |
|
|
|
|
|
|
|
totalPage = math.ceil(jsonres[data][totalCounts] / len(jsonres[data][poiInfos])) |
|
|
|
|
|
|
|
writeInfo('{}美食数据总共有{}'.format(self.cityMap[city][zh], totalPage)) |
|
|
|
|
|
|
|
# 轮询每一家店的数据 |
|
|
|
|
|
|
|
for poiInfo in jsonres[data][poiInfos]: |
|
|
|
|
|
|
|
writeInfo('美食店数据:{}'.format(json.dumps(poiInfo, indent=1))) |
|
|
|
|
|
|
|
db = MysqlDB() |
|
|
|
|
|
|
|
# 保存到mysql数据库 |
|
|
|
|
|
|
|
db.insert('insert into meishi (poiId, frontImg, title, avgScore, ' |
|
|
|
|
|
|
|
'allCommentNum, address, avgPrice, hasAds, adsClickUrl, adsShowUrl) ' |
|
|
|
|
|
|
|
'value (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)', |
|
|
|
|
|
|
|
(str(poiInfo['poiId']), poiInfo['frontImg'], poiInfo['title'], |
|
|
|
|
|
|
|
str(poiInfo['avgScore']), |
|
|
|
|
|
|
|
str(poiInfo['allCommentNum']), poiInfo['address'], str(poiInfo['avgPrice']), |
|
|
|
|
|
|
|
poiInfo['hasAds'], poiInfo['adsClickUrl'], poiInfo['adsShowUrl'])) |
|
|
|
|
|
|
|
else: |
|
|
|
|
|
|
|
writeError("从json{}解析美食店数据失败".format(json.dumps(jsonres, indent=1))) |
|
|
|
|
|
|
|
else: |
|
|
|
|
|
|
|
writeError("从接口地址{}解析json数据{}失败".format(url, res.content)) |
|
|
|
|
|
|
|
GET_PARAM['page'] = GET_PARAM['page'] + 1 |
|
|
|
|
|
|
|
# 限制请求速率 |
|
|
|
|
|
|
|
time.sleep(1) |
|
|
|
|
|
|
|
|
|
|
|
# 爬取城市美食 |
|
|
|
# 爬取城市美食 |
|
|
|
def meishi(self): |
|
|
|
def meishi(self): |
|
|
|
for city in self.cityMap.keys(): |
|
|
|
for city in self.cityMap.keys(): |
|
|
|
# 获取请求参数 |
|
|
|
self.meishiWithCity(city) |
|
|
|
c = CityParam(self.cityMap[city]) |
|
|
|
|
|
|
|
GET_PARAM = c.getParam() |
|
|
|
|
|
|
|
totalPage = None |
|
|
|
|
|
|
|
# 当前页数 |
|
|
|
|
|
|
|
GET_PARAM['page'] = 1 |
|
|
|
|
|
|
|
# 爬取美食列表数据 |
|
|
|
|
|
|
|
while totalPage is None or GET_PARAM['page'] <= totalPage: |
|
|
|
|
|
|
|
# 添加token参数 |
|
|
|
|
|
|
|
GET_PARAM["_token"] = c.encrypt_token() |
|
|
|
|
|
|
|
writeInfo("param:{}".format(GET_PARAM)) |
|
|
|
|
|
|
|
# 拼接城市美食接口地址 |
|
|
|
|
|
|
|
base_url = "https://{}.meituan.com/meishi/api/poi/getPoiList?".format(self.cityMap[city][en]) |
|
|
|
|
|
|
|
url = base_url + urlencode(GET_PARAM, encoding='utf-8') |
|
|
|
|
|
|
|
writeInfo("接口地址:{}".format(url)) |
|
|
|
|
|
|
|
writeInfo("解析{}美食列表第{}页数据".format(self.cityMap[city][zh], GET_PARAM['page'])) |
|
|
|
|
|
|
|
# 发送请求获取美食数据 |
|
|
|
|
|
|
|
res = self.session.get(url, headers=c.getHeaders(), timeout=TIMEOUT) |
|
|
|
|
|
|
|
# 解析接口响应的json数据 |
|
|
|
|
|
|
|
if res.status_code == 200 and 'json' in res.headers['Content-Type']: |
|
|
|
|
|
|
|
jsonres = json.loads(res.content) |
|
|
|
|
|
|
|
writeInfo("响应json{}".format(json.dumps(jsonres, indent=1))) |
|
|
|
|
|
|
|
# 判断数据结构合法性 |
|
|
|
|
|
|
|
if data in jsonres and totalCounts in jsonres[data] and poiInfos in jsonres[data]: |
|
|
|
|
|
|
|
# 获取分页数 |
|
|
|
|
|
|
|
if totalPage is None: |
|
|
|
|
|
|
|
totalPage = math.ceil(jsonres[data][totalCounts] / len(jsonres[data][poiInfos])) |
|
|
|
|
|
|
|
writeInfo('{}美食数据总共有{}'.format(self.cityMap[city][zh], totalPage)) |
|
|
|
|
|
|
|
# 轮询每一家店的数据 |
|
|
|
|
|
|
|
for poiInfo in jsonres[data][poiInfos]: |
|
|
|
|
|
|
|
writeInfo('美食店数据:{}'.format(json.dumps(poiInfo, indent=1))) |
|
|
|
|
|
|
|
db = MysqlDB() |
|
|
|
|
|
|
|
# 保存到mysql数据库 |
|
|
|
|
|
|
|
db.insert('insert into meishi (poiId, frontImg, title, avgScore, ' |
|
|
|
|
|
|
|
'allCommentNum, address, avgPrice, hasAds, adsClickUrl, adsShowUrl) ' |
|
|
|
|
|
|
|
'value (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)', |
|
|
|
|
|
|
|
(str(poiInfo['poiId']), poiInfo['frontImg'], poiInfo['title'], |
|
|
|
|
|
|
|
str(poiInfo['avgScore']), |
|
|
|
|
|
|
|
str(poiInfo['allCommentNum']), poiInfo['address'], str(poiInfo['avgPrice']), |
|
|
|
|
|
|
|
poiInfo['hasAds'], poiInfo['adsClickUrl'], poiInfo['adsShowUrl'])) |
|
|
|
|
|
|
|
else: |
|
|
|
|
|
|
|
writeError("从json{}解析美食店数据失败".format(json.dumps(jsonres, indent=1))) |
|
|
|
|
|
|
|
else: |
|
|
|
|
|
|
|
writeError("从接口地址{}解析json数据{}失败".format(url, res.content)) |
|
|
|
|
|
|
|
GET_PARAM['page'] = GET_PARAM['page'] + 1 |
|
|
|
|
|
|
|
# 限制请求速率 |
|
|
|
|
|
|
|
time.sleep(1) |
|
|
|
|
|
|
|