master
pan 4 years ago
parent ebe2382d4a
commit ae0f735438
  1. 105
      main.py
  2. 8
      run.py

@ -40,6 +40,7 @@ class MysqlDB:
finally:
connection.close()
# 美团美食爬虫
class MeiTuanCrawler:
# 城市列表网址
@ -71,53 +72,63 @@ class MeiTuanCrawler:
else:
writeError("无法爬取城市链接,响应码:{0}".format(res.status_code))
def meishiInputCity(self):
city = input("请输入要爬取的城市:\n")
while city not in self.cityMap:
writeInfo("城市:{}不合法".format(city))
city = input("请输入要爬取的城市:\n")
self.meishiWithCity(city)
def meishiWithCity(self, city):
# 获取请求参数
c = CityParam(self.cityMap[city])
GET_PARAM = c.getParam()
totalPage = None
# 当前页数
GET_PARAM['page'] = 1
# 爬取美食列表数据
while totalPage is None or GET_PARAM['page'] <= totalPage:
# 添加token参数
GET_PARAM["_token"] = c.encrypt_token()
writeInfo("param:{}".format(GET_PARAM))
# 拼接城市美食接口地址
base_url = "https://{}.meituan.com/meishi/api/poi/getPoiList?".format(self.cityMap[city][en])
url = base_url + urlencode(GET_PARAM, encoding='utf-8')
writeInfo("接口地址:{}".format(url))
writeInfo("解析{}美食列表第{}页数据".format(self.cityMap[city][zh], GET_PARAM['page']))
# 发送请求获取美食数据
res = self.session.get(url, headers=c.getHeaders(), timeout=TIMEOUT)
# 解析接口响应的json数据
if res.status_code == 200 and 'json' in res.headers['Content-Type']:
jsonres = json.loads(res.content)
writeInfo("响应json{}".format(json.dumps(jsonres, indent=1)))
# 判断数据结构合法性
if data in jsonres and totalCounts in jsonres[data] and poiInfos in jsonres[data]:
# 获取分页数
if totalPage is None:
totalPage = math.ceil(jsonres[data][totalCounts] / len(jsonres[data][poiInfos]))
writeInfo('{}美食数据总共有{}'.format(self.cityMap[city][zh], totalPage))
# 轮询每一家店的数据
for poiInfo in jsonres[data][poiInfos]:
writeInfo('美食店数据:{}'.format(json.dumps(poiInfo, indent=1)))
db = MysqlDB()
# 保存到mysql数据库
db.insert('insert into meishi (poiId, frontImg, title, avgScore, '
'allCommentNum, address, avgPrice, hasAds, adsClickUrl, adsShowUrl) '
'value (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)',
(str(poiInfo['poiId']), poiInfo['frontImg'], poiInfo['title'],
str(poiInfo['avgScore']),
str(poiInfo['allCommentNum']), poiInfo['address'], str(poiInfo['avgPrice']),
poiInfo['hasAds'], poiInfo['adsClickUrl'], poiInfo['adsShowUrl']))
else:
writeError("从json{}解析美食店数据失败".format(json.dumps(jsonres, indent=1)))
else:
writeError("从接口地址{}解析json数据{}失败".format(url, res.content))
GET_PARAM['page'] = GET_PARAM['page'] + 1
# 限制请求速率
time.sleep(1)
# 爬取城市美食
def meishi(self):
for city in self.cityMap.keys():
# 获取请求参数
c = CityParam(self.cityMap[city])
GET_PARAM = c.getParam()
totalPage = None
# 当前页数
GET_PARAM['page'] = 1
# 爬取美食列表数据
while totalPage is None or GET_PARAM['page'] <= totalPage:
# 添加token参数
GET_PARAM["_token"] = c.encrypt_token()
writeInfo("param:{}".format(GET_PARAM))
# 拼接城市美食接口地址
base_url = "https://{}.meituan.com/meishi/api/poi/getPoiList?".format(self.cityMap[city][en])
url = base_url + urlencode(GET_PARAM, encoding='utf-8')
writeInfo("接口地址:{}".format(url))
writeInfo("解析{}美食列表第{}页数据".format(self.cityMap[city][zh], GET_PARAM['page']))
# 发送请求获取美食数据
res = self.session.get(url, headers=c.getHeaders(), timeout=TIMEOUT)
# 解析接口响应的json数据
if res.status_code == 200 and 'json' in res.headers['Content-Type']:
jsonres = json.loads(res.content)
writeInfo("响应json{}".format(json.dumps(jsonres, indent=1)))
# 判断数据结构合法性
if data in jsonres and totalCounts in jsonres[data] and poiInfos in jsonres[data]:
# 获取分页数
if totalPage is None:
totalPage = math.ceil(jsonres[data][totalCounts] / len(jsonres[data][poiInfos]))
writeInfo('{}美食数据总共有{}'.format(self.cityMap[city][zh], totalPage))
# 轮询每一家店的数据
for poiInfo in jsonres[data][poiInfos]:
writeInfo('美食店数据:{}'.format(json.dumps(poiInfo, indent=1)))
db = MysqlDB()
# 保存到mysql数据库
db.insert('insert into meishi (poiId, frontImg, title, avgScore, '
'allCommentNum, address, avgPrice, hasAds, adsClickUrl, adsShowUrl) '
'value (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)',
(str(poiInfo['poiId']), poiInfo['frontImg'], poiInfo['title'],
str(poiInfo['avgScore']),
str(poiInfo['allCommentNum']), poiInfo['address'], str(poiInfo['avgPrice']),
poiInfo['hasAds'], poiInfo['adsClickUrl'], poiInfo['adsShowUrl']))
else:
writeError("从json{}解析美食店数据失败".format(json.dumps(jsonres, indent=1)))
else:
writeError("从接口地址{}解析json数据{}失败".format(url, res.content))
GET_PARAM['page'] = GET_PARAM['page'] + 1
# 限制请求速率
time.sleep(1)
self.meishiWithCity(city)

@ -1,5 +1,11 @@
from config.log import writeInfo, writeError
from main import MeiTuanCrawler, MysqlDB
c = MeiTuanCrawler()
c.getCity()
c.meishi()
option=input('选择爬虫选项:\n1.爬取所有城市\n2.爬取指定城市\n')
methods={'1':c.meishi,'2':c.meishiInputCity}
while option not in methods:
writeError('选项{}不合法!!!'.format(option))
option = input('选择爬虫选项:\n1.爬取所有城市\n2.爬取指定城市\n')
methods[option]()

Loading…
Cancel
Save