diff --git a/main.py b/main.py index 1bd1d41..e1284c2 100644 --- a/main.py +++ b/main.py @@ -40,6 +40,7 @@ class MysqlDB: finally: connection.close() + # 美团美食爬虫 class MeiTuanCrawler: # 城市列表网址 @@ -71,53 +72,63 @@ class MeiTuanCrawler: else: writeError("无法爬取城市链接,响应码:{0}".format(res.status_code)) + def meishiInputCity(self): + city = input("请输入要爬取的城市:\n") + while city not in self.cityMap: + writeInfo("城市:{}不合法".format(city)) + city = input("请输入要爬取的城市:\n") + self.meishiWithCity(city) + + def meishiWithCity(self, city): + # 获取请求参数 + c = CityParam(self.cityMap[city]) + GET_PARAM = c.getParam() + totalPage = None + # 当前页数 + GET_PARAM['page'] = 1 + # 爬取美食列表数据 + while totalPage is None or GET_PARAM['page'] <= totalPage: + # 添加token参数 + GET_PARAM["_token"] = c.encrypt_token() + writeInfo("param:{}".format(GET_PARAM)) + # 拼接城市美食接口地址 + base_url = "https://{}.meituan.com/meishi/api/poi/getPoiList?".format(self.cityMap[city][en]) + url = base_url + urlencode(GET_PARAM, encoding='utf-8') + writeInfo("接口地址:{}".format(url)) + writeInfo("解析{}美食列表第{}页数据".format(self.cityMap[city][zh], GET_PARAM['page'])) + # 发送请求获取美食数据 + res = self.session.get(url, headers=c.getHeaders(), timeout=TIMEOUT) + # 解析接口响应的json数据 + if res.status_code == 200 and 'json' in res.headers['Content-Type']: + jsonres = json.loads(res.content) + writeInfo("响应json{}".format(json.dumps(jsonres, indent=1))) + # 判断数据结构合法性 + if data in jsonres and totalCounts in jsonres[data] and poiInfos in jsonres[data]: + # 获取分页数 + if totalPage is None: + totalPage = math.ceil(jsonres[data][totalCounts] / len(jsonres[data][poiInfos])) + writeInfo('{}美食数据总共有{}'.format(self.cityMap[city][zh], totalPage)) + # 轮询每一家店的数据 + for poiInfo in jsonres[data][poiInfos]: + writeInfo('美食店数据:{}'.format(json.dumps(poiInfo, indent=1))) + db = MysqlDB() + # 保存到mysql数据库 + db.insert('insert into meishi (poiId, frontImg, title, avgScore, ' + 'allCommentNum, address, avgPrice, hasAds, adsClickUrl, adsShowUrl) ' + 'value (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)', + (str(poiInfo['poiId']), poiInfo['frontImg'], poiInfo['title'], + str(poiInfo['avgScore']), + str(poiInfo['allCommentNum']), poiInfo['address'], str(poiInfo['avgPrice']), + poiInfo['hasAds'], poiInfo['adsClickUrl'], poiInfo['adsShowUrl'])) + else: + writeError("从json{}解析美食店数据失败".format(json.dumps(jsonres, indent=1))) + else: + writeError("从接口地址{}解析json数据{}失败".format(url, res.content)) + GET_PARAM['page'] = GET_PARAM['page'] + 1 + # 限制请求速率 + time.sleep(1) + # 爬取城市美食 def meishi(self): for city in self.cityMap.keys(): - # 获取请求参数 - c = CityParam(self.cityMap[city]) - GET_PARAM = c.getParam() - totalPage = None - # 当前页数 - GET_PARAM['page'] = 1 - # 爬取美食列表数据 - while totalPage is None or GET_PARAM['page'] <= totalPage: - # 添加token参数 - GET_PARAM["_token"] = c.encrypt_token() - writeInfo("param:{}".format(GET_PARAM)) - # 拼接城市美食接口地址 - base_url = "https://{}.meituan.com/meishi/api/poi/getPoiList?".format(self.cityMap[city][en]) - url = base_url + urlencode(GET_PARAM, encoding='utf-8') - writeInfo("接口地址:{}".format(url)) - writeInfo("解析{}美食列表第{}页数据".format(self.cityMap[city][zh], GET_PARAM['page'])) - # 发送请求获取美食数据 - res = self.session.get(url, headers=c.getHeaders(), timeout=TIMEOUT) - # 解析接口响应的json数据 - if res.status_code == 200 and 'json' in res.headers['Content-Type']: - jsonres = json.loads(res.content) - writeInfo("响应json{}".format(json.dumps(jsonres, indent=1))) - # 判断数据结构合法性 - if data in jsonres and totalCounts in jsonres[data] and poiInfos in jsonres[data]: - # 获取分页数 - if totalPage is None: - totalPage = math.ceil(jsonres[data][totalCounts] / len(jsonres[data][poiInfos])) - writeInfo('{}美食数据总共有{}'.format(self.cityMap[city][zh], totalPage)) - # 轮询每一家店的数据 - for poiInfo in jsonres[data][poiInfos]: - writeInfo('美食店数据:{}'.format(json.dumps(poiInfo, indent=1))) - db = MysqlDB() - # 保存到mysql数据库 - db.insert('insert into meishi (poiId, frontImg, title, avgScore, ' - 'allCommentNum, address, avgPrice, hasAds, adsClickUrl, adsShowUrl) ' - 'value (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)', - (str(poiInfo['poiId']), poiInfo['frontImg'], poiInfo['title'], - str(poiInfo['avgScore']), - str(poiInfo['allCommentNum']), poiInfo['address'], str(poiInfo['avgPrice']), - poiInfo['hasAds'], poiInfo['adsClickUrl'], poiInfo['adsShowUrl'])) - else: - writeError("从json{}解析美食店数据失败".format(json.dumps(jsonres, indent=1))) - else: - writeError("从接口地址{}解析json数据{}失败".format(url, res.content)) - GET_PARAM['page'] = GET_PARAM['page'] + 1 - # 限制请求速率 - time.sleep(1) + self.meishiWithCity(city) diff --git a/run.py b/run.py index 4f32558..07778ca 100644 --- a/run.py +++ b/run.py @@ -1,5 +1,11 @@ +from config.log import writeInfo, writeError from main import MeiTuanCrawler, MysqlDB c = MeiTuanCrawler() c.getCity() -c.meishi() +option=input('选择爬虫选项:\n1.爬取所有城市\n2.爬取指定城市\n') +methods={'1':c.meishi,'2':c.meishiInputCity} +while option not in methods: + writeError('选项{}不合法!!!'.format(option)) + option = input('选择爬虫选项:\n1.爬取所有城市\n2.爬取指定城市\n') +methods[option]()