init

5 years ago · ae0f735438
parent ebe2382d4a
commit ae0f735438
2 changed files with 65 additions and 48 deletions
--- a/main.py
+++ b/main.py
@ -40,6 +40,7 @@ class MysqlDB:
        finally:
            connection.close()
 # 美团美食爬虫
 class MeiTuanCrawler:
    # 城市列表网址
@ -71,53 +72,63 @@ class MeiTuanCrawler:
        else:
            writeError("无法爬取城市链接，响应码：{0}".format(res.status_code))
    def meishiInputCity(self):
        city = input("请输入要爬取的城市:\n")
        while city not in self.cityMap:
            writeInfo("城市：{}不合法".format(city))
            city = input("请输入要爬取的城市:\n")
        self.meishiWithCity(city)
    def meishiWithCity(self, city):
        # 获取请求参数
        c = CityParam(self.cityMap[city])
        GET_PARAM = c.getParam()
        totalPage = None
        # 当前页数
        GET_PARAM['page'] = 1
        # 爬取美食列表数据
        while totalPage is None or GET_PARAM['page'] <= totalPage:
            # 添加token参数
            GET_PARAM["_token"] = c.encrypt_token()
            writeInfo("param:{}".format(GET_PARAM))
            # 拼接城市美食接口地址
            base_url = "https://{}.meituan.com/meishi/api/poi/getPoiList?".format(self.cityMap[city][en])
            url = base_url + urlencode(GET_PARAM, encoding='utf-8')
            writeInfo("接口地址：{}".format(url))
            writeInfo("解析{}美食列表第{}页数据".format(self.cityMap[city][zh], GET_PARAM['page']))
            # 发送请求获取美食数据
            res = self.session.get(url, headers=c.getHeaders(), timeout=TIMEOUT)
            # 解析接口响应的json数据
            if res.status_code == 200 and 'json' in res.headers['Content-Type']:
                jsonres = json.loads(res.content)
                writeInfo("响应json{}".format(json.dumps(jsonres, indent=1)))
                # 判断数据结构合法性
                if data in jsonres and totalCounts in jsonres[data] and poiInfos in jsonres[data]:
                    # 获取分页数
                    if totalPage is None:
                        totalPage = math.ceil(jsonres[data][totalCounts] / len(jsonres[data][poiInfos]))
                        writeInfo('{}美食数据总共有{}'.format(self.cityMap[city][zh], totalPage))
                    # 轮询每一家店的数据
                    for poiInfo in jsonres[data][poiInfos]:
                        writeInfo('美食店数据：{}'.format(json.dumps(poiInfo, indent=1)))
                        db = MysqlDB()
                        # 保存到mysql数据库
                        db.insert('insert into meishi (poiId, frontImg, title, avgScore, '
                                  'allCommentNum, address, avgPrice, hasAds, adsClickUrl, adsShowUrl) '
                                  'value (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)',
                                  (str(poiInfo['poiId']), poiInfo['frontImg'], poiInfo['title'],
                                   str(poiInfo['avgScore']),
                                   str(poiInfo['allCommentNum']), poiInfo['address'], str(poiInfo['avgPrice']),
                                   poiInfo['hasAds'], poiInfo['adsClickUrl'], poiInfo['adsShowUrl']))
                else:
                    writeError("从json{}解析美食店数据失败".format(json.dumps(jsonres, indent=1)))
            else:
                writeError("从接口地址{}解析json数据{}失败".format(url, res.content))
            GET_PARAM['page'] = GET_PARAM['page'] + 1
            # 限制请求速率
            time.sleep(1)
    # 爬取城市美食
    def meishi(self):
        for city in self.cityMap.keys():
-            # 获取请求参数
+            self.meishiWithCity(city)
            c = CityParam(self.cityMap[city])
            GET_PARAM = c.getParam()
            totalPage = None
            # 当前页数
            GET_PARAM['page'] = 1
            # 爬取美食列表数据
            while totalPage is None or GET_PARAM['page'] <= totalPage:
                # 添加token参数
                GET_PARAM["_token"] = c.encrypt_token()
                writeInfo("param:{}".format(GET_PARAM))
                # 拼接城市美食接口地址
                base_url = "https://{}.meituan.com/meishi/api/poi/getPoiList?".format(self.cityMap[city][en])
                url = base_url + urlencode(GET_PARAM, encoding='utf-8')
                writeInfo("接口地址：{}".format(url))
                writeInfo("解析{}美食列表第{}页数据".format(self.cityMap[city][zh], GET_PARAM['page']))
                # 发送请求获取美食数据
                res = self.session.get(url, headers=c.getHeaders(), timeout=TIMEOUT)
                # 解析接口响应的json数据
                if res.status_code == 200 and 'json' in res.headers['Content-Type']:
                    jsonres = json.loads(res.content)
                    writeInfo("响应json{}".format(json.dumps(jsonres, indent=1)))
                    # 判断数据结构合法性
                    if data in jsonres and totalCounts in jsonres[data] and poiInfos in jsonres[data]:
                        # 获取分页数
                        if totalPage is None:
                            totalPage = math.ceil(jsonres[data][totalCounts] / len(jsonres[data][poiInfos]))
                            writeInfo('{}美食数据总共有{}'.format(self.cityMap[city][zh], totalPage))
                        # 轮询每一家店的数据
                        for poiInfo in jsonres[data][poiInfos]:
                            writeInfo('美食店数据：{}'.format(json.dumps(poiInfo, indent=1)))
                            db = MysqlDB()
                            # 保存到mysql数据库
                            db.insert('insert into meishi (poiId, frontImg, title, avgScore, '
                                      'allCommentNum, address, avgPrice, hasAds, adsClickUrl, adsShowUrl) '
                                      'value (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)',
                                      (str(poiInfo['poiId']), poiInfo['frontImg'], poiInfo['title'],
                                       str(poiInfo['avgScore']),
                                       str(poiInfo['allCommentNum']), poiInfo['address'], str(poiInfo['avgPrice']),
                                       poiInfo['hasAds'], poiInfo['adsClickUrl'], poiInfo['adsShowUrl']))
                    else:
                        writeError("从json{}解析美食店数据失败".format(json.dumps(jsonres, indent=1)))
                else:
                    writeError("从接口地址{}解析json数据{}失败".format(url, res.content))
                GET_PARAM['page'] = GET_PARAM['page'] + 1
                # 限制请求速率
                time.sleep(1)
--- a/run.py
+++ b/run.py
@ -1,5 +1,11 @@
 from config.log import writeInfo, writeError
 from main import MeiTuanCrawler, MysqlDB
 c = MeiTuanCrawler()
 c.getCity()
-c.meishi()
+option=input('选择爬虫选项:\n1.爬取所有城市\n2.爬取指定城市\n')
 methods={'1':c.meishi,'2':c.meishiInputCity}
 while option not in methods:
    writeError('选项{}不合法！！！'.format(option))
    option = input('选择爬虫选项:\n1.爬取所有城市\n2.爬取指定城市\n')
 methods[option]()