|
|
|
import json
|
|
|
|
import math
|
|
|
|
import time
|
|
|
|
from urllib.parse import urlencode
|
|
|
|
|
|
|
|
from bs4 import BeautifulSoup, Tag
|
|
|
|
|
|
|
|
from common import getSession, CityParam, zh, en, href, TIMEOUT, totalCounts, poiInfos, data
|
|
|
|
from config.config import cf
|
|
|
|
from config.log import writeInfo, writeError
|
|
|
|
import pymysql
|
|
|
|
|
|
|
|
|
|
|
|
# mysql数据库
|
|
|
|
class MysqlDB:
|
|
|
|
# 建立连接
|
|
|
|
def connect(self):
|
|
|
|
mysql = 'mysql'
|
|
|
|
host = cf.get(mysql, 'host')
|
|
|
|
user = cf.get(mysql, 'user')
|
|
|
|
passwd = cf.get(mysql, 'passwd')
|
|
|
|
db = cf.get(mysql, 'db')
|
|
|
|
port = int(cf.get(mysql, 'port'))
|
|
|
|
charset = cf.get(mysql, 'charset')
|
|
|
|
return pymysql.connect(host=host, user=user, passwd=passwd, db=db, port=port, charset=charset)
|
|
|
|
|
|
|
|
# 执行insert语句
|
|
|
|
def insert(self, sql, params):
|
|
|
|
connection = self.connect()
|
|
|
|
try:
|
|
|
|
with connection.cursor() as cursor:
|
|
|
|
# Create a new record
|
|
|
|
cursor.execute(sql, params)
|
|
|
|
# connection is not autocommit by default. So you must commit to save
|
|
|
|
# your changes.
|
|
|
|
# 提交事务
|
|
|
|
connection.commit()
|
|
|
|
except Exception as e:
|
|
|
|
writeError(e)
|
|
|
|
finally:
|
|
|
|
connection.close()
|
|
|
|
|
|
|
|
|
|
|
|
# 美团美食爬虫
|
|
|
|
class MeiTuanCrawler:
|
|
|
|
# 城市列表网址
|
|
|
|
CITY_URL = 'https://www.meituan.com/changecity/'
|
|
|
|
features = 'lxml'
|
|
|
|
|
|
|
|
def __init__(self) -> None:
|
|
|
|
super().__init__()
|
|
|
|
# 请求会话
|
|
|
|
self.session = getSession()
|
|
|
|
# 美团城市列表
|
|
|
|
self.cityMap = {}
|
|
|
|
|
|
|
|
# 爬取城市列表
|
|
|
|
def getCity(self):
|
|
|
|
res = self.session.get(MeiTuanCrawler.CITY_URL)
|
|
|
|
if res.status_code == 200:
|
|
|
|
writeInfo("开始解析城市列表信息")
|
|
|
|
# 解析DOM
|
|
|
|
html = BeautifulSoup(res.content, MeiTuanCrawler.features)
|
|
|
|
# 获取城市超链接
|
|
|
|
city_a = html.select('a[class="link city"]')
|
|
|
|
for city in city_a:
|
|
|
|
if type(city) == Tag:
|
|
|
|
# 拼接城市美食网址
|
|
|
|
self.cityMap[city.text] = {href: 'https:{0}/meishi'.format(city[href]), zh: city.text,
|
|
|
|
en: str(city[href]).split('.')[0].replace('//', '')}
|
|
|
|
writeInfo('总共获取了{0}个美团城市超链接'.format(len(self.cityMap)))
|
|
|
|
else:
|
|
|
|
writeError("无法爬取城市链接,响应码:{0}".format(res.status_code))
|
|
|
|
|
|
|
|
def meishiInputCity(self):
|
|
|
|
city = input("请输入要爬取的城市:\n")
|
|
|
|
while city not in self.cityMap:
|
|
|
|
writeInfo("城市:{}不合法".format(city))
|
|
|
|
city = input("请输入要爬取的城市:\n")
|
|
|
|
self.meishiWithCity(city)
|
|
|
|
|
|
|
|
def meishiWithCity(self, city):
|
|
|
|
# 获取请求参数
|
|
|
|
c = CityParam(self.cityMap[city])
|
|
|
|
GET_PARAM = c.getParam()
|
|
|
|
totalPage = None
|
|
|
|
# 当前页数
|
|
|
|
GET_PARAM['page'] = 1
|
|
|
|
# 爬取美食列表数据
|
|
|
|
while totalPage is None or GET_PARAM['page'] <= totalPage:
|
|
|
|
# 添加token参数
|
|
|
|
GET_PARAM["_token"] = c.encrypt_token()
|
|
|
|
writeInfo("param:{}".format(GET_PARAM))
|
|
|
|
# 拼接城市美食接口地址
|
|
|
|
base_url = "https://{}.meituan.com/meishi/api/poi/getPoiList?".format(self.cityMap[city][en])
|
|
|
|
url = base_url + urlencode(GET_PARAM, encoding='utf-8')
|
|
|
|
writeInfo("接口地址:{}".format(url))
|
|
|
|
writeInfo("解析{}美食列表第{}页数据".format(self.cityMap[city][zh], GET_PARAM['page']))
|
|
|
|
# 发送请求获取美食数据
|
|
|
|
res = self.session.get(url, headers=c.getHeaders(), timeout=TIMEOUT)
|
|
|
|
# 解析接口响应的json数据
|
|
|
|
if res.status_code == 200 and 'json' in res.headers['Content-Type']:
|
|
|
|
jsonres = json.loads(res.content)
|
|
|
|
writeInfo("响应json{}".format(json.dumps(jsonres, indent=1)))
|
|
|
|
# 判断数据结构合法性
|
|
|
|
if data in jsonres and totalCounts in jsonres[data] and poiInfos in jsonres[data]:
|
|
|
|
# 获取分页数
|
|
|
|
if totalPage is None:
|
|
|
|
totalPage = math.ceil(jsonres[data][totalCounts] / len(jsonres[data][poiInfos]))
|
|
|
|
writeInfo('{}美食数据总共有{}'.format(self.cityMap[city][zh], totalPage))
|
|
|
|
# 轮询每一家店的数据
|
|
|
|
for poiInfo in jsonres[data][poiInfos]:
|
|
|
|
writeInfo('美食店数据:{}'.format(json.dumps(poiInfo, indent=1)))
|
|
|
|
db = MysqlDB()
|
|
|
|
# 保存到mysql数据库
|
|
|
|
db.insert('insert into meishi (poiId, frontImg, title, avgScore, '
|
|
|
|
'allCommentNum, address, avgPrice, hasAds, adsClickUrl, adsShowUrl) '
|
|
|
|
'value (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)',
|
|
|
|
(str(poiInfo['poiId']), poiInfo['frontImg'], poiInfo['title'],
|
|
|
|
str(poiInfo['avgScore']),
|
|
|
|
str(poiInfo['allCommentNum']), poiInfo['address'], str(poiInfo['avgPrice']),
|
|
|
|
poiInfo['hasAds'], poiInfo['adsClickUrl'], poiInfo['adsShowUrl']))
|
|
|
|
else:
|
|
|
|
writeError("从json{}解析美食店数据失败".format(json.dumps(jsonres, indent=1)))
|
|
|
|
else:
|
|
|
|
writeError("从接口地址{}解析json数据{}失败".format(url, res.content))
|
|
|
|
GET_PARAM['page'] = GET_PARAM['page'] + 1
|
|
|
|
# 限制请求速率
|
|
|
|
time.sleep(1)
|
|
|
|
|
|
|
|
# 爬取城市美食
|
|
|
|
def meishi(self):
|
|
|
|
for city in self.cityMap.keys():
|
|
|
|
self.meishiWithCity(city)
|