You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
meituan/main.py

123 lines
5.5 KiB

import json
import math
import time
from urllib.parse import urlencode
from bs4 import BeautifulSoup, Tag
from common import getSession, CityParam, zh, en, href, TIMEOUT, totalCounts, poiInfos, data
from config.config import cf
from config.log import writeInfo, writeError
import pymysql
# mysql数据库
class MysqlDB:
# 建立连接
def connect(self):
mysql = 'mysql'
host = cf.get(mysql, 'host')
user = cf.get(mysql, 'user')
passwd = cf.get(mysql, 'passwd')
db = cf.get(mysql, 'db')
port = int(cf.get(mysql, 'port'))
charset = cf.get(mysql, 'charset')
return pymysql.connect(host=host, user=user, passwd=passwd, db=db, port=port, charset=charset)
# 执行insert语句
def insert(self, sql, params):
connection = self.connect()
try:
with connection.cursor() as cursor:
# Create a new record
cursor.execute(sql, params)
# connection is not autocommit by default. So you must commit to save
# your changes.
# 提交事务
connection.commit()
except Exception as e:
writeError(e)
finally:
connection.close()
# 美团美食爬虫
class MeiTuanCrawler:
# 城市列表网址
CITY_URL = 'https://www.meituan.com/changecity/'
features = 'lxml'
def __init__(self) -> None:
super().__init__()
# 请求会话
self.session = getSession()
# 美团城市列表
self.cityMap = {}
# 爬取城市列表
def getCity(self):
res = self.session.get(MeiTuanCrawler.CITY_URL)
if res.status_code == 200:
writeInfo("开始解析城市列表信息")
# 解析DOM
html = BeautifulSoup(res.content, MeiTuanCrawler.features)
# 获取城市超链接
city_a = html.select('a[class="link city"]')
for city in city_a:
if type(city) == Tag:
# 拼接城市美食网址
self.cityMap[city.text] = {href: 'https:{0}/meishi'.format(city[href]), zh: city.text,
en: str(city[href]).split('.')[0].replace('//', '')}
writeInfo('总共获取了{0}个美团城市超链接'.format(len(self.cityMap)))
else:
writeError("无法爬取城市链接,响应码:{0}".format(res.status_code))
# 爬取城市美食
def meishi(self):
for city in self.cityMap.keys():
# 获取请求参数
c = CityParam(self.cityMap[city])
GET_PARAM = c.getParam()
totalPage = None
# 当前页数
GET_PARAM['page'] = 1
# 爬取美食列表数据
while totalPage is None or GET_PARAM['page'] <= totalPage:
# 添加token参数
GET_PARAM["_token"] = c.encrypt_token()
writeInfo("param:{}".format(GET_PARAM))
# 拼接城市美食接口地址
base_url = "https://{}.meituan.com/meishi/api/poi/getPoiList?".format(self.cityMap[city][en])
url = base_url + urlencode(GET_PARAM, encoding='utf-8')
writeInfo("接口地址:{}".format(url))
writeInfo("解析{}美食列表第{}页数据".format(self.cityMap[city][zh], GET_PARAM['page']))
# 发送请求获取美食数据
res = self.session.get(url, headers=c.getHeaders(), timeout=TIMEOUT)
# 解析接口响应的json数据
if res.status_code == 200 and 'json' in res.headers['Content-Type']:
jsonres = json.loads(res.content)
writeInfo("响应json{}".format(json.dumps(jsonres, indent=1)))
# 判断数据结构合法性
if data in jsonres and totalCounts in jsonres[data] and poiInfos in jsonres[data]:
# 获取分页数
if totalPage is None:
totalPage = math.ceil(jsonres[data][totalCounts] / len(jsonres[data][poiInfos]))
writeInfo('{}美食数据总共有{}'.format(self.cityMap[city][zh], totalPage))
# 轮询每一家店的数据
for poiInfo in jsonres[data][poiInfos]:
writeInfo('美食店数据:{}'.format(json.dumps(poiInfo, indent=1)))
db = MysqlDB()
# 保存到mysql数据库
db.insert('insert into meishi (poiId, frontImg, title, avgScore, '
'allCommentNum, address, avgPrice, hasAds, adsClickUrl, adsShowUrl) '
'value (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)',
(str(poiInfo['poiId']), poiInfo['frontImg'], poiInfo['title'],
str(poiInfo['avgScore']),
str(poiInfo['allCommentNum']), poiInfo['address'], str(poiInfo['avgPrice']),
poiInfo['hasAds'], poiInfo['adsClickUrl'], poiInfo['adsShowUrl']))
else:
writeError("从json{}解析美食店数据失败".format(json.dumps(jsonres, indent=1)))
else:
writeError("从接口地址{}解析json数据{}失败".format(url, res.content))
GET_PARAM['page'] = GET_PARAM['page'] + 1
# 限制请求速率
time.sleep(1)