commit
ebe2382d4a
@ -0,0 +1,144 @@ |
||||
# Created by .ignore support plugin (hsz.mobi) |
||||
### Python template |
||||
# Byte-compiled / optimized / DLL files |
||||
__pycache__/ |
||||
*.py[cod] |
||||
*$py.class |
||||
|
||||
# C extensions |
||||
*.so |
||||
|
||||
# Distribution / packaging |
||||
.Python |
||||
build/ |
||||
develop-eggs/ |
||||
dist/ |
||||
downloads/ |
||||
eggs/ |
||||
.eggs/ |
||||
lib/ |
||||
lib64/ |
||||
parts/ |
||||
sdist/ |
||||
var/ |
||||
wheels/ |
||||
pip-wheel-metadata/ |
||||
share/python-wheels/ |
||||
*.egg-info/ |
||||
.installed.cfg |
||||
*.egg |
||||
MANIFEST |
||||
|
||||
# PyInstaller |
||||
# Usually these files are written by a python script from a template |
||||
# before PyInstaller builds the exe, so as to inject date/other infos into it. |
||||
*.manifest |
||||
*.spec |
||||
|
||||
# Installer logs |
||||
pip-log.txt |
||||
pip-delete-this-directory.txt |
||||
|
||||
# Unit test / coverage reports |
||||
htmlcov/ |
||||
.tox/ |
||||
.nox/ |
||||
.coverage |
||||
.coverage.* |
||||
.cache |
||||
nosetests.xml |
||||
coverage.xml |
||||
*.cover |
||||
*.py,cover |
||||
.hypothesis/ |
||||
.pytest_cache/ |
||||
cover/ |
||||
|
||||
# Translations |
||||
*.mo |
||||
*.pot |
||||
|
||||
# Django stuff: |
||||
*.log |
||||
local_settings.py |
||||
db.sqlite3 |
||||
db.sqlite3-journal |
||||
|
||||
# Flask stuff: |
||||
instance/ |
||||
.webassets-cache |
||||
|
||||
# Scrapy stuff: |
||||
.scrapy |
||||
|
||||
# Sphinx documentation |
||||
docs/_build/ |
||||
|
||||
# PyBuilder |
||||
.pybuilder/ |
||||
target/ |
||||
|
||||
# Jupyter Notebook |
||||
.ipynb_checkpoints |
||||
|
||||
# IPython |
||||
profile_default/ |
||||
ipython_config.py |
||||
|
||||
# pyenv |
||||
# For a library or package, you might want to ignore these files since the code is |
||||
# intended to run in multiple environments; otherwise, check them in: |
||||
# .python-version |
||||
|
||||
# pipenv |
||||
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. |
||||
# However, in case of collaboration, if having platform-specific dependencies or dependencies |
||||
# having no cross-platform support, pipenv may install dependencies that don't work, or not |
||||
# install all needed dependencies. |
||||
#Pipfile.lock |
||||
|
||||
# PEP 582; used by e.g. github.com/David-OConnor/pyflow |
||||
__pypackages__/ |
||||
|
||||
# Celery stuff |
||||
celerybeat-schedule |
||||
celerybeat.pid |
||||
|
||||
# SageMath parsed files |
||||
*.sage.py |
||||
|
||||
# Environments |
||||
.env |
||||
.venv |
||||
env/ |
||||
venv/ |
||||
ENV/ |
||||
env.bak/ |
||||
venv.bak/ |
||||
|
||||
# Spyder project settings |
||||
.spyderproject |
||||
.spyproject |
||||
|
||||
# Rope project settings |
||||
.ropeproject |
||||
|
||||
# mkdocs documentation |
||||
/site |
||||
|
||||
# mypy |
||||
.mypy_cache/ |
||||
.dmypy.json |
||||
dmypy.json |
||||
|
||||
# Pyre type checker |
||||
.pyre/ |
||||
|
||||
# pytype static type analyzer |
||||
.pytype/ |
||||
|
||||
# Cython debug symbols |
||||
cython_debug/ |
||||
.idea |
||||
|
||||
/log/ |
@ -0,0 +1,160 @@ |
||||
# 获取请求头 |
||||
import base64 |
||||
import os |
||||
import random |
||||
import re |
||||
import time |
||||
import zlib |
||||
import pandas |
||||
import requests |
||||
from requests.adapters import HTTPAdapter |
||||
from urllib3 import Retry |
||||
|
||||
|
||||
|
||||
|
||||
# 获取请求会话 |
||||
def getSession(): |
||||
session = requests.session() |
||||
retry = Retry(connect=3, backoff_factor=0.5) |
||||
adapter = HTTPAdapter(max_retries=retry) |
||||
session.mount('http://', adapter) |
||||
session.mount('https://', adapter) |
||||
return session |
||||
|
||||
|
||||
href = 'href' |
||||
zh = 'zh' |
||||
en = 'en' |
||||
|
||||
# 城市参数类 |
||||
class CityParam: |
||||
|
||||
def __init__(self, city) -> None: |
||||
super().__init__() |
||||
self.city = city |
||||
self.uuid = self.get_uuid() |
||||
self.data = self.getData() |
||||
self.param = self.getParam() |
||||
|
||||
# 获取请求头 |
||||
def getHeaders(self): |
||||
return { |
||||
"Accept": "application/json", |
||||
"Referer": "https://{}.meituan.com/".format(self.city[en]), |
||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36", |
||||
} |
||||
|
||||
# 获取请求参数uuid |
||||
def get_uuid(self): |
||||
"""获取uuid""" |
||||
headers = { |
||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36" |
||||
} |
||||
res = requests.get(self.city[href], headers=headers).text |
||||
return re.findall(r'"uuid":"(.*?)"', res, re.S)[0] |
||||
|
||||
# 初始化参数字典 |
||||
def getData(self): |
||||
return { |
||||
"cityName": self.city[zh], |
||||
"cateId": '0', |
||||
"areaId": "0", |
||||
"sort": "", |
||||
"dinnerCountAttrId": "", |
||||
"page": "1", |
||||
"userId": "", |
||||
"uuid": self.uuid, |
||||
"platform": "1", |
||||
"partner": "126", |
||||
"originUrl": href, |
||||
"riskLevel": "1", |
||||
"optimusCode": "1" |
||||
} |
||||
|
||||
# 初始化请求参数 |
||||
def getParam(self): |
||||
return { |
||||
"cityName": self.data["cityName"], |
||||
"cateId": self.data["cateId"], |
||||
"areaId": self.data["areaId"], |
||||
"sort": self.data["sort"], |
||||
"dinnerCountAttrId": self.data["dinnerCountAttrId"], |
||||
"page": self.data["page"], |
||||
"userId": self.data["userId"], |
||||
"uuid": self.data["uuid"], |
||||
"platform": self.data["platform"], |
||||
"partner": self.data["partner"], |
||||
"originUrl": self.data["originUrl"], |
||||
"riskLevel": self.data["riskLevel"], |
||||
"optimusCode": self.data["optimusCode"], |
||||
} |
||||
|
||||
# 生成请求sign参数 |
||||
def sign(self): |
||||
"""生成sign参数""" |
||||
# 默认编码 |
||||
# coding = sys.getdefaultencoding() |
||||
# 二进制压缩 |
||||
SIGN_PARAM = "areaId={}&cateId={}&cityName={}&dinnerCountAttrId={}&optimusCode={}&originUrl={}&page={}&partner={}&platform={}&riskLevel={}&sort={}&userId={}&uuid={}".format( |
||||
self.data["areaId"], |
||||
self.data["cateId"], |
||||
self.data["cityName"], |
||||
self.data["dinnerCountAttrId"], |
||||
self.data["optimusCode"], |
||||
self.data["originUrl"], |
||||
self.data["page"], |
||||
self.data["partner"], |
||||
self.data["platform"], |
||||
self.data["riskLevel"], |
||||
self.data["sort"], |
||||
self.data["userId"], |
||||
self.data["uuid"] |
||||
) |
||||
binary_data = zlib.compress(SIGN_PARAM.encode()) |
||||
# base64编码 |
||||
base64_data = base64.b64encode(binary_data) |
||||
# 返回utf8编码的字符串 |
||||
return base64_data.decode() |
||||
|
||||
# 生成请求token参数 |
||||
def encrypt_token(self): |
||||
"""生成_token参数""" |
||||
ts = int(time.time() * 1000) # time.time()返回1970年至今的时间(以秒为单位) |
||||
# 伪装机型 |
||||
json_path = os.path.dirname(os.path.realpath(__file__)) + '\\config\\br.json' |
||||
df = pandas.read_json(json_path) |
||||
brVD, brR_one, brR_two = df.iloc[random.randint(0, len(df) - 1)] |
||||
token_data = { |
||||
"rId": 100900, |
||||
"ver": "1.0.6", |
||||
"ts": ts, |
||||
"cts": ts + random.randint(100, 120), # 经测,cts - ts 的差值大致在 90-130 之间 |
||||
# "cts": ts + 100, |
||||
"brVD": eval(brVD), |
||||
"brR": [eval(brR_one), eval(brR_two), 24, 24], |
||||
"bI": [self.city[href], ""], |
||||
"mT": [], |
||||
"kT": [], |
||||
"aT": [], |
||||
"tT": [], |
||||
"aM": "", |
||||
"sign": self.sign() |
||||
} |
||||
# 二进制压缩 |
||||
binary_data = zlib.compress(str(token_data).encode()) |
||||
# base64编码 |
||||
base64_data = base64.b64encode(binary_data) |
||||
return base64_data.decode() |
||||
|
||||
|
||||
# 等待服务器发送数据秒数 |
||||
TIMEOUT = 5 |
||||
data='data' |
||||
# 总记录数键 |
||||
totalCounts = 'totalCounts' |
||||
# 商户键 |
||||
poiInfos = 'poiInfos' |
||||
|
||||
|
||||
|
@ -0,0 +1,18 @@ |
||||
;日志配置 |
||||
[file] |
||||
;日志文件名 |
||||
logFile = log.txt |
||||
;mysql数据库配置 |
||||
[mysql] |
||||
#数据库服务端地址 |
||||
host=localhost |
||||
#用户 |
||||
user=sukura |
||||
#密码 |
||||
passwd=123456 |
||||
#数据库名 |
||||
db=meishi |
||||
#端口 |
||||
port=3306 |
||||
#连接编码 |
||||
charset=utf8 |
@ -0,0 +1,87 @@ |
||||
[ |
||||
{ |
||||
"barVD": "[150,625]", |
||||
"brR_one": "[1366, 768]", |
||||
"brR_two": "[1366, 728]" |
||||
}, |
||||
{ |
||||
"barVD": "[886,635]", |
||||
"brR_one": "[1366,768]", |
||||
"brR_two": "[1366,738]" |
||||
}, |
||||
{ |
||||
"barVD": "[1560,219]", |
||||
"brR_one": "[1600,900]", |
||||
"brR_two": "[1600,860]" |
||||
}, |
||||
{ |
||||
"barVD": "[1366,225]", |
||||
"brR_one": "[1366,768]", |
||||
"brR_two": "[1366,768]" |
||||
}, |
||||
{ |
||||
"barVD": "[1366,209]", |
||||
"brR_one": "[1366,768]", |
||||
"brR_two": "[1366,768]" |
||||
}, |
||||
{ |
||||
"barVD": "[265,689]", |
||||
"brR_one": "[1280,800]", |
||||
"brR_two": "[1280,760]" |
||||
}, |
||||
{ |
||||
"barVD": "[1440,264]", |
||||
"brR_one": "[1440,900]", |
||||
"brR_two": "[1440,877]" |
||||
}, |
||||
{ |
||||
"barVD": "[800,150]", |
||||
"brR_one": "[800,600]", |
||||
"brR_two": "[800,560]" |
||||
}, |
||||
{ |
||||
"barVD": "[1024,318]", |
||||
"brR_one": "[1024,768]", |
||||
"brR_two": "[1024,728]" |
||||
}, |
||||
{ |
||||
"barVD": "[1280,150]", |
||||
"brR_one": "[1280,600]", |
||||
"brR_two": "[1280,560]" |
||||
}, |
||||
{ |
||||
"barVD": "[1280,150]", |
||||
"brR_one": "[1280,600]", |
||||
"brR_two": "[1280,600]" |
||||
}, |
||||
{ |
||||
"barVD": "[1280,270]", |
||||
"brR_one": "[1280,720]", |
||||
"brR_two": "[1280,680]" |
||||
}, |
||||
{ |
||||
"barVD": "[1280,161]", |
||||
"brR_one": "[1280,720]", |
||||
"brR_two": "[1280,720]" |
||||
}, |
||||
{ |
||||
"barVD": "[1280,198]", |
||||
"brR_one": "[1280,768]", |
||||
"brR_two": "[1280,728]" |
||||
}, |
||||
{ |
||||
"barVD": "[1280,209]", |
||||
"brR_one": "[1280,768]", |
||||
"brR_two": "[1280,768]" |
||||
}, |
||||
{ |
||||
"barVD": "[1360,198]", |
||||
"brR_one": "[1360,768]", |
||||
"brR_two": "[1360,728]" |
||||
}, |
||||
{ |
||||
"barVD": "[1360,209]", |
||||
"brR_one": "[1360,768]", |
||||
"brR_two": "[1360,768]" |
||||
} |
||||
] |
@ -0,0 +1,34 @@ |
||||
import configparser |
||||
import logging |
||||
from logging.handlers import TimedRotatingFileHandler |
||||
import os |
||||
|
||||
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) |
||||
print(BASE_DIR) |
||||
cf = configparser.ConfigParser() |
||||
config_path = BASE_DIR + "/config.ini" |
||||
if not os.path.exists(config_path): |
||||
raise Exception("配置文件:%s不存在" % config_path) |
||||
cf.read(config_path, encoding='utf-8') |
||||
logFile = cf.get('file', 'logFile') |
||||
logger = logging.getLogger() |
||||
logger.setLevel(logging.INFO) |
||||
|
||||
|
||||
def init(): |
||||
log_format = logging.Formatter(fmt="%(asctime)s %(levelname)s : %(message)s", datefmt='%Y-%m-%d %H:%M:%S') |
||||
# 在控制台打印日志 |
||||
streamHandler = logging.StreamHandler() |
||||
streamHandler.setFormatter(log_format) |
||||
logger.addHandler(streamHandler) |
||||
|
||||
logpath = BASE_DIR + "/log/" |
||||
print(logpath) |
||||
if not os.path.exists(BASE_DIR + "/log/"): |
||||
os.mkdir(logpath) |
||||
|
||||
timedRotatingFileHandler = TimedRotatingFileHandler(filename=logpath + "all.log", when='H', interval=1, |
||||
encoding='utf-8') |
||||
timedRotatingFileHandler.setFormatter(log_format) |
||||
|
||||
logger.addHandler(timedRotatingFileHandler) |
@ -0,0 +1,21 @@ |
||||
import time |
||||
|
||||
from config.config import init |
||||
from config.config import logger |
||||
|
||||
start = int(time.time()) |
||||
init() |
||||
|
||||
def getRunTimeInt(): |
||||
return (int(time.time()) - start) |
||||
|
||||
def getRunTime(): |
||||
return '程序已经执行%d秒' % (int(time.time()) - start) |
||||
|
||||
|
||||
def writeInfo(msg): |
||||
logger.info('%s\t(%s)' % (msg, getRunTime())) |
||||
|
||||
|
||||
def writeError(msg): |
||||
logger.error('%s\t(%s)' % (msg, getRunTime())) |
@ -0,0 +1,14 @@ |
||||
create table meishi( |
||||
poiId int not null, |
||||
frontImg varchar(128) not null, |
||||
title varchar(128) not null, |
||||
avgScore float not null, |
||||
allCommentNum int not null, |
||||
address varchar(128) not null, |
||||
avgPrice int not null, |
||||
hasAds tinyint not null, |
||||
adsClickUrl varchar(2048), |
||||
adsShowUrl varchar(2048), |
||||
constraint meishi_pk |
||||
primary key (poiId) |
||||
) |
@ -0,0 +1,123 @@ |
||||
import json |
||||
import math |
||||
import time |
||||
from urllib.parse import urlencode |
||||
|
||||
from bs4 import BeautifulSoup, Tag |
||||
|
||||
from common import getSession, CityParam, zh, en, href, TIMEOUT, totalCounts, poiInfos, data |
||||
from config.config import cf |
||||
from config.log import writeInfo, writeError |
||||
import pymysql |
||||
|
||||
|
||||
# mysql数据库 |
||||
class MysqlDB: |
||||
# 建立连接 |
||||
def connect(self): |
||||
mysql = 'mysql' |
||||
host = cf.get(mysql, 'host') |
||||
user = cf.get(mysql, 'user') |
||||
passwd = cf.get(mysql, 'passwd') |
||||
db = cf.get(mysql, 'db') |
||||
port = int(cf.get(mysql, 'port')) |
||||
charset = cf.get(mysql, 'charset') |
||||
return pymysql.connect(host=host, user=user, passwd=passwd, db=db, port=port, charset=charset) |
||||
|
||||
# 执行insert语句 |
||||
def insert(self, sql, params): |
||||
connection = self.connect() |
||||
try: |
||||
with connection.cursor() as cursor: |
||||
# Create a new record |
||||
cursor.execute(sql, params) |
||||
# connection is not autocommit by default. So you must commit to save |
||||
# your changes. |
||||
# 提交事务 |
||||
connection.commit() |
||||
except Exception as e: |
||||
writeError(e) |
||||
finally: |
||||
connection.close() |
||||
|
||||
# 美团美食爬虫 |
||||
class MeiTuanCrawler: |
||||
# 城市列表网址 |
||||
CITY_URL = 'https://www.meituan.com/changecity/' |
||||
features = 'lxml' |
||||
|
||||
def __init__(self) -> None: |
||||
super().__init__() |
||||
# 请求会话 |
||||
self.session = getSession() |
||||
# 美团城市列表 |
||||
self.cityMap = {} |
||||
|
||||
# 爬取城市列表 |
||||
def getCity(self): |
||||
res = self.session.get(MeiTuanCrawler.CITY_URL) |
||||
if res.status_code == 200: |
||||
writeInfo("开始解析城市列表信息") |
||||
# 解析DOM |
||||
html = BeautifulSoup(res.content, MeiTuanCrawler.features) |
||||
# 获取城市超链接 |
||||
city_a = html.select('a[class="link city"]') |
||||
for city in city_a: |
||||
if type(city) == Tag: |
||||
# 拼接城市美食网址 |
||||
self.cityMap[city.text] = {href: 'https:{0}/meishi'.format(city[href]), zh: city.text, |
||||
en: str(city[href]).split('.')[0].replace('//', '')} |
||||
writeInfo('总共获取了{0}个美团城市超链接'.format(len(self.cityMap))) |
||||
else: |
||||
writeError("无法爬取城市链接,响应码:{0}".format(res.status_code)) |
||||
|
||||
# 爬取城市美食 |
||||
def meishi(self): |
||||
for city in self.cityMap.keys(): |
||||
# 获取请求参数 |
||||
c = CityParam(self.cityMap[city]) |
||||
GET_PARAM = c.getParam() |
||||
totalPage = None |
||||
# 当前页数 |
||||
GET_PARAM['page'] = 1 |
||||
# 爬取美食列表数据 |
||||
while totalPage is None or GET_PARAM['page'] <= totalPage: |
||||
# 添加token参数 |
||||
GET_PARAM["_token"] = c.encrypt_token() |
||||
writeInfo("param:{}".format(GET_PARAM)) |
||||
# 拼接城市美食接口地址 |
||||
base_url = "https://{}.meituan.com/meishi/api/poi/getPoiList?".format(self.cityMap[city][en]) |
||||
url = base_url + urlencode(GET_PARAM, encoding='utf-8') |
||||
writeInfo("接口地址:{}".format(url)) |
||||
writeInfo("解析{}美食列表第{}页数据".format(self.cityMap[city][zh], GET_PARAM['page'])) |
||||
# 发送请求获取美食数据 |
||||
res = self.session.get(url, headers=c.getHeaders(), timeout=TIMEOUT) |
||||
# 解析接口响应的json数据 |
||||
if res.status_code == 200 and 'json' in res.headers['Content-Type']: |
||||
jsonres = json.loads(res.content) |
||||
writeInfo("响应json{}".format(json.dumps(jsonres, indent=1))) |
||||
# 判断数据结构合法性 |
||||
if data in jsonres and totalCounts in jsonres[data] and poiInfos in jsonres[data]: |
||||
# 获取分页数 |
||||
if totalPage is None: |
||||
totalPage = math.ceil(jsonres[data][totalCounts] / len(jsonres[data][poiInfos])) |
||||
writeInfo('{}美食数据总共有{}'.format(self.cityMap[city][zh], totalPage)) |
||||
# 轮询每一家店的数据 |
||||
for poiInfo in jsonres[data][poiInfos]: |
||||
writeInfo('美食店数据:{}'.format(json.dumps(poiInfo, indent=1))) |
||||
db = MysqlDB() |
||||
# 保存到mysql数据库 |
||||
db.insert('insert into meishi (poiId, frontImg, title, avgScore, ' |
||||
'allCommentNum, address, avgPrice, hasAds, adsClickUrl, adsShowUrl) ' |
||||
'value (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)', |
||||
(str(poiInfo['poiId']), poiInfo['frontImg'], poiInfo['title'], |
||||
str(poiInfo['avgScore']), |
||||
str(poiInfo['allCommentNum']), poiInfo['address'], str(poiInfo['avgPrice']), |
||||
poiInfo['hasAds'], poiInfo['adsClickUrl'], poiInfo['adsShowUrl'])) |
||||
else: |
||||
writeError("从json{}解析美食店数据失败".format(json.dumps(jsonres, indent=1))) |
||||
else: |
||||
writeError("从接口地址{}解析json数据{}失败".format(url, res.content)) |
||||
GET_PARAM['page'] = GET_PARAM['page'] + 1 |
||||
# 限制请求速率 |
||||
time.sleep(1) |
Loading…
Reference in new issue