master
pan 4 years ago
commit ebe2382d4a
  1. 144
      .gitignore
  2. 160
      common.py
  3. 18
      config.ini
  4. 87
      config/br.json
  5. 34
      config/config.py
  6. 21
      config/log.py
  7. 14
      db.sql
  8. 123
      main.py
  9. 5
      run.py

144
.gitignore vendored

@ -0,0 +1,144 @@
# Created by .ignore support plugin (hsz.mobi)
### Python template
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
.idea
/log/

@ -0,0 +1,160 @@
# 获取请求头
import base64
import os
import random
import re
import time
import zlib
import pandas
import requests
from requests.adapters import HTTPAdapter
from urllib3 import Retry
# 获取请求会话
def getSession():
session = requests.session()
retry = Retry(connect=3, backoff_factor=0.5)
adapter = HTTPAdapter(max_retries=retry)
session.mount('http://', adapter)
session.mount('https://', adapter)
return session
href = 'href'
zh = 'zh'
en = 'en'
# 城市参数类
class CityParam:
def __init__(self, city) -> None:
super().__init__()
self.city = city
self.uuid = self.get_uuid()
self.data = self.getData()
self.param = self.getParam()
# 获取请求头
def getHeaders(self):
return {
"Accept": "application/json",
"Referer": "https://{}.meituan.com/".format(self.city[en]),
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36",
}
# 获取请求参数uuid
def get_uuid(self):
"""获取uuid"""
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36"
}
res = requests.get(self.city[href], headers=headers).text
return re.findall(r'"uuid":"(.*?)"', res, re.S)[0]
# 初始化参数字典
def getData(self):
return {
"cityName": self.city[zh],
"cateId": '0',
"areaId": "0",
"sort": "",
"dinnerCountAttrId": "",
"page": "1",
"userId": "",
"uuid": self.uuid,
"platform": "1",
"partner": "126",
"originUrl": href,
"riskLevel": "1",
"optimusCode": "1"
}
# 初始化请求参数
def getParam(self):
return {
"cityName": self.data["cityName"],
"cateId": self.data["cateId"],
"areaId": self.data["areaId"],
"sort": self.data["sort"],
"dinnerCountAttrId": self.data["dinnerCountAttrId"],
"page": self.data["page"],
"userId": self.data["userId"],
"uuid": self.data["uuid"],
"platform": self.data["platform"],
"partner": self.data["partner"],
"originUrl": self.data["originUrl"],
"riskLevel": self.data["riskLevel"],
"optimusCode": self.data["optimusCode"],
}
# 生成请求sign参数
def sign(self):
"""生成sign参数"""
# 默认编码
# coding = sys.getdefaultencoding()
# 二进制压缩
SIGN_PARAM = "areaId={}&cateId={}&cityName={}&dinnerCountAttrId={}&optimusCode={}&originUrl={}&page={}&partner={}&platform={}&riskLevel={}&sort={}&userId={}&uuid={}".format(
self.data["areaId"],
self.data["cateId"],
self.data["cityName"],
self.data["dinnerCountAttrId"],
self.data["optimusCode"],
self.data["originUrl"],
self.data["page"],
self.data["partner"],
self.data["platform"],
self.data["riskLevel"],
self.data["sort"],
self.data["userId"],
self.data["uuid"]
)
binary_data = zlib.compress(SIGN_PARAM.encode())
# base64编码
base64_data = base64.b64encode(binary_data)
# 返回utf8编码的字符串
return base64_data.decode()
# 生成请求token参数
def encrypt_token(self):
"""生成_token参数"""
ts = int(time.time() * 1000) # time.time()返回1970年至今的时间(以秒为单位)
# 伪装机型
json_path = os.path.dirname(os.path.realpath(__file__)) + '\\config\\br.json'
df = pandas.read_json(json_path)
brVD, brR_one, brR_two = df.iloc[random.randint(0, len(df) - 1)]
token_data = {
"rId": 100900,
"ver": "1.0.6",
"ts": ts,
"cts": ts + random.randint(100, 120), # 经测,cts - ts 的差值大致在 90-130 之间
# "cts": ts + 100,
"brVD": eval(brVD),
"brR": [eval(brR_one), eval(brR_two), 24, 24],
"bI": [self.city[href], ""],
"mT": [],
"kT": [],
"aT": [],
"tT": [],
"aM": "",
"sign": self.sign()
}
# 二进制压缩
binary_data = zlib.compress(str(token_data).encode())
# base64编码
base64_data = base64.b64encode(binary_data)
return base64_data.decode()
# 等待服务器发送数据秒数
TIMEOUT = 5
data='data'
# 总记录数键
totalCounts = 'totalCounts'
# 商户键
poiInfos = 'poiInfos'

@ -0,0 +1,18 @@
;日志配置
[file]
;日志文件名
logFile = log.txt
;mysql数据库配置
[mysql]
#数据库服务端地址
host=localhost
#用户
user=sukura
#密码
passwd=123456
#数据库名
db=meishi
#端口
port=3306
#连接编码
charset=utf8

@ -0,0 +1,87 @@
[
{
"barVD": "[150,625]",
"brR_one": "[1366, 768]",
"brR_two": "[1366, 728]"
},
{
"barVD": "[886,635]",
"brR_one": "[1366,768]",
"brR_two": "[1366,738]"
},
{
"barVD": "[1560,219]",
"brR_one": "[1600,900]",
"brR_two": "[1600,860]"
},
{
"barVD": "[1366,225]",
"brR_one": "[1366,768]",
"brR_two": "[1366,768]"
},
{
"barVD": "[1366,209]",
"brR_one": "[1366,768]",
"brR_two": "[1366,768]"
},
{
"barVD": "[265,689]",
"brR_one": "[1280,800]",
"brR_two": "[1280,760]"
},
{
"barVD": "[1440,264]",
"brR_one": "[1440,900]",
"brR_two": "[1440,877]"
},
{
"barVD": "[800,150]",
"brR_one": "[800,600]",
"brR_two": "[800,560]"
},
{
"barVD": "[1024,318]",
"brR_one": "[1024,768]",
"brR_two": "[1024,728]"
},
{
"barVD": "[1280,150]",
"brR_one": "[1280,600]",
"brR_two": "[1280,560]"
},
{
"barVD": "[1280,150]",
"brR_one": "[1280,600]",
"brR_two": "[1280,600]"
},
{
"barVD": "[1280,270]",
"brR_one": "[1280,720]",
"brR_two": "[1280,680]"
},
{
"barVD": "[1280,161]",
"brR_one": "[1280,720]",
"brR_two": "[1280,720]"
},
{
"barVD": "[1280,198]",
"brR_one": "[1280,768]",
"brR_two": "[1280,728]"
},
{
"barVD": "[1280,209]",
"brR_one": "[1280,768]",
"brR_two": "[1280,768]"
},
{
"barVD": "[1360,198]",
"brR_one": "[1360,768]",
"brR_two": "[1360,728]"
},
{
"barVD": "[1360,209]",
"brR_one": "[1360,768]",
"brR_two": "[1360,768]"
}
]

@ -0,0 +1,34 @@
import configparser
import logging
from logging.handlers import TimedRotatingFileHandler
import os
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
print(BASE_DIR)
cf = configparser.ConfigParser()
config_path = BASE_DIR + "/config.ini"
if not os.path.exists(config_path):
raise Exception("配置文件:%s不存在" % config_path)
cf.read(config_path, encoding='utf-8')
logFile = cf.get('file', 'logFile')
logger = logging.getLogger()
logger.setLevel(logging.INFO)
def init():
log_format = logging.Formatter(fmt="%(asctime)s %(levelname)s : %(message)s", datefmt='%Y-%m-%d %H:%M:%S')
# 在控制台打印日志
streamHandler = logging.StreamHandler()
streamHandler.setFormatter(log_format)
logger.addHandler(streamHandler)
logpath = BASE_DIR + "/log/"
print(logpath)
if not os.path.exists(BASE_DIR + "/log/"):
os.mkdir(logpath)
timedRotatingFileHandler = TimedRotatingFileHandler(filename=logpath + "all.log", when='H', interval=1,
encoding='utf-8')
timedRotatingFileHandler.setFormatter(log_format)
logger.addHandler(timedRotatingFileHandler)

@ -0,0 +1,21 @@
import time
from config.config import init
from config.config import logger
start = int(time.time())
init()
def getRunTimeInt():
return (int(time.time()) - start)
def getRunTime():
return '程序已经执行%d' % (int(time.time()) - start)
def writeInfo(msg):
logger.info('%s\t(%s)' % (msg, getRunTime()))
def writeError(msg):
logger.error('%s\t(%s)' % (msg, getRunTime()))

@ -0,0 +1,14 @@
create table meishi(
poiId int not null,
frontImg varchar(128) not null,
title varchar(128) not null,
avgScore float not null,
allCommentNum int not null,
address varchar(128) not null,
avgPrice int not null,
hasAds tinyint not null,
adsClickUrl varchar(2048),
adsShowUrl varchar(2048),
constraint meishi_pk
primary key (poiId)
)

@ -0,0 +1,123 @@
import json
import math
import time
from urllib.parse import urlencode
from bs4 import BeautifulSoup, Tag
from common import getSession, CityParam, zh, en, href, TIMEOUT, totalCounts, poiInfos, data
from config.config import cf
from config.log import writeInfo, writeError
import pymysql
# mysql数据库
class MysqlDB:
# 建立连接
def connect(self):
mysql = 'mysql'
host = cf.get(mysql, 'host')
user = cf.get(mysql, 'user')
passwd = cf.get(mysql, 'passwd')
db = cf.get(mysql, 'db')
port = int(cf.get(mysql, 'port'))
charset = cf.get(mysql, 'charset')
return pymysql.connect(host=host, user=user, passwd=passwd, db=db, port=port, charset=charset)
# 执行insert语句
def insert(self, sql, params):
connection = self.connect()
try:
with connection.cursor() as cursor:
# Create a new record
cursor.execute(sql, params)
# connection is not autocommit by default. So you must commit to save
# your changes.
# 提交事务
connection.commit()
except Exception as e:
writeError(e)
finally:
connection.close()
# 美团美食爬虫
class MeiTuanCrawler:
# 城市列表网址
CITY_URL = 'https://www.meituan.com/changecity/'
features = 'lxml'
def __init__(self) -> None:
super().__init__()
# 请求会话
self.session = getSession()
# 美团城市列表
self.cityMap = {}
# 爬取城市列表
def getCity(self):
res = self.session.get(MeiTuanCrawler.CITY_URL)
if res.status_code == 200:
writeInfo("开始解析城市列表信息")
# 解析DOM
html = BeautifulSoup(res.content, MeiTuanCrawler.features)
# 获取城市超链接
city_a = html.select('a[class="link city"]')
for city in city_a:
if type(city) == Tag:
# 拼接城市美食网址
self.cityMap[city.text] = {href: 'https:{0}/meishi'.format(city[href]), zh: city.text,
en: str(city[href]).split('.')[0].replace('//', '')}
writeInfo('总共获取了{0}个美团城市超链接'.format(len(self.cityMap)))
else:
writeError("无法爬取城市链接,响应码:{0}".format(res.status_code))
# 爬取城市美食
def meishi(self):
for city in self.cityMap.keys():
# 获取请求参数
c = CityParam(self.cityMap[city])
GET_PARAM = c.getParam()
totalPage = None
# 当前页数
GET_PARAM['page'] = 1
# 爬取美食列表数据
while totalPage is None or GET_PARAM['page'] <= totalPage:
# 添加token参数
GET_PARAM["_token"] = c.encrypt_token()
writeInfo("param:{}".format(GET_PARAM))
# 拼接城市美食接口地址
base_url = "https://{}.meituan.com/meishi/api/poi/getPoiList?".format(self.cityMap[city][en])
url = base_url + urlencode(GET_PARAM, encoding='utf-8')
writeInfo("接口地址:{}".format(url))
writeInfo("解析{}美食列表第{}页数据".format(self.cityMap[city][zh], GET_PARAM['page']))
# 发送请求获取美食数据
res = self.session.get(url, headers=c.getHeaders(), timeout=TIMEOUT)
# 解析接口响应的json数据
if res.status_code == 200 and 'json' in res.headers['Content-Type']:
jsonres = json.loads(res.content)
writeInfo("响应json{}".format(json.dumps(jsonres, indent=1)))
# 判断数据结构合法性
if data in jsonres and totalCounts in jsonres[data] and poiInfos in jsonres[data]:
# 获取分页数
if totalPage is None:
totalPage = math.ceil(jsonres[data][totalCounts] / len(jsonres[data][poiInfos]))
writeInfo('{}美食数据总共有{}'.format(self.cityMap[city][zh], totalPage))
# 轮询每一家店的数据
for poiInfo in jsonres[data][poiInfos]:
writeInfo('美食店数据:{}'.format(json.dumps(poiInfo, indent=1)))
db = MysqlDB()
# 保存到mysql数据库
db.insert('insert into meishi (poiId, frontImg, title, avgScore, '
'allCommentNum, address, avgPrice, hasAds, adsClickUrl, adsShowUrl) '
'value (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)',
(str(poiInfo['poiId']), poiInfo['frontImg'], poiInfo['title'],
str(poiInfo['avgScore']),
str(poiInfo['allCommentNum']), poiInfo['address'], str(poiInfo['avgPrice']),
poiInfo['hasAds'], poiInfo['adsClickUrl'], poiInfo['adsShowUrl']))
else:
writeError("从json{}解析美食店数据失败".format(json.dumps(jsonres, indent=1)))
else:
writeError("从接口地址{}解析json数据{}失败".format(url, res.content))
GET_PARAM['page'] = GET_PARAM['page'] + 1
# 限制请求速率
time.sleep(1)

@ -0,0 +1,5 @@
from main import MeiTuanCrawler, MysqlDB
c = MeiTuanCrawler()
c.getCity()
c.meishi()
Loading…
Cancel
Save