From ebe2382d4ab3ac1d0ddd9a25df110b2530957710 Mon Sep 17 00:00:00 2001 From: pan <1029559041@qq.com> Date: Wed, 1 Jul 2020 16:18:33 +0800 Subject: [PATCH] init --- .gitignore | 144 ++++++++++++++++++++++++++++++++++++++++++ common.py | 160 +++++++++++++++++++++++++++++++++++++++++++++++ config.ini | 18 ++++++ config/br.json | 87 ++++++++++++++++++++++++++ config/config.py | 34 ++++++++++ config/log.py | 21 +++++++ db.sql | 14 +++++ main.py | 123 ++++++++++++++++++++++++++++++++++++ run.py | 5 ++ 9 files changed, 606 insertions(+) create mode 100644 .gitignore create mode 100644 common.py create mode 100644 config.ini create mode 100644 config/br.json create mode 100644 config/config.py create mode 100644 config/log.py create mode 100644 db.sql create mode 100644 main.py create mode 100644 run.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..47c4f6e --- /dev/null +++ b/.gitignore @@ -0,0 +1,144 @@ +# Created by .ignore support plugin (hsz.mobi) +### Python template +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ +.idea + +/log/ diff --git a/common.py b/common.py new file mode 100644 index 0000000..3790171 --- /dev/null +++ b/common.py @@ -0,0 +1,160 @@ +# 获取请求头 +import base64 +import os +import random +import re +import time +import zlib +import pandas +import requests +from requests.adapters import HTTPAdapter +from urllib3 import Retry + + + + +# 获取请求会话 +def getSession(): + session = requests.session() + retry = Retry(connect=3, backoff_factor=0.5) + adapter = HTTPAdapter(max_retries=retry) + session.mount('http://', adapter) + session.mount('https://', adapter) + return session + + +href = 'href' +zh = 'zh' +en = 'en' + +# 城市参数类 +class CityParam: + + def __init__(self, city) -> None: + super().__init__() + self.city = city + self.uuid = self.get_uuid() + self.data = self.getData() + self.param = self.getParam() + + # 获取请求头 + def getHeaders(self): + return { + "Accept": "application/json", + "Referer": "https://{}.meituan.com/".format(self.city[en]), + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36", + } + + # 获取请求参数uuid + def get_uuid(self): + """获取uuid""" + headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36" + } + res = requests.get(self.city[href], headers=headers).text + return re.findall(r'"uuid":"(.*?)"', res, re.S)[0] + + # 初始化参数字典 + def getData(self): + return { + "cityName": self.city[zh], + "cateId": '0', + "areaId": "0", + "sort": "", + "dinnerCountAttrId": "", + "page": "1", + "userId": "", + "uuid": self.uuid, + "platform": "1", + "partner": "126", + "originUrl": href, + "riskLevel": "1", + "optimusCode": "1" + } + + # 初始化请求参数 + def getParam(self): + return { + "cityName": self.data["cityName"], + "cateId": self.data["cateId"], + "areaId": self.data["areaId"], + "sort": self.data["sort"], + "dinnerCountAttrId": self.data["dinnerCountAttrId"], + "page": self.data["page"], + "userId": self.data["userId"], + "uuid": self.data["uuid"], + "platform": self.data["platform"], + "partner": self.data["partner"], + "originUrl": self.data["originUrl"], + "riskLevel": self.data["riskLevel"], + "optimusCode": self.data["optimusCode"], + } + + # 生成请求sign参数 + def sign(self): + """生成sign参数""" + # 默认编码 + # coding = sys.getdefaultencoding() + # 二进制压缩 + SIGN_PARAM = "areaId={}&cateId={}&cityName={}&dinnerCountAttrId={}&optimusCode={}&originUrl={}&page={}&partner={}&platform={}&riskLevel={}&sort={}&userId={}&uuid={}".format( + self.data["areaId"], + self.data["cateId"], + self.data["cityName"], + self.data["dinnerCountAttrId"], + self.data["optimusCode"], + self.data["originUrl"], + self.data["page"], + self.data["partner"], + self.data["platform"], + self.data["riskLevel"], + self.data["sort"], + self.data["userId"], + self.data["uuid"] + ) + binary_data = zlib.compress(SIGN_PARAM.encode()) + # base64编码 + base64_data = base64.b64encode(binary_data) + # 返回utf8编码的字符串 + return base64_data.decode() + + # 生成请求token参数 + def encrypt_token(self): + """生成_token参数""" + ts = int(time.time() * 1000) # time.time()返回1970年至今的时间(以秒为单位) + # 伪装机型 + json_path = os.path.dirname(os.path.realpath(__file__)) + '\\config\\br.json' + df = pandas.read_json(json_path) + brVD, brR_one, brR_two = df.iloc[random.randint(0, len(df) - 1)] + token_data = { + "rId": 100900, + "ver": "1.0.6", + "ts": ts, + "cts": ts + random.randint(100, 120), # 经测,cts - ts 的差值大致在 90-130 之间 + # "cts": ts + 100, + "brVD": eval(brVD), + "brR": [eval(brR_one), eval(brR_two), 24, 24], + "bI": [self.city[href], ""], + "mT": [], + "kT": [], + "aT": [], + "tT": [], + "aM": "", + "sign": self.sign() + } + # 二进制压缩 + binary_data = zlib.compress(str(token_data).encode()) + # base64编码 + base64_data = base64.b64encode(binary_data) + return base64_data.decode() + + +# 等待服务器发送数据秒数 +TIMEOUT = 5 +data='data' +# 总记录数键 +totalCounts = 'totalCounts' +# 商户键 +poiInfos = 'poiInfos' + + + diff --git a/config.ini b/config.ini new file mode 100644 index 0000000..4aedba6 --- /dev/null +++ b/config.ini @@ -0,0 +1,18 @@ +;日志配置 +[file] +;日志文件名 +logFile = log.txt +;mysql数据库配置 +[mysql] +#数据库服务端地址 +host=localhost +#用户 +user=sukura +#密码 +passwd=123456 +#数据库名 +db=meishi +#端口 +port=3306 +#连接编码 +charset=utf8 \ No newline at end of file diff --git a/config/br.json b/config/br.json new file mode 100644 index 0000000..1ecccc5 --- /dev/null +++ b/config/br.json @@ -0,0 +1,87 @@ +[ + { + "barVD": "[150,625]", + "brR_one": "[1366, 768]", + "brR_two": "[1366, 728]" + }, + { + "barVD": "[886,635]", + "brR_one": "[1366,768]", + "brR_two": "[1366,738]" + }, + { + "barVD": "[1560,219]", + "brR_one": "[1600,900]", + "brR_two": "[1600,860]" + }, + { + "barVD": "[1366,225]", + "brR_one": "[1366,768]", + "brR_two": "[1366,768]" + }, + { + "barVD": "[1366,209]", + "brR_one": "[1366,768]", + "brR_two": "[1366,768]" + }, + { + "barVD": "[265,689]", + "brR_one": "[1280,800]", + "brR_two": "[1280,760]" + }, + { + "barVD": "[1440,264]", + "brR_one": "[1440,900]", + "brR_two": "[1440,877]" + }, + { + "barVD": "[800,150]", + "brR_one": "[800,600]", + "brR_two": "[800,560]" + }, + { + "barVD": "[1024,318]", + "brR_one": "[1024,768]", + "brR_two": "[1024,728]" + }, + { + "barVD": "[1280,150]", + "brR_one": "[1280,600]", + "brR_two": "[1280,560]" + }, + { + "barVD": "[1280,150]", + "brR_one": "[1280,600]", + "brR_two": "[1280,600]" + }, + { + "barVD": "[1280,270]", + "brR_one": "[1280,720]", + "brR_two": "[1280,680]" + }, + { + "barVD": "[1280,161]", + "brR_one": "[1280,720]", + "brR_two": "[1280,720]" + }, + { + "barVD": "[1280,198]", + "brR_one": "[1280,768]", + "brR_two": "[1280,728]" + }, + { + "barVD": "[1280,209]", + "brR_one": "[1280,768]", + "brR_two": "[1280,768]" + }, + { + "barVD": "[1360,198]", + "brR_one": "[1360,768]", + "brR_two": "[1360,728]" + }, + { + "barVD": "[1360,209]", + "brR_one": "[1360,768]", + "brR_two": "[1360,768]" + } +] \ No newline at end of file diff --git a/config/config.py b/config/config.py new file mode 100644 index 0000000..93fdc06 --- /dev/null +++ b/config/config.py @@ -0,0 +1,34 @@ +import configparser +import logging +from logging.handlers import TimedRotatingFileHandler +import os + +BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +print(BASE_DIR) +cf = configparser.ConfigParser() +config_path = BASE_DIR + "/config.ini" +if not os.path.exists(config_path): + raise Exception("配置文件:%s不存在" % config_path) +cf.read(config_path, encoding='utf-8') +logFile = cf.get('file', 'logFile') +logger = logging.getLogger() +logger.setLevel(logging.INFO) + + +def init(): + log_format = logging.Formatter(fmt="%(asctime)s %(levelname)s : %(message)s", datefmt='%Y-%m-%d %H:%M:%S') + # 在控制台打印日志 + streamHandler = logging.StreamHandler() + streamHandler.setFormatter(log_format) + logger.addHandler(streamHandler) + + logpath = BASE_DIR + "/log/" + print(logpath) + if not os.path.exists(BASE_DIR + "/log/"): + os.mkdir(logpath) + + timedRotatingFileHandler = TimedRotatingFileHandler(filename=logpath + "all.log", when='H', interval=1, + encoding='utf-8') + timedRotatingFileHandler.setFormatter(log_format) + + logger.addHandler(timedRotatingFileHandler) diff --git a/config/log.py b/config/log.py new file mode 100644 index 0000000..d9929a9 --- /dev/null +++ b/config/log.py @@ -0,0 +1,21 @@ +import time + +from config.config import init +from config.config import logger + +start = int(time.time()) +init() + +def getRunTimeInt(): + return (int(time.time()) - start) + +def getRunTime(): + return '程序已经执行%d秒' % (int(time.time()) - start) + + +def writeInfo(msg): + logger.info('%s\t(%s)' % (msg, getRunTime())) + + +def writeError(msg): + logger.error('%s\t(%s)' % (msg, getRunTime())) diff --git a/db.sql b/db.sql new file mode 100644 index 0000000..e2f47c7 --- /dev/null +++ b/db.sql @@ -0,0 +1,14 @@ +create table meishi( + poiId int not null, + frontImg varchar(128) not null, + title varchar(128) not null, + avgScore float not null, + allCommentNum int not null, + address varchar(128) not null, + avgPrice int not null, + hasAds tinyint not null, + adsClickUrl varchar(2048), + adsShowUrl varchar(2048), + constraint meishi_pk + primary key (poiId) +) \ No newline at end of file diff --git a/main.py b/main.py new file mode 100644 index 0000000..1bd1d41 --- /dev/null +++ b/main.py @@ -0,0 +1,123 @@ +import json +import math +import time +from urllib.parse import urlencode + +from bs4 import BeautifulSoup, Tag + +from common import getSession, CityParam, zh, en, href, TIMEOUT, totalCounts, poiInfos, data +from config.config import cf +from config.log import writeInfo, writeError +import pymysql + + +# mysql数据库 +class MysqlDB: + # 建立连接 + def connect(self): + mysql = 'mysql' + host = cf.get(mysql, 'host') + user = cf.get(mysql, 'user') + passwd = cf.get(mysql, 'passwd') + db = cf.get(mysql, 'db') + port = int(cf.get(mysql, 'port')) + charset = cf.get(mysql, 'charset') + return pymysql.connect(host=host, user=user, passwd=passwd, db=db, port=port, charset=charset) + + # 执行insert语句 + def insert(self, sql, params): + connection = self.connect() + try: + with connection.cursor() as cursor: + # Create a new record + cursor.execute(sql, params) + # connection is not autocommit by default. So you must commit to save + # your changes. + # 提交事务 + connection.commit() + except Exception as e: + writeError(e) + finally: + connection.close() + +# 美团美食爬虫 +class MeiTuanCrawler: + # 城市列表网址 + CITY_URL = 'https://www.meituan.com/changecity/' + features = 'lxml' + + def __init__(self) -> None: + super().__init__() + # 请求会话 + self.session = getSession() + # 美团城市列表 + self.cityMap = {} + + # 爬取城市列表 + def getCity(self): + res = self.session.get(MeiTuanCrawler.CITY_URL) + if res.status_code == 200: + writeInfo("开始解析城市列表信息") + # 解析DOM + html = BeautifulSoup(res.content, MeiTuanCrawler.features) + # 获取城市超链接 + city_a = html.select('a[class="link city"]') + for city in city_a: + if type(city) == Tag: + # 拼接城市美食网址 + self.cityMap[city.text] = {href: 'https:{0}/meishi'.format(city[href]), zh: city.text, + en: str(city[href]).split('.')[0].replace('//', '')} + writeInfo('总共获取了{0}个美团城市超链接'.format(len(self.cityMap))) + else: + writeError("无法爬取城市链接,响应码:{0}".format(res.status_code)) + + # 爬取城市美食 + def meishi(self): + for city in self.cityMap.keys(): + # 获取请求参数 + c = CityParam(self.cityMap[city]) + GET_PARAM = c.getParam() + totalPage = None + # 当前页数 + GET_PARAM['page'] = 1 + # 爬取美食列表数据 + while totalPage is None or GET_PARAM['page'] <= totalPage: + # 添加token参数 + GET_PARAM["_token"] = c.encrypt_token() + writeInfo("param:{}".format(GET_PARAM)) + # 拼接城市美食接口地址 + base_url = "https://{}.meituan.com/meishi/api/poi/getPoiList?".format(self.cityMap[city][en]) + url = base_url + urlencode(GET_PARAM, encoding='utf-8') + writeInfo("接口地址:{}".format(url)) + writeInfo("解析{}美食列表第{}页数据".format(self.cityMap[city][zh], GET_PARAM['page'])) + # 发送请求获取美食数据 + res = self.session.get(url, headers=c.getHeaders(), timeout=TIMEOUT) + # 解析接口响应的json数据 + if res.status_code == 200 and 'json' in res.headers['Content-Type']: + jsonres = json.loads(res.content) + writeInfo("响应json{}".format(json.dumps(jsonres, indent=1))) + # 判断数据结构合法性 + if data in jsonres and totalCounts in jsonres[data] and poiInfos in jsonres[data]: + # 获取分页数 + if totalPage is None: + totalPage = math.ceil(jsonres[data][totalCounts] / len(jsonres[data][poiInfos])) + writeInfo('{}美食数据总共有{}'.format(self.cityMap[city][zh], totalPage)) + # 轮询每一家店的数据 + for poiInfo in jsonres[data][poiInfos]: + writeInfo('美食店数据:{}'.format(json.dumps(poiInfo, indent=1))) + db = MysqlDB() + # 保存到mysql数据库 + db.insert('insert into meishi (poiId, frontImg, title, avgScore, ' + 'allCommentNum, address, avgPrice, hasAds, adsClickUrl, adsShowUrl) ' + 'value (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)', + (str(poiInfo['poiId']), poiInfo['frontImg'], poiInfo['title'], + str(poiInfo['avgScore']), + str(poiInfo['allCommentNum']), poiInfo['address'], str(poiInfo['avgPrice']), + poiInfo['hasAds'], poiInfo['adsClickUrl'], poiInfo['adsShowUrl'])) + else: + writeError("从json{}解析美食店数据失败".format(json.dumps(jsonres, indent=1))) + else: + writeError("从接口地址{}解析json数据{}失败".format(url, res.content)) + GET_PARAM['page'] = GET_PARAM['page'] + 1 + # 限制请求速率 + time.sleep(1) diff --git a/run.py b/run.py new file mode 100644 index 0000000..4f32558 --- /dev/null +++ b/run.py @@ -0,0 +1,5 @@ +from main import MeiTuanCrawler, MysqlDB + +c = MeiTuanCrawler() +c.getCity() +c.meishi()