init

5 years ago · ebe2382d4a
commit ebe2382d4a
9 changed files with 606 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,144 @@
+# Created by .ignore support plugin (hsz.mobi)
+### Python template
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+.idea
+
+/log/
--- a/common.py
+++ b/common.py
@ -0,0 +1,160 @@
+# 获取请求头
+import base64
+import os
+import random
+import re
+import time
+import zlib
+import pandas
+import requests
+from requests.adapters import HTTPAdapter
+from urllib3 import Retry
+
+
+
+
+# 获取请求会话
+def getSession():
+    session = requests.session()
+    retry = Retry(connect=3, backoff_factor=0.5)
+    adapter = HTTPAdapter(max_retries=retry)
+    session.mount('http://', adapter)
+    session.mount('https://', adapter)
+    return session
+
+
+href = 'href'
+zh = 'zh'
+en = 'en'
+
+# 城市参数类
+class CityParam:
+
+    def __init__(self, city) -> None:
+        super().__init__()
+        self.city = city
+        self.uuid = self.get_uuid()
+        self.data = self.getData()
+        self.param = self.getParam()
+
+    # 获取请求头
+    def getHeaders(self):
+        return {
+            "Accept": "application/json",
+            "Referer": "https://{}.meituan.com/".format(self.city[en]),
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36",
+        }
+
+    # 获取请求参数uuid
+    def get_uuid(self):
+        """获取uuid"""
+        headers = {
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36"
+        }
+        res = requests.get(self.city[href], headers=headers).text
+        return re.findall(r'"uuid":"(.*?)"', res, re.S)[0]
+
+    # 初始化参数字典
+    def getData(self):
+        return {
+            "cityName": self.city[zh],
+            "cateId": '0',
+            "areaId": "0",
+            "sort": "",
+            "dinnerCountAttrId": "",
+            "page": "1",
+            "userId": "",
+            "uuid": self.uuid,
+            "platform": "1",
+            "partner": "126",
+            "originUrl": href,
+            "riskLevel": "1",
+            "optimusCode": "1"
+        }
+
+    # 初始化请求参数
+    def getParam(self):
+        return {
+            "cityName": self.data["cityName"],
+            "cateId": self.data["cateId"],
+            "areaId": self.data["areaId"],
+            "sort": self.data["sort"],
+            "dinnerCountAttrId": self.data["dinnerCountAttrId"],
+            "page": self.data["page"],
+            "userId": self.data["userId"],
+            "uuid": self.data["uuid"],
+            "platform": self.data["platform"],
+            "partner": self.data["partner"],
+            "originUrl": self.data["originUrl"],
+            "riskLevel": self.data["riskLevel"],
+            "optimusCode": self.data["optimusCode"],
+        }
+
+    # 生成请求sign参数
+    def sign(self):
+        """生成sign参数"""
+        # 默认编码
+        # coding = sys.getdefaultencoding()
+        # 二进制压缩
+        SIGN_PARAM = "areaId={}&cateId={}&cityName={}&dinnerCountAttrId={}&optimusCode={}&originUrl={}&page={}&partner={}&platform={}&riskLevel={}&sort={}&userId={}&uuid={}".format(
+            self.data["areaId"],
+            self.data["cateId"],
+            self.data["cityName"],
+            self.data["dinnerCountAttrId"],
+            self.data["optimusCode"],
+            self.data["originUrl"],
+            self.data["page"],
+            self.data["partner"],
+            self.data["platform"],
+            self.data["riskLevel"],
+            self.data["sort"],
+            self.data["userId"],
+            self.data["uuid"]
+        )
+        binary_data = zlib.compress(SIGN_PARAM.encode())
+        # base64编码
+        base64_data = base64.b64encode(binary_data)
+        # 返回utf8编码的字符串
+        return base64_data.decode()
+
+    # 生成请求token参数
+    def encrypt_token(self):
+        """生成_token参数"""
+        ts = int(time.time() * 1000)  # time.time()返回1970年至今的时间(以秒为单位)
+        # 伪装机型
+        json_path = os.path.dirname(os.path.realpath(__file__)) + '\\config\\br.json'
+        df = pandas.read_json(json_path)
+        brVD, brR_one, brR_two = df.iloc[random.randint(0, len(df) - 1)]
+        token_data = {
+            "rId": 100900,
+            "ver": "1.0.6",
+            "ts": ts,
+            "cts": ts + random.randint(100, 120),  # 经测,cts - ts 的差值大致在 90-130 之间
+            # "cts": ts + 100,
+            "brVD": eval(brVD),
+            "brR": [eval(brR_one), eval(brR_two), 24, 24],
+            "bI": [self.city[href], ""],
+            "mT": [],
+            "kT": [],
+            "aT": [],
+            "tT": [],
+            "aM": "",
+            "sign": self.sign()
+        }
+        # 二进制压缩
+        binary_data = zlib.compress(str(token_data).encode())
+        # base64编码
+        base64_data = base64.b64encode(binary_data)
+        return base64_data.decode()
+
+
+# 等待服务器发送数据秒数
+TIMEOUT = 5
+data='data'
+# 总记录数键
+totalCounts = 'totalCounts'
+# 商户键
+poiInfos = 'poiInfos'
+
+
+
--- a/config.ini
+++ b/config.ini
@ -0,0 +1,18 @@
+;日志配置
+[file]
+;日志文件名
+logFile = log.txt
+;mysql数据库配置
+[mysql]
+#数据库服务端地址
+host=localhost
+#用户
+user=sukura
+#密码
+passwd=123456
+#数据库名
+db=meishi
+#端口
+port=3306
+#连接编码
+charset=utf8
--- a/config/br.json
+++ b/config/br.json
@ -0,0 +1,87 @@
+[
+  {
+    "barVD": "[150,625]",
+    "brR_one": "[1366, 768]",
+    "brR_two": "[1366, 728]"
+  },
+  {
+    "barVD": "[886,635]",
+    "brR_one": "[1366,768]",
+    "brR_two": "[1366,738]"
+  },
+  {
+    "barVD": "[1560,219]",
+    "brR_one": "[1600,900]",
+    "brR_two": "[1600,860]"
+  },
+  {
+    "barVD": "[1366,225]",
+    "brR_one": "[1366,768]",
+    "brR_two": "[1366,768]"
+  },
+  {
+    "barVD": "[1366,209]",
+    "brR_one": "[1366,768]",
+    "brR_two": "[1366,768]"
+  },
+  {
+    "barVD": "[265,689]",
+    "brR_one": "[1280,800]",
+    "brR_two": "[1280,760]"
+  },
+  {
+    "barVD": "[1440,264]",
+    "brR_one": "[1440,900]",
+    "brR_two": "[1440,877]"
+  },
+  {
+    "barVD": "[800,150]",
+    "brR_one": "[800,600]",
+    "brR_two": "[800,560]"
+  },
+  {
+    "barVD": "[1024,318]",
+    "brR_one": "[1024,768]",
+    "brR_two": "[1024,728]"
+  },
+  {
+    "barVD": "[1280,150]",
+    "brR_one": "[1280,600]",
+    "brR_two": "[1280,560]"
+  },
+  {
+    "barVD": "[1280,150]",
+    "brR_one": "[1280,600]",
+    "brR_two": "[1280,600]"
+  },
+  {
+    "barVD": "[1280,270]",
+    "brR_one": "[1280,720]",
+    "brR_two": "[1280,680]"
+  },
+  {
+    "barVD": "[1280,161]",
+    "brR_one": "[1280,720]",
+    "brR_two": "[1280,720]"
+  },
+  {
+    "barVD": "[1280,198]",
+    "brR_one": "[1280,768]",
+    "brR_two": "[1280,728]"
+  },
+  {
+    "barVD": "[1280,209]",
+    "brR_one": "[1280,768]",
+    "brR_two": "[1280,768]"
+  },
+  {
+    "barVD": "[1360,198]",
+    "brR_one": "[1360,768]",
+    "brR_two": "[1360,728]"
+  },
+  {
+    "barVD": "[1360,209]",
+    "brR_one": "[1360,768]",
+    "brR_two": "[1360,768]"
+  }
+]
--- a/config/config.py
+++ b/config/config.py
@ -0,0 +1,34 @@
+import configparser
+import logging
+from logging.handlers import TimedRotatingFileHandler
+import os
+
+BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+print(BASE_DIR)
+cf = configparser.ConfigParser()
+config_path = BASE_DIR + "/config.ini"
+if not os.path.exists(config_path):
+    raise Exception("配置文件:%s不存在" % config_path)
+cf.read(config_path, encoding='utf-8')
+logFile = cf.get('file', 'logFile')
+logger = logging.getLogger()
+logger.setLevel(logging.INFO)
+
+
+def init():
+    log_format = logging.Formatter(fmt="%(asctime)s %(levelname)s : %(message)s", datefmt='%Y-%m-%d %H:%M:%S')
+    # 在控制台打印日志
+    streamHandler = logging.StreamHandler()
+    streamHandler.setFormatter(log_format)
+    logger.addHandler(streamHandler)
+
+    logpath = BASE_DIR + "/log/"
+    print(logpath)
+    if not os.path.exists(BASE_DIR + "/log/"):
+        os.mkdir(logpath)
+
+    timedRotatingFileHandler = TimedRotatingFileHandler(filename=logpath + "all.log", when='H', interval=1,
+                                                        encoding='utf-8')
+    timedRotatingFileHandler.setFormatter(log_format)
+
+    logger.addHandler(timedRotatingFileHandler)
--- a/config/log.py
+++ b/config/log.py
@ -0,0 +1,21 @@
+import time
+
+from config.config import init
+from config.config import logger
+
+start = int(time.time())
+init()
+
+def getRunTimeInt():
+    return (int(time.time()) - start)
+
+def getRunTime():
+    return '程序已经执行%d秒' % (int(time.time()) - start)
+
+
+def writeInfo(msg):
+    logger.info('%s\t(%s)' % (msg, getRunTime()))
+
+
+def writeError(msg):
+    logger.error('%s\t(%s)' % (msg, getRunTime()))
--- a/db.sql
+++ b/db.sql
@ -0,0 +1,14 @@
+create table meishi(
+    poiId int not null,
+    frontImg varchar(128) not null,
+    title varchar(128) not null,
+    avgScore float not null,
+    allCommentNum int not null,
+    address varchar(128) not null,
+    avgPrice int not null,
+    hasAds tinyint not null,
+    adsClickUrl varchar(2048),
+    adsShowUrl varchar(2048),
+    constraint meishi_pk
+		primary key (poiId)
+)
--- a/main.py
+++ b/main.py
@ -0,0 +1,123 @@
+import json
+import math
+import time
+from urllib.parse import urlencode
+
+from bs4 import BeautifulSoup, Tag
+
+from common import getSession, CityParam, zh, en, href, TIMEOUT, totalCounts, poiInfos, data
+from config.config import cf
+from config.log import writeInfo, writeError
+import pymysql
+
+
+# mysql数据库
+class MysqlDB:
+    # 建立连接
+    def connect(self):
+        mysql = 'mysql'
+        host = cf.get(mysql, 'host')
+        user = cf.get(mysql, 'user')
+        passwd = cf.get(mysql, 'passwd')
+        db = cf.get(mysql, 'db')
+        port = int(cf.get(mysql, 'port'))
+        charset = cf.get(mysql, 'charset')
+        return pymysql.connect(host=host, user=user, passwd=passwd, db=db, port=port, charset=charset)
+
+    # 执行insert语句
+    def insert(self, sql, params):
+        connection = self.connect()
+        try:
+            with connection.cursor() as cursor:
+                # Create a new record
+                cursor.execute(sql, params)
+            # connection is not autocommit by default. So you must commit to save
+            # your changes.
+            # 提交事务
+            connection.commit()
+        except Exception as e:
+            writeError(e)
+        finally:
+            connection.close()
+
+# 美团美食爬虫
+class MeiTuanCrawler:
+    # 城市列表网址
+    CITY_URL = 'https://www.meituan.com/changecity/'
+    features = 'lxml'
+
+    def __init__(self) -> None:
+        super().__init__()
+        # 请求会话
+        self.session = getSession()
+        # 美团城市列表
+        self.cityMap = {}
+
+    # 爬取城市列表
+    def getCity(self):
+        res = self.session.get(MeiTuanCrawler.CITY_URL)
+        if res.status_code == 200:
+            writeInfo("开始解析城市列表信息")
+            # 解析DOM
+            html = BeautifulSoup(res.content, MeiTuanCrawler.features)
+            # 获取城市超链接
+            city_a = html.select('a[class="link city"]')
+            for city in city_a:
+                if type(city) == Tag:
+                    # 拼接城市美食网址
+                    self.cityMap[city.text] = {href: 'https:{0}/meishi'.format(city[href]), zh: city.text,
+                                               en: str(city[href]).split('.')[0].replace('//', '')}
+            writeInfo('总共获取了{0}个美团城市超链接'.format(len(self.cityMap)))
+        else:
+            writeError("无法爬取城市链接，响应码：{0}".format(res.status_code))
+
+    # 爬取城市美食
+    def meishi(self):
+        for city in self.cityMap.keys():
+            # 获取请求参数
+            c = CityParam(self.cityMap[city])
+            GET_PARAM = c.getParam()
+            totalPage = None
+            # 当前页数
+            GET_PARAM['page'] = 1
+            # 爬取美食列表数据
+            while totalPage is None or GET_PARAM['page'] <= totalPage:
+                # 添加token参数
+                GET_PARAM["_token"] = c.encrypt_token()
+                writeInfo("param:{}".format(GET_PARAM))
+                # 拼接城市美食接口地址
+                base_url = "https://{}.meituan.com/meishi/api/poi/getPoiList?".format(self.cityMap[city][en])
+                url = base_url + urlencode(GET_PARAM, encoding='utf-8')
+                writeInfo("接口地址：{}".format(url))
+                writeInfo("解析{}美食列表第{}页数据".format(self.cityMap[city][zh], GET_PARAM['page']))
+                # 发送请求获取美食数据
+                res = self.session.get(url, headers=c.getHeaders(), timeout=TIMEOUT)
+                # 解析接口响应的json数据
+                if res.status_code == 200 and 'json' in res.headers['Content-Type']:
+                    jsonres = json.loads(res.content)
+                    writeInfo("响应json{}".format(json.dumps(jsonres, indent=1)))
+                    # 判断数据结构合法性
+                    if data in jsonres and totalCounts in jsonres[data] and poiInfos in jsonres[data]:
+                        # 获取分页数
+                        if totalPage is None:
+                            totalPage = math.ceil(jsonres[data][totalCounts] / len(jsonres[data][poiInfos]))
+                            writeInfo('{}美食数据总共有{}'.format(self.cityMap[city][zh], totalPage))
+                        # 轮询每一家店的数据
+                        for poiInfo in jsonres[data][poiInfos]:
+                            writeInfo('美食店数据：{}'.format(json.dumps(poiInfo, indent=1)))
+                            db = MysqlDB()
+                            # 保存到mysql数据库
+                            db.insert('insert into meishi (poiId, frontImg, title, avgScore, '
+                                      'allCommentNum, address, avgPrice, hasAds, adsClickUrl, adsShowUrl) '
+                                      'value (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)',
+                                      (str(poiInfo['poiId']), poiInfo['frontImg'], poiInfo['title'],
+                                       str(poiInfo['avgScore']),
+                                       str(poiInfo['allCommentNum']), poiInfo['address'], str(poiInfo['avgPrice']),
+                                       poiInfo['hasAds'], poiInfo['adsClickUrl'], poiInfo['adsShowUrl']))
+                    else:
+                        writeError("从json{}解析美食店数据失败".format(json.dumps(jsonres, indent=1)))
+                else:
+                    writeError("从接口地址{}解析json数据{}失败".format(url, res.content))
+                GET_PARAM['page'] = GET_PARAM['page'] + 1
+                # 限制请求速率
+                time.sleep(1)
--- a/run.py
+++ b/run.py
@ -0,0 +1,5 @@
+from main import MeiTuanCrawler, MysqlDB
+
+c = MeiTuanCrawler()
+c.getCity()
+c.meishi()