commit c70bbf113d21f2b3af87a0e71299eaa31a9e9060 Author: 潘啟华 <1029559041@qq.com> Date: Wed Sep 11 02:28:45 2019 +0800 init diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..d2ed069 --- /dev/null +++ b/.gitignore @@ -0,0 +1,131 @@ +# Created by .ignore support plugin (hsz.mobi) +### Python template +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +/.idea/ +/log/ +/login.png +*.xlsx diff --git a/Crawler.py b/Crawler.py new file mode 100644 index 0000000..4a17481 --- /dev/null +++ b/Crawler.py @@ -0,0 +1,48 @@ +# 手机爬虫 +import gzip +import zlib + + +class MobilePhoneCrawler(): + def __init__(self) -> None: + super().__init__() + # 限制5000条数据 + self._max_count = 5000 + # 手机列表数据 + self._mobile_list = [] + + @property + def max_count(self): + return self._max_count + + @max_count.setter + def max_count(self, value): + self._max_count = value + + @property + def mobile_list(self): + return self._mobile_list + + # 获取列表数据 + def get_page(self): + pass + + # 获取手机详情数据 + def get_mobile(self, base_url,param_url,**kwargs): + pass + + # 保存手机数据 + def save_mobile(self, mobile): + pass + + def get_req(self,url,**kwargs): + pass + + # 解压网页数据 + def uzipData(self, data): + if data.startswith(b'\x1f\x8b'): + return gzip.decompress(data) + elif data.startswith(b'\xec\xbd'): + return zlib.decompress(data, -zlib.MAX_WBITS) + else: + return data \ No newline at end of file diff --git a/chromedriver.exe b/chromedriver.exe new file mode 100644 index 0000000..0eedbe0 Binary files /dev/null and b/chromedriver.exe differ diff --git a/config.ini b/config.ini new file mode 100644 index 0000000..9f34528 --- /dev/null +++ b/config.ini @@ -0,0 +1,23 @@ +[file] +;日志文件名 +logFile = log.txt +[selenium] +;chrome浏览器执行路径 +binary_location = I:\ChromeUpdater\chrome.exe +;浏览器用户数据路径 +user_data_dir=I:\ChromeUpdater\User Data +[excel] +;采集数据报表 +file1=excel1.xlsx +;清洗脏数据报表 +file2=excel2.xlsx +;参数列表 +param_name=手机名称,参考价格,电商报价,上市时间,网友综合评分,屏幕尺寸,机身容量,屏幕色数,运营商支持,网络模式,SIM卡类型,WiFi,蓝牙,手机类型,机身结构,电池类型,电池更换,屏幕材质,屏幕分辨率,像素密度,触控方式,触摸特性,操作系统,CPU型号,核心数,CPU制程,运行内存,容量扩展,传感器类型,后置相机,前置相机,变焦,闪光灯,视频拍摄,拍照特性,视频格式,视频播放,音乐格式,图片格式,文档格式,GPS,感应器,USB接口,耳机接口,无线连接,日常功能,键盘类型,输入方式,输入法,包装清单 +;非空参数个数(从左到右保留指定个数的参数) +param_required_index=8 +;达到指定缓冲数据量写入一次报表 +data_size=10 +;线程池大小 +thread_count=5 +;采集数据量 +max_count=30 \ No newline at end of file diff --git a/config/config.py b/config/config.py new file mode 100644 index 0000000..d7534fe --- /dev/null +++ b/config/config.py @@ -0,0 +1,31 @@ +import configparser +import logging +from logging.handlers import TimedRotatingFileHandler +import os +BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + +cf = configparser.ConfigParser() +config_path=BASE_DIR+"//config.ini" +if not os.path.exists(config_path): + raise Exception("配置文件:%s不存在" % config_path) +cf.read(config_path,encoding='utf-8') +logFile = cf.get('file', 'logFile') +logger=logging.getLogger() +logger.setLevel(logging.INFO) +def init(): + log_format=logging.Formatter(fmt="%(asctime)s %(levelname)s : %(message)s",datefmt='%Y-%m-%d %H:%M:%S') + # 在控制台打印日志 + streamHandler = logging.StreamHandler() + streamHandler.setFormatter(log_format) + logger.addHandler(streamHandler) + + logpath=BASE_DIR+"\\log\\" + if not os.path.exists(BASE_DIR+"\\log\\"): + os.mkdir(logpath) + + timedRotatingFileHandler=TimedRotatingFileHandler(filename=logpath+"all.log",when='H',interval=1,encoding='utf-8') + timedRotatingFileHandler.setFormatter(log_format) + + logger.addHandler(timedRotatingFileHandler) + + diff --git a/config/log.py b/config/log.py new file mode 100644 index 0000000..d9929a9 --- /dev/null +++ b/config/log.py @@ -0,0 +1,21 @@ +import time + +from config.config import init +from config.config import logger + +start = int(time.time()) +init() + +def getRunTimeInt(): + return (int(time.time()) - start) + +def getRunTime(): + return '程序已经执行%d秒' % (int(time.time()) - start) + + +def writeInfo(msg): + logger.info('%s\t(%s)' % (msg, getRunTime())) + + +def writeError(msg): + logger.error('%s\t(%s)' % (msg, getRunTime())) diff --git a/main.py b/main.py new file mode 100644 index 0000000..90f9f06 --- /dev/null +++ b/main.py @@ -0,0 +1,501 @@ +import gzip +import json +import os +import random +import re +import threading +import time +import zlib + +import io +from typing import Optional, Callable, Any, Iterable, Mapping + +import requests +import win32api +import win32con +from PIL import Image +from bs4 import BeautifulSoup +# 手机实体类 +from openpyxl import load_workbook, Workbook +from pynput.mouse import Controller, Button +from selenium.common.exceptions import NoSuchElementException +from selenium.webdriver.chrome import webdriver +from selenium.webdriver.chrome.options import Options +from urllib3.exceptions import HeaderParsingError + +from Crawler import MobilePhoneCrawler +from config.config import cf, config_path +from config.log import writeInfo, writeError +from bs4 import BeautifulSoup +import re + +headers = { + 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36', + 'cookie': 'hng=CN%7Czh-CN%7CCNY%7C156; lid=tb84443556; enc=Di1rYCRWDQZC0UbccH38rIzuBg8LLFGKPSeQNu0fJ6Atw1lfF%2BtBE6Jm3vKtkZ%2FcJwoY%2FA2OAFq1CCgzrB0Wmg%3D%3D; t=2ce95276273d2f0fec4b735114efb9f0; uc3=id2=UonSf2s8K7H57A%3D%3D&nk2=F5RNYQezF9ZVJA%3D%3D&lg2=URm48syIIVrSKA%3D%3D&vt3=F8dByuPZuePp%2FK4exO4%3D; tracknick=tb84443556; uc4=nk4=0%40FY4Gtg6GE3gLVPH74U0sgDg9VVYt&id4=0%40UOE4tAnGHWIKt7PI5bS6f4noV%2Bbp; lgc=tb84443556; _tb_token_=e83ebbe73eeff; cookie2=170434fc3c4cae5b8d05c6ca3e035a7d; cna=3uL0FRGpQX4CAdoUCaGR7Dw2; UM_distinctid=16d0a0bd4f67e9-0657eeb9eeb382-5373e62-384000-16d0a0bd4f7aff; otherx=e%3D1%26p%3D*%26s%3D0%26c%3D0%26f%3D0%26g%3D0%26t%3D0; _med=dw:2560&dh:1440&pw:2560&ph:1440&ist:0; cq=ccp%3D1; swfstore=175036; res=scroll%3A1613*5900-client%3A1613*924-offset%3A1613*5900-screen%3A2560*1440; CNZZDATA1256793290=2086205104-1567833369-%7C1567844169; pnm_cku822=098%23E1hvKpvUvbpvUpCkvvvvvjiPRFS96jtjn2MWsjD2PmPp1jr8RLqwtjY8RFdUsjtbRpGCvvLMMQvvmphvLhbyi9mFecXPjV5vsEe4jVDQpGoHbdiQpznCwaZOecXPjV5vsEe4jVDQpYLhbdiQpzwBwaZOecEvjV5vsCkwjVDQpGFvbdiQpzvxwa7ivpvUphvhrpcsXvmEvpvVpyUUCE%2BfKphv8hCvCbyvvh89phvOJpvvpYYvpm9vvvCHtZCv8vvvvhcDphvOKpvvBHvCvpvVphhvvvvvRphvChCvvvm5vpvhphvhHv%3D%3D; isg=BHBwqfKM2QBI7oUELIuAZ-OhQT7CuVQDVfnti2rBAUuPJRHPEs91k6SXeW3gtQzb; l=cBO-RSocq17jyErCBOCZlurza77TvIRAguPzaNbMi_5QN1TQITQOkrDBxe96cjWdtj8B4JuaUMv9-etuigILNzGHtBUV.' +} + +# 获取字典cookie +cookies = headers['cookie'].split(';') +cookie_list = [] +for cookie in cookies: + cookie_list.append({'name': cookie.split('=')[0], 'value': cookie.split('=')[1]}) + + +# 天猫手机爬虫 +# https://list.tmall.com/search_product.htm?q=%CA%D6%BB%FA&click_id=%CA%D6%BB%FA&from=mallfp..pc_1.7_hq&spm=875.7931836%2FB.a1z5h.8.66144265MD9ShM +class TmallCrawler(MobilePhoneCrawler): + + def __init__(self) -> None: + super().__init__() + # 手机实体数据 + self.session = requests.Session() + # 登录 + # self.login() + self.get_page() + + ''' + 登录 + ''' + + def login(self): + # 获取验证码图片 + login_url = 'https://qrlogin.taobao.com/qrcodelogin/generateQRCode4Login.do?from=tmall&appkey=00000000&umid_token=T4C16243DC287A311CA928E0D5EA177D443B864009178BBAA55A4CB86A4' + writeInfo(login_url) + login_res = self.session.get(login_url) + res_content = login_res.content.decode() + res_json = json.loads(res_content[res_content.index("{"):res_content.index("}") + 1]) + writeInfo(json.dumps(res_json, indent=1)) + img_url = res_json["url"] + img_res = self.session.get("http:%s" % img_url) + if img_res.status_code == 200: + img_name = 'login.png' + # 保存二维码图片 + with open(img_name, 'wb') as file: + file.write(img_res.content) + # 打开二维码图片 + Image.open(img_name).show() + win32api.MessageBox(0, "请打开手机淘宝扫描二维码", "提醒", win32con.MB_ICONWARNING | win32con.MB_SYSTEMMODAL) + while True: + login_url = "https://qrlogin.taobao.com/qrcodelogin/qrcodeLoginCheck.do?lgToken={0}&defaulturl=https%3A%2F%2Fwww.tmall.com".format( + res_json['lgToken']) + writeInfo("login_url:{0}".format(login_url)) + check_login_res = self.session.get(login_url) + # 检查扫码结果 + if check_login_res.status_code == 200: + check_login_res_json = json.loads(check_login_res.content.decode()) + writeInfo(json.dumps(check_login_res_json, indent=1)) + if check_login_res_json['code'] == '10006': + # 扫码成功 + check_login_url = check_login_res_json['url'] + writeInfo("check_login_url={0}".format(check_login_url)) + login_res = self.session.get(check_login_url) + if login_res.status_code == 200: + # 重定向登陆身份验证 + login_res_html = BeautifulSoup(login_res.content, 'html.parser') + check_url = login_res_html.select_one("iframe")["src"] + writeInfo("check_url={0}".format(check_url)) + # 登录身份验证 + check_login_res = self.session.get(check_url) + if check_login_res.status_code == 200: + check_login_res_content = check_login_res.content.decode() + # 阿里巴巴集团 | 身份验证 + verify_modes_url = re.search("http.*verify_modes.*=", + check_login_res_content).group() + '1' + verify_modes_res = self.session.get(verify_modes_url) + if verify_modes_res.status_code == 200: + verify_modes_res_content = verify_modes_res.content.decode() + if '你最近购买过什么商品' in verify_modes_res_content: + raise Exception("触发图片验证,模拟请求失败") + else: + win32api.MessageBox(0, "请在手机淘宝上点击确认按钮登录", "提醒", + win32con.MB_ICONWARNING | win32con.MB_SYSTEMMODAL) + # 检测手机淘宝确认状态 + htoken = re.search("htoken\".*[a-zA-Z]", verify_modes_res_content).group() + htoken = htoken[htoken.index(":") + 2:] + while True: + time.sleep(1) + check_status_res = self.session.get( + "https://passport.taobao.com/iv/onekey/check_status.do?htoken={0}".format( + htoken)) + if check_status_res.status_code == 200: + check_status_res_json = json.loads(check_status_res.content.decode()) + if check_status_res_json['content']['code'] == '1': + login_safe_res = self.session.get( + check_status_res_json['content']['url']) + if login_safe_res.status_code == 200: + # login_safe_res_content=login_safe_res.content.decode(login_safe_res.apparent_encoding) + # login_safe_href=re.search("https.*pass.tmall.com.*\w",login_safe_res_content).group() + # index_res = self.session.get(login_safe_href) + writeInfo("登录成功") + break + else: + raise Exception("模拟登陆请求失败!!!") + else: + writeInfo(json.dumps(check_status_res_json, indent=1)) + else: + raise Exception("模拟登陆请求失败!!!") + break + else: + raise Exception("模拟登陆请求失败!!!") + else: + raise Exception("模拟登陆请求失败!!!") + else: + raise Exception("模拟登陆请求失败!!!") + elif check_login_res_json['code'] == '10004': + self.login() + time.sleep(1) + else: + raise Exception("获取登陆二维码图片失败") + + ''' + 获取分页数据 + url:分页url + ''' + + def get_page(self): + # 商品列表页地址 + domain = "https://list.tmall.com/search_product.htm" + url = '{0}?q=%CA%D6%BB%FA&type=p&vmarket=&spm=875.7931836%2FB.a2227oh.d100&xl=shouji_1&from=mallfp..pc_1_suggest'.format( + domain) + while True: + # 获取分页响应数据 + res = self.session.get(url, headers=headers) + # 判断响应状态码200才做处理 + if res.status_code == 200: + try: + # 使用BeautifulSoup解析html + res_html = BeautifulSoup(res.content, 'html.parser') + # 验证码检测 + if 'security-X5' == res_html.select_one("title").text: + self.clickCaptcha(url) + # 获取当前页 + current_page = res_html.select_one("b[class=ui-page-cur]") + writeInfo("开始解析第{0}页的数据,url:{1}".format(current_page.text, url)) + # 获取商品列表里的每个超链接 + product_hrefs = res_html.select("#J_ItemList .productTitle>a") + for product_href in product_hrefs: + # 轮询超链接获取商品详情数据 + self.get_mobile("https:{0}".format(product_href['href'])) + # 超过指定数据量结束循环 + if len(self.mobile_list) == self.max_count: + break + except Exception as e: + writeError(e) + else: + writeError("获取分页信息失败,url:%s响应状态码:%d" % (url, res.status_code)) + url = "{0}{1}".format(domain, current_page.find_next_siblings()[0]['href']) + + ''' + 滑动认证 + res_html:滑动验证码页面源代码 + url:滑动验证码页面url + ''' + + def clickCaptcha(self, url): + try: + chrome_options = Options() + chrome_options.binary_location = cf.get('selenium', 'binary_location') + # 以root权限运行 + chrome_options.add_argument('--no-sandbox') + chrome_options.add_argument('--disable-dev-shm-usage') + # chrome_options.add_argument('--headless') + # 设置用户数据路径 + chrome_options.add_argument('--user-data-dir={0}'.format(cf.get('selenium', 'user_data_dir'))) + # 不加载图片 + chrome_options.add_argument('blink-settings=imagesEnabled=false') + # 禁用gpu加速 + chrome_options.add_argument('--disable-gpu') + # 最大化 + chrome_options.add_argument('--start-maximized') + # 全屏模式 + chrome_options.add_argument('start-fullscreen') + # 设置为开发者模式,防止被识别出来使用了Selenium + chrome_options.add_experimental_option('excludeSwitches', ['enable-automation']) + driver = webdriver.WebDriver(options=chrome_options, service_log_path="I:\ChromeUpdater\selenium.log") + # driver.set_window_rect(0,0,1024,768) + # 访问滑动验证页面 + driver.get(url) + try: + # 获取滑块 + nc_1_n1z = driver.find_element_by_css_selector("#nc_1_n1z") + # 获取滑动条 + nc_1__scale_text = driver.find_element_by_css_selector("#nc_1__scale_text") + # 滑块坐标中心 + mouse = Controller() + # 移动到滑块坐标中心 + x = nc_1_n1z.rect['x'] + nc_1_n1z.rect['width'] / 2 + y = nc_1_n1z.rect['y'] + nc_1_n1z.rect['height'] / 2 + mouse.position = (x, y) + time.sleep(0.5) + mouse.press(Button.left) + time.sleep(0.5) + mouse.move(x + nc_1__scale_text.rect['width'] - nc_1_n1z.rect['width'], y) + time.sleep(0.5) + mouse.release(Button.left) + while True: + if len(driver.find_elements_by_css_selector(".errloading")) > 0: + driver.quit() + self.clickCaptcha(url) + break + else: + pass + # sub_slide_width = random.randint(30, 50) + # action.move_by_offset(sub_slide_width, 0).perform() # 移动滑块 + # start += sub_slide_width + time.sleep(random.randint(1, 10) / 10) + cookie_list = driver.get_cookies() + # 关闭浏览器 + driver.quit() + except NoSuchElementException as e: + writeError(e) + driver.quit() + self.clickCaptcha(url) + except Exception as e: + writeError(e) + raise Exception("模拟滑动验证失败") + + ''' + 获取手机详情数据 + url:手机链接 + ''' + + def get_mobile(self, url, param_url=None, **kwargs): + res = self.session.get(url) + if res.status_code == 200: + res_html = BeautifulSoup(res.content, 'html.parser') + # 验证码检测 + if 'security-X5' == res_html.select_one("title").text: + self.clickCaptcha(url) + # 获取手机规格参数 + # 判断手机是否有规格参数 + if res_html.select_one("#J_Attrs") is None: + writeInfo("手机详情url:%s没有规格参数" % url) + else: + try: + ths = res_html.select("table:contains('规格参数') tbody>tr:not([class='tm-tableAttrSub']) th") + # 轮询规格参数表格里的每一行参数 + mobile_dict = {} + for th in ths: + if 'colspan' in th.attrs: + continue + # 字典存储规格参数 + key = str(th.text).strip() + value = str(th.next_sibling.text).strip() + mobile_dict[key] = value + # 存放到列表里 + self.mobile_list.append(mobile_dict) + writeInfo("添加手机:{0}信息".format(str(res_html.select_one("div[class=tb-detail-hd]>h1").text).strip())) + except Exception as e: + writeError(e) + else: + writeError("手机url:%s响应状态码:%d" % (url, res.status_code)) + + # 保存手机数据 + def save_mobile(self, mobile): + self.mobile_list.append(mobile) + + +# 评测中心手机爬虫 +# http://product.cnmo.com/all/product.html +class CnmoCrawler(MobilePhoneCrawler): + def __init__(self) -> None: + super().__init__() + self.threads = [] + self.threadLock = threading.Lock() + try: + # 线程池大小 + self.thread_count = int(cf.get('excel', 'thread_count')) + # 数据指定缓存数写入一次excel + self.data_size = int(cf.get('excel', 'data_size')) + # 获取文件保存路径 + self.file1 = cf.get('excel', 'file1') + self.file2 = cf.get('excel', 'file2') + # 获取保存参数列表 + self.param_name_list = cf.get('excel', 'param_name').split(',') + # 获取非空参数个数 + self.param_required_index = int(cf.get('excel', 'param_required_index')) + # 采集数据量 + self.max_count = int(cf.get('excel', 'max_count')) + except Exception as e: + writeError("初始化参数失败,异常信息{0},请检查配置文件{1}的配置".format(e, config_path)) + raise + # 清空上次采集数据 + if os.path.exists(self.file1): + os.remove(self.file1) + if os.path.exists(self.file2): + os.remove(self.file2) + + def get_page(self): + # 起始页链接 + start_url = 'http://product.cnmo.com/all/product.html' + # 下一页链接 + next_page_url = None + while True: + current_page_url = start_url if next_page_url is None else next_page_url + writeInfo("开始解析列表页:{0}".format(current_page_url)) + # 调用解析器解析网页请求体 + res = self.get_req(current_page_url) + # 判断响应状态码,200正常返回 + if res is not None and res.status_code == 200: + try: + writeInfo("列表页:{0}解析成功".format(current_page_url)) + res_html = BeautifulSoup(self.uzipData(res.content), 'html.parser') + # 解析列表数据 + li_s = res_html.select("ul.all-con-con-ul.cf>li") + for li in li_s: + if len(self.mobile_list) > self.max_count: + return + p = li.select_one('p.red') + # 多线程获取手机详情参数 + time_to_market = re.search('\d{4}年\d{2}月', p.text) + thread = myThread(self, 'http:{0}'.format(li.select_one('a.name')['href']), + 'http:{0}'.format(li.select_one('div.info>a:contains(参数)')['href']), + 上市时间=None if time_to_market is None else time_to_market.group()) + thread.start() + if len(self.threads) == self.thread_count: + for t in self.threads: + t.join() + writeInfo("清空线程池") + self.threads.clear() + + self.threads.append(thread) + + # 获取下一页链接 + next_page_url = 'http:{0}'.format(res_html.select_one(".pnext")["href"]) + except Exception as e: + writeError("解析列表页出现异常信息:{0}".format(e)) + else: + raise Exception("列表页:{0}解析失败".format(current_page_url)) + + def run(self): + self.get_page() + writeInfo('采集数据完毕,开始清洗脏数据') + self.clear_data() + writeInfo('清洗脏数据完毕') + + def get_mobile(self, base_url, param_url, **kwargs): + # 字典存储手机详细参数 + param_dict = {} + writeInfo("开始解析手机详情参数页{0}".format(param_url)) + + # 获取网友综合评分 + score_res = self.get_req(base_url) + if score_res is not None and score_res.status_code == 200: + score_res_html = BeautifulSoup(self.uzipData(score_res.content), 'html.parser') + param_dict['网友综合评分'] = score_res_html.select_one('div.pro-comm-stars').find_next('span', + {'class': 'red'}).text + mobile_res = self.get_req(param_url) + + # 判断响应状态码,200正常返回 + if mobile_res is not None and mobile_res.status_code == 200: + # 调用解析器解析网页请求体 + try: + mobile_res_html = BeautifulSoup(self.uzipData(mobile_res.content), 'html.parser') + phone_name = mobile_res_html.select_one('#proName>a').text + param_dict['手机名称'] = phone_name + writeInfo("开始解析手机{0}详细参数".format(phone_name)) + # 参考价格 + param_dict['参考价格'] = mobile_res_html.select_one('span:contains(参考价格)').find_next().text + # 电商报价 + param_dict['电商报价'] = mobile_res_html.select_one('span:contains(电商报价)').next_sibling.strip() + # 获取参数名 + param_name_list = mobile_res_html.select('div.right>p') + for param_name in param_name_list: + # 获取参数值 + param_dict[param_name['paramname']] = param_name['paramvalue'] + # 获取锁,用于线程同步 + self.threadLock.acquire() + self.save_mobile(dict(param_dict, **kwargs)) + # 释放锁,开启下一个线程 + self.threadLock.release() + except Exception as e: + writeError("解析手机出现异常信息:{0}".format(e)) + else: + writeError("解析手机详情参数页{0}失败".format(param_url)) + + def save_mobile(self, mobile, ingore=False): + self.mobile_list.append(mobile) + writeInfo("当前已爬取{0}台手机".format(len(self.mobile_list))) + if not ingore and len(self.mobile_list) % self.data_size == 0: + self.save_excel(self.mobile_list[-self.data_size:]) + elif ingore and len(self.mobile_list) % self.data_size != 0: + self.save_excel(self.mobile_list[-(len(self.mobile_list) % self.data_size):]) + else: + writeInfo('缓存数据不足{0}条或没有剩余数据,不需要写入'.format(self.data_size)) + + def init_excel(self, file, max_index=None): + wb = Workbook() + ws = wb.active + for index, param_name in enumerate(self.param_name_list): + if max_index is None or index < max_index: + ws.cell(row=1, column=index + 1, value=param_name) + wb.save(file) + wb.close() + + # 保存数据到excel + def save_excel(self, data_list): + # 文件不存在,初始化表头 + if not os.path.exists(self.file1): + self.init_excel(self.file1) + wb = load_workbook(self.file1) + ws = wb.active + # 写入数据 + max_row = ws.max_row + for row_index, data in enumerate(data_list): + for column_index, param_name in enumerate(self.param_name_list): + ws.cell(row=max_row + row_index + 1, column=column_index + 1, + value=data[param_name] if param_name in data else None) + wb.save(self.file1) + wb.close() + + # 清洗脏数据 + def clear_data(self): + # 源数据 + source_wb = load_workbook(self.file1) + source_ws = source_wb.active + # 清洗脏数据后的新报表 + self.init_excel(self.file2,max_index=self.param_required_index) + target_wb = load_workbook(self.file2) + target_ws = target_wb.active + write_row = 2 + for current_row in range(2, source_ws.max_row + 1): + for current_column in range(1, self.param_required_index + 1): + val = source_ws.cell(row=current_row, column=current_column).value + if val is None or len(val) == 0 or ( + current_column == 2 and val == '曝光' or val == '即将上市'): + for i in range(1, self.param_required_index + 1): + target_ws.cell(row=write_row, column=i, value='') + break + else: + target_ws.cell(row=write_row, column=current_column, value=val) + if current_column == self.param_required_index: + write_row += 1 + # 保存清洗结果 + target_wb.save(self.file2) + + def get_req(self, url, max_retries=3, **kwargs): + try: + return requests.get(url, headers=dict({ + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36', + }, + **kwargs)) + except Exception as e: + writeError(e) + time.sleep(10) + max_retries -= 1 + if max_retries > 0: + self.get_req(url, max_retries, **kwargs) + else: + return None + + +class myThread(threading.Thread): + def __init__(self, crawler, base_url, param_url, **kwargs): + threading.Thread.__init__(self) + self.crawler = crawler + self.base_url = base_url + self.param_url = param_url + self.kwargs = kwargs + + def run(self) -> None: + self.crawler.get_mobile(self.base_url, self.param_url, **self.kwargs) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..59316f6 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,10 @@ +beautifulsoup4==4.8.0 +bs4==0.0.1 +certifi==2019.6.16 +chardet==3.0.4 +idna==2.8 +Pillow==6.1.0 +pywin32==224 +requests==2.22.0 +soupsieve==1.9.3 +urllib3==1.25.3 diff --git a/test.py b/test.py new file mode 100644 index 0000000..fbd571a --- /dev/null +++ b/test.py @@ -0,0 +1,14 @@ + +import requests + +from main import TmallCrawler, CnmoCrawler + +def abc(): + pass + +if __name__ == '__main__': + # 天猫爬虫测试 + # TmallCrawler().get_page() + # 评测中心爬虫测试 + CnmoCrawler().run() + # print(int(300/100)) \ No newline at end of file