commit
c70bbf113d
@ -0,0 +1,131 @@ |
||||
# Created by .ignore support plugin (hsz.mobi) |
||||
### Python template |
||||
# Byte-compiled / optimized / DLL files |
||||
__pycache__/ |
||||
*.py[cod] |
||||
*$py.class |
||||
|
||||
# C extensions |
||||
*.so |
||||
|
||||
# Distribution / packaging |
||||
.Python |
||||
build/ |
||||
develop-eggs/ |
||||
dist/ |
||||
downloads/ |
||||
eggs/ |
||||
.eggs/ |
||||
lib/ |
||||
lib64/ |
||||
parts/ |
||||
sdist/ |
||||
var/ |
||||
wheels/ |
||||
pip-wheel-metadata/ |
||||
share/python-wheels/ |
||||
*.egg-info/ |
||||
.installed.cfg |
||||
*.egg |
||||
MANIFEST |
||||
|
||||
# PyInstaller |
||||
# Usually these files are written by a python script from a template |
||||
# before PyInstaller builds the exe, so as to inject date/other infos into it. |
||||
*.manifest |
||||
*.spec |
||||
|
||||
# Installer logs |
||||
pip-log.txt |
||||
pip-delete-this-directory.txt |
||||
|
||||
# Unit test / coverage reports |
||||
htmlcov/ |
||||
.tox/ |
||||
.nox/ |
||||
.coverage |
||||
.coverage.* |
||||
.cache |
||||
nosetests.xml |
||||
coverage.xml |
||||
*.cover |
||||
.hypothesis/ |
||||
.pytest_cache/ |
||||
|
||||
# Translations |
||||
*.mo |
||||
*.pot |
||||
|
||||
# Django stuff: |
||||
*.log |
||||
local_settings.py |
||||
db.sqlite3 |
||||
db.sqlite3-journal |
||||
|
||||
# Flask stuff: |
||||
instance/ |
||||
.webassets-cache |
||||
|
||||
# Scrapy stuff: |
||||
.scrapy |
||||
|
||||
# Sphinx documentation |
||||
docs/_build/ |
||||
|
||||
# PyBuilder |
||||
target/ |
||||
|
||||
# Jupyter Notebook |
||||
.ipynb_checkpoints |
||||
|
||||
# IPython |
||||
profile_default/ |
||||
ipython_config.py |
||||
|
||||
# pyenv |
||||
.python-version |
||||
|
||||
# pipenv |
||||
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. |
||||
# However, in case of collaboration, if having platform-specific dependencies or dependencies |
||||
# having no cross-platform support, pipenv may install dependencies that don't work, or not |
||||
# install all needed dependencies. |
||||
#Pipfile.lock |
||||
|
||||
# celery beat schedule file |
||||
celerybeat-schedule |
||||
|
||||
# SageMath parsed files |
||||
*.sage.py |
||||
|
||||
# Environments |
||||
.env |
||||
.venv |
||||
env/ |
||||
venv/ |
||||
ENV/ |
||||
env.bak/ |
||||
venv.bak/ |
||||
|
||||
# Spyder project settings |
||||
.spyderproject |
||||
.spyproject |
||||
|
||||
# Rope project settings |
||||
.ropeproject |
||||
|
||||
# mkdocs documentation |
||||
/site |
||||
|
||||
# mypy |
||||
.mypy_cache/ |
||||
.dmypy.json |
||||
dmypy.json |
||||
|
||||
# Pyre type checker |
||||
.pyre/ |
||||
|
||||
/.idea/ |
||||
/log/ |
||||
/login.png |
||||
*.xlsx |
@ -0,0 +1,48 @@ |
||||
# 手机爬虫 |
||||
import gzip |
||||
import zlib |
||||
|
||||
|
||||
class MobilePhoneCrawler(): |
||||
def __init__(self) -> None: |
||||
super().__init__() |
||||
# 限制5000条数据 |
||||
self._max_count = 5000 |
||||
# 手机列表数据 |
||||
self._mobile_list = [] |
||||
|
||||
@property |
||||
def max_count(self): |
||||
return self._max_count |
||||
|
||||
@max_count.setter |
||||
def max_count(self, value): |
||||
self._max_count = value |
||||
|
||||
@property |
||||
def mobile_list(self): |
||||
return self._mobile_list |
||||
|
||||
# 获取列表数据 |
||||
def get_page(self): |
||||
pass |
||||
|
||||
# 获取手机详情数据 |
||||
def get_mobile(self, base_url,param_url,**kwargs): |
||||
pass |
||||
|
||||
# 保存手机数据 |
||||
def save_mobile(self, mobile): |
||||
pass |
||||
|
||||
def get_req(self,url,**kwargs): |
||||
pass |
||||
|
||||
# 解压网页数据 |
||||
def uzipData(self, data): |
||||
if data.startswith(b'\x1f\x8b'): |
||||
return gzip.decompress(data) |
||||
elif data.startswith(b'\xec\xbd'): |
||||
return zlib.decompress(data, -zlib.MAX_WBITS) |
||||
else: |
||||
return data |
Binary file not shown.
@ -0,0 +1,23 @@ |
||||
[file] |
||||
;日志文件名 |
||||
logFile = log.txt |
||||
[selenium] |
||||
;chrome浏览器执行路径 |
||||
binary_location = I:\ChromeUpdater\chrome.exe |
||||
;浏览器用户数据路径 |
||||
user_data_dir=I:\ChromeUpdater\User Data |
||||
[excel] |
||||
;采集数据报表 |
||||
file1=excel1.xlsx |
||||
;清洗脏数据报表 |
||||
file2=excel2.xlsx |
||||
;参数列表 |
||||
param_name=手机名称,参考价格,电商报价,上市时间,网友综合评分,屏幕尺寸,机身容量,屏幕色数,运营商支持,网络模式,SIM卡类型,WiFi,蓝牙,手机类型,机身结构,电池类型,电池更换,屏幕材质,屏幕分辨率,像素密度,触控方式,触摸特性,操作系统,CPU型号,核心数,CPU制程,运行内存,容量扩展,传感器类型,后置相机,前置相机,变焦,闪光灯,视频拍摄,拍照特性,视频格式,视频播放,音乐格式,图片格式,文档格式,GPS,感应器,USB接口,耳机接口,无线连接,日常功能,键盘类型,输入方式,输入法,包装清单 |
||||
;非空参数个数(从左到右保留指定个数的参数) |
||||
param_required_index=8 |
||||
;达到指定缓冲数据量写入一次报表 |
||||
data_size=10 |
||||
;线程池大小 |
||||
thread_count=5 |
||||
;采集数据量 |
||||
max_count=30 |
@ -0,0 +1,31 @@ |
||||
import configparser |
||||
import logging |
||||
from logging.handlers import TimedRotatingFileHandler |
||||
import os |
||||
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) |
||||
|
||||
cf = configparser.ConfigParser() |
||||
config_path=BASE_DIR+"//config.ini" |
||||
if not os.path.exists(config_path): |
||||
raise Exception("配置文件:%s不存在" % config_path) |
||||
cf.read(config_path,encoding='utf-8') |
||||
logFile = cf.get('file', 'logFile') |
||||
logger=logging.getLogger() |
||||
logger.setLevel(logging.INFO) |
||||
def init(): |
||||
log_format=logging.Formatter(fmt="%(asctime)s %(levelname)s : %(message)s",datefmt='%Y-%m-%d %H:%M:%S') |
||||
# 在控制台打印日志 |
||||
streamHandler = logging.StreamHandler() |
||||
streamHandler.setFormatter(log_format) |
||||
logger.addHandler(streamHandler) |
||||
|
||||
logpath=BASE_DIR+"\\log\\" |
||||
if not os.path.exists(BASE_DIR+"\\log\\"): |
||||
os.mkdir(logpath) |
||||
|
||||
timedRotatingFileHandler=TimedRotatingFileHandler(filename=logpath+"all.log",when='H',interval=1,encoding='utf-8') |
||||
timedRotatingFileHandler.setFormatter(log_format) |
||||
|
||||
logger.addHandler(timedRotatingFileHandler) |
||||
|
||||
|
@ -0,0 +1,21 @@ |
||||
import time |
||||
|
||||
from config.config import init |
||||
from config.config import logger |
||||
|
||||
start = int(time.time()) |
||||
init() |
||||
|
||||
def getRunTimeInt(): |
||||
return (int(time.time()) - start) |
||||
|
||||
def getRunTime(): |
||||
return '程序已经执行%d秒' % (int(time.time()) - start) |
||||
|
||||
|
||||
def writeInfo(msg): |
||||
logger.info('%s\t(%s)' % (msg, getRunTime())) |
||||
|
||||
|
||||
def writeError(msg): |
||||
logger.error('%s\t(%s)' % (msg, getRunTime())) |
@ -0,0 +1,501 @@ |
||||
import gzip |
||||
import json |
||||
import os |
||||
import random |
||||
import re |
||||
import threading |
||||
import time |
||||
import zlib |
||||
|
||||
import io |
||||
from typing import Optional, Callable, Any, Iterable, Mapping |
||||
|
||||
import requests |
||||
import win32api |
||||
import win32con |
||||
from PIL import Image |
||||
from bs4 import BeautifulSoup |
||||
# 手机实体类 |
||||
from openpyxl import load_workbook, Workbook |
||||
from pynput.mouse import Controller, Button |
||||
from selenium.common.exceptions import NoSuchElementException |
||||
from selenium.webdriver.chrome import webdriver |
||||
from selenium.webdriver.chrome.options import Options |
||||
from urllib3.exceptions import HeaderParsingError |
||||
|
||||
from Crawler import MobilePhoneCrawler |
||||
from config.config import cf, config_path |
||||
from config.log import writeInfo, writeError |
||||
from bs4 import BeautifulSoup |
||||
import re |
||||
|
||||
headers = { |
||||
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36', |
||||
'cookie': 'hng=CN%7Czh-CN%7CCNY%7C156; lid=tb84443556; enc=Di1rYCRWDQZC0UbccH38rIzuBg8LLFGKPSeQNu0fJ6Atw1lfF%2BtBE6Jm3vKtkZ%2FcJwoY%2FA2OAFq1CCgzrB0Wmg%3D%3D; t=2ce95276273d2f0fec4b735114efb9f0; uc3=id2=UonSf2s8K7H57A%3D%3D&nk2=F5RNYQezF9ZVJA%3D%3D&lg2=URm48syIIVrSKA%3D%3D&vt3=F8dByuPZuePp%2FK4exO4%3D; tracknick=tb84443556; uc4=nk4=0%40FY4Gtg6GE3gLVPH74U0sgDg9VVYt&id4=0%40UOE4tAnGHWIKt7PI5bS6f4noV%2Bbp; lgc=tb84443556; _tb_token_=e83ebbe73eeff; cookie2=170434fc3c4cae5b8d05c6ca3e035a7d; cna=3uL0FRGpQX4CAdoUCaGR7Dw2; UM_distinctid=16d0a0bd4f67e9-0657eeb9eeb382-5373e62-384000-16d0a0bd4f7aff; otherx=e%3D1%26p%3D*%26s%3D0%26c%3D0%26f%3D0%26g%3D0%26t%3D0; _med=dw:2560&dh:1440&pw:2560&ph:1440&ist:0; cq=ccp%3D1; swfstore=175036; res=scroll%3A1613*5900-client%3A1613*924-offset%3A1613*5900-screen%3A2560*1440; CNZZDATA1256793290=2086205104-1567833369-%7C1567844169; pnm_cku822=098%23E1hvKpvUvbpvUpCkvvvvvjiPRFS96jtjn2MWsjD2PmPp1jr8RLqwtjY8RFdUsjtbRpGCvvLMMQvvmphvLhbyi9mFecXPjV5vsEe4jVDQpGoHbdiQpznCwaZOecXPjV5vsEe4jVDQpYLhbdiQpzwBwaZOecEvjV5vsCkwjVDQpGFvbdiQpzvxwa7ivpvUphvhrpcsXvmEvpvVpyUUCE%2BfKphv8hCvCbyvvh89phvOJpvvpYYvpm9vvvCHtZCv8vvvvhcDphvOKpvvBHvCvpvVphhvvvvvRphvChCvvvm5vpvhphvhHv%3D%3D; isg=BHBwqfKM2QBI7oUELIuAZ-OhQT7CuVQDVfnti2rBAUuPJRHPEs91k6SXeW3gtQzb; l=cBO-RSocq17jyErCBOCZlurza77TvIRAguPzaNbMi_5QN1TQITQOkrDBxe96cjWdtj8B4JuaUMv9-etuigILNzGHtBUV.' |
||||
} |
||||
|
||||
# 获取字典cookie |
||||
cookies = headers['cookie'].split(';') |
||||
cookie_list = [] |
||||
for cookie in cookies: |
||||
cookie_list.append({'name': cookie.split('=')[0], 'value': cookie.split('=')[1]}) |
||||
|
||||
|
||||
# 天猫手机爬虫 |
||||
# https://list.tmall.com/search_product.htm?q=%CA%D6%BB%FA&click_id=%CA%D6%BB%FA&from=mallfp..pc_1.7_hq&spm=875.7931836%2FB.a1z5h.8.66144265MD9ShM |
||||
class TmallCrawler(MobilePhoneCrawler): |
||||
|
||||
def __init__(self) -> None: |
||||
super().__init__() |
||||
# 手机实体数据 |
||||
self.session = requests.Session() |
||||
# 登录 |
||||
# self.login() |
||||
self.get_page() |
||||
|
||||
''' |
||||
登录 |
||||
''' |
||||
|
||||
def login(self): |
||||
# 获取验证码图片 |
||||
login_url = 'https://qrlogin.taobao.com/qrcodelogin/generateQRCode4Login.do?from=tmall&appkey=00000000&umid_token=T4C16243DC287A311CA928E0D5EA177D443B864009178BBAA55A4CB86A4' |
||||
writeInfo(login_url) |
||||
login_res = self.session.get(login_url) |
||||
res_content = login_res.content.decode() |
||||
res_json = json.loads(res_content[res_content.index("{"):res_content.index("}") + 1]) |
||||
writeInfo(json.dumps(res_json, indent=1)) |
||||
img_url = res_json["url"] |
||||
img_res = self.session.get("http:%s" % img_url) |
||||
if img_res.status_code == 200: |
||||
img_name = 'login.png' |
||||
# 保存二维码图片 |
||||
with open(img_name, 'wb') as file: |
||||
file.write(img_res.content) |
||||
# 打开二维码图片 |
||||
Image.open(img_name).show() |
||||
win32api.MessageBox(0, "请打开手机淘宝扫描二维码", "提醒", win32con.MB_ICONWARNING | win32con.MB_SYSTEMMODAL) |
||||
while True: |
||||
login_url = "https://qrlogin.taobao.com/qrcodelogin/qrcodeLoginCheck.do?lgToken={0}&defaulturl=https%3A%2F%2Fwww.tmall.com".format( |
||||
res_json['lgToken']) |
||||
writeInfo("login_url:{0}".format(login_url)) |
||||
check_login_res = self.session.get(login_url) |
||||
# 检查扫码结果 |
||||
if check_login_res.status_code == 200: |
||||
check_login_res_json = json.loads(check_login_res.content.decode()) |
||||
writeInfo(json.dumps(check_login_res_json, indent=1)) |
||||
if check_login_res_json['code'] == '10006': |
||||
# 扫码成功 |
||||
check_login_url = check_login_res_json['url'] |
||||
writeInfo("check_login_url={0}".format(check_login_url)) |
||||
login_res = self.session.get(check_login_url) |
||||
if login_res.status_code == 200: |
||||
# 重定向登陆身份验证 |
||||
login_res_html = BeautifulSoup(login_res.content, 'html.parser') |
||||
check_url = login_res_html.select_one("iframe")["src"] |
||||
writeInfo("check_url={0}".format(check_url)) |
||||
# 登录身份验证 |
||||
check_login_res = self.session.get(check_url) |
||||
if check_login_res.status_code == 200: |
||||
check_login_res_content = check_login_res.content.decode() |
||||
# 阿里巴巴集团 | 身份验证 |
||||
verify_modes_url = re.search("http.*verify_modes.*=", |
||||
check_login_res_content).group() + '1' |
||||
verify_modes_res = self.session.get(verify_modes_url) |
||||
if verify_modes_res.status_code == 200: |
||||
verify_modes_res_content = verify_modes_res.content.decode() |
||||
if '你最近购买过什么商品' in verify_modes_res_content: |
||||
raise Exception("触发图片验证,模拟请求失败") |
||||
else: |
||||
win32api.MessageBox(0, "请在手机淘宝上点击确认按钮登录", "提醒", |
||||
win32con.MB_ICONWARNING | win32con.MB_SYSTEMMODAL) |
||||
# 检测手机淘宝确认状态 |
||||
htoken = re.search("htoken\".*[a-zA-Z]", verify_modes_res_content).group() |
||||
htoken = htoken[htoken.index(":") + 2:] |
||||
while True: |
||||
time.sleep(1) |
||||
check_status_res = self.session.get( |
||||
"https://passport.taobao.com/iv/onekey/check_status.do?htoken={0}".format( |
||||
htoken)) |
||||
if check_status_res.status_code == 200: |
||||
check_status_res_json = json.loads(check_status_res.content.decode()) |
||||
if check_status_res_json['content']['code'] == '1': |
||||
login_safe_res = self.session.get( |
||||
check_status_res_json['content']['url']) |
||||
if login_safe_res.status_code == 200: |
||||
# login_safe_res_content=login_safe_res.content.decode(login_safe_res.apparent_encoding) |
||||
# login_safe_href=re.search("https.*pass.tmall.com.*\w",login_safe_res_content).group() |
||||
# index_res = self.session.get(login_safe_href) |
||||
writeInfo("登录成功") |
||||
break |
||||
else: |
||||
raise Exception("模拟登陆请求失败!!!") |
||||
else: |
||||
writeInfo(json.dumps(check_status_res_json, indent=1)) |
||||
else: |
||||
raise Exception("模拟登陆请求失败!!!") |
||||
break |
||||
else: |
||||
raise Exception("模拟登陆请求失败!!!") |
||||
else: |
||||
raise Exception("模拟登陆请求失败!!!") |
||||
else: |
||||
raise Exception("模拟登陆请求失败!!!") |
||||
elif check_login_res_json['code'] == '10004': |
||||
self.login() |
||||
time.sleep(1) |
||||
else: |
||||
raise Exception("获取登陆二维码图片失败") |
||||
|
||||
''' |
||||
获取分页数据 |
||||
url:分页url |
||||
''' |
||||
|
||||
def get_page(self): |
||||
# 商品列表页地址 |
||||
domain = "https://list.tmall.com/search_product.htm" |
||||
url = '{0}?q=%CA%D6%BB%FA&type=p&vmarket=&spm=875.7931836%2FB.a2227oh.d100&xl=shouji_1&from=mallfp..pc_1_suggest'.format( |
||||
domain) |
||||
while True: |
||||
# 获取分页响应数据 |
||||
res = self.session.get(url, headers=headers) |
||||
# 判断响应状态码200才做处理 |
||||
if res.status_code == 200: |
||||
try: |
||||
# 使用BeautifulSoup解析html |
||||
res_html = BeautifulSoup(res.content, 'html.parser') |
||||
# 验证码检测 |
||||
if 'security-X5' == res_html.select_one("title").text: |
||||
self.clickCaptcha(url) |
||||
# 获取当前页 |
||||
current_page = res_html.select_one("b[class=ui-page-cur]") |
||||
writeInfo("开始解析第{0}页的数据,url:{1}".format(current_page.text, url)) |
||||
# 获取商品列表里的每个超链接 |
||||
product_hrefs = res_html.select("#J_ItemList .productTitle>a") |
||||
for product_href in product_hrefs: |
||||
# 轮询超链接获取商品详情数据 |
||||
self.get_mobile("https:{0}".format(product_href['href'])) |
||||
# 超过指定数据量结束循环 |
||||
if len(self.mobile_list) == self.max_count: |
||||
break |
||||
except Exception as e: |
||||
writeError(e) |
||||
else: |
||||
writeError("获取分页信息失败,url:%s响应状态码:%d" % (url, res.status_code)) |
||||
url = "{0}{1}".format(domain, current_page.find_next_siblings()[0]['href']) |
||||
|
||||
''' |
||||
滑动认证 |
||||
res_html:滑动验证码页面源代码 |
||||
url:滑动验证码页面url |
||||
''' |
||||
|
||||
def clickCaptcha(self, url): |
||||
try: |
||||
chrome_options = Options() |
||||
chrome_options.binary_location = cf.get('selenium', 'binary_location') |
||||
# 以root权限运行 |
||||
chrome_options.add_argument('--no-sandbox') |
||||
chrome_options.add_argument('--disable-dev-shm-usage') |
||||
# chrome_options.add_argument('--headless') |
||||
# 设置用户数据路径 |
||||
chrome_options.add_argument('--user-data-dir={0}'.format(cf.get('selenium', 'user_data_dir'))) |
||||
# 不加载图片 |
||||
chrome_options.add_argument('blink-settings=imagesEnabled=false') |
||||
# 禁用gpu加速 |
||||
chrome_options.add_argument('--disable-gpu') |
||||
# 最大化 |
||||
chrome_options.add_argument('--start-maximized') |
||||
# 全屏模式 |
||||
chrome_options.add_argument('start-fullscreen') |
||||
# 设置为开发者模式,防止被识别出来使用了Selenium |
||||
chrome_options.add_experimental_option('excludeSwitches', ['enable-automation']) |
||||
driver = webdriver.WebDriver(options=chrome_options, service_log_path="I:\ChromeUpdater\selenium.log") |
||||
# driver.set_window_rect(0,0,1024,768) |
||||
# 访问滑动验证页面 |
||||
driver.get(url) |
||||
try: |
||||
# 获取滑块 |
||||
nc_1_n1z = driver.find_element_by_css_selector("#nc_1_n1z") |
||||
# 获取滑动条 |
||||
nc_1__scale_text = driver.find_element_by_css_selector("#nc_1__scale_text") |
||||
# 滑块坐标中心 |
||||
mouse = Controller() |
||||
# 移动到滑块坐标中心 |
||||
x = nc_1_n1z.rect['x'] + nc_1_n1z.rect['width'] / 2 |
||||
y = nc_1_n1z.rect['y'] + nc_1_n1z.rect['height'] / 2 |
||||
mouse.position = (x, y) |
||||
time.sleep(0.5) |
||||
mouse.press(Button.left) |
||||
time.sleep(0.5) |
||||
mouse.move(x + nc_1__scale_text.rect['width'] - nc_1_n1z.rect['width'], y) |
||||
time.sleep(0.5) |
||||
mouse.release(Button.left) |
||||
while True: |
||||
if len(driver.find_elements_by_css_selector(".errloading")) > 0: |
||||
driver.quit() |
||||
self.clickCaptcha(url) |
||||
break |
||||
else: |
||||
pass |
||||
# sub_slide_width = random.randint(30, 50) |
||||
# action.move_by_offset(sub_slide_width, 0).perform() # 移动滑块 |
||||
# start += sub_slide_width |
||||
time.sleep(random.randint(1, 10) / 10) |
||||
cookie_list = driver.get_cookies() |
||||
# 关闭浏览器 |
||||
driver.quit() |
||||
except NoSuchElementException as e: |
||||
writeError(e) |
||||
driver.quit() |
||||
self.clickCaptcha(url) |
||||
except Exception as e: |
||||
writeError(e) |
||||
raise Exception("模拟滑动验证失败") |
||||
|
||||
''' |
||||
获取手机详情数据 |
||||
url:手机链接 |
||||
''' |
||||
|
||||
def get_mobile(self, url, param_url=None, **kwargs): |
||||
res = self.session.get(url) |
||||
if res.status_code == 200: |
||||
res_html = BeautifulSoup(res.content, 'html.parser') |
||||
# 验证码检测 |
||||
if 'security-X5' == res_html.select_one("title").text: |
||||
self.clickCaptcha(url) |
||||
# 获取手机规格参数 |
||||
# 判断手机是否有规格参数 |
||||
if res_html.select_one("#J_Attrs") is None: |
||||
writeInfo("手机详情url:%s没有规格参数" % url) |
||||
else: |
||||
try: |
||||
ths = res_html.select("table:contains('规格参数') tbody>tr:not([class='tm-tableAttrSub']) th") |
||||
# 轮询规格参数表格里的每一行参数 |
||||
mobile_dict = {} |
||||
for th in ths: |
||||
if 'colspan' in th.attrs: |
||||
continue |
||||
# 字典存储规格参数 |
||||
key = str(th.text).strip() |
||||
value = str(th.next_sibling.text).strip() |
||||
mobile_dict[key] = value |
||||
# 存放到列表里 |
||||
self.mobile_list.append(mobile_dict) |
||||
writeInfo("添加手机:{0}信息".format(str(res_html.select_one("div[class=tb-detail-hd]>h1").text).strip())) |
||||
except Exception as e: |
||||
writeError(e) |
||||
else: |
||||
writeError("手机url:%s响应状态码:%d" % (url, res.status_code)) |
||||
|
||||
# 保存手机数据 |
||||
def save_mobile(self, mobile): |
||||
self.mobile_list.append(mobile) |
||||
|
||||
|
||||
# 评测中心手机爬虫 |
||||
# http://product.cnmo.com/all/product.html |
||||
class CnmoCrawler(MobilePhoneCrawler): |
||||
def __init__(self) -> None: |
||||
super().__init__() |
||||
self.threads = [] |
||||
self.threadLock = threading.Lock() |
||||
try: |
||||
# 线程池大小 |
||||
self.thread_count = int(cf.get('excel', 'thread_count')) |
||||
# 数据指定缓存数写入一次excel |
||||
self.data_size = int(cf.get('excel', 'data_size')) |
||||
# 获取文件保存路径 |
||||
self.file1 = cf.get('excel', 'file1') |
||||
self.file2 = cf.get('excel', 'file2') |
||||
# 获取保存参数列表 |
||||
self.param_name_list = cf.get('excel', 'param_name').split(',') |
||||
# 获取非空参数个数 |
||||
self.param_required_index = int(cf.get('excel', 'param_required_index')) |
||||
# 采集数据量 |
||||
self.max_count = int(cf.get('excel', 'max_count')) |
||||
except Exception as e: |
||||
writeError("初始化参数失败,异常信息{0},请检查配置文件{1}的配置".format(e, config_path)) |
||||
raise |
||||
# 清空上次采集数据 |
||||
if os.path.exists(self.file1): |
||||
os.remove(self.file1) |
||||
if os.path.exists(self.file2): |
||||
os.remove(self.file2) |
||||
|
||||
def get_page(self): |
||||
# 起始页链接 |
||||
start_url = 'http://product.cnmo.com/all/product.html' |
||||
# 下一页链接 |
||||
next_page_url = None |
||||
while True: |
||||
current_page_url = start_url if next_page_url is None else next_page_url |
||||
writeInfo("开始解析列表页:{0}".format(current_page_url)) |
||||
# 调用解析器解析网页请求体 |
||||
res = self.get_req(current_page_url) |
||||
# 判断响应状态码,200正常返回 |
||||
if res is not None and res.status_code == 200: |
||||
try: |
||||
writeInfo("列表页:{0}解析成功".format(current_page_url)) |
||||
res_html = BeautifulSoup(self.uzipData(res.content), 'html.parser') |
||||
# 解析列表数据 |
||||
li_s = res_html.select("ul.all-con-con-ul.cf>li") |
||||
for li in li_s: |
||||
if len(self.mobile_list) > self.max_count: |
||||
return |
||||
p = li.select_one('p.red') |
||||
# 多线程获取手机详情参数 |
||||
time_to_market = re.search('\d{4}年\d{2}月', p.text) |
||||
thread = myThread(self, 'http:{0}'.format(li.select_one('a.name')['href']), |
||||
'http:{0}'.format(li.select_one('div.info>a:contains(参数)')['href']), |
||||
上市时间=None if time_to_market is None else time_to_market.group()) |
||||
thread.start() |
||||
if len(self.threads) == self.thread_count: |
||||
for t in self.threads: |
||||
t.join() |
||||
writeInfo("清空线程池") |
||||
self.threads.clear() |
||||
|
||||
self.threads.append(thread) |
||||
|
||||
# 获取下一页链接 |
||||
next_page_url = 'http:{0}'.format(res_html.select_one(".pnext")["href"]) |
||||
except Exception as e: |
||||
writeError("解析列表页出现异常信息:{0}".format(e)) |
||||
else: |
||||
raise Exception("列表页:{0}解析失败".format(current_page_url)) |
||||
|
||||
def run(self): |
||||
self.get_page() |
||||
writeInfo('采集数据完毕,开始清洗脏数据') |
||||
self.clear_data() |
||||
writeInfo('清洗脏数据完毕') |
||||
|
||||
def get_mobile(self, base_url, param_url, **kwargs): |
||||
# 字典存储手机详细参数 |
||||
param_dict = {} |
||||
writeInfo("开始解析手机详情参数页{0}".format(param_url)) |
||||
|
||||
# 获取网友综合评分 |
||||
score_res = self.get_req(base_url) |
||||
if score_res is not None and score_res.status_code == 200: |
||||
score_res_html = BeautifulSoup(self.uzipData(score_res.content), 'html.parser') |
||||
param_dict['网友综合评分'] = score_res_html.select_one('div.pro-comm-stars').find_next('span', |
||||
{'class': 'red'}).text |
||||
mobile_res = self.get_req(param_url) |
||||
|
||||
# 判断响应状态码,200正常返回 |
||||
if mobile_res is not None and mobile_res.status_code == 200: |
||||
# 调用解析器解析网页请求体 |
||||
try: |
||||
mobile_res_html = BeautifulSoup(self.uzipData(mobile_res.content), 'html.parser') |
||||
phone_name = mobile_res_html.select_one('#proName>a').text |
||||
param_dict['手机名称'] = phone_name |
||||
writeInfo("开始解析手机{0}详细参数".format(phone_name)) |
||||
# 参考价格 |
||||
param_dict['参考价格'] = mobile_res_html.select_one('span:contains(参考价格)').find_next().text |
||||
# 电商报价 |
||||
param_dict['电商报价'] = mobile_res_html.select_one('span:contains(电商报价)').next_sibling.strip() |
||||
# 获取参数名 |
||||
param_name_list = mobile_res_html.select('div.right>p') |
||||
for param_name in param_name_list: |
||||
# 获取参数值 |
||||
param_dict[param_name['paramname']] = param_name['paramvalue'] |
||||
# 获取锁,用于线程同步 |
||||
self.threadLock.acquire() |
||||
self.save_mobile(dict(param_dict, **kwargs)) |
||||
# 释放锁,开启下一个线程 |
||||
self.threadLock.release() |
||||
except Exception as e: |
||||
writeError("解析手机出现异常信息:{0}".format(e)) |
||||
else: |
||||
writeError("解析手机详情参数页{0}失败".format(param_url)) |
||||
|
||||
def save_mobile(self, mobile, ingore=False): |
||||
self.mobile_list.append(mobile) |
||||
writeInfo("当前已爬取{0}台手机".format(len(self.mobile_list))) |
||||
if not ingore and len(self.mobile_list) % self.data_size == 0: |
||||
self.save_excel(self.mobile_list[-self.data_size:]) |
||||
elif ingore and len(self.mobile_list) % self.data_size != 0: |
||||
self.save_excel(self.mobile_list[-(len(self.mobile_list) % self.data_size):]) |
||||
else: |
||||
writeInfo('缓存数据不足{0}条或没有剩余数据,不需要写入'.format(self.data_size)) |
||||
|
||||
def init_excel(self, file, max_index=None): |
||||
wb = Workbook() |
||||
ws = wb.active |
||||
for index, param_name in enumerate(self.param_name_list): |
||||
if max_index is None or index < max_index: |
||||
ws.cell(row=1, column=index + 1, value=param_name) |
||||
wb.save(file) |
||||
wb.close() |
||||
|
||||
# 保存数据到excel |
||||
def save_excel(self, data_list): |
||||
# 文件不存在,初始化表头 |
||||
if not os.path.exists(self.file1): |
||||
self.init_excel(self.file1) |
||||
wb = load_workbook(self.file1) |
||||
ws = wb.active |
||||
# 写入数据 |
||||
max_row = ws.max_row |
||||
for row_index, data in enumerate(data_list): |
||||
for column_index, param_name in enumerate(self.param_name_list): |
||||
ws.cell(row=max_row + row_index + 1, column=column_index + 1, |
||||
value=data[param_name] if param_name in data else None) |
||||
wb.save(self.file1) |
||||
wb.close() |
||||
|
||||
# 清洗脏数据 |
||||
def clear_data(self): |
||||
# 源数据 |
||||
source_wb = load_workbook(self.file1) |
||||
source_ws = source_wb.active |
||||
# 清洗脏数据后的新报表 |
||||
self.init_excel(self.file2,max_index=self.param_required_index) |
||||
target_wb = load_workbook(self.file2) |
||||
target_ws = target_wb.active |
||||
write_row = 2 |
||||
for current_row in range(2, source_ws.max_row + 1): |
||||
for current_column in range(1, self.param_required_index + 1): |
||||
val = source_ws.cell(row=current_row, column=current_column).value |
||||
if val is None or len(val) == 0 or ( |
||||
current_column == 2 and val == '曝光' or val == '即将上市'): |
||||
for i in range(1, self.param_required_index + 1): |
||||
target_ws.cell(row=write_row, column=i, value='') |
||||
break |
||||
else: |
||||
target_ws.cell(row=write_row, column=current_column, value=val) |
||||
if current_column == self.param_required_index: |
||||
write_row += 1 |
||||
# 保存清洗结果 |
||||
target_wb.save(self.file2) |
||||
|
||||
def get_req(self, url, max_retries=3, **kwargs): |
||||
try: |
||||
return requests.get(url, headers=dict({ |
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36', |
||||
}, |
||||
**kwargs)) |
||||
except Exception as e: |
||||
writeError(e) |
||||
time.sleep(10) |
||||
max_retries -= 1 |
||||
if max_retries > 0: |
||||
self.get_req(url, max_retries, **kwargs) |
||||
else: |
||||
return None |
||||
|
||||
|
||||
class myThread(threading.Thread): |
||||
def __init__(self, crawler, base_url, param_url, **kwargs): |
||||
threading.Thread.__init__(self) |
||||
self.crawler = crawler |
||||
self.base_url = base_url |
||||
self.param_url = param_url |
||||
self.kwargs = kwargs |
||||
|
||||
def run(self) -> None: |
||||
self.crawler.get_mobile(self.base_url, self.param_url, **self.kwargs) |
@ -0,0 +1,10 @@ |
||||
beautifulsoup4==4.8.0 |
||||
bs4==0.0.1 |
||||
certifi==2019.6.16 |
||||
chardet==3.0.4 |
||||
idna==2.8 |
||||
Pillow==6.1.0 |
||||
pywin32==224 |
||||
requests==2.22.0 |
||||
soupsieve==1.9.3 |
||||
urllib3==1.25.3 |
Loading…
Reference in new issue