commit
c70bbf113d
@ -0,0 +1,131 @@ |
|||||||
|
# Created by .ignore support plugin (hsz.mobi) |
||||||
|
### Python template |
||||||
|
# Byte-compiled / optimized / DLL files |
||||||
|
__pycache__/ |
||||||
|
*.py[cod] |
||||||
|
*$py.class |
||||||
|
|
||||||
|
# C extensions |
||||||
|
*.so |
||||||
|
|
||||||
|
# Distribution / packaging |
||||||
|
.Python |
||||||
|
build/ |
||||||
|
develop-eggs/ |
||||||
|
dist/ |
||||||
|
downloads/ |
||||||
|
eggs/ |
||||||
|
.eggs/ |
||||||
|
lib/ |
||||||
|
lib64/ |
||||||
|
parts/ |
||||||
|
sdist/ |
||||||
|
var/ |
||||||
|
wheels/ |
||||||
|
pip-wheel-metadata/ |
||||||
|
share/python-wheels/ |
||||||
|
*.egg-info/ |
||||||
|
.installed.cfg |
||||||
|
*.egg |
||||||
|
MANIFEST |
||||||
|
|
||||||
|
# PyInstaller |
||||||
|
# Usually these files are written by a python script from a template |
||||||
|
# before PyInstaller builds the exe, so as to inject date/other infos into it. |
||||||
|
*.manifest |
||||||
|
*.spec |
||||||
|
|
||||||
|
# Installer logs |
||||||
|
pip-log.txt |
||||||
|
pip-delete-this-directory.txt |
||||||
|
|
||||||
|
# Unit test / coverage reports |
||||||
|
htmlcov/ |
||||||
|
.tox/ |
||||||
|
.nox/ |
||||||
|
.coverage |
||||||
|
.coverage.* |
||||||
|
.cache |
||||||
|
nosetests.xml |
||||||
|
coverage.xml |
||||||
|
*.cover |
||||||
|
.hypothesis/ |
||||||
|
.pytest_cache/ |
||||||
|
|
||||||
|
# Translations |
||||||
|
*.mo |
||||||
|
*.pot |
||||||
|
|
||||||
|
# Django stuff: |
||||||
|
*.log |
||||||
|
local_settings.py |
||||||
|
db.sqlite3 |
||||||
|
db.sqlite3-journal |
||||||
|
|
||||||
|
# Flask stuff: |
||||||
|
instance/ |
||||||
|
.webassets-cache |
||||||
|
|
||||||
|
# Scrapy stuff: |
||||||
|
.scrapy |
||||||
|
|
||||||
|
# Sphinx documentation |
||||||
|
docs/_build/ |
||||||
|
|
||||||
|
# PyBuilder |
||||||
|
target/ |
||||||
|
|
||||||
|
# Jupyter Notebook |
||||||
|
.ipynb_checkpoints |
||||||
|
|
||||||
|
# IPython |
||||||
|
profile_default/ |
||||||
|
ipython_config.py |
||||||
|
|
||||||
|
# pyenv |
||||||
|
.python-version |
||||||
|
|
||||||
|
# pipenv |
||||||
|
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. |
||||||
|
# However, in case of collaboration, if having platform-specific dependencies or dependencies |
||||||
|
# having no cross-platform support, pipenv may install dependencies that don't work, or not |
||||||
|
# install all needed dependencies. |
||||||
|
#Pipfile.lock |
||||||
|
|
||||||
|
# celery beat schedule file |
||||||
|
celerybeat-schedule |
||||||
|
|
||||||
|
# SageMath parsed files |
||||||
|
*.sage.py |
||||||
|
|
||||||
|
# Environments |
||||||
|
.env |
||||||
|
.venv |
||||||
|
env/ |
||||||
|
venv/ |
||||||
|
ENV/ |
||||||
|
env.bak/ |
||||||
|
venv.bak/ |
||||||
|
|
||||||
|
# Spyder project settings |
||||||
|
.spyderproject |
||||||
|
.spyproject |
||||||
|
|
||||||
|
# Rope project settings |
||||||
|
.ropeproject |
||||||
|
|
||||||
|
# mkdocs documentation |
||||||
|
/site |
||||||
|
|
||||||
|
# mypy |
||||||
|
.mypy_cache/ |
||||||
|
.dmypy.json |
||||||
|
dmypy.json |
||||||
|
|
||||||
|
# Pyre type checker |
||||||
|
.pyre/ |
||||||
|
|
||||||
|
/.idea/ |
||||||
|
/log/ |
||||||
|
/login.png |
||||||
|
*.xlsx |
@ -0,0 +1,48 @@ |
|||||||
|
# 手机爬虫 |
||||||
|
import gzip |
||||||
|
import zlib |
||||||
|
|
||||||
|
|
||||||
|
class MobilePhoneCrawler(): |
||||||
|
def __init__(self) -> None: |
||||||
|
super().__init__() |
||||||
|
# 限制5000条数据 |
||||||
|
self._max_count = 5000 |
||||||
|
# 手机列表数据 |
||||||
|
self._mobile_list = [] |
||||||
|
|
||||||
|
@property |
||||||
|
def max_count(self): |
||||||
|
return self._max_count |
||||||
|
|
||||||
|
@max_count.setter |
||||||
|
def max_count(self, value): |
||||||
|
self._max_count = value |
||||||
|
|
||||||
|
@property |
||||||
|
def mobile_list(self): |
||||||
|
return self._mobile_list |
||||||
|
|
||||||
|
# 获取列表数据 |
||||||
|
def get_page(self): |
||||||
|
pass |
||||||
|
|
||||||
|
# 获取手机详情数据 |
||||||
|
def get_mobile(self, base_url,param_url,**kwargs): |
||||||
|
pass |
||||||
|
|
||||||
|
# 保存手机数据 |
||||||
|
def save_mobile(self, mobile): |
||||||
|
pass |
||||||
|
|
||||||
|
def get_req(self,url,**kwargs): |
||||||
|
pass |
||||||
|
|
||||||
|
# 解压网页数据 |
||||||
|
def uzipData(self, data): |
||||||
|
if data.startswith(b'\x1f\x8b'): |
||||||
|
return gzip.decompress(data) |
||||||
|
elif data.startswith(b'\xec\xbd'): |
||||||
|
return zlib.decompress(data, -zlib.MAX_WBITS) |
||||||
|
else: |
||||||
|
return data |
Binary file not shown.
@ -0,0 +1,23 @@ |
|||||||
|
[file] |
||||||
|
;日志文件名 |
||||||
|
logFile = log.txt |
||||||
|
[selenium] |
||||||
|
;chrome浏览器执行路径 |
||||||
|
binary_location = I:\ChromeUpdater\chrome.exe |
||||||
|
;浏览器用户数据路径 |
||||||
|
user_data_dir=I:\ChromeUpdater\User Data |
||||||
|
[excel] |
||||||
|
;采集数据报表 |
||||||
|
file1=excel1.xlsx |
||||||
|
;清洗脏数据报表 |
||||||
|
file2=excel2.xlsx |
||||||
|
;参数列表 |
||||||
|
param_name=手机名称,参考价格,电商报价,上市时间,网友综合评分,屏幕尺寸,机身容量,屏幕色数,运营商支持,网络模式,SIM卡类型,WiFi,蓝牙,手机类型,机身结构,电池类型,电池更换,屏幕材质,屏幕分辨率,像素密度,触控方式,触摸特性,操作系统,CPU型号,核心数,CPU制程,运行内存,容量扩展,传感器类型,后置相机,前置相机,变焦,闪光灯,视频拍摄,拍照特性,视频格式,视频播放,音乐格式,图片格式,文档格式,GPS,感应器,USB接口,耳机接口,无线连接,日常功能,键盘类型,输入方式,输入法,包装清单 |
||||||
|
;非空参数个数(从左到右保留指定个数的参数) |
||||||
|
param_required_index=8 |
||||||
|
;达到指定缓冲数据量写入一次报表 |
||||||
|
data_size=10 |
||||||
|
;线程池大小 |
||||||
|
thread_count=5 |
||||||
|
;采集数据量 |
||||||
|
max_count=30 |
@ -0,0 +1,31 @@ |
|||||||
|
import configparser |
||||||
|
import logging |
||||||
|
from logging.handlers import TimedRotatingFileHandler |
||||||
|
import os |
||||||
|
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) |
||||||
|
|
||||||
|
cf = configparser.ConfigParser() |
||||||
|
config_path=BASE_DIR+"//config.ini" |
||||||
|
if not os.path.exists(config_path): |
||||||
|
raise Exception("配置文件:%s不存在" % config_path) |
||||||
|
cf.read(config_path,encoding='utf-8') |
||||||
|
logFile = cf.get('file', 'logFile') |
||||||
|
logger=logging.getLogger() |
||||||
|
logger.setLevel(logging.INFO) |
||||||
|
def init(): |
||||||
|
log_format=logging.Formatter(fmt="%(asctime)s %(levelname)s : %(message)s",datefmt='%Y-%m-%d %H:%M:%S') |
||||||
|
# 在控制台打印日志 |
||||||
|
streamHandler = logging.StreamHandler() |
||||||
|
streamHandler.setFormatter(log_format) |
||||||
|
logger.addHandler(streamHandler) |
||||||
|
|
||||||
|
logpath=BASE_DIR+"\\log\\" |
||||||
|
if not os.path.exists(BASE_DIR+"\\log\\"): |
||||||
|
os.mkdir(logpath) |
||||||
|
|
||||||
|
timedRotatingFileHandler=TimedRotatingFileHandler(filename=logpath+"all.log",when='H',interval=1,encoding='utf-8') |
||||||
|
timedRotatingFileHandler.setFormatter(log_format) |
||||||
|
|
||||||
|
logger.addHandler(timedRotatingFileHandler) |
||||||
|
|
||||||
|
|
@ -0,0 +1,21 @@ |
|||||||
|
import time |
||||||
|
|
||||||
|
from config.config import init |
||||||
|
from config.config import logger |
||||||
|
|
||||||
|
start = int(time.time()) |
||||||
|
init() |
||||||
|
|
||||||
|
def getRunTimeInt(): |
||||||
|
return (int(time.time()) - start) |
||||||
|
|
||||||
|
def getRunTime(): |
||||||
|
return '程序已经执行%d秒' % (int(time.time()) - start) |
||||||
|
|
||||||
|
|
||||||
|
def writeInfo(msg): |
||||||
|
logger.info('%s\t(%s)' % (msg, getRunTime())) |
||||||
|
|
||||||
|
|
||||||
|
def writeError(msg): |
||||||
|
logger.error('%s\t(%s)' % (msg, getRunTime())) |
@ -0,0 +1,501 @@ |
|||||||
|
import gzip |
||||||
|
import json |
||||||
|
import os |
||||||
|
import random |
||||||
|
import re |
||||||
|
import threading |
||||||
|
import time |
||||||
|
import zlib |
||||||
|
|
||||||
|
import io |
||||||
|
from typing import Optional, Callable, Any, Iterable, Mapping |
||||||
|
|
||||||
|
import requests |
||||||
|
import win32api |
||||||
|
import win32con |
||||||
|
from PIL import Image |
||||||
|
from bs4 import BeautifulSoup |
||||||
|
# 手机实体类 |
||||||
|
from openpyxl import load_workbook, Workbook |
||||||
|
from pynput.mouse import Controller, Button |
||||||
|
from selenium.common.exceptions import NoSuchElementException |
||||||
|
from selenium.webdriver.chrome import webdriver |
||||||
|
from selenium.webdriver.chrome.options import Options |
||||||
|
from urllib3.exceptions import HeaderParsingError |
||||||
|
|
||||||
|
from Crawler import MobilePhoneCrawler |
||||||
|
from config.config import cf, config_path |
||||||
|
from config.log import writeInfo, writeError |
||||||
|
from bs4 import BeautifulSoup |
||||||
|
import re |
||||||
|
|
||||||
|
headers = { |
||||||
|
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36', |
||||||
|
'cookie': 'hng=CN%7Czh-CN%7CCNY%7C156; lid=tb84443556; enc=Di1rYCRWDQZC0UbccH38rIzuBg8LLFGKPSeQNu0fJ6Atw1lfF%2BtBE6Jm3vKtkZ%2FcJwoY%2FA2OAFq1CCgzrB0Wmg%3D%3D; t=2ce95276273d2f0fec4b735114efb9f0; uc3=id2=UonSf2s8K7H57A%3D%3D&nk2=F5RNYQezF9ZVJA%3D%3D&lg2=URm48syIIVrSKA%3D%3D&vt3=F8dByuPZuePp%2FK4exO4%3D; tracknick=tb84443556; uc4=nk4=0%40FY4Gtg6GE3gLVPH74U0sgDg9VVYt&id4=0%40UOE4tAnGHWIKt7PI5bS6f4noV%2Bbp; lgc=tb84443556; _tb_token_=e83ebbe73eeff; cookie2=170434fc3c4cae5b8d05c6ca3e035a7d; cna=3uL0FRGpQX4CAdoUCaGR7Dw2; UM_distinctid=16d0a0bd4f67e9-0657eeb9eeb382-5373e62-384000-16d0a0bd4f7aff; otherx=e%3D1%26p%3D*%26s%3D0%26c%3D0%26f%3D0%26g%3D0%26t%3D0; _med=dw:2560&dh:1440&pw:2560&ph:1440&ist:0; cq=ccp%3D1; swfstore=175036; res=scroll%3A1613*5900-client%3A1613*924-offset%3A1613*5900-screen%3A2560*1440; CNZZDATA1256793290=2086205104-1567833369-%7C1567844169; pnm_cku822=098%23E1hvKpvUvbpvUpCkvvvvvjiPRFS96jtjn2MWsjD2PmPp1jr8RLqwtjY8RFdUsjtbRpGCvvLMMQvvmphvLhbyi9mFecXPjV5vsEe4jVDQpGoHbdiQpznCwaZOecXPjV5vsEe4jVDQpYLhbdiQpzwBwaZOecEvjV5vsCkwjVDQpGFvbdiQpzvxwa7ivpvUphvhrpcsXvmEvpvVpyUUCE%2BfKphv8hCvCbyvvh89phvOJpvvpYYvpm9vvvCHtZCv8vvvvhcDphvOKpvvBHvCvpvVphhvvvvvRphvChCvvvm5vpvhphvhHv%3D%3D; isg=BHBwqfKM2QBI7oUELIuAZ-OhQT7CuVQDVfnti2rBAUuPJRHPEs91k6SXeW3gtQzb; l=cBO-RSocq17jyErCBOCZlurza77TvIRAguPzaNbMi_5QN1TQITQOkrDBxe96cjWdtj8B4JuaUMv9-etuigILNzGHtBUV.' |
||||||
|
} |
||||||
|
|
||||||
|
# 获取字典cookie |
||||||
|
cookies = headers['cookie'].split(';') |
||||||
|
cookie_list = [] |
||||||
|
for cookie in cookies: |
||||||
|
cookie_list.append({'name': cookie.split('=')[0], 'value': cookie.split('=')[1]}) |
||||||
|
|
||||||
|
|
||||||
|
# 天猫手机爬虫 |
||||||
|
# https://list.tmall.com/search_product.htm?q=%CA%D6%BB%FA&click_id=%CA%D6%BB%FA&from=mallfp..pc_1.7_hq&spm=875.7931836%2FB.a1z5h.8.66144265MD9ShM |
||||||
|
class TmallCrawler(MobilePhoneCrawler): |
||||||
|
|
||||||
|
def __init__(self) -> None: |
||||||
|
super().__init__() |
||||||
|
# 手机实体数据 |
||||||
|
self.session = requests.Session() |
||||||
|
# 登录 |
||||||
|
# self.login() |
||||||
|
self.get_page() |
||||||
|
|
||||||
|
''' |
||||||
|
登录 |
||||||
|
''' |
||||||
|
|
||||||
|
def login(self): |
||||||
|
# 获取验证码图片 |
||||||
|
login_url = 'https://qrlogin.taobao.com/qrcodelogin/generateQRCode4Login.do?from=tmall&appkey=00000000&umid_token=T4C16243DC287A311CA928E0D5EA177D443B864009178BBAA55A4CB86A4' |
||||||
|
writeInfo(login_url) |
||||||
|
login_res = self.session.get(login_url) |
||||||
|
res_content = login_res.content.decode() |
||||||
|
res_json = json.loads(res_content[res_content.index("{"):res_content.index("}") + 1]) |
||||||
|
writeInfo(json.dumps(res_json, indent=1)) |
||||||
|
img_url = res_json["url"] |
||||||
|
img_res = self.session.get("http:%s" % img_url) |
||||||
|
if img_res.status_code == 200: |
||||||
|
img_name = 'login.png' |
||||||
|
# 保存二维码图片 |
||||||
|
with open(img_name, 'wb') as file: |
||||||
|
file.write(img_res.content) |
||||||
|
# 打开二维码图片 |
||||||
|
Image.open(img_name).show() |
||||||
|
win32api.MessageBox(0, "请打开手机淘宝扫描二维码", "提醒", win32con.MB_ICONWARNING | win32con.MB_SYSTEMMODAL) |
||||||
|
while True: |
||||||
|
login_url = "https://qrlogin.taobao.com/qrcodelogin/qrcodeLoginCheck.do?lgToken={0}&defaulturl=https%3A%2F%2Fwww.tmall.com".format( |
||||||
|
res_json['lgToken']) |
||||||
|
writeInfo("login_url:{0}".format(login_url)) |
||||||
|
check_login_res = self.session.get(login_url) |
||||||
|
# 检查扫码结果 |
||||||
|
if check_login_res.status_code == 200: |
||||||
|
check_login_res_json = json.loads(check_login_res.content.decode()) |
||||||
|
writeInfo(json.dumps(check_login_res_json, indent=1)) |
||||||
|
if check_login_res_json['code'] == '10006': |
||||||
|
# 扫码成功 |
||||||
|
check_login_url = check_login_res_json['url'] |
||||||
|
writeInfo("check_login_url={0}".format(check_login_url)) |
||||||
|
login_res = self.session.get(check_login_url) |
||||||
|
if login_res.status_code == 200: |
||||||
|
# 重定向登陆身份验证 |
||||||
|
login_res_html = BeautifulSoup(login_res.content, 'html.parser') |
||||||
|
check_url = login_res_html.select_one("iframe")["src"] |
||||||
|
writeInfo("check_url={0}".format(check_url)) |
||||||
|
# 登录身份验证 |
||||||
|
check_login_res = self.session.get(check_url) |
||||||
|
if check_login_res.status_code == 200: |
||||||
|
check_login_res_content = check_login_res.content.decode() |
||||||
|
# 阿里巴巴集团 | 身份验证 |
||||||
|
verify_modes_url = re.search("http.*verify_modes.*=", |
||||||
|
check_login_res_content).group() + '1' |
||||||
|
verify_modes_res = self.session.get(verify_modes_url) |
||||||
|
if verify_modes_res.status_code == 200: |
||||||
|
verify_modes_res_content = verify_modes_res.content.decode() |
||||||
|
if '你最近购买过什么商品' in verify_modes_res_content: |
||||||
|
raise Exception("触发图片验证,模拟请求失败") |
||||||
|
else: |
||||||
|
win32api.MessageBox(0, "请在手机淘宝上点击确认按钮登录", "提醒", |
||||||
|
win32con.MB_ICONWARNING | win32con.MB_SYSTEMMODAL) |
||||||
|
# 检测手机淘宝确认状态 |
||||||
|
htoken = re.search("htoken\".*[a-zA-Z]", verify_modes_res_content).group() |
||||||
|
htoken = htoken[htoken.index(":") + 2:] |
||||||
|
while True: |
||||||
|
time.sleep(1) |
||||||
|
check_status_res = self.session.get( |
||||||
|
"https://passport.taobao.com/iv/onekey/check_status.do?htoken={0}".format( |
||||||
|
htoken)) |
||||||
|
if check_status_res.status_code == 200: |
||||||
|
check_status_res_json = json.loads(check_status_res.content.decode()) |
||||||
|
if check_status_res_json['content']['code'] == '1': |
||||||
|
login_safe_res = self.session.get( |
||||||
|
check_status_res_json['content']['url']) |
||||||
|
if login_safe_res.status_code == 200: |
||||||
|
# login_safe_res_content=login_safe_res.content.decode(login_safe_res.apparent_encoding) |
||||||
|
# login_safe_href=re.search("https.*pass.tmall.com.*\w",login_safe_res_content).group() |
||||||
|
# index_res = self.session.get(login_safe_href) |
||||||
|
writeInfo("登录成功") |
||||||
|
break |
||||||
|
else: |
||||||
|
raise Exception("模拟登陆请求失败!!!") |
||||||
|
else: |
||||||
|
writeInfo(json.dumps(check_status_res_json, indent=1)) |
||||||
|
else: |
||||||
|
raise Exception("模拟登陆请求失败!!!") |
||||||
|
break |
||||||
|
else: |
||||||
|
raise Exception("模拟登陆请求失败!!!") |
||||||
|
else: |
||||||
|
raise Exception("模拟登陆请求失败!!!") |
||||||
|
else: |
||||||
|
raise Exception("模拟登陆请求失败!!!") |
||||||
|
elif check_login_res_json['code'] == '10004': |
||||||
|
self.login() |
||||||
|
time.sleep(1) |
||||||
|
else: |
||||||
|
raise Exception("获取登陆二维码图片失败") |
||||||
|
|
||||||
|
''' |
||||||
|
获取分页数据 |
||||||
|
url:分页url |
||||||
|
''' |
||||||
|
|
||||||
|
def get_page(self): |
||||||
|
# 商品列表页地址 |
||||||
|
domain = "https://list.tmall.com/search_product.htm" |
||||||
|
url = '{0}?q=%CA%D6%BB%FA&type=p&vmarket=&spm=875.7931836%2FB.a2227oh.d100&xl=shouji_1&from=mallfp..pc_1_suggest'.format( |
||||||
|
domain) |
||||||
|
while True: |
||||||
|
# 获取分页响应数据 |
||||||
|
res = self.session.get(url, headers=headers) |
||||||
|
# 判断响应状态码200才做处理 |
||||||
|
if res.status_code == 200: |
||||||
|
try: |
||||||
|
# 使用BeautifulSoup解析html |
||||||
|
res_html = BeautifulSoup(res.content, 'html.parser') |
||||||
|
# 验证码检测 |
||||||
|
if 'security-X5' == res_html.select_one("title").text: |
||||||
|
self.clickCaptcha(url) |
||||||
|
# 获取当前页 |
||||||
|
current_page = res_html.select_one("b[class=ui-page-cur]") |
||||||
|
writeInfo("开始解析第{0}页的数据,url:{1}".format(current_page.text, url)) |
||||||
|
# 获取商品列表里的每个超链接 |
||||||
|
product_hrefs = res_html.select("#J_ItemList .productTitle>a") |
||||||
|
for product_href in product_hrefs: |
||||||
|
# 轮询超链接获取商品详情数据 |
||||||
|
self.get_mobile("https:{0}".format(product_href['href'])) |
||||||
|
# 超过指定数据量结束循环 |
||||||
|
if len(self.mobile_list) == self.max_count: |
||||||
|
break |
||||||
|
except Exception as e: |
||||||
|
writeError(e) |
||||||
|
else: |
||||||
|
writeError("获取分页信息失败,url:%s响应状态码:%d" % (url, res.status_code)) |
||||||
|
url = "{0}{1}".format(domain, current_page.find_next_siblings()[0]['href']) |
||||||
|
|
||||||
|
''' |
||||||
|
滑动认证 |
||||||
|
res_html:滑动验证码页面源代码 |
||||||
|
url:滑动验证码页面url |
||||||
|
''' |
||||||
|
|
||||||
|
def clickCaptcha(self, url): |
||||||
|
try: |
||||||
|
chrome_options = Options() |
||||||
|
chrome_options.binary_location = cf.get('selenium', 'binary_location') |
||||||
|
# 以root权限运行 |
||||||
|
chrome_options.add_argument('--no-sandbox') |
||||||
|
chrome_options.add_argument('--disable-dev-shm-usage') |
||||||
|
# chrome_options.add_argument('--headless') |
||||||
|
# 设置用户数据路径 |
||||||
|
chrome_options.add_argument('--user-data-dir={0}'.format(cf.get('selenium', 'user_data_dir'))) |
||||||
|
# 不加载图片 |
||||||
|
chrome_options.add_argument('blink-settings=imagesEnabled=false') |
||||||
|
# 禁用gpu加速 |
||||||
|
chrome_options.add_argument('--disable-gpu') |
||||||
|
# 最大化 |
||||||
|
chrome_options.add_argument('--start-maximized') |
||||||
|
# 全屏模式 |
||||||
|
chrome_options.add_argument('start-fullscreen') |
||||||
|
# 设置为开发者模式,防止被识别出来使用了Selenium |
||||||
|
chrome_options.add_experimental_option('excludeSwitches', ['enable-automation']) |
||||||
|
driver = webdriver.WebDriver(options=chrome_options, service_log_path="I:\ChromeUpdater\selenium.log") |
||||||
|
# driver.set_window_rect(0,0,1024,768) |
||||||
|
# 访问滑动验证页面 |
||||||
|
driver.get(url) |
||||||
|
try: |
||||||
|
# 获取滑块 |
||||||
|
nc_1_n1z = driver.find_element_by_css_selector("#nc_1_n1z") |
||||||
|
# 获取滑动条 |
||||||
|
nc_1__scale_text = driver.find_element_by_css_selector("#nc_1__scale_text") |
||||||
|
# 滑块坐标中心 |
||||||
|
mouse = Controller() |
||||||
|
# 移动到滑块坐标中心 |
||||||
|
x = nc_1_n1z.rect['x'] + nc_1_n1z.rect['width'] / 2 |
||||||
|
y = nc_1_n1z.rect['y'] + nc_1_n1z.rect['height'] / 2 |
||||||
|
mouse.position = (x, y) |
||||||
|
time.sleep(0.5) |
||||||
|
mouse.press(Button.left) |
||||||
|
time.sleep(0.5) |
||||||
|
mouse.move(x + nc_1__scale_text.rect['width'] - nc_1_n1z.rect['width'], y) |
||||||
|
time.sleep(0.5) |
||||||
|
mouse.release(Button.left) |
||||||
|
while True: |
||||||
|
if len(driver.find_elements_by_css_selector(".errloading")) > 0: |
||||||
|
driver.quit() |
||||||
|
self.clickCaptcha(url) |
||||||
|
break |
||||||
|
else: |
||||||
|
pass |
||||||
|
# sub_slide_width = random.randint(30, 50) |
||||||
|
# action.move_by_offset(sub_slide_width, 0).perform() # 移动滑块 |
||||||
|
# start += sub_slide_width |
||||||
|
time.sleep(random.randint(1, 10) / 10) |
||||||
|
cookie_list = driver.get_cookies() |
||||||
|
# 关闭浏览器 |
||||||
|
driver.quit() |
||||||
|
except NoSuchElementException as e: |
||||||
|
writeError(e) |
||||||
|
driver.quit() |
||||||
|
self.clickCaptcha(url) |
||||||
|
except Exception as e: |
||||||
|
writeError(e) |
||||||
|
raise Exception("模拟滑动验证失败") |
||||||
|
|
||||||
|
''' |
||||||
|
获取手机详情数据 |
||||||
|
url:手机链接 |
||||||
|
''' |
||||||
|
|
||||||
|
def get_mobile(self, url, param_url=None, **kwargs): |
||||||
|
res = self.session.get(url) |
||||||
|
if res.status_code == 200: |
||||||
|
res_html = BeautifulSoup(res.content, 'html.parser') |
||||||
|
# 验证码检测 |
||||||
|
if 'security-X5' == res_html.select_one("title").text: |
||||||
|
self.clickCaptcha(url) |
||||||
|
# 获取手机规格参数 |
||||||
|
# 判断手机是否有规格参数 |
||||||
|
if res_html.select_one("#J_Attrs") is None: |
||||||
|
writeInfo("手机详情url:%s没有规格参数" % url) |
||||||
|
else: |
||||||
|
try: |
||||||
|
ths = res_html.select("table:contains('规格参数') tbody>tr:not([class='tm-tableAttrSub']) th") |
||||||
|
# 轮询规格参数表格里的每一行参数 |
||||||
|
mobile_dict = {} |
||||||
|
for th in ths: |
||||||
|
if 'colspan' in th.attrs: |
||||||
|
continue |
||||||
|
# 字典存储规格参数 |
||||||
|
key = str(th.text).strip() |
||||||
|
value = str(th.next_sibling.text).strip() |
||||||
|
mobile_dict[key] = value |
||||||
|
# 存放到列表里 |
||||||
|
self.mobile_list.append(mobile_dict) |
||||||
|
writeInfo("添加手机:{0}信息".format(str(res_html.select_one("div[class=tb-detail-hd]>h1").text).strip())) |
||||||
|
except Exception as e: |
||||||
|
writeError(e) |
||||||
|
else: |
||||||
|
writeError("手机url:%s响应状态码:%d" % (url, res.status_code)) |
||||||
|
|
||||||
|
# 保存手机数据 |
||||||
|
def save_mobile(self, mobile): |
||||||
|
self.mobile_list.append(mobile) |
||||||
|
|
||||||
|
|
||||||
|
# 评测中心手机爬虫 |
||||||
|
# http://product.cnmo.com/all/product.html |
||||||
|
class CnmoCrawler(MobilePhoneCrawler): |
||||||
|
def __init__(self) -> None: |
||||||
|
super().__init__() |
||||||
|
self.threads = [] |
||||||
|
self.threadLock = threading.Lock() |
||||||
|
try: |
||||||
|
# 线程池大小 |
||||||
|
self.thread_count = int(cf.get('excel', 'thread_count')) |
||||||
|
# 数据指定缓存数写入一次excel |
||||||
|
self.data_size = int(cf.get('excel', 'data_size')) |
||||||
|
# 获取文件保存路径 |
||||||
|
self.file1 = cf.get('excel', 'file1') |
||||||
|
self.file2 = cf.get('excel', 'file2') |
||||||
|
# 获取保存参数列表 |
||||||
|
self.param_name_list = cf.get('excel', 'param_name').split(',') |
||||||
|
# 获取非空参数个数 |
||||||
|
self.param_required_index = int(cf.get('excel', 'param_required_index')) |
||||||
|
# 采集数据量 |
||||||
|
self.max_count = int(cf.get('excel', 'max_count')) |
||||||
|
except Exception as e: |
||||||
|
writeError("初始化参数失败,异常信息{0},请检查配置文件{1}的配置".format(e, config_path)) |
||||||
|
raise |
||||||
|
# 清空上次采集数据 |
||||||
|
if os.path.exists(self.file1): |
||||||
|
os.remove(self.file1) |
||||||
|
if os.path.exists(self.file2): |
||||||
|
os.remove(self.file2) |
||||||
|
|
||||||
|
def get_page(self): |
||||||
|
# 起始页链接 |
||||||
|
start_url = 'http://product.cnmo.com/all/product.html' |
||||||
|
# 下一页链接 |
||||||
|
next_page_url = None |
||||||
|
while True: |
||||||
|
current_page_url = start_url if next_page_url is None else next_page_url |
||||||
|
writeInfo("开始解析列表页:{0}".format(current_page_url)) |
||||||
|
# 调用解析器解析网页请求体 |
||||||
|
res = self.get_req(current_page_url) |
||||||
|
# 判断响应状态码,200正常返回 |
||||||
|
if res is not None and res.status_code == 200: |
||||||
|
try: |
||||||
|
writeInfo("列表页:{0}解析成功".format(current_page_url)) |
||||||
|
res_html = BeautifulSoup(self.uzipData(res.content), 'html.parser') |
||||||
|
# 解析列表数据 |
||||||
|
li_s = res_html.select("ul.all-con-con-ul.cf>li") |
||||||
|
for li in li_s: |
||||||
|
if len(self.mobile_list) > self.max_count: |
||||||
|
return |
||||||
|
p = li.select_one('p.red') |
||||||
|
# 多线程获取手机详情参数 |
||||||
|
time_to_market = re.search('\d{4}年\d{2}月', p.text) |
||||||
|
thread = myThread(self, 'http:{0}'.format(li.select_one('a.name')['href']), |
||||||
|
'http:{0}'.format(li.select_one('div.info>a:contains(参数)')['href']), |
||||||
|
上市时间=None if time_to_market is None else time_to_market.group()) |
||||||
|
thread.start() |
||||||
|
if len(self.threads) == self.thread_count: |
||||||
|
for t in self.threads: |
||||||
|
t.join() |
||||||
|
writeInfo("清空线程池") |
||||||
|
self.threads.clear() |
||||||
|
|
||||||
|
self.threads.append(thread) |
||||||
|
|
||||||
|
# 获取下一页链接 |
||||||
|
next_page_url = 'http:{0}'.format(res_html.select_one(".pnext")["href"]) |
||||||
|
except Exception as e: |
||||||
|
writeError("解析列表页出现异常信息:{0}".format(e)) |
||||||
|
else: |
||||||
|
raise Exception("列表页:{0}解析失败".format(current_page_url)) |
||||||
|
|
||||||
|
def run(self): |
||||||
|
self.get_page() |
||||||
|
writeInfo('采集数据完毕,开始清洗脏数据') |
||||||
|
self.clear_data() |
||||||
|
writeInfo('清洗脏数据完毕') |
||||||
|
|
||||||
|
def get_mobile(self, base_url, param_url, **kwargs): |
||||||
|
# 字典存储手机详细参数 |
||||||
|
param_dict = {} |
||||||
|
writeInfo("开始解析手机详情参数页{0}".format(param_url)) |
||||||
|
|
||||||
|
# 获取网友综合评分 |
||||||
|
score_res = self.get_req(base_url) |
||||||
|
if score_res is not None and score_res.status_code == 200: |
||||||
|
score_res_html = BeautifulSoup(self.uzipData(score_res.content), 'html.parser') |
||||||
|
param_dict['网友综合评分'] = score_res_html.select_one('div.pro-comm-stars').find_next('span', |
||||||
|
{'class': 'red'}).text |
||||||
|
mobile_res = self.get_req(param_url) |
||||||
|
|
||||||
|
# 判断响应状态码,200正常返回 |
||||||
|
if mobile_res is not None and mobile_res.status_code == 200: |
||||||
|
# 调用解析器解析网页请求体 |
||||||
|
try: |
||||||
|
mobile_res_html = BeautifulSoup(self.uzipData(mobile_res.content), 'html.parser') |
||||||
|
phone_name = mobile_res_html.select_one('#proName>a').text |
||||||
|
param_dict['手机名称'] = phone_name |
||||||
|
writeInfo("开始解析手机{0}详细参数".format(phone_name)) |
||||||
|
# 参考价格 |
||||||
|
param_dict['参考价格'] = mobile_res_html.select_one('span:contains(参考价格)').find_next().text |
||||||
|
# 电商报价 |
||||||
|
param_dict['电商报价'] = mobile_res_html.select_one('span:contains(电商报价)').next_sibling.strip() |
||||||
|
# 获取参数名 |
||||||
|
param_name_list = mobile_res_html.select('div.right>p') |
||||||
|
for param_name in param_name_list: |
||||||
|
# 获取参数值 |
||||||
|
param_dict[param_name['paramname']] = param_name['paramvalue'] |
||||||
|
# 获取锁,用于线程同步 |
||||||
|
self.threadLock.acquire() |
||||||
|
self.save_mobile(dict(param_dict, **kwargs)) |
||||||
|
# 释放锁,开启下一个线程 |
||||||
|
self.threadLock.release() |
||||||
|
except Exception as e: |
||||||
|
writeError("解析手机出现异常信息:{0}".format(e)) |
||||||
|
else: |
||||||
|
writeError("解析手机详情参数页{0}失败".format(param_url)) |
||||||
|
|
||||||
|
def save_mobile(self, mobile, ingore=False): |
||||||
|
self.mobile_list.append(mobile) |
||||||
|
writeInfo("当前已爬取{0}台手机".format(len(self.mobile_list))) |
||||||
|
if not ingore and len(self.mobile_list) % self.data_size == 0: |
||||||
|
self.save_excel(self.mobile_list[-self.data_size:]) |
||||||
|
elif ingore and len(self.mobile_list) % self.data_size != 0: |
||||||
|
self.save_excel(self.mobile_list[-(len(self.mobile_list) % self.data_size):]) |
||||||
|
else: |
||||||
|
writeInfo('缓存数据不足{0}条或没有剩余数据,不需要写入'.format(self.data_size)) |
||||||
|
|
||||||
|
def init_excel(self, file, max_index=None): |
||||||
|
wb = Workbook() |
||||||
|
ws = wb.active |
||||||
|
for index, param_name in enumerate(self.param_name_list): |
||||||
|
if max_index is None or index < max_index: |
||||||
|
ws.cell(row=1, column=index + 1, value=param_name) |
||||||
|
wb.save(file) |
||||||
|
wb.close() |
||||||
|
|
||||||
|
# 保存数据到excel |
||||||
|
def save_excel(self, data_list): |
||||||
|
# 文件不存在,初始化表头 |
||||||
|
if not os.path.exists(self.file1): |
||||||
|
self.init_excel(self.file1) |
||||||
|
wb = load_workbook(self.file1) |
||||||
|
ws = wb.active |
||||||
|
# 写入数据 |
||||||
|
max_row = ws.max_row |
||||||
|
for row_index, data in enumerate(data_list): |
||||||
|
for column_index, param_name in enumerate(self.param_name_list): |
||||||
|
ws.cell(row=max_row + row_index + 1, column=column_index + 1, |
||||||
|
value=data[param_name] if param_name in data else None) |
||||||
|
wb.save(self.file1) |
||||||
|
wb.close() |
||||||
|
|
||||||
|
# 清洗脏数据 |
||||||
|
def clear_data(self): |
||||||
|
# 源数据 |
||||||
|
source_wb = load_workbook(self.file1) |
||||||
|
source_ws = source_wb.active |
||||||
|
# 清洗脏数据后的新报表 |
||||||
|
self.init_excel(self.file2,max_index=self.param_required_index) |
||||||
|
target_wb = load_workbook(self.file2) |
||||||
|
target_ws = target_wb.active |
||||||
|
write_row = 2 |
||||||
|
for current_row in range(2, source_ws.max_row + 1): |
||||||
|
for current_column in range(1, self.param_required_index + 1): |
||||||
|
val = source_ws.cell(row=current_row, column=current_column).value |
||||||
|
if val is None or len(val) == 0 or ( |
||||||
|
current_column == 2 and val == '曝光' or val == '即将上市'): |
||||||
|
for i in range(1, self.param_required_index + 1): |
||||||
|
target_ws.cell(row=write_row, column=i, value='') |
||||||
|
break |
||||||
|
else: |
||||||
|
target_ws.cell(row=write_row, column=current_column, value=val) |
||||||
|
if current_column == self.param_required_index: |
||||||
|
write_row += 1 |
||||||
|
# 保存清洗结果 |
||||||
|
target_wb.save(self.file2) |
||||||
|
|
||||||
|
def get_req(self, url, max_retries=3, **kwargs): |
||||||
|
try: |
||||||
|
return requests.get(url, headers=dict({ |
||||||
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36', |
||||||
|
}, |
||||||
|
**kwargs)) |
||||||
|
except Exception as e: |
||||||
|
writeError(e) |
||||||
|
time.sleep(10) |
||||||
|
max_retries -= 1 |
||||||
|
if max_retries > 0: |
||||||
|
self.get_req(url, max_retries, **kwargs) |
||||||
|
else: |
||||||
|
return None |
||||||
|
|
||||||
|
|
||||||
|
class myThread(threading.Thread): |
||||||
|
def __init__(self, crawler, base_url, param_url, **kwargs): |
||||||
|
threading.Thread.__init__(self) |
||||||
|
self.crawler = crawler |
||||||
|
self.base_url = base_url |
||||||
|
self.param_url = param_url |
||||||
|
self.kwargs = kwargs |
||||||
|
|
||||||
|
def run(self) -> None: |
||||||
|
self.crawler.get_mobile(self.base_url, self.param_url, **self.kwargs) |
@ -0,0 +1,10 @@ |
|||||||
|
beautifulsoup4==4.8.0 |
||||||
|
bs4==0.0.1 |
||||||
|
certifi==2019.6.16 |
||||||
|
chardet==3.0.4 |
||||||
|
idna==2.8 |
||||||
|
Pillow==6.1.0 |
||||||
|
pywin32==224 |
||||||
|
requests==2.22.0 |
||||||
|
soupsieve==1.9.3 |
||||||
|
urllib3==1.25.3 |
Loading…
Reference in new issue