|
|
@ -9,14 +9,14 @@ from bs4 import BeautifulSoup |
|
|
|
# 手机实体类 |
|
|
|
# 手机实体类 |
|
|
|
from openpyxl import load_workbook, Workbook |
|
|
|
from openpyxl import load_workbook, Workbook |
|
|
|
|
|
|
|
|
|
|
|
from Crawler import MobilePhoneCrawler |
|
|
|
from Crawler import MobiePhoneCrawler |
|
|
|
from config.config import cf, config_path |
|
|
|
from config.config import cf, config_path |
|
|
|
from config.log import writeInfo, writeError |
|
|
|
from config.log import writeInfo, writeError |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 评测中心手机爬虫 |
|
|
|
# 评测中心手机爬虫 |
|
|
|
# http://product.cnmo.com/all/product.html |
|
|
|
# http://product.cnmo.com/all/product.html |
|
|
|
class CnmoCrawler(MobilePhoneCrawler): |
|
|
|
class CnmoCrawler(MobiePhoneCrawler): |
|
|
|
def __init__(self) -> None: |
|
|
|
def __init__(self) -> None: |
|
|
|
super().__init__() |
|
|
|
super().__init__() |
|
|
|
self.threads = [] |
|
|
|
self.threads = [] |
|
|
@ -62,7 +62,7 @@ class CnmoCrawler(MobilePhoneCrawler): |
|
|
|
# 解析列表数据 |
|
|
|
# 解析列表数据 |
|
|
|
li_s = res_html.select("ul.all-con-con-ul.cf>li") |
|
|
|
li_s = res_html.select("ul.all-con-con-ul.cf>li") |
|
|
|
for li in li_s: |
|
|
|
for li in li_s: |
|
|
|
if len(self.mobile_list) > self.max_count: |
|
|
|
if len(self.mobie_list) > self.max_count: |
|
|
|
return |
|
|
|
return |
|
|
|
p = li.select_one('p.red') |
|
|
|
p = li.select_one('p.red') |
|
|
|
# 多线程获取手机详情参数 |
|
|
|
# 多线程获取手机详情参数 |
|
|
@ -95,7 +95,7 @@ class CnmoCrawler(MobilePhoneCrawler): |
|
|
|
self.clear_data() |
|
|
|
self.clear_data() |
|
|
|
writeInfo('清洗脏数据完毕') |
|
|
|
writeInfo('清洗脏数据完毕') |
|
|
|
|
|
|
|
|
|
|
|
def get_mobile(self, base_url, param_url, **kwargs): |
|
|
|
def get_mobie(self, base_url, param_url, **kwargs): |
|
|
|
# 字典存储手机详细参数 |
|
|
|
# 字典存储手机详细参数 |
|
|
|
param_dict = {} |
|
|
|
param_dict = {} |
|
|
|
writeInfo("开始解析手机详情参数页{0}".format(param_url)) |
|
|
|
writeInfo("开始解析手机详情参数页{0}".format(param_url)) |
|
|
@ -106,28 +106,28 @@ class CnmoCrawler(MobilePhoneCrawler): |
|
|
|
score_res_html = BeautifulSoup(self.uzipData(score_res.content), 'html.parser') |
|
|
|
score_res_html = BeautifulSoup(self.uzipData(score_res.content), 'html.parser') |
|
|
|
param_dict['网友综合评分'] = score_res_html.select_one('div.pro-comm-stars').find_next('span', |
|
|
|
param_dict['网友综合评分'] = score_res_html.select_one('div.pro-comm-stars').find_next('span', |
|
|
|
{'class': 'red'}).text |
|
|
|
{'class': 'red'}).text |
|
|
|
mobile_res = self.get_req(param_url) |
|
|
|
mobie_res = self.get_req(param_url) |
|
|
|
|
|
|
|
|
|
|
|
# 判断响应状态码,200正常返回 |
|
|
|
# 判断响应状态码,200正常返回 |
|
|
|
if mobile_res is not None and mobile_res.status_code == 200: |
|
|
|
if mobie_res is not None and mobie_res.status_code == 200: |
|
|
|
# 调用解析器解析网页请求体 |
|
|
|
# 调用解析器解析网页请求体 |
|
|
|
try: |
|
|
|
try: |
|
|
|
mobile_res_html = BeautifulSoup(self.uzipData(mobile_res.content), 'html.parser') |
|
|
|
mobie_res_html = BeautifulSoup(self.uzipData(mobie_res.content), 'html.parser') |
|
|
|
phone_name = mobile_res_html.select_one('#proName>a').text |
|
|
|
phone_name = mobie_res_html.select_one('#proName>a').text |
|
|
|
param_dict['手机名称'] = phone_name |
|
|
|
param_dict['手机名称'] = phone_name |
|
|
|
writeInfo("开始解析手机{0}详细参数".format(phone_name)) |
|
|
|
writeInfo("开始解析手机{0}详细参数".format(phone_name)) |
|
|
|
# 参考价格 |
|
|
|
# 参考价格 |
|
|
|
param_dict['参考价格'] = mobile_res_html.select_one('span:contains(参考价格)').find_next().text |
|
|
|
param_dict['参考价格'] = mobie_res_html.select_one('span:contains(参考价格)').find_next().text |
|
|
|
# 电商报价 |
|
|
|
# 电商报价 |
|
|
|
param_dict['电商报价'] = mobile_res_html.select_one('span:contains(电商报价)').next_sibling.strip() |
|
|
|
param_dict['电商报价'] = mobie_res_html.select_one('span:contains(电商报价)').next_sibling.strip() |
|
|
|
# 获取参数名 |
|
|
|
# 获取参数名 |
|
|
|
param_name_list = mobile_res_html.select('div.right>p') |
|
|
|
param_name_list = mobie_res_html.select('div.right>p') |
|
|
|
for param_name in param_name_list: |
|
|
|
for param_name in param_name_list: |
|
|
|
# 获取参数值 |
|
|
|
# 获取参数值 |
|
|
|
param_dict[param_name['paramname']] = param_name['paramvalue'] |
|
|
|
param_dict[param_name['paramname']] = param_name['paramvalue'] |
|
|
|
# 获取锁,用于线程同步 |
|
|
|
# 获取锁,用于线程同步 |
|
|
|
self.threadLock.acquire() |
|
|
|
self.threadLock.acquire() |
|
|
|
self.save_mobile(dict(param_dict, **kwargs)) |
|
|
|
self.save_mobie(dict(param_dict, **kwargs)) |
|
|
|
# 释放锁,开启下一个线程 |
|
|
|
# 释放锁,开启下一个线程 |
|
|
|
self.threadLock.release() |
|
|
|
self.threadLock.release() |
|
|
|
except Exception as e: |
|
|
|
except Exception as e: |
|
|
@ -135,13 +135,13 @@ class CnmoCrawler(MobilePhoneCrawler): |
|
|
|
else: |
|
|
|
else: |
|
|
|
writeError("解析手机详情参数页{0}失败".format(param_url)) |
|
|
|
writeError("解析手机详情参数页{0}失败".format(param_url)) |
|
|
|
|
|
|
|
|
|
|
|
def save_mobile(self, mobile, ingore=False): |
|
|
|
def save_mobie(self, mobie, ingore=False): |
|
|
|
self.mobile_list.append(mobile) |
|
|
|
self.mobie_list.append(mobie) |
|
|
|
writeInfo("当前已爬取{0}台手机".format(len(self.mobile_list))) |
|
|
|
writeInfo("当前已爬取{0}台手机".format(len(self.mobie_list))) |
|
|
|
if not ingore and len(self.mobile_list) % self.data_size == 0: |
|
|
|
if not ingore and len(self.mobie_list) % self.data_size == 0: |
|
|
|
self.save_excel(self.mobile_list[-self.data_size:]) |
|
|
|
self.save_excel(self.mobie_list[-self.data_size:]) |
|
|
|
elif ingore and len(self.mobile_list) % self.data_size != 0: |
|
|
|
elif ingore and len(self.mobie_list) % self.data_size != 0: |
|
|
|
self.save_excel(self.mobile_list[-(len(self.mobile_list) % self.data_size):]) |
|
|
|
self.save_excel(self.mobie_list[-(len(self.mobie_list) % self.data_size):]) |
|
|
|
else: |
|
|
|
else: |
|
|
|
writeInfo('缓存数据不足{0}条或没有剩余数据,不需要写入'.format(self.data_size)) |
|
|
|
writeInfo('缓存数据不足{0}条或没有剩余数据,不需要写入'.format(self.data_size)) |
|
|
|
|
|
|
|
|
|
|
@ -221,4 +221,4 @@ class myThread(threading.Thread): |
|
|
|
self.kwargs = kwargs |
|
|
|
self.kwargs = kwargs |
|
|
|
|
|
|
|
|
|
|
|
def run(self) -> None: |
|
|
|
def run(self) -> None: |
|
|
|
self.crawler.get_mobile(self.base_url, self.param_url, **self.kwargs) |
|
|
|
self.crawler.get_mobie(self.base_url, self.param_url, **self.kwargs) |
|
|
|