master
潘啟华 5 years ago
parent f85ab74267
commit f4c1d435b1
  1. 12
      Crawler.py
  2. 40
      main.py

@ -3,13 +3,13 @@ import gzip
import zlib
class MobilePhoneCrawler():
class MobiePhoneCrawler():
def __init__(self) -> None:
super().__init__()
# 限制5000条数据
self._max_count = 5000
# 手机列表数据
self._mobile_list = []
self._mobie_list = []
@property
def max_count(self):
@ -20,19 +20,19 @@ class MobilePhoneCrawler():
self._max_count = value
@property
def mobile_list(self):
return self._mobile_list
def mobie_list(self):
return self._mobie_list
# 获取列表数据
def get_page(self):
pass
# 获取手机详情数据
def get_mobile(self, base_url,param_url,**kwargs):
def get_mobie(self, base_url, param_url, **kwargs):
pass
# 保存手机数据
def save_mobile(self, mobile):
def save_mobie(self, mobie):
pass
def get_req(self,url,**kwargs):

@ -9,14 +9,14 @@ from bs4 import BeautifulSoup
# 手机实体类
from openpyxl import load_workbook, Workbook
from Crawler import MobilePhoneCrawler
from Crawler import MobiePhoneCrawler
from config.config import cf, config_path
from config.log import writeInfo, writeError
# 评测中心手机爬虫
# http://product.cnmo.com/all/product.html
class CnmoCrawler(MobilePhoneCrawler):
class CnmoCrawler(MobiePhoneCrawler):
def __init__(self) -> None:
super().__init__()
self.threads = []
@ -62,7 +62,7 @@ class CnmoCrawler(MobilePhoneCrawler):
# 解析列表数据
li_s = res_html.select("ul.all-con-con-ul.cf>li")
for li in li_s:
if len(self.mobile_list) > self.max_count:
if len(self.mobie_list) > self.max_count:
return
p = li.select_one('p.red')
# 多线程获取手机详情参数
@ -95,7 +95,7 @@ class CnmoCrawler(MobilePhoneCrawler):
self.clear_data()
writeInfo('清洗脏数据完毕')
def get_mobile(self, base_url, param_url, **kwargs):
def get_mobie(self, base_url, param_url, **kwargs):
# 字典存储手机详细参数
param_dict = {}
writeInfo("开始解析手机详情参数页{0}".format(param_url))
@ -106,28 +106,28 @@ class CnmoCrawler(MobilePhoneCrawler):
score_res_html = BeautifulSoup(self.uzipData(score_res.content), 'html.parser')
param_dict['网友综合评分'] = score_res_html.select_one('div.pro-comm-stars').find_next('span',
{'class': 'red'}).text
mobile_res = self.get_req(param_url)
mobie_res = self.get_req(param_url)
# 判断响应状态码,200正常返回
if mobile_res is not None and mobile_res.status_code == 200:
if mobie_res is not None and mobie_res.status_code == 200:
# 调用解析器解析网页请求体
try:
mobile_res_html = BeautifulSoup(self.uzipData(mobile_res.content), 'html.parser')
phone_name = mobile_res_html.select_one('#proName>a').text
mobie_res_html = BeautifulSoup(self.uzipData(mobie_res.content), 'html.parser')
phone_name = mobie_res_html.select_one('#proName>a').text
param_dict['手机名称'] = phone_name
writeInfo("开始解析手机{0}详细参数".format(phone_name))
# 参考价格
param_dict['参考价格'] = mobile_res_html.select_one('span:contains(参考价格)').find_next().text
param_dict['参考价格'] = mobie_res_html.select_one('span:contains(参考价格)').find_next().text
# 电商报价
param_dict['电商报价'] = mobile_res_html.select_one('span:contains(电商报价)').next_sibling.strip()
param_dict['电商报价'] = mobie_res_html.select_one('span:contains(电商报价)').next_sibling.strip()
# 获取参数名
param_name_list = mobile_res_html.select('div.right>p')
param_name_list = mobie_res_html.select('div.right>p')
for param_name in param_name_list:
# 获取参数值
param_dict[param_name['paramname']] = param_name['paramvalue']
# 获取锁,用于线程同步
self.threadLock.acquire()
self.save_mobile(dict(param_dict, **kwargs))
self.save_mobie(dict(param_dict, **kwargs))
# 释放锁,开启下一个线程
self.threadLock.release()
except Exception as e:
@ -135,13 +135,13 @@ class CnmoCrawler(MobilePhoneCrawler):
else:
writeError("解析手机详情参数页{0}失败".format(param_url))
def save_mobile(self, mobile, ingore=False):
self.mobile_list.append(mobile)
writeInfo("当前已爬取{0}台手机".format(len(self.mobile_list)))
if not ingore and len(self.mobile_list) % self.data_size == 0:
self.save_excel(self.mobile_list[-self.data_size:])
elif ingore and len(self.mobile_list) % self.data_size != 0:
self.save_excel(self.mobile_list[-(len(self.mobile_list) % self.data_size):])
def save_mobie(self, mobie, ingore=False):
self.mobie_list.append(mobie)
writeInfo("当前已爬取{0}台手机".format(len(self.mobie_list)))
if not ingore and len(self.mobie_list) % self.data_size == 0:
self.save_excel(self.mobie_list[-self.data_size:])
elif ingore and len(self.mobie_list) % self.data_size != 0:
self.save_excel(self.mobie_list[-(len(self.mobie_list) % self.data_size):])
else:
writeInfo('缓存数据不足{0}条或没有剩余数据,不需要写入'.format(self.data_size))
@ -221,4 +221,4 @@ class myThread(threading.Thread):
self.kwargs = kwargs
def run(self) -> None:
self.crawler.get_mobile(self.base_url, self.param_url, **self.kwargs)
self.crawler.get_mobie(self.base_url, self.param_url, **self.kwargs)

Loading…
Cancel
Save