master
潘啟华 5 years ago
parent f85ab74267
commit f4c1d435b1
  1. 12
      Crawler.py
  2. 40
      main.py

@ -3,13 +3,13 @@ import gzip
import zlib import zlib
class MobilePhoneCrawler(): class MobiePhoneCrawler():
def __init__(self) -> None: def __init__(self) -> None:
super().__init__() super().__init__()
# 限制5000条数据 # 限制5000条数据
self._max_count = 5000 self._max_count = 5000
# 手机列表数据 # 手机列表数据
self._mobile_list = [] self._mobie_list = []
@property @property
def max_count(self): def max_count(self):
@ -20,19 +20,19 @@ class MobilePhoneCrawler():
self._max_count = value self._max_count = value
@property @property
def mobile_list(self): def mobie_list(self):
return self._mobile_list return self._mobie_list
# 获取列表数据 # 获取列表数据
def get_page(self): def get_page(self):
pass pass
# 获取手机详情数据 # 获取手机详情数据
def get_mobile(self, base_url,param_url,**kwargs): def get_mobie(self, base_url, param_url, **kwargs):
pass pass
# 保存手机数据 # 保存手机数据
def save_mobile(self, mobile): def save_mobie(self, mobie):
pass pass
def get_req(self,url,**kwargs): def get_req(self,url,**kwargs):

@ -9,14 +9,14 @@ from bs4 import BeautifulSoup
# 手机实体类 # 手机实体类
from openpyxl import load_workbook, Workbook from openpyxl import load_workbook, Workbook
from Crawler import MobilePhoneCrawler from Crawler import MobiePhoneCrawler
from config.config import cf, config_path from config.config import cf, config_path
from config.log import writeInfo, writeError from config.log import writeInfo, writeError
# 评测中心手机爬虫 # 评测中心手机爬虫
# http://product.cnmo.com/all/product.html # http://product.cnmo.com/all/product.html
class CnmoCrawler(MobilePhoneCrawler): class CnmoCrawler(MobiePhoneCrawler):
def __init__(self) -> None: def __init__(self) -> None:
super().__init__() super().__init__()
self.threads = [] self.threads = []
@ -62,7 +62,7 @@ class CnmoCrawler(MobilePhoneCrawler):
# 解析列表数据 # 解析列表数据
li_s = res_html.select("ul.all-con-con-ul.cf>li") li_s = res_html.select("ul.all-con-con-ul.cf>li")
for li in li_s: for li in li_s:
if len(self.mobile_list) > self.max_count: if len(self.mobie_list) > self.max_count:
return return
p = li.select_one('p.red') p = li.select_one('p.red')
# 多线程获取手机详情参数 # 多线程获取手机详情参数
@ -95,7 +95,7 @@ class CnmoCrawler(MobilePhoneCrawler):
self.clear_data() self.clear_data()
writeInfo('清洗脏数据完毕') writeInfo('清洗脏数据完毕')
def get_mobile(self, base_url, param_url, **kwargs): def get_mobie(self, base_url, param_url, **kwargs):
# 字典存储手机详细参数 # 字典存储手机详细参数
param_dict = {} param_dict = {}
writeInfo("开始解析手机详情参数页{0}".format(param_url)) writeInfo("开始解析手机详情参数页{0}".format(param_url))
@ -106,28 +106,28 @@ class CnmoCrawler(MobilePhoneCrawler):
score_res_html = BeautifulSoup(self.uzipData(score_res.content), 'html.parser') score_res_html = BeautifulSoup(self.uzipData(score_res.content), 'html.parser')
param_dict['网友综合评分'] = score_res_html.select_one('div.pro-comm-stars').find_next('span', param_dict['网友综合评分'] = score_res_html.select_one('div.pro-comm-stars').find_next('span',
{'class': 'red'}).text {'class': 'red'}).text
mobile_res = self.get_req(param_url) mobie_res = self.get_req(param_url)
# 判断响应状态码,200正常返回 # 判断响应状态码,200正常返回
if mobile_res is not None and mobile_res.status_code == 200: if mobie_res is not None and mobie_res.status_code == 200:
# 调用解析器解析网页请求体 # 调用解析器解析网页请求体
try: try:
mobile_res_html = BeautifulSoup(self.uzipData(mobile_res.content), 'html.parser') mobie_res_html = BeautifulSoup(self.uzipData(mobie_res.content), 'html.parser')
phone_name = mobile_res_html.select_one('#proName>a').text phone_name = mobie_res_html.select_one('#proName>a').text
param_dict['手机名称'] = phone_name param_dict['手机名称'] = phone_name
writeInfo("开始解析手机{0}详细参数".format(phone_name)) writeInfo("开始解析手机{0}详细参数".format(phone_name))
# 参考价格 # 参考价格
param_dict['参考价格'] = mobile_res_html.select_one('span:contains(参考价格)').find_next().text param_dict['参考价格'] = mobie_res_html.select_one('span:contains(参考价格)').find_next().text
# 电商报价 # 电商报价
param_dict['电商报价'] = mobile_res_html.select_one('span:contains(电商报价)').next_sibling.strip() param_dict['电商报价'] = mobie_res_html.select_one('span:contains(电商报价)').next_sibling.strip()
# 获取参数名 # 获取参数名
param_name_list = mobile_res_html.select('div.right>p') param_name_list = mobie_res_html.select('div.right>p')
for param_name in param_name_list: for param_name in param_name_list:
# 获取参数值 # 获取参数值
param_dict[param_name['paramname']] = param_name['paramvalue'] param_dict[param_name['paramname']] = param_name['paramvalue']
# 获取锁,用于线程同步 # 获取锁,用于线程同步
self.threadLock.acquire() self.threadLock.acquire()
self.save_mobile(dict(param_dict, **kwargs)) self.save_mobie(dict(param_dict, **kwargs))
# 释放锁,开启下一个线程 # 释放锁,开启下一个线程
self.threadLock.release() self.threadLock.release()
except Exception as e: except Exception as e:
@ -135,13 +135,13 @@ class CnmoCrawler(MobilePhoneCrawler):
else: else:
writeError("解析手机详情参数页{0}失败".format(param_url)) writeError("解析手机详情参数页{0}失败".format(param_url))
def save_mobile(self, mobile, ingore=False): def save_mobie(self, mobie, ingore=False):
self.mobile_list.append(mobile) self.mobie_list.append(mobie)
writeInfo("当前已爬取{0}台手机".format(len(self.mobile_list))) writeInfo("当前已爬取{0}台手机".format(len(self.mobie_list)))
if not ingore and len(self.mobile_list) % self.data_size == 0: if not ingore and len(self.mobie_list) % self.data_size == 0:
self.save_excel(self.mobile_list[-self.data_size:]) self.save_excel(self.mobie_list[-self.data_size:])
elif ingore and len(self.mobile_list) % self.data_size != 0: elif ingore and len(self.mobie_list) % self.data_size != 0:
self.save_excel(self.mobile_list[-(len(self.mobile_list) % self.data_size):]) self.save_excel(self.mobie_list[-(len(self.mobie_list) % self.data_size):])
else: else:
writeInfo('缓存数据不足{0}条或没有剩余数据,不需要写入'.format(self.data_size)) writeInfo('缓存数据不足{0}条或没有剩余数据,不需要写入'.format(self.data_size))
@ -221,4 +221,4 @@ class myThread(threading.Thread):
self.kwargs = kwargs self.kwargs = kwargs
def run(self) -> None: def run(self) -> None:
self.crawler.get_mobile(self.base_url, self.param_url, **self.kwargs) self.crawler.get_mobie(self.base_url, self.param_url, **self.kwargs)

Loading…
Cancel
Save