master
panqihua 5 years ago
parent f4c1d435b1
commit 1e39d2fc72
  1. BIN
      chromedriver.exe
  2. 1
      config/config.py
  3. 35
      main.py
  4. 5
      requirements.txt
  5. 1
      test.py

Binary file not shown.

@ -13,6 +13,7 @@ logFile = cf.get('file', 'logFile')
logger=logging.getLogger()
logger.setLevel(logging.INFO)
def init():
logging.getLogger("urllib3").setLevel(logging.ERROR)
log_format=logging.Formatter(fmt="%(asctime)s %(levelname)s : %(message)s",datefmt='%Y-%m-%d %H:%M:%S')
# 在控制台打印日志
streamHandler = logging.StreamHandler()

@ -95,6 +95,23 @@ class CnmoCrawler(MobiePhoneCrawler):
self.clear_data()
writeInfo('清洗脏数据完毕')
#获取价格标签数据
def find_chinese(self,html,text):
if html.select_one('span:contains({0})'.format(text)) is None:
return html.select_one('span:contains({0})'.format(text.encode('gbk').decode('iso8859-1')))
else:
return html.select_one('span:contains({0})'.format(text))
def decode(self,text):
encoding=['iso8859-1','iso8859-9']
decoding='gbk'
for i in encoding:
try:
return text.encode(i).decode(decoding)
except:
pass
return text
def get_mobie(self, base_url, param_url, **kwargs):
# 字典存储手机详细参数
param_dict = {}
@ -112,19 +129,23 @@ class CnmoCrawler(MobiePhoneCrawler):
if mobie_res is not None and mobie_res.status_code == 200:
# 调用解析器解析网页请求体
try:
mobie_res_html = BeautifulSoup(self.uzipData(mobie_res.content), 'html.parser')
phone_name = mobie_res_html.select_one('#proName>a').text
mobie_res_html = BeautifulSoup(mobie_res.content, 'html.parser')
phone_name = self.decode(mobie_res_html.select_one('#proName>a').text)
param_dict['手机名称'] = phone_name
writeInfo("开始解析手机{0}详细参数".format(phone_name))
writeInfo("开始解析手机\"{0}\"详细参数".format(phone_name))
# 参考价格
param_dict['参考价格'] = mobie_res_html.select_one('span:contains(参考价格)').find_next().text
price=self.find_chinese(mobie_res_html,'参考价格')
if price is not None and price.find_next() is not None and "".join(filter(str.isdigit, price.find_next().text)).isdigit():
param_dict['参考价格'] = "".join(filter(str.isdigit, price.find_next().text))
# 电商报价
param_dict['电商报价'] = mobie_res_html.select_one('span:contains(电商报价)').next_sibling.strip()
price=self.find_chinese(mobie_res_html,'电商报价')
if price is not None and price.next_sibling is not None and "".join(filter(str.isdigit, price.next_sibling.strip())).isdigit():
param_dict['电商报价'] = "".join(filter(str.isdigit, price.next_sibling().strip))
# 获取参数名
param_name_list = mobie_res_html.select('div.right>p')
for param_name in param_name_list:
# 获取参数值
param_dict[param_name['paramname']] = param_name['paramvalue']
param_dict[param_name['paramname']] = self.decode(param_name['paramvalue'])
# 获取锁,用于线程同步
self.threadLock.acquire()
self.save_mobie(dict(param_dict, **kwargs))
@ -137,7 +158,7 @@ class CnmoCrawler(MobiePhoneCrawler):
def save_mobie(self, mobie, ingore=False):
self.mobie_list.append(mobie)
writeInfo("当前已爬取{0}台手机".format(len(self.mobie_list)))
writeInfo("目前爬虫进度,爬取数据量/目标数据量={current}/{max_count}({num})".format(current=len(self.mobie_list),max_count=self.max_count,num="{:.2%}".format(len(self.mobie_list)/self.max_count)))
if not ingore and len(self.mobie_list) % self.data_size == 0:
self.save_excel(self.mobie_list[-self.data_size:])
elif ingore and len(self.mobie_list) % self.data_size != 0:

@ -6,11 +6,6 @@ et-xmlfile==1.0.1
idna==2.8
jdcal==1.4.1
openpyxl==2.6.3
Pillow==6.1.0
pynput==1.4.2
pywin32==224
requests==2.22.0
selenium==3.141.0
six==1.12.0
soupsieve==1.9.3
urllib3==1.25.3

@ -1,3 +1,4 @@
from config.log import writeInfo
from main import CnmoCrawler

Loading…
Cancel
Save