|
|
|
@ -95,6 +95,23 @@ class CnmoCrawler(MobiePhoneCrawler): |
|
|
|
|
self.clear_data() |
|
|
|
|
writeInfo('清洗脏数据完毕') |
|
|
|
|
|
|
|
|
|
#获取价格标签数据 |
|
|
|
|
def find_chinese(self,html,text): |
|
|
|
|
if html.select_one('span:contains({0})'.format(text)) is None: |
|
|
|
|
return html.select_one('span:contains({0})'.format(text.encode('gbk').decode('iso8859-1'))) |
|
|
|
|
else: |
|
|
|
|
return html.select_one('span:contains({0})'.format(text)) |
|
|
|
|
|
|
|
|
|
def decode(self,text): |
|
|
|
|
encoding=['iso8859-1','iso8859-9'] |
|
|
|
|
decoding='gbk' |
|
|
|
|
for i in encoding: |
|
|
|
|
try: |
|
|
|
|
return text.encode(i).decode(decoding) |
|
|
|
|
except: |
|
|
|
|
pass |
|
|
|
|
return text |
|
|
|
|
|
|
|
|
|
def get_mobie(self, base_url, param_url, **kwargs): |
|
|
|
|
# 字典存储手机详细参数 |
|
|
|
|
param_dict = {} |
|
|
|
@ -112,19 +129,23 @@ class CnmoCrawler(MobiePhoneCrawler): |
|
|
|
|
if mobie_res is not None and mobie_res.status_code == 200: |
|
|
|
|
# 调用解析器解析网页请求体 |
|
|
|
|
try: |
|
|
|
|
mobie_res_html = BeautifulSoup(self.uzipData(mobie_res.content), 'html.parser') |
|
|
|
|
phone_name = mobie_res_html.select_one('#proName>a').text |
|
|
|
|
mobie_res_html = BeautifulSoup(mobie_res.content, 'html.parser') |
|
|
|
|
phone_name = self.decode(mobie_res_html.select_one('#proName>a').text) |
|
|
|
|
param_dict['手机名称'] = phone_name |
|
|
|
|
writeInfo("开始解析手机{0}详细参数".format(phone_name)) |
|
|
|
|
writeInfo("开始解析手机\"{0}\"详细参数".format(phone_name)) |
|
|
|
|
# 参考价格 |
|
|
|
|
param_dict['参考价格'] = mobie_res_html.select_one('span:contains(参考价格)').find_next().text |
|
|
|
|
price=self.find_chinese(mobie_res_html,'参考价格') |
|
|
|
|
if price is not None and price.find_next() is not None and "".join(filter(str.isdigit, price.find_next().text)).isdigit(): |
|
|
|
|
param_dict['参考价格'] = "".join(filter(str.isdigit, price.find_next().text)) |
|
|
|
|
# 电商报价 |
|
|
|
|
param_dict['电商报价'] = mobie_res_html.select_one('span:contains(电商报价)').next_sibling.strip() |
|
|
|
|
price=self.find_chinese(mobie_res_html,'电商报价') |
|
|
|
|
if price is not None and price.next_sibling is not None and "".join(filter(str.isdigit, price.next_sibling.strip())).isdigit(): |
|
|
|
|
param_dict['电商报价'] = "".join(filter(str.isdigit, price.next_sibling().strip)) |
|
|
|
|
# 获取参数名 |
|
|
|
|
param_name_list = mobie_res_html.select('div.right>p') |
|
|
|
|
for param_name in param_name_list: |
|
|
|
|
# 获取参数值 |
|
|
|
|
param_dict[param_name['paramname']] = param_name['paramvalue'] |
|
|
|
|
param_dict[param_name['paramname']] = self.decode(param_name['paramvalue']) |
|
|
|
|
# 获取锁,用于线程同步 |
|
|
|
|
self.threadLock.acquire() |
|
|
|
|
self.save_mobie(dict(param_dict, **kwargs)) |
|
|
|
@ -137,7 +158,7 @@ class CnmoCrawler(MobiePhoneCrawler): |
|
|
|
|
|
|
|
|
|
def save_mobie(self, mobie, ingore=False): |
|
|
|
|
self.mobie_list.append(mobie) |
|
|
|
|
writeInfo("当前已爬取{0}台手机".format(len(self.mobie_list))) |
|
|
|
|
writeInfo("目前爬虫进度,爬取数据量/目标数据量={current}/{max_count}({num})".format(current=len(self.mobie_list),max_count=self.max_count,num="{:.2%}".format(len(self.mobie_list)/self.max_count))) |
|
|
|
|
if not ingore and len(self.mobie_list) % self.data_size == 0: |
|
|
|
|
self.save_excel(self.mobie_list[-self.data_size:]) |
|
|
|
|
elif ingore and len(self.mobie_list) % self.data_size != 0: |
|
|
|
|