|
|
|
@ -80,7 +80,12 @@ class CnmoCrawler(MobiePhoneCrawler): |
|
|
|
|
self.threads.append(thread) |
|
|
|
|
|
|
|
|
|
# 获取下一页链接 |
|
|
|
|
next_page_url = 'http:{0}'.format(res_html.select_one(".pnext")["href"]) |
|
|
|
|
href = res_html.select_one(".pnext")["href"] |
|
|
|
|
if len(href) == 0: |
|
|
|
|
writeInfo('已经没有更多数据,爬虫程序将结束') |
|
|
|
|
return |
|
|
|
|
else: |
|
|
|
|
next_page_url = 'http:{0}'.format(res_html.select_one(".pnext")["href"]) |
|
|
|
|
except Exception as e: |
|
|
|
|
writeError("解析列表页出现异常信息:{0}".format(e)) |
|
|
|
|
else: |
|
|
|
@ -89,22 +94,22 @@ class CnmoCrawler(MobiePhoneCrawler): |
|
|
|
|
def run(self): |
|
|
|
|
try: |
|
|
|
|
self.get_page() |
|
|
|
|
writeInfo('采集数据完毕,开始清洗脏数据') |
|
|
|
|
except Exception as e: |
|
|
|
|
writeError("采集数据出现异常,开始清洗脏数据") |
|
|
|
|
writeInfo('采集数据完毕,开始清洗脏数据') |
|
|
|
|
self.clear_data() |
|
|
|
|
writeInfo('清洗脏数据完毕') |
|
|
|
|
|
|
|
|
|
#获取价格标签数据 |
|
|
|
|
def find_chinese(self,html,text): |
|
|
|
|
# 获取价格标签数据 |
|
|
|
|
def find_chinese(self, html, text): |
|
|
|
|
if html.select_one('span:contains({0})'.format(text)) is None: |
|
|
|
|
return html.select_one('span:contains({0})'.format(text.encode('gbk').decode('iso8859-1'))) |
|
|
|
|
else: |
|
|
|
|
return html.select_one('span:contains({0})'.format(text)) |
|
|
|
|
|
|
|
|
|
def decode(self,text): |
|
|
|
|
encoding=['iso8859-1','iso8859-9'] |
|
|
|
|
decoding='gbk' |
|
|
|
|
def decode(self, text): |
|
|
|
|
encoding = ['iso8859-1', 'iso8859-9'] |
|
|
|
|
decoding = 'gbk' |
|
|
|
|
for i in encoding: |
|
|
|
|
try: |
|
|
|
|
return text.encode(i).decode(decoding) |
|
|
|
@ -134,12 +139,14 @@ class CnmoCrawler(MobiePhoneCrawler): |
|
|
|
|
param_dict['手机名称'] = phone_name |
|
|
|
|
writeInfo("开始解析手机\"{0}\"详细参数".format(phone_name)) |
|
|
|
|
# 参考价格 |
|
|
|
|
price=self.find_chinese(mobie_res_html,'参考价格') |
|
|
|
|
if price is not None and price.find_next() is not None and "".join(filter(str.isdigit, price.find_next().text)).isdigit(): |
|
|
|
|
price = self.find_chinese(mobie_res_html, '参考价格') |
|
|
|
|
if price is not None and price.find_next() is not None and "".join( |
|
|
|
|
filter(str.isdigit, price.find_next().text)).isdigit(): |
|
|
|
|
param_dict['参考价格'] = "".join(filter(str.isdigit, price.find_next().text)) |
|
|
|
|
# 电商报价 |
|
|
|
|
price=self.find_chinese(mobie_res_html,'电商报价') |
|
|
|
|
if price is not None and price.next_sibling is not None and "".join(filter(str.isdigit, price.next_sibling.strip())).isdigit(): |
|
|
|
|
price = self.find_chinese(mobie_res_html, '电商报价') |
|
|
|
|
if price is not None and price.next_sibling is not None and "".join( |
|
|
|
|
filter(str.isdigit, price.next_sibling.strip())).isdigit(): |
|
|
|
|
param_dict['电商报价'] = "".join(filter(str.isdigit, price.next_sibling().strip)) |
|
|
|
|
# 获取参数名 |
|
|
|
|
param_name_list = mobie_res_html.select('div.right>p') |
|
|
|
@ -158,7 +165,10 @@ class CnmoCrawler(MobiePhoneCrawler): |
|
|
|
|
|
|
|
|
|
def save_mobie(self, mobie, ingore=False): |
|
|
|
|
self.mobie_list.append(mobie) |
|
|
|
|
writeInfo("目前爬虫进度,爬取数据量/目标数据量={current}/{max_count}({num})".format(current=len(self.mobie_list),max_count=self.max_count,num="{:.2%}".format(len(self.mobie_list)/self.max_count))) |
|
|
|
|
writeInfo("目前爬虫进度,爬取数据量/目标数据量={current}/{max_count}({num})".format(current=len(self.mobie_list), |
|
|
|
|
max_count=self.max_count, |
|
|
|
|
num="{:.2%}".format( |
|
|
|
|
len(self.mobie_list) / self.max_count))) |
|
|
|
|
if not ingore and len(self.mobie_list) % self.data_size == 0: |
|
|
|
|
self.save_excel(self.mobie_list[-self.data_size:]) |
|
|
|
|
elif ingore and len(self.mobie_list) % self.data_size != 0: |
|
|
|
@ -197,7 +207,7 @@ class CnmoCrawler(MobiePhoneCrawler): |
|
|
|
|
source_wb = load_workbook(self.file1) |
|
|
|
|
source_ws = source_wb.active |
|
|
|
|
# 清洗脏数据后的新报表 |
|
|
|
|
self.init_excel(self.file2,max_index=self.param_required_index) |
|
|
|
|
self.init_excel(self.file2, max_index=self.param_required_index) |
|
|
|
|
target_wb = load_workbook(self.file2) |
|
|
|
|
target_ws = target_wb.active |
|
|
|
|
write_row = 2 |
|
|
|
@ -206,7 +216,8 @@ class CnmoCrawler(MobiePhoneCrawler): |
|
|
|
|
val = source_ws.cell(row=current_row, column=current_column).value |
|
|
|
|
if val is None or len(val) == 0 or ( |
|
|
|
|
'参考价格' in self.param_name_list and |
|
|
|
|
current_column == list(self.param_name_list).index('参考价格')+1 and val == '曝光' or val == '即将上市'): |
|
|
|
|
current_column == list(self.param_name_list).index( |
|
|
|
|
'参考价格') + 1 and val in ['曝光','即将上市']): |
|
|
|
|
for i in range(1, self.param_required_index + 1): |
|
|
|
|
target_ws.cell(row=write_row, column=i, value='') |
|
|
|
|
break |
|
|
|
@ -216,6 +227,7 @@ class CnmoCrawler(MobiePhoneCrawler): |
|
|
|
|
write_row += 1 |
|
|
|
|
# 保存清洗结果 |
|
|
|
|
target_wb.save(self.file2) |
|
|
|
|
target_wb.close() |
|
|
|
|
|
|
|
|
|
def get_req(self, url, max_retries=3, **kwargs): |
|
|
|
|
try: |
|
|
|
|