判断爬到最后一页则结束爬虫

master
panqihua 5 years ago
parent 1e39d2fc72
commit 07c02ccf55
  1. 22
      main.py

@ -80,6 +80,11 @@ class CnmoCrawler(MobiePhoneCrawler):
self.threads.append(thread)
# 获取下一页链接
href = res_html.select_one(".pnext")["href"]
if len(href) == 0:
writeInfo('已经没有更多数据,爬虫程序将结束')
return
else:
next_page_url = 'http:{0}'.format(res_html.select_one(".pnext")["href"])
except Exception as e:
writeError("解析列表页出现异常信息:{0}".format(e))
@ -89,9 +94,9 @@ class CnmoCrawler(MobiePhoneCrawler):
def run(self):
try:
self.get_page()
writeInfo('采集数据完毕,开始清洗脏数据')
except Exception as e:
writeError("采集数据出现异常,开始清洗脏数据")
writeInfo('采集数据完毕,开始清洗脏数据')
self.clear_data()
writeInfo('清洗脏数据完毕')
@ -135,11 +140,13 @@ class CnmoCrawler(MobiePhoneCrawler):
writeInfo("开始解析手机\"{0}\"详细参数".format(phone_name))
# 参考价格
price = self.find_chinese(mobie_res_html, '参考价格')
if price is not None and price.find_next() is not None and "".join(filter(str.isdigit, price.find_next().text)).isdigit():
if price is not None and price.find_next() is not None and "".join(
filter(str.isdigit, price.find_next().text)).isdigit():
param_dict['参考价格'] = "".join(filter(str.isdigit, price.find_next().text))
# 电商报价
price = self.find_chinese(mobie_res_html, '电商报价')
if price is not None and price.next_sibling is not None and "".join(filter(str.isdigit, price.next_sibling.strip())).isdigit():
if price is not None and price.next_sibling is not None and "".join(
filter(str.isdigit, price.next_sibling.strip())).isdigit():
param_dict['电商报价'] = "".join(filter(str.isdigit, price.next_sibling().strip))
# 获取参数名
param_name_list = mobie_res_html.select('div.right>p')
@ -158,7 +165,10 @@ class CnmoCrawler(MobiePhoneCrawler):
def save_mobie(self, mobie, ingore=False):
self.mobie_list.append(mobie)
writeInfo("目前爬虫进度,爬取数据量/目标数据量={current}/{max_count}({num})".format(current=len(self.mobie_list),max_count=self.max_count,num="{:.2%}".format(len(self.mobie_list)/self.max_count)))
writeInfo("目前爬虫进度,爬取数据量/目标数据量={current}/{max_count}({num})".format(current=len(self.mobie_list),
max_count=self.max_count,
num="{:.2%}".format(
len(self.mobie_list) / self.max_count)))
if not ingore and len(self.mobie_list) % self.data_size == 0:
self.save_excel(self.mobie_list[-self.data_size:])
elif ingore and len(self.mobie_list) % self.data_size != 0:
@ -206,7 +216,8 @@ class CnmoCrawler(MobiePhoneCrawler):
val = source_ws.cell(row=current_row, column=current_column).value
if val is None or len(val) == 0 or (
'参考价格' in self.param_name_list and
current_column == list(self.param_name_list).index('参考价格')+1 and val == '曝光' or val == '即将上市'):
current_column == list(self.param_name_list).index(
'参考价格') + 1 and val in ['曝光','即将上市']):
for i in range(1, self.param_required_index + 1):
target_ws.cell(row=write_row, column=i, value='')
break
@ -216,6 +227,7 @@ class CnmoCrawler(MobiePhoneCrawler):
write_row += 1
# 保存清洗结果
target_wb.save(self.file2)
target_wb.close()
def get_req(self, url, max_retries=3, **kwargs):
try:

Loading…
Cancel
Save