diff --git a/chromedriver.exe b/chromedriver.exe deleted file mode 100644 index 0eedbe0..0000000 Binary files a/chromedriver.exe and /dev/null differ diff --git a/config/config.py b/config/config.py index d7534fe..fdbaeb9 100644 --- a/config/config.py +++ b/config/config.py @@ -13,6 +13,7 @@ logFile = cf.get('file', 'logFile') logger=logging.getLogger() logger.setLevel(logging.INFO) def init(): + logging.getLogger("urllib3").setLevel(logging.ERROR) log_format=logging.Formatter(fmt="%(asctime)s %(levelname)s : %(message)s",datefmt='%Y-%m-%d %H:%M:%S') # 在控制台打印日志 streamHandler = logging.StreamHandler() diff --git a/main.py b/main.py index ba74c7e..fe568c5 100644 --- a/main.py +++ b/main.py @@ -95,6 +95,23 @@ class CnmoCrawler(MobiePhoneCrawler): self.clear_data() writeInfo('清洗脏数据完毕') + #获取价格标签数据 + def find_chinese(self,html,text): + if html.select_one('span:contains({0})'.format(text)) is None: + return html.select_one('span:contains({0})'.format(text.encode('gbk').decode('iso8859-1'))) + else: + return html.select_one('span:contains({0})'.format(text)) + + def decode(self,text): + encoding=['iso8859-1','iso8859-9'] + decoding='gbk' + for i in encoding: + try: + return text.encode(i).decode(decoding) + except: + pass + return text + def get_mobie(self, base_url, param_url, **kwargs): # 字典存储手机详细参数 param_dict = {} @@ -112,19 +129,23 @@ class CnmoCrawler(MobiePhoneCrawler): if mobie_res is not None and mobie_res.status_code == 200: # 调用解析器解析网页请求体 try: - mobie_res_html = BeautifulSoup(self.uzipData(mobie_res.content), 'html.parser') - phone_name = mobie_res_html.select_one('#proName>a').text + mobie_res_html = BeautifulSoup(mobie_res.content, 'html.parser') + phone_name = self.decode(mobie_res_html.select_one('#proName>a').text) param_dict['手机名称'] = phone_name - writeInfo("开始解析手机{0}详细参数".format(phone_name)) + writeInfo("开始解析手机\"{0}\"详细参数".format(phone_name)) # 参考价格 - param_dict['参考价格'] = mobie_res_html.select_one('span:contains(参考价格)').find_next().text + price=self.find_chinese(mobie_res_html,'参考价格') + if price is not None and price.find_next() is not None and "".join(filter(str.isdigit, price.find_next().text)).isdigit(): + param_dict['参考价格'] = "".join(filter(str.isdigit, price.find_next().text)) # 电商报价 - param_dict['电商报价'] = mobie_res_html.select_one('span:contains(电商报价)').next_sibling.strip() + price=self.find_chinese(mobie_res_html,'电商报价') + if price is not None and price.next_sibling is not None and "".join(filter(str.isdigit, price.next_sibling.strip())).isdigit(): + param_dict['电商报价'] = "".join(filter(str.isdigit, price.next_sibling().strip)) # 获取参数名 param_name_list = mobie_res_html.select('div.right>p') for param_name in param_name_list: # 获取参数值 - param_dict[param_name['paramname']] = param_name['paramvalue'] + param_dict[param_name['paramname']] = self.decode(param_name['paramvalue']) # 获取锁,用于线程同步 self.threadLock.acquire() self.save_mobie(dict(param_dict, **kwargs)) @@ -137,7 +158,7 @@ class CnmoCrawler(MobiePhoneCrawler): def save_mobie(self, mobie, ingore=False): self.mobie_list.append(mobie) - writeInfo("当前已爬取{0}台手机".format(len(self.mobie_list))) + writeInfo("目前爬虫进度,爬取数据量/目标数据量={current}/{max_count}({num})".format(current=len(self.mobie_list),max_count=self.max_count,num="{:.2%}".format(len(self.mobie_list)/self.max_count))) if not ingore and len(self.mobie_list) % self.data_size == 0: self.save_excel(self.mobie_list[-self.data_size:]) elif ingore and len(self.mobie_list) % self.data_size != 0: diff --git a/requirements.txt b/requirements.txt index 6eae377..18bebc8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,11 +6,6 @@ et-xmlfile==1.0.1 idna==2.8 jdcal==1.4.1 openpyxl==2.6.3 -Pillow==6.1.0 -pynput==1.4.2 -pywin32==224 requests==2.22.0 -selenium==3.141.0 -six==1.12.0 soupsieve==1.9.3 urllib3==1.25.3 diff --git a/test.py b/test.py index 06260f5..0917c20 100644 --- a/test.py +++ b/test.py @@ -1,3 +1,4 @@ +from config.log import writeInfo from main import CnmoCrawler