init

5 years ago · f85ab74267
parent 0031d4f9d3
commit f85ab74267
2 changed files with 2 additions and 290 deletions
--- a/main.py
+++ b/main.py
@ -1,298 +1,17 @@
-import gzip
-import json
 import os
-import random
+import os
 import re
 import threading
 import time
-import zlib
-
-import io
-from typing import Optional, Callable, Any, Iterable, Mapping

 import requests
-import win32api
-import win32con
-from PIL import Image
 from bs4 import BeautifulSoup
 # 手机实体类
 from openpyxl import load_workbook, Workbook
-from pynput.mouse import Controller, Button
-from selenium.common.exceptions import NoSuchElementException
-from selenium.webdriver.chrome import webdriver
-from selenium.webdriver.chrome.options import Options
-from urllib3.exceptions import HeaderParsingError

 from Crawler import MobilePhoneCrawler
 from config.config import cf, config_path
 from config.log import writeInfo, writeError
-from bs4 import BeautifulSoup
-import re
-
-headers = {
-    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36',
-    'cookie': 'hng=CN%7Czh-CN%7CCNY%7C156; lid=tb84443556; enc=Di1rYCRWDQZC0UbccH38rIzuBg8LLFGKPSeQNu0fJ6Atw1lfF%2BtBE6Jm3vKtkZ%2FcJwoY%2FA2OAFq1CCgzrB0Wmg%3D%3D; t=2ce95276273d2f0fec4b735114efb9f0; uc3=id2=UonSf2s8K7H57A%3D%3D&nk2=F5RNYQezF9ZVJA%3D%3D&lg2=URm48syIIVrSKA%3D%3D&vt3=F8dByuPZuePp%2FK4exO4%3D; tracknick=tb84443556; uc4=nk4=0%40FY4Gtg6GE3gLVPH74U0sgDg9VVYt&id4=0%40UOE4tAnGHWIKt7PI5bS6f4noV%2Bbp; lgc=tb84443556; _tb_token_=e83ebbe73eeff; cookie2=170434fc3c4cae5b8d05c6ca3e035a7d; cna=3uL0FRGpQX4CAdoUCaGR7Dw2; UM_distinctid=16d0a0bd4f67e9-0657eeb9eeb382-5373e62-384000-16d0a0bd4f7aff; otherx=e%3D1%26p%3D*%26s%3D0%26c%3D0%26f%3D0%26g%3D0%26t%3D0; _med=dw:2560&dh:1440&pw:2560&ph:1440&ist:0; cq=ccp%3D1; swfstore=175036; res=scroll%3A1613*5900-client%3A1613*924-offset%3A1613*5900-screen%3A2560*1440; CNZZDATA1256793290=2086205104-1567833369-%7C1567844169; pnm_cku822=098%23E1hvKpvUvbpvUpCkvvvvvjiPRFS96jtjn2MWsjD2PmPp1jr8RLqwtjY8RFdUsjtbRpGCvvLMMQvvmphvLhbyi9mFecXPjV5vsEe4jVDQpGoHbdiQpznCwaZOecXPjV5vsEe4jVDQpYLhbdiQpzwBwaZOecEvjV5vsCkwjVDQpGFvbdiQpzvxwa7ivpvUphvhrpcsXvmEvpvVpyUUCE%2BfKphv8hCvCbyvvh89phvOJpvvpYYvpm9vvvCHtZCv8vvvvhcDphvOKpvvBHvCvpvVphhvvvvvRphvChCvvvm5vpvhphvhHv%3D%3D; isg=BHBwqfKM2QBI7oUELIuAZ-OhQT7CuVQDVfnti2rBAUuPJRHPEs91k6SXeW3gtQzb; l=cBO-RSocq17jyErCBOCZlurza77TvIRAguPzaNbMi_5QN1TQITQOkrDBxe96cjWdtj8B4JuaUMv9-etuigILNzGHtBUV.'
-}
-
-# 获取字典cookie
-cookies = headers['cookie'].split(';')
-cookie_list = []
-for cookie in cookies:
-    cookie_list.append({'name': cookie.split('=')[0], 'value': cookie.split('=')[1]})
-
-
-# 天猫手机爬虫
-# https://list.tmall.com/search_product.htm?q=%CA%D6%BB%FA&click_id=%CA%D6%BB%FA&from=mallfp..pc_1.7_hq&spm=875.7931836%2FB.a1z5h.8.66144265MD9ShM
-class TmallCrawler(MobilePhoneCrawler):
-
-    def __init__(self) -> None:
-        super().__init__()
-        # 手机实体数据
-        self.session = requests.Session()
-        # 登录
-        # self.login()
-        self.get_page()
-
-    '''
-    登录
-    '''
-
-    def login(self):
-        # 获取验证码图片
-        login_url = 'https://qrlogin.taobao.com/qrcodelogin/generateQRCode4Login.do?from=tmall&appkey=00000000&umid_token=T4C16243DC287A311CA928E0D5EA177D443B864009178BBAA55A4CB86A4'
-        writeInfo(login_url)
-        login_res = self.session.get(login_url)
-        res_content = login_res.content.decode()
-        res_json = json.loads(res_content[res_content.index("{"):res_content.index("}") + 1])
-        writeInfo(json.dumps(res_json, indent=1))
-        img_url = res_json["url"]
-        img_res = self.session.get("http:%s" % img_url)
-        if img_res.status_code == 200:
-            img_name = 'login.png'
-            # 保存二维码图片
-            with open(img_name, 'wb') as file:
-                file.write(img_res.content)
-            # 打开二维码图片
-            Image.open(img_name).show()
-            win32api.MessageBox(0, "请打开手机淘宝扫描二维码", "提醒", win32con.MB_ICONWARNING | win32con.MB_SYSTEMMODAL)
-            while True:
-                login_url = "https://qrlogin.taobao.com/qrcodelogin/qrcodeLoginCheck.do?lgToken={0}&defaulturl=https%3A%2F%2Fwww.tmall.com".format(
-                    res_json['lgToken'])
-                writeInfo("login_url:{0}".format(login_url))
-                check_login_res = self.session.get(login_url)
-                # 检查扫码结果
-                if check_login_res.status_code == 200:
-                    check_login_res_json = json.loads(check_login_res.content.decode())
-                    writeInfo(json.dumps(check_login_res_json, indent=1))
-                    if check_login_res_json['code'] == '10006':
-                        # 扫码成功
-                        check_login_url = check_login_res_json['url']
-                        writeInfo("check_login_url={0}".format(check_login_url))
-                        login_res = self.session.get(check_login_url)
-                        if login_res.status_code == 200:
-                            # 重定向登陆身份验证
-                            login_res_html = BeautifulSoup(login_res.content, 'html.parser')
-                            check_url = login_res_html.select_one("iframe")["src"]
-                            writeInfo("check_url={0}".format(check_url))
-                            # 登录身份验证
-                            check_login_res = self.session.get(check_url)
-                            if check_login_res.status_code == 200:
-                                check_login_res_content = check_login_res.content.decode()
-                                # 阿里巴巴集团 |  身份验证
-                                verify_modes_url = re.search("http.*verify_modes.*=",
-                                                             check_login_res_content).group() + '1'
-                                verify_modes_res = self.session.get(verify_modes_url)
-                                if verify_modes_res.status_code == 200:
-                                    verify_modes_res_content = verify_modes_res.content.decode()
-                                    if '你最近购买过什么商品' in verify_modes_res_content:
-                                        raise Exception("触发图片验证，模拟请求失败")
-                                    else:
-                                        win32api.MessageBox(0, "请在手机淘宝上点击确认按钮登录", "提醒",
-                                                            win32con.MB_ICONWARNING | win32con.MB_SYSTEMMODAL)
-                                        # 检测手机淘宝确认状态
-                                        htoken = re.search("htoken\".*[a-zA-Z]", verify_modes_res_content).group()
-                                        htoken = htoken[htoken.index(":") + 2:]
-                                        while True:
-                                            time.sleep(1)
-                                            check_status_res = self.session.get(
-                                                "https://passport.taobao.com/iv/onekey/check_status.do?htoken={0}".format(
-                                                    htoken))
-                                            if check_status_res.status_code == 200:
-                                                check_status_res_json = json.loads(check_status_res.content.decode())
-                                                if check_status_res_json['content']['code'] == '1':
-                                                    login_safe_res = self.session.get(
-                                                        check_status_res_json['content']['url'])
-                                                    if login_safe_res.status_code == 200:
-                                                        # login_safe_res_content=login_safe_res.content.decode(login_safe_res.apparent_encoding)
-                                                        # login_safe_href=re.search("https.*pass.tmall.com.*\w",login_safe_res_content).group()
-                                                        # index_res = self.session.get(login_safe_href)
-                                                        writeInfo("登录成功")
-                                                        break
-                                                    else:
-                                                        raise Exception("模拟登陆请求失败！！！")
-                                                else:
-                                                    writeInfo(json.dumps(check_status_res_json, indent=1))
-                                            else:
-                                                raise Exception("模拟登陆请求失败！！！")
-                                        break
-                                else:
-                                    raise Exception("模拟登陆请求失败！！！")
-                            else:
-                                raise Exception("模拟登陆请求失败！！！")
-                        else:
-                            raise Exception("模拟登陆请求失败！！！")
-                    elif check_login_res_json['code'] == '10004':
-                        self.login()
-                time.sleep(1)
-        else:
-            raise Exception("获取登陆二维码图片失败")
-
-    '''
-    获取分页数据
-    url:分页url
-    '''
-
-    def get_page(self):
-        # 商品列表页地址
-        domain = "https://list.tmall.com/search_product.htm"
-        url = '{0}?q=%CA%D6%BB%FA&type=p&vmarket=&spm=875.7931836%2FB.a2227oh.d100&xl=shouji_1&from=mallfp..pc_1_suggest'.format(
-            domain)
-        while True:
-            # 获取分页响应数据
-            res = self.session.get(url, headers=headers)
-            # 判断响应状态码200才做处理
-            if res.status_code == 200:
-                try:
-                    # 使用BeautifulSoup解析html
-                    res_html = BeautifulSoup(res.content, 'html.parser')
-                    # 验证码检测
-                    if 'security-X5' == res_html.select_one("title").text:
-                        self.clickCaptcha(url)
-                    # 获取当前页
-                    current_page = res_html.select_one("b[class=ui-page-cur]")
-                    writeInfo("开始解析第{0}页的数据,url:{1}".format(current_page.text, url))
-                    # 获取商品列表里的每个超链接
-                    product_hrefs = res_html.select("#J_ItemList .productTitle>a")
-                    for product_href in product_hrefs:
-                        # 轮询超链接获取商品详情数据
-                        self.get_mobile("https:{0}".format(product_href['href']))
-                        # 超过指定数据量结束循环
-                        if len(self.mobile_list) == self.max_count:
-                            break
-                except Exception as e:
-                    writeError(e)
-            else:
-                writeError("获取分页信息失败，url:%s响应状态码：%d" % (url, res.status_code))
-            url = "{0}{1}".format(domain, current_page.find_next_siblings()[0]['href'])
-
-    '''
-    滑动认证
-    res_html：滑动验证码页面源代码
-    url：滑动验证码页面url
-    '''
-
-    def clickCaptcha(self, url):
-        try:
-            chrome_options = Options()
-            chrome_options.binary_location = cf.get('selenium', 'binary_location')
-            # 以root权限运行
-            chrome_options.add_argument('--no-sandbox')
-            chrome_options.add_argument('--disable-dev-shm-usage')
-            # chrome_options.add_argument('--headless')
-            # 设置用户数据路径
-            chrome_options.add_argument('--user-data-dir={0}'.format(cf.get('selenium', 'user_data_dir')))
-            # 不加载图片
-            chrome_options.add_argument('blink-settings=imagesEnabled=false')
-            # 禁用gpu加速
-            chrome_options.add_argument('--disable-gpu')
-            # 最大化
-            chrome_options.add_argument('--start-maximized')
-            # 全屏模式
-            chrome_options.add_argument('start-fullscreen')
-            # 设置为开发者模式，防止被识别出来使用了Selenium
-            chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
-            driver = webdriver.WebDriver(options=chrome_options, service_log_path="I:\ChromeUpdater\selenium.log")
-            # driver.set_window_rect(0,0,1024,768)
-            # 访问滑动验证页面
-            driver.get(url)
-            try:
-                # 获取滑块
-                nc_1_n1z = driver.find_element_by_css_selector("#nc_1_n1z")
-                # 获取滑动条
-                nc_1__scale_text = driver.find_element_by_css_selector("#nc_1__scale_text")
-                # 滑块坐标中心
-                mouse = Controller()
-                # 移动到滑块坐标中心
-                x = nc_1_n1z.rect['x'] + nc_1_n1z.rect['width'] / 2
-                y = nc_1_n1z.rect['y'] + nc_1_n1z.rect['height'] / 2
-                mouse.position = (x, y)
-                time.sleep(0.5)
-                mouse.press(Button.left)
-                time.sleep(0.5)
-                mouse.move(x + nc_1__scale_text.rect['width'] - nc_1_n1z.rect['width'], y)
-                time.sleep(0.5)
-                mouse.release(Button.left)
-                while True:
-                    if len(driver.find_elements_by_css_selector(".errloading")) > 0:
-                        driver.quit()
-                        self.clickCaptcha(url)
-                        break
-                    else:
-                        pass
-                        # sub_slide_width = random.randint(30, 50)
-                        # action.move_by_offset(sub_slide_width, 0).perform()  # 移动滑块
-                        # start += sub_slide_width
-                    time.sleep(random.randint(1, 10) / 10)
-                cookie_list = driver.get_cookies()
-                # 关闭浏览器
-                driver.quit()
-            except NoSuchElementException as e:
-                writeError(e)
-                driver.quit()
-                self.clickCaptcha(url)
-        except Exception as e:
-            writeError(e)
-            raise Exception("模拟滑动验证失败")
-
-    '''
-    获取手机详情数据
-    url：手机链接
-    '''
-
-    def get_mobile(self, url, param_url=None, **kwargs):
-        res = self.session.get(url)
-        if res.status_code == 200:
-            res_html = BeautifulSoup(res.content, 'html.parser')
-            # 验证码检测
-            if 'security-X5' == res_html.select_one("title").text:
-                self.clickCaptcha(url)
-            # 获取手机规格参数
-            # 判断手机是否有规格参数
-            if res_html.select_one("#J_Attrs") is None:
-                writeInfo("手机详情url：%s没有规格参数" % url)
-            else:
-                try:
-                    ths = res_html.select("table:contains('规格参数') tbody>tr:not([class='tm-tableAttrSub']) th")
-                    # 轮询规格参数表格里的每一行参数
-                    mobile_dict = {}
-                    for th in ths:
-                        if 'colspan' in th.attrs:
-                            continue
-                        # 字典存储规格参数
-                        key = str(th.text).strip()
-                        value = str(th.next_sibling.text).strip()
-                        mobile_dict[key] = value
-                    # 存放到列表里
-                    self.mobile_list.append(mobile_dict)
-                    writeInfo("添加手机:{0}信息".format(str(res_html.select_one("div[class=tb-detail-hd]>h1").text).strip()))
-                except Exception as e:
-                    writeError(e)
-        else:
-            writeError("手机url:%s响应状态码：%d" % (url, res.status_code))
-
-    # 保存手机数据
-    def save_mobile(self, mobile):
-        self.mobile_list.append(mobile)


 # 评测中心手机爬虫
--- a/test.py
+++ b/test.py
@ -1,14 +1,7 @@
+from main import CnmoCrawler

-import requests
-
-from main import TmallCrawler, CnmoCrawler
-
-def abc():
-    pass

 if __name__ == '__main__':
-    # 天猫爬虫测试
-    # TmallCrawler().get_page()
    # 评测中心爬虫测试
    CnmoCrawler().run()
    # print(int(300/100))