自动清理旧数据

解决频繁爬取导致的“Max retries exceeded with URL in requests ”
5 years ago · e05b64c18d
parent 5dbd2c792b
commit e05b64c18d
2 changed files with 39 additions and 15 deletions
--- a/Test.py
+++ b/Test.py
@ -0,0 +1,16 @@
 import requests
 from bs4 import BeautifulSoup
 HEADERS = {
    "X-Requested-With": "XMLHttpRequest",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36"
                  "(KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36",
 }
 html=requests.get('https://jobs.51job.com/guangzhou/119982727.html?s=01&t=0',headers=HEADERS).content
 bs_tmp = BeautifulSoup(html, 'lxml').select(
                    'body > div.tCompanyPage > div.tCompany_center.clearfix > div.tHeader.tHjob > div > div.cn')[0]
 bs_tmp1 = bs_tmp.select('h1')[0]
 bs_tmp2 = bs_tmp.select('strong')[0]
 bs_tmp3 = bs_tmp.select('p.cname > a.catn')[0]
 bs_tmp4 = bs_tmp.select(' p.msg.ltype')[0].text.replace(u'\xa0', '').split('|')
 tmp = {"岗位": bs_tmp1.text, "公司": bs_tmp3['title'], "薪资": bs_tmp2.text, '位置': bs_tmp4[0],
       '工作经验': bs_tmp4[1], '学历': bs_tmp4[2], '招聘人数': bs_tmp4[3], '发布时间': bs_tmp4[4]}
--- a/job_spider.py
+++ b/job_spider.py
@ -4,6 +4,9 @@ from gevent import monkey
 from gevent.pool import Pool
 monkey.patch_all(select=False)
 from requests.adapters import HTTPAdapter
 from urllib3 import Retry
 import time
 import os
 import logging
@ -12,7 +15,6 @@ from queue import Queue
 from bs4 import BeautifulSoup
 def get_logger():
    """
    创建日志实例
@ -43,6 +45,11 @@ LOG_LEVEL = logging.INFO  # 日志等级
 POOL_MAXSIZE = 8  # 线程池最大容量
 logger = get_logger()
 session = requests.Session()
 retry = Retry(connect=3, backoff_factor=0.5)
 adapter = HTTPAdapter(max_retries=retry)
 session.mount('http://', adapter)
 session.mount('https://', adapter)
 class JobSpider:
@ -65,7 +72,7 @@ class JobSpider:
        urls = [START_URL.format(p) for p in range(1, 200)]  # #resultList > div:nth-child(53)
        for url in urls:
            logger.info("爬取链接:{}\n第 {} 页".format(url, urls.index(url) + 1))
-            html = requests.get(url, headers=HEADERS).content
+            html = session.get(url, headers=HEADERS).content
            bs = BeautifulSoup(html, "lxml").find("div", class_="dw_table").find_all("div", class_="el")
            for b in bs:
                try:
@ -83,9 +90,9 @@ class JobSpider:
        while True:
            # 从队列中取 url
            url = self.desc_url_queue.get()
-            resp = requests.get(url, headers=HEADERS)
+            resp = session.get(url, headers=HEADERS)
            if resp.status_code == 200:
-                logger.info("爬取链接:{}\n第 {} 条岗位详情".format(url,self.count))
+                logger.info("爬取链接:{}\n第 {} 条岗位详情".format(url, self.count))
                html = resp.content
                self.desc_url_queue.task_done()
                self.count += 1
@ -99,7 +106,7 @@ class JobSpider:
                bs_tmp2 = bs_tmp.select('strong')[0]
                bs_tmp3 = bs_tmp.select('p.cname > a.catn')[0]
                bs_tmp4 = bs_tmp.select(' p.msg.ltype')[0].text.replace(u'\xa0', '').split('|')
-
+                if len(bs_tmp4) == 5:
                    with open(os.path.join(self.job_dir, "岗位信息.txt"), 'ab+') as f:
                        tmp = {"岗位": bs_tmp1.text, "公司": bs_tmp3['title'], "薪资": bs_tmp2.text, '位置': bs_tmp4[0],
                               '工作经验': bs_tmp4[1], '学历': bs_tmp4[2], '招聘人数': bs_tmp4[3], '发布时间': bs_tmp4[4]}
@ -135,7 +142,7 @@ class JobSpider:
        self.execute_more_tasks(self.post_require)
        self.desc_url_queue.join()  # 主线程阻塞,等待队列清空
-    def clearDir(self,rootdir):
+    def clearDir(self, rootdir):
        filelist = os.listdir(rootdir)
        for f in filelist:
            filepath = os.path.join(rootdir, f)
@ -145,6 +152,7 @@ class JobSpider:
            elif os.path.isdir(filepath):
                shutil.rmtree(filepath, True)
 if __name__ == "__main__":
    spider = JobSpider()
    start = time.time()