自动清理旧数据

解决频繁爬取导致的“Max retries exceeded with URL in requests ”
5 years ago · e05b64c18d
parent 5dbd2c792b
commit e05b64c18d
2 changed files with 39 additions and 15 deletions
--- a/Test.py
+++ b/Test.py
@ -0,0 +1,16 @@
+import requests
+from bs4 import BeautifulSoup
+HEADERS = {
+    "X-Requested-With": "XMLHttpRequest",
+    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36"
+                  "(KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36",
+}
+html=requests.get('https://jobs.51job.com/guangzhou/119982727.html?s=01&t=0',headers=HEADERS).content
+bs_tmp = BeautifulSoup(html, 'lxml').select(
+                    'body > div.tCompanyPage > div.tCompany_center.clearfix > div.tHeader.tHjob > div > div.cn')[0]
+bs_tmp1 = bs_tmp.select('h1')[0]
+bs_tmp2 = bs_tmp.select('strong')[0]
+bs_tmp3 = bs_tmp.select('p.cname > a.catn')[0]
+bs_tmp4 = bs_tmp.select(' p.msg.ltype')[0].text.replace(u'\xa0', '').split('|')
+tmp = {"岗位": bs_tmp1.text, "公司": bs_tmp3['title'], "薪资": bs_tmp2.text, '位置': bs_tmp4[0],
+       '工作经验': bs_tmp4[1], '学历': bs_tmp4[2], '招聘人数': bs_tmp4[3], '发布时间': bs_tmp4[4]}
--- a/job_spider.py
+++ b/job_spider.py
@ -4,6 +4,9 @@ from gevent import monkey
 from gevent.pool import Pool

 monkey.patch_all(select=False)
+
+from requests.adapters import HTTPAdapter
+from urllib3 import Retry
 import time
 import os
 import logging
@ -12,7 +15,6 @@ from queue import Queue
 from bs4 import BeautifulSoup


-
 def get_logger():
    """
    创建日志实例
@ -43,6 +45,11 @@ LOG_LEVEL = logging.INFO  # 日志等级
 POOL_MAXSIZE = 8  # 线程池最大容量

 logger = get_logger()
+session = requests.Session()
+retry = Retry(connect=3, backoff_factor=0.5)
+adapter = HTTPAdapter(max_retries=retry)
+session.mount('http://', adapter)
+session.mount('https://', adapter)


 class JobSpider:
@ -65,7 +72,7 @@ class JobSpider:
        urls = [START_URL.format(p) for p in range(1, 200)]  # #resultList > div:nth-child(53)
        for url in urls:
            logger.info("爬取链接:{}\n第 {} 页".format(url, urls.index(url) + 1))
-            html = requests.get(url, headers=HEADERS).content
+            html = session.get(url, headers=HEADERS).content
            bs = BeautifulSoup(html, "lxml").find("div", class_="dw_table").find_all("div", class_="el")
            for b in bs:
                try:
@ -83,9 +90,9 @@ class JobSpider:
        while True:
            # 从队列中取 url
            url = self.desc_url_queue.get()
-            resp = requests.get(url, headers=HEADERS)
+            resp = session.get(url, headers=HEADERS)
            if resp.status_code == 200:
-                logger.info("爬取链接:{}\n第 {} 条岗位详情".format(url,self.count))
+                logger.info("爬取链接:{}\n第 {} 条岗位详情".format(url, self.count))
                html = resp.content
                self.desc_url_queue.task_done()
                self.count += 1
@ -99,15 +106,15 @@ class JobSpider:
                bs_tmp2 = bs_tmp.select('strong')[0]
                bs_tmp3 = bs_tmp.select('p.cname > a.catn')[0]
                bs_tmp4 = bs_tmp.select(' p.msg.ltype')[0].text.replace(u'\xa0', '').split('|')
-
-                with open(os.path.join(self.job_dir, "岗位信息.txt"), 'ab+') as f:
-                    tmp = {"岗位": bs_tmp1.text, "公司": bs_tmp3['title'], "薪资": bs_tmp2.text, '位置': bs_tmp4[0],
-                           '工作经验': bs_tmp4[1], '学历': bs_tmp4[2], '招聘人数': bs_tmp4[3], '发布时间': bs_tmp4[4]}
-                    f.write((str(tmp) + '\n').encode('utf-8'))
-                bs = BeautifulSoup(html, "lxml").find("div", class_="bmsg job_msg inbox").text
-                s = bs.replace("微信", "").replace("分享", "").replace("邮件", "").replace("\t", "").strip()
-                with open(os.path.join(self.job_dir, "岗位描述.txt"), "a", encoding="utf-8") as f:
-                    f.write(s)
+                if len(bs_tmp4) == 5:
+                    with open(os.path.join(self.job_dir, "岗位信息.txt"), 'ab+') as f:
+                        tmp = {"岗位": bs_tmp1.text, "公司": bs_tmp3['title'], "薪资": bs_tmp2.text, '位置': bs_tmp4[0],
+                               '工作经验': bs_tmp4[1], '学历': bs_tmp4[2], '招聘人数': bs_tmp4[3], '发布时间': bs_tmp4[4]}
+                        f.write((str(tmp) + '\n').encode('utf-8'))
+                    bs = BeautifulSoup(html, "lxml").find("div", class_="bmsg job_msg inbox").text
+                    s = bs.replace("微信", "").replace("分享", "").replace("邮件", "").replace("\t", "").strip()
+                    with open(os.path.join(self.job_dir, "岗位描述.txt"), "a", encoding="utf-8") as f:
+                        f.write(s)
            except Exception as e:
                logger.error(e)
                logger.warning(url)
@ -135,7 +142,7 @@ class JobSpider:
        self.execute_more_tasks(self.post_require)
        self.desc_url_queue.join()  # 主线程阻塞,等待队列清空

-    def clearDir(self,rootdir):
+    def clearDir(self, rootdir):
        filelist = os.listdir(rootdir)
        for f in filelist:
            filepath = os.path.join(rootdir, f)
@ -145,8 +152,9 @@ class JobSpider:
            elif os.path.isdir(filepath):
                shutil.rmtree(filepath, True)

+
 if __name__ == "__main__":
    spider = JobSpider()
    start = time.time()
    spider.run()
-    logger.info("总耗时 {} 秒".format(time.time() - start))
+    logger.info("总耗时 {} 秒".format(time.time() - start))