From 5dbd2c792bf542733aa723068cb00c1b6d913d78 Mon Sep 17 00:00:00 2001
From: pan <1029559041@qq.com>
Date: Tue, 30 Jun 2020 18:33:16 +0800
Subject: [PATCH] update

---
 job_spider.py | 32 ++++++++++++++++++++++++--------
 1 file changed, 24 insertions(+), 8 deletions(-)

diff --git a/job_spider.py b/job_spider.py
index 150cf43..a52b57d 100644
--- a/job_spider.py
+++ b/job_spider.py
@@ -1,3 +1,5 @@
+import shutil
+
 from gevent import monkey
 from gevent.pool import Pool
 
@@ -9,8 +11,6 @@ import requests
 from queue import Queue
 from bs4 import BeautifulSoup
 
-# 开启多线程
-monkey.patch_all()
 
 
 def get_logger():
@@ -49,6 +49,7 @@ class JobSpider:
     """
     Job 网站爬虫类
     """
+    job_dir = 'data'
 
     def __init__(self):
         self.count = 1  # 记录当前爬第几条数据
@@ -64,7 +65,7 @@ class JobSpider:
         urls = [START_URL.format(p) for p in range(1, 200)]  # #resultList > div:nth-child(53)
         for url in urls:
             logger.info("爬取链接:{}\n第 {} 页".format(url, urls.index(url) + 1))
-            html = requests.get(url, headers=HEADERS).content.decode("gbk")
+            html = requests.get(url, headers=HEADERS).content
             bs = BeautifulSoup(html, "lxml").find("div", class_="dw_table").find_all("div", class_="el")
             for b in bs:
                 try:
@@ -84,8 +85,8 @@ class JobSpider:
             url = self.desc_url_queue.get()
             resp = requests.get(url, headers=HEADERS)
             if resp.status_code == 200:
-                logger.info("爬取第 {} 条岗位详情".format(self.count))
-                html = resp.content.decode("gbk")
+                logger.info("爬取链接:{}\n第 {} 条岗位详情".format(url,self.count))
+                html = resp.content
                 self.desc_url_queue.task_done()
                 self.count += 1
             else:
@@ -98,13 +99,14 @@ class JobSpider:
                 bs_tmp2 = bs_tmp.select('strong')[0]
                 bs_tmp3 = bs_tmp.select('p.cname > a.catn')[0]
                 bs_tmp4 = bs_tmp.select(' p.msg.ltype')[0].text.replace(u'\xa0', '').split('|')
-                with open('data/岗位信息.txt', 'ab+') as f:
+
+                with open(os.path.join(self.job_dir, "岗位信息.txt"), 'ab+') as f:
                     tmp = {"岗位": bs_tmp1.text, "公司": bs_tmp3['title'], "薪资": bs_tmp2.text, '位置': bs_tmp4[0],
                            '工作经验': bs_tmp4[1], '学历': bs_tmp4[2], '招聘人数': bs_tmp4[3], '发布时间': bs_tmp4[4]}
                     f.write((str(tmp) + '\n').encode('utf-8'))
                 bs = BeautifulSoup(html, "lxml").find("div", class_="bmsg job_msg inbox").text
                 s = bs.replace("微信", "").replace("分享", "").replace("邮件", "").replace("\t", "").strip()
-                with open(os.path.join("data", "岗位描述.txt"), "a", encoding="utf-8") as f:
+                with open(os.path.join(self.job_dir, "岗位描述.txt"), "a", encoding="utf-8") as f:
                     f.write(s)
             except Exception as e:
                 logger.error(e)
@@ -121,6 +123,11 @@ class JobSpider:
             self.pool.apply_async(target)
 
     def run(self):
+        if os.path.exists(self.job_dir):
+            self.clearDir(self.job_dir)
+        else:
+            os.mkdir(self.job_dir)
+
         """
         多线程爬取数据
         """
@@ -128,9 +135,18 @@ class JobSpider:
         self.execute_more_tasks(self.post_require)
         self.desc_url_queue.join()  # 主线程阻塞,等待队列清空
 
+    def clearDir(self,rootdir):
+        filelist = os.listdir(rootdir)
+        for f in filelist:
+            filepath = os.path.join(rootdir, f)
+            if os.path.isfile(filepath):
+                os.remove(filepath)
+                print(filepath + " removed!")
+            elif os.path.isdir(filepath):
+                shutil.rmtree(filepath, True)
 
 if __name__ == "__main__":
     spider = JobSpider()
     start = time.time()
     spider.run()
-    logger.info("总耗时 {} 秒".format(time.time() - start))
+    logger.info("总耗时 {} 秒".format(time.time() - start))
\ No newline at end of file