update

5 years ago · 5dbd2c792b
parent dc9a49f501
commit 5dbd2c792b
1 changed files with 24 additions and 8 deletions
--- a/job_spider.py
+++ b/job_spider.py
@ -1,3 +1,5 @@
 import shutil
 from gevent import monkey
 from gevent.pool import Pool
@ -9,8 +11,6 @@ import requests
 from queue import Queue
 from bs4 import BeautifulSoup
 # 开启多线程
 monkey.patch_all()
 def get_logger():
@ -49,6 +49,7 @@ class JobSpider:
    """
    Job 网站爬虫类
    """
    job_dir = 'data'
    def __init__(self):
        self.count = 1  # 记录当前爬第几条数据
@ -64,7 +65,7 @@ class JobSpider:
        urls = [START_URL.format(p) for p in range(1, 200)]  # #resultList > div:nth-child(53)
        for url in urls:
            logger.info("爬取链接:{}\n第 {} 页".format(url, urls.index(url) + 1))
-            html = requests.get(url, headers=HEADERS).content.decode("gbk")
+            html = requests.get(url, headers=HEADERS).content
            bs = BeautifulSoup(html, "lxml").find("div", class_="dw_table").find_all("div", class_="el")
            for b in bs:
                try:
@ -84,8 +85,8 @@ class JobSpider:
            url = self.desc_url_queue.get()
            resp = requests.get(url, headers=HEADERS)
            if resp.status_code == 200:
-                logger.info("爬取第 {} 条岗位详情".format(self.count))
+                logger.info("爬取链接:{}\n第 {} 条岗位详情".format(url,self.count))
-                html = resp.content.decode("gbk")
+                html = resp.content
                self.desc_url_queue.task_done()
                self.count += 1
            else:
@ -98,13 +99,14 @@ class JobSpider:
                bs_tmp2 = bs_tmp.select('strong')[0]
                bs_tmp3 = bs_tmp.select('p.cname > a.catn')[0]
                bs_tmp4 = bs_tmp.select(' p.msg.ltype')[0].text.replace(u'\xa0', '').split('|')
-                with open('data/岗位信息.txt', 'ab+') as f:
+
                with open(os.path.join(self.job_dir, "岗位信息.txt"), 'ab+') as f:
                    tmp = {"岗位": bs_tmp1.text, "公司": bs_tmp3['title'], "薪资": bs_tmp2.text, '位置': bs_tmp4[0],
                           '工作经验': bs_tmp4[1], '学历': bs_tmp4[2], '招聘人数': bs_tmp4[3], '发布时间': bs_tmp4[4]}
                    f.write((str(tmp) + '\n').encode('utf-8'))
                bs = BeautifulSoup(html, "lxml").find("div", class_="bmsg job_msg inbox").text
                s = bs.replace("微信", "").replace("分享", "").replace("邮件", "").replace("\t", "").strip()
-                with open(os.path.join("data", "岗位描述.txt"), "a", encoding="utf-8") as f:
+                with open(os.path.join(self.job_dir, "岗位描述.txt"), "a", encoding="utf-8") as f:
                    f.write(s)
            except Exception as e:
                logger.error(e)
@ -121,6 +123,11 @@ class JobSpider:
            self.pool.apply_async(target)
    def run(self):
        if os.path.exists(self.job_dir):
            self.clearDir(self.job_dir)
        else:
            os.mkdir(self.job_dir)
        """
        多线程爬取数据
        """
@ -128,9 +135,18 @@ class JobSpider:
        self.execute_more_tasks(self.post_require)
        self.desc_url_queue.join()  # 主线程阻塞,等待队列清空
    def clearDir(self,rootdir):
        filelist = os.listdir(rootdir)
        for f in filelist:
            filepath = os.path.join(rootdir, f)
            if os.path.isfile(filepath):
                os.remove(filepath)
                print(filepath + " removed!")
            elif os.path.isdir(filepath):
                shutil.rmtree(filepath, True)
 if __name__ == "__main__":
    spider = JobSpider()
    start = time.time()
    spider.run()
-    logger.info("总耗时 {} 秒".format(time.time() - start))
+    logger.info("总耗时 {} 秒".format(time.time() - start))