From 5dbd2c792bf542733aa723068cb00c1b6d913d78 Mon Sep 17 00:00:00 2001 From: pan <1029559041@qq.com> Date: Tue, 30 Jun 2020 18:33:16 +0800 Subject: [PATCH] update --- job_spider.py | 32 ++++++++++++++++++++++++-------- 1 file changed, 24 insertions(+), 8 deletions(-) diff --git a/job_spider.py b/job_spider.py index 150cf43..a52b57d 100644 --- a/job_spider.py +++ b/job_spider.py @@ -1,3 +1,5 @@ +import shutil + from gevent import monkey from gevent.pool import Pool @@ -9,8 +11,6 @@ import requests from queue import Queue from bs4 import BeautifulSoup -# 开启多线程 -monkey.patch_all() def get_logger(): @@ -49,6 +49,7 @@ class JobSpider: """ Job 网站爬虫类 """ + job_dir = 'data' def __init__(self): self.count = 1 # 记录当前爬第几条数据 @@ -64,7 +65,7 @@ class JobSpider: urls = [START_URL.format(p) for p in range(1, 200)] # #resultList > div:nth-child(53) for url in urls: logger.info("爬取链接:{}\n第 {} 页".format(url, urls.index(url) + 1)) - html = requests.get(url, headers=HEADERS).content.decode("gbk") + html = requests.get(url, headers=HEADERS).content bs = BeautifulSoup(html, "lxml").find("div", class_="dw_table").find_all("div", class_="el") for b in bs: try: @@ -84,8 +85,8 @@ class JobSpider: url = self.desc_url_queue.get() resp = requests.get(url, headers=HEADERS) if resp.status_code == 200: - logger.info("爬取第 {} 条岗位详情".format(self.count)) - html = resp.content.decode("gbk") + logger.info("爬取链接:{}\n第 {} 条岗位详情".format(url,self.count)) + html = resp.content self.desc_url_queue.task_done() self.count += 1 else: @@ -98,13 +99,14 @@ class JobSpider: bs_tmp2 = bs_tmp.select('strong')[0] bs_tmp3 = bs_tmp.select('p.cname > a.catn')[0] bs_tmp4 = bs_tmp.select(' p.msg.ltype')[0].text.replace(u'\xa0', '').split('|') - with open('data/岗位信息.txt', 'ab+') as f: + + with open(os.path.join(self.job_dir, "岗位信息.txt"), 'ab+') as f: tmp = {"岗位": bs_tmp1.text, "公司": bs_tmp3['title'], "薪资": bs_tmp2.text, '位置': bs_tmp4[0], '工作经验': bs_tmp4[1], '学历': bs_tmp4[2], '招聘人数': bs_tmp4[3], '发布时间': bs_tmp4[4]} f.write((str(tmp) + '\n').encode('utf-8')) bs = BeautifulSoup(html, "lxml").find("div", class_="bmsg job_msg inbox").text s = bs.replace("微信", "").replace("分享", "").replace("邮件", "").replace("\t", "").strip() - with open(os.path.join("data", "岗位描述.txt"), "a", encoding="utf-8") as f: + with open(os.path.join(self.job_dir, "岗位描述.txt"), "a", encoding="utf-8") as f: f.write(s) except Exception as e: logger.error(e) @@ -121,6 +123,11 @@ class JobSpider: self.pool.apply_async(target) def run(self): + if os.path.exists(self.job_dir): + self.clearDir(self.job_dir) + else: + os.mkdir(self.job_dir) + """ 多线程爬取数据 """ @@ -128,9 +135,18 @@ class JobSpider: self.execute_more_tasks(self.post_require) self.desc_url_queue.join() # 主线程阻塞,等待队列清空 + def clearDir(self,rootdir): + filelist = os.listdir(rootdir) + for f in filelist: + filepath = os.path.join(rootdir, f) + if os.path.isfile(filepath): + os.remove(filepath) + print(filepath + " removed!") + elif os.path.isdir(filepath): + shutil.rmtree(filepath, True) if __name__ == "__main__": spider = JobSpider() start = time.time() spider.run() - logger.info("总耗时 {} 秒".format(time.time() - start)) + logger.info("总耗时 {} 秒".format(time.time() - start)) \ No newline at end of file