import shutil from gevent import monkey from gevent.pool import Pool monkey.patch_all(select=False) import time import os import logging import requests from queue import Queue from bs4 import BeautifulSoup def get_logger(): """ 创建日志实例 """ formatter = logging.Formatter("%(asctime)s - %(message)s") logger = logging.getLogger("monitor") logger.setLevel(LOG_LEVEL) ch = logging.StreamHandler() ch.setFormatter(formatter) logger.addHandler(ch) return logger HEADERS = { "X-Requested-With": "XMLHttpRequest", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36" "(KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36", } START_URL = ( 'https://search.51job.com/list/030200,000000,0000,00,9,99,%2520,2,{}.html?' 'lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&' 'companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&' 'fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=' ) LOG_LEVEL = logging.INFO # 日志等级 POOL_MAXSIZE = 8 # 线程池最大容量 logger = get_logger() class JobSpider: """ Job 网站爬虫类 """ job_dir = 'data' def __init__(self): self.count = 1 # 记录当前爬第几条数据 self.company = [] self.desc_url_queue = Queue() # 线程池队列 self.pool = Pool(POOL_MAXSIZE) # 线程池管理线程,最大协程数 # 获取信息 def job_spider(self): """ 爬虫入口 """ urls = [START_URL.format(p) for p in range(1, 200)] # #resultList > div:nth-child(53) for url in urls: logger.info("爬取链接:{}\n第 {} 页".format(url, urls.index(url) + 1)) html = requests.get(url, headers=HEADERS).content bs = BeautifulSoup(html, "lxml").find("div", class_="dw_table").find_all("div", class_="el") for b in bs: try: href = b.find("a")["href"] self.desc_url_queue.put(href) # 岗位详情链接加入队列 except Exception: pass # 打印队列长度,即多少条岗位详情 url logger.info("队列长度为 {} ".format(self.desc_url_queue.qsize())) def post_require(self): """ 爬取职位描述 """ while True: # 从队列中取 url url = self.desc_url_queue.get() resp = requests.get(url, headers=HEADERS) if resp.status_code == 200: logger.info("爬取链接:{}\n第 {} 条岗位详情".format(url,self.count)) html = resp.content self.desc_url_queue.task_done() self.count += 1 else: self.desc_url_queue.put(url) continue try: bs_tmp = BeautifulSoup(html, 'lxml').select( 'body > div.tCompanyPage > div.tCompany_center.clearfix > div.tHeader.tHjob > div > div.cn')[0] bs_tmp1 = bs_tmp.select('h1')[0] bs_tmp2 = bs_tmp.select('strong')[0] bs_tmp3 = bs_tmp.select('p.cname > a.catn')[0] bs_tmp4 = bs_tmp.select(' p.msg.ltype')[0].text.replace(u'\xa0', '').split('|') with open(os.path.join(self.job_dir, "岗位信息.txt"), 'ab+') as f: tmp = {"岗位": bs_tmp1.text, "公司": bs_tmp3['title'], "薪资": bs_tmp2.text, '位置': bs_tmp4[0], '工作经验': bs_tmp4[1], '学历': bs_tmp4[2], '招聘人数': bs_tmp4[3], '发布时间': bs_tmp4[4]} f.write((str(tmp) + '\n').encode('utf-8')) bs = BeautifulSoup(html, "lxml").find("div", class_="bmsg job_msg inbox").text s = bs.replace("微信", "").replace("分享", "").replace("邮件", "").replace("\t", "").strip() with open(os.path.join(self.job_dir, "岗位描述.txt"), "a", encoding="utf-8") as f: f.write(s) except Exception as e: logger.error(e) logger.warning(url) def execute_more_tasks(self, target): """ 协程池接收请求任务,可以扩展把解析,存储耗时操作加入各自队列,效率最大化 :param target: 任务函数 :param count: 启动线程数量 """ for i in range(POOL_MAXSIZE): self.pool.apply_async(target) def run(self): if os.path.exists(self.job_dir): self.clearDir(self.job_dir) else: os.mkdir(self.job_dir) """ 多线程爬取数据 """ self.job_spider() self.execute_more_tasks(self.post_require) self.desc_url_queue.join() # 主线程阻塞,等待队列清空 def clearDir(self,rootdir): filelist = os.listdir(rootdir) for f in filelist: filepath = os.path.join(rootdir, f) if os.path.isfile(filepath): os.remove(filepath) print(filepath + " removed!") elif os.path.isdir(filepath): shutil.rmtree(filepath, True) if __name__ == "__main__": spider = JobSpider() start = time.time() spider.run() logger.info("总耗时 {} 秒".format(time.time() - start))