house-job/job_spider.py

import shutil

from gevent import monkey
from gevent.pool import Pool

monkey.patch_all(select=False)
import time
import os
import logging
import requests
from queue import Queue
from bs4 import BeautifulSoup


def get_logger():
    """
    创建日志实例
    """
    formatter = logging.Formatter("%(asctime)s - %(message)s")
    logger = logging.getLogger("monitor")
    logger.setLevel(LOG_LEVEL)
    ch = logging.StreamHandler()
    ch.setFormatter(formatter)
    logger.addHandler(ch)
    return logger


HEADERS = {
    "X-Requested-With": "XMLHttpRequest",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36"
                  "(KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36",
}

START_URL = (
    'https://search.51job.com/list/030200,000000,0000,00,9,99,%2520,2,{}.html?'
    'lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&'
    'companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&'
    'fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare='
)

LOG_LEVEL = logging.INFO  # 日志等级
POOL_MAXSIZE = 8  # 线程池最大容量

logger = get_logger()


class JobSpider:
    """
    Job 网站爬虫类
    """
    job_dir = 'data'

    def __init__(self):
        self.count = 1  # 记录当前爬第几条数据
        self.company = []
        self.desc_url_queue = Queue()  # 线程池队列
        self.pool = Pool(POOL_MAXSIZE)  # 线程池管理线程,最大协程数

    # 获取信息
    def job_spider(self):
        """
        爬虫入口
        """
        urls = [START_URL.format(p) for p in range(1, 200)]  # #resultList > div:nth-child(53)
        for url in urls:
            logger.info("爬取链接:{}\n第 {} 页".format(url, urls.index(url) + 1))
            html = requests.get(url, headers=HEADERS).content
            bs = BeautifulSoup(html, "lxml").find("div", class_="dw_table").find_all("div", class_="el")
            for b in bs:
                try:
                    href = b.find("a")["href"]
                    self.desc_url_queue.put(href)  # 岗位详情链接加入队列
                except Exception:
                    pass
        # 打印队列长度,即多少条岗位详情 url
        logger.info("队列长度为 {} ".format(self.desc_url_queue.qsize()))

    def post_require(self):
        """
        爬取职位描述
        """
        while True:
            # 从队列中取 url
            url = self.desc_url_queue.get()
            resp = requests.get(url, headers=HEADERS)
            if resp.status_code == 200:
                logger.info("爬取链接:{}\n第 {} 条岗位详情".format(url,self.count))
                html = resp.content
                self.desc_url_queue.task_done()
                self.count += 1
            else:
                self.desc_url_queue.put(url)
                continue
            try:
                bs_tmp = BeautifulSoup(html, 'lxml').select(
                    'body > div.tCompanyPage > div.tCompany_center.clearfix > div.tHeader.tHjob > div > div.cn')[0]
                bs_tmp1 = bs_tmp.select('h1')[0]
                bs_tmp2 = bs_tmp.select('strong')[0]
                bs_tmp3 = bs_tmp.select('p.cname > a.catn')[0]
                bs_tmp4 = bs_tmp.select(' p.msg.ltype')[0].text.replace(u'\xa0', '').split('|')

                with open(os.path.join(self.job_dir, "岗位信息.txt"), 'ab+') as f:
                    tmp = {"岗位": bs_tmp1.text, "公司": bs_tmp3['title'], "薪资": bs_tmp2.text, '位置': bs_tmp4[0],
                           '工作经验': bs_tmp4[1], '学历': bs_tmp4[2], '招聘人数': bs_tmp4[3], '发布时间': bs_tmp4[4]}
                    f.write((str(tmp) + '\n').encode('utf-8'))
                bs = BeautifulSoup(html, "lxml").find("div", class_="bmsg job_msg inbox").text
                s = bs.replace("微信", "").replace("分享", "").replace("邮件", "").replace("\t", "").strip()
                with open(os.path.join(self.job_dir, "岗位描述.txt"), "a", encoding="utf-8") as f:
                    f.write(s)
            except Exception as e:
                logger.error(e)
                logger.warning(url)

    def execute_more_tasks(self, target):
        """
        协程池接收请求任务,可以扩展把解析,存储耗时操作加入各自队列,效率最大化

        :param target: 任务函数
        :param count: 启动线程数量
        """
        for i in range(POOL_MAXSIZE):
            self.pool.apply_async(target)

    def run(self):
        if os.path.exists(self.job_dir):
            self.clearDir(self.job_dir)
        else:
            os.mkdir(self.job_dir)

        """
        多线程爬取数据
        """
        self.job_spider()
        self.execute_more_tasks(self.post_require)
        self.desc_url_queue.join()  # 主线程阻塞,等待队列清空

    def clearDir(self,rootdir):
        filelist = os.listdir(rootdir)
        for f in filelist:
            filepath = os.path.join(rootdir, f)
            if os.path.isfile(filepath):
                os.remove(filepath)
                print(filepath + " removed!")
            elif os.path.isdir(filepath):
                shutil.rmtree(filepath, True)

if __name__ == "__main__":
    spider = JobSpider()
    start = time.time()
    spider.run()
    logger.info("总耗时 {} 秒".format(time.time() - start))