init

5 years ago · dc9a49f501
commit dc9a49f501
8 changed files with 736 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,146 @@
 # Created by .ignore support plugin (hsz.mobi)
 ### Python template
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
 *$py.class
 # C extensions
 *.so
 # Distribution / packaging
 .Python
 build/
 develop-eggs/
 dist/
 downloads/
 eggs/
 .eggs/
 lib/
 lib64/
 parts/
 sdist/
 var/
 wheels/
 pip-wheel-metadata/
 share/python-wheels/
 *.egg-info/
 .installed.cfg
 *.egg
 MANIFEST
 # PyInstaller
 #  Usually these files are written by a python script from a template
 #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 *.manifest
 *.spec
 # Installer logs
 pip-log.txt
 pip-delete-this-directory.txt
 # Unit test / coverage reports
 htmlcov/
 .tox/
 .nox/
 .coverage
 .coverage.*
 .cache
 nosetests.xml
 coverage.xml
 *.cover
 *.py,cover
 .hypothesis/
 .pytest_cache/
 cover/
 # Translations
 *.mo
 *.pot
 # Django stuff:
 *.log
 local_settings.py
 db.sqlite3
 db.sqlite3-journal
 # Flask stuff:
 instance/
 .webassets-cache
 # Scrapy stuff:
 .scrapy
 # Sphinx documentation
 docs/_build/
 # PyBuilder
 .pybuilder/
 target/
 # Jupyter Notebook
 .ipynb_checkpoints
 # IPython
 profile_default/
 ipython_config.py
 # pyenv
 #   For a library or package, you might want to ignore these files since the code is
 #   intended to run in multiple environments; otherwise, check them in:
 # .python-version
 # pipenv
 #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 #   install all needed dependencies.
 #Pipfile.lock
 # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 __pypackages__/
 # Celery stuff
 celerybeat-schedule
 celerybeat.pid
 # SageMath parsed files
 *.sage.py
 # Environments
 .env
 .venv
 env/
 venv/
 ENV/
 env.bak/
 venv.bak/
 # Spyder project settings
 .spyderproject
 .spyproject
 # Rope project settings
 .ropeproject
 # mkdocs documentation
 /site
 # mypy
 .mypy_cache/
 .dmypy.json
 dmypy.json
 # Pyre type checker
 .pyre/
 # pytype static type analyzer
 .pytype/
 # Cython debug symbols
 cython_debug/
 /images/both/
 /images/house/
 /images/job/
 /data/
 /.idea/
--- a/both_data_analysis1.py
+++ b/both_data_analysis1.py
@ -0,0 +1,98 @@
 import pandas as pd
 import re
 import pyecharts.options as opts
 from pyecharts.charts import Line, Bar
 # 岗位信息数据预处理
 with open('data/岗位信息.txt','rb') as file:
    job_list = []
    while True:
        line = file.readline()
        if not line:
            break
        line = eval(line.decode('utf-8'))
        try:
            line['位置'] = re.split('-',line['位置'])[1]
            danwei = re.findall('[\u4e00-\u9fa5]+',line['薪资'])
            xinzi = re.findall('\d+.*\d',line['薪资'])[0].split('-')
            if not xinzi[1]:
                xinzi[1] = xinzi[0]
            if danwei[0][0] == '万' and danwei[1] == '月':
                line['薪资'] = round((float(xinzi[0])+float(xinzi[1]))/2,2)
            elif danwei[0][0] == '万' and danwei[1] == '年':
                line['薪资'] = round((float(xinzi[0]) + float(xinzi[1])) / 2 /12, 2)
            elif danwei[0] == '千' and danwei[1] == '月':
                line['薪资'] = round((float(xinzi[0]) + float(xinzi[1])) / 2 / 10, 2)
            elif danwei[0] == '元' and danwei[1:] == '小时':
                line['薪资'] = round((float(xinzi[0]) + float(xinzi[1]))*8*22 / 2 / 100, 2)
        except:
            continue
        job_list.append(line)
 job_list_DF = pd.DataFrame(job_list)
 # 房源信息预处理
 with open('data/房源信息.txt','rb') as file:
    house_list = []
    while True:
        line = file.readline()
        if not line:
            break
        line = eval(line.decode('utf-8'))
        line['面积'] = int(re.findall('\d+',line['面积'])[0])
        line['价格'] = int(re.findall('\d+',line['价格'])[0])
        house_list.append(line)
 house_list_DF = pd.DataFrame(house_list)
 xingzhengqu = [item for item in set(house_list_DF.get(key='行政区')) if item]
 # 获取每个区 单日每平方米的价格
 def houserGetAvgPrice(xingzhengqu):
    totalPrice = 0
    totalArea = 0
    for item in house_list:
        if item['行政区'] == xingzhengqu:
            totalArea = totalArea + item['面积']
            totalPrice = totalPrice + item['价格']
    return totalPrice / totalArea if totalArea >0 else 1
 # 房租每日单价
 house_totalAvgPriceList = []
 for index,item in enumerate(xingzhengqu):
    avg_price = houserGetAvgPrice(item)
    house_totalAvgPriceList.append(round(avg_price/30,2))
 attr, house_value = (xingzhengqu,house_totalAvgPriceList)
 # 获取每个区 单日薪资
 def jobGetAvgPrice(xingzhengqu):
    totalPrice = 0
    total = 0
    for item in job_list:
        if item['位置'] == xingzhengqu:
            total = total + 1
            totalPrice = totalPrice + item['薪资']
    return totalPrice / total if total >0 else 0
 # 获取每个区 单时薪资
 job_totalAvgPriceList = []
 for index,item in enumerate(xingzhengqu):
    avg_price = jobGetAvgPrice(item+'区')
    job_totalAvgPriceList.append(round(avg_price*10000/30/24,2))
 attr, job_value = (xingzhengqu,job_totalAvgPriceList)
 # 广州房租-薪资图
 line =Line(init_opts=opts.InitOpts(width='800px',height='800px'))
 line.add_xaxis(xaxis_data=attr)
 line.add_yaxis('房租：元/日(1平方米)', house_value)
 line.add_yaxis("薪资:元/日", job_value)
 line.render('images/both/广州房租-薪资.html')
 difference = []
 for i in range(len(job_value)):
    difference.append(round(job_value[i]-house_value[i],2))
 # 广州房租-薪资差距图
 bar = Bar(init_opts=opts.InitOpts(width='1200px',height='1400px'))
 bar.add_xaxis(attr)
 bar.add_yaxis("广州房租-薪资差距图：元",difference)
 bar.set_global_opts(title_opts=opts.TitleOpts(title='广州房租-薪资差距图：元'),xaxis_opts=opts.AxisOpts(axislabel_opts={"rotate":"270"}))
 bar.render('images/both/广州房租-薪资差距.html')
--- a/house_data_analysis1.py
+++ b/house_data_analysis1.py
@ -0,0 +1,91 @@
 import pandas as pd
 import pyecharts.options as opts
 from pyecharts.charts import Pie
 import re
 with open('data/房源信息.txt','rb') as file:
    house_list = []
    while True:
        line = file.readline()
        if not line:
            break
        line = eval(line.decode('utf-8'))
        line['面积'] = int(re.findall('\d+',line['面积'])[0])
        line['价格'] = int(re.findall('\d+',line['价格'])[0])
        house_list.append(line)
 house_list_DF = pd.DataFrame(house_list)
 xingzhengqu = [item for item in set(house_list_DF.get(key='行政区')) if item]
 # 租房面积统计
 bins = [-1,30,60,90,120,200,300,400,10000]
 attr = ['0-30平方米','30-60平方米','60-90平方米','90-120平方米','120-200平方米','200-300平方米','300-400平方米','400+平方米']
 tmpDF = house_list_DF.groupby(pd.cut(house_list_DF['面积'],bins = bins,labels=attr)).size().reset_index(name = 'count')
 value = list(map(int,tmpDF['count'].values))
 pie = Pie(init_opts=opts.InitOpts(width='800px',height='800px'))
 pie.add('',zip(attr,value)).set_global_opts(title_opts=opts.TitleOpts(title='租房面积统计'))
 pie.render('images/house/广州租房面积统计.html')
 # 求每个区的每平方米的租房单价
 from pyecharts.charts import TreeMap
 def getAvgPrice(xingzhengqu):
    totalPrice = 0
    totalArea = 0
    for item in house_list:
        if item['行政区'] == xingzhengqu:
            totalArea = totalArea + item['面积']
            totalPrice = totalPrice + item['价格']
    return totalPrice / totalArea if totalArea >0 else 1
 # 获取每个区 单月每平方米的价格
 def getTotalAvgPrice():
    totalAvgPriceList = []
    totalAvgPriceDirList = []
    for index, item in enumerate(xingzhengqu):
        avg_price = getAvgPrice(item)
        totalAvgPriceList.append(round(avg_price,3))
        totalAvgPriceDirList.append({'value':round(avg_price,3),'name':item + " ￥" + str(round(avg_price,3))})
    return totalAvgPriceDirList
 # 获取每月每平方米的价格
 data = getTotalAvgPrice()
 treemap = TreeMap(init_opts=opts.InitOpts(width='900px',height='800px'))
 treemap.add('广州各区房租单价:平方米/月',data,label_opts=opts.LabelOpts(is_show=True, position='inside',font_size=13))
 treemap.render('images/house/广州各区房租单价.html')
 # 获取每个区 单日每平方米的价格
 from pyecharts.charts import Bar
 totalAvgPriceList = []
 for index,item in enumerate(xingzhengqu):
    avg_price = getAvgPrice(item)
    totalAvgPriceList.append(round(avg_price/30,3))
 attr, value = (xingzhengqu,totalAvgPriceList)
 bar = Bar(init_opts=opts.InitOpts(width='900px',height='800px'))
 bar.add_xaxis(attr)
 bar.add_yaxis("广州",value)
 bar.set_global_opts(title_opts=opts.TitleOpts(title='广州各区房租单价:平方米/日'))
 bar.render('images/house/广州每日每平方米的价格.html')
 # 获取户型数据
 from pyecharts.charts import WordCloud
 def getRooms():
    results = house_list_DF.groupby('房间').size().reset_index(name='count')
    room_list = list(results.房间.values)
    weight_list = list(map(int,results['count'].values))
    return (room_list, weight_list)
 attr, value = getRooms()
 wordcloud = WordCloud(init_opts=opts.InitOpts(width='900px',height='400px'))
 wordcloud.add('',zip(attr,value),word_size_range=[2,100])
 wordcloud.render('images/house/广州户型数据.html')
 # 获取各个区的房源比重
 from pyecharts.charts import Pie
 def getAreaWeight():
    result = house_list_DF.groupby('行政区').size().reset_index(name='count')
    areaName = list(result.行政区.values)
    areaWeight = list(map(int,result['count'].values))
    areaName_tmp = []
    for index,item in enumerate(areaName):
        areaName_tmp.append(item + str(round(areaWeight[index]/sum(areaWeight)*100,2))+'%')
    return zip(areaName_tmp,areaWeight)
 pie = Pie(init_opts=opts.InitOpts(width='600px',height='400px'))
 pie.add('',getAreaWeight()).set_global_opts(title_opts=opts.TitleOpts(title='广州房源分布'))
 pie.render('images/house/广州房源分布.html')
--- a/house_spider.py
+++ b/house_spider.py
@ -0,0 +1,141 @@
 import time
 import logging
 import requests
 from gevent import monkey
 from gevent.pool import Pool
 from queue import Queue
 import bs4
 import re
 # 开启多线程
 monkey.patch_all()
 def get_logger():
    """
    创建日志实例
    """
    formatter = logging.Formatter("%(asctime)s - %(message)s")
    logger = logging.getLogger("monitor")
    logger.setLevel(LOG_LEVEL)
    ch = logging.StreamHandler()
    ch.setFormatter(formatter)
    logger.addHandler(ch)
    return logger
 headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3741.400 QQBrowser/10.5.3863.400'}
 START_URL = ('https://gz.zu.ke.com/zufang/pg{}/')
 LOG_LEVEL = logging.INFO    # 日志等级
 POOL_MAXSIZE = 8  # 线程池最大容量
 logger = get_logger()
 class HouseSpider:
    """
    House 网站爬虫类
    """
    def __init__(self):
        self.count = 1  # 记录当前爬第几条数据
        self.desc_url_queue = Queue()  # 线程池队列
        self.pool = Pool(POOL_MAXSIZE)  # 线程池管理线程,最大协程数
    # 获取信息
    def job_spider(self):
        """
        爬虫入口
        """
        urls = [START_URL.format(p) for p in range(1, 100)]
        for url in urls:
            logger.info("爬取第 {} 页".format(urls.index(url) + 1))
            response = requests.get(url=url, headers=headers)
            response.enconding = response.apparent_encoding
            bs = bs4.BeautifulSoup(response.text, 'lxml')
            house_list = bs.select('#content > div.content__article > div.content__list > div')  #
            house = []
            for item in house_list:
                tmp = {}
                try:
                    self.desc_url_queue.put(item.select('a')[0].attrs['href'])  # 房屋详情链接加入队列
                    title = item.select('div > p.content__list--item--title.twoline > a')[0].string.replace('\n','').replace(' ', '')
                    tmp['标题'] = title
                    xinzhengqu = item.select('div > p.content__list--item--des > a:nth-child(1)')[0].string
                    tmp['行政区'] = xinzhengqu
                    xinzhengqu_level = item.select('div > p.content__list--item--des > a:nth-child(2)')[0].string
                    tmp['二级行政区'] = xinzhengqu_level
                    location = item.select('div > p.content__list--item--des > a:nth-child(3)')[0].string
                    tmp['地址'] = location
                    detail_house = str(item.select('div > p.content__list--item--des')[0]).replace(' ', '').replace('\n', '')
                    detail_house_list = re.split('<i>/</i>', detail_house)[1:-1]
                    miji = detail_house_list[0]
                    tmp['面积'] = miji
                    chaoxiang = detail_house_list[1]
                    tmp['朝向'] = chaoxiang
                    rooms = detail_house_list[2][:detail_house_list[2].index('<')]
                    tmp['房间'] = rooms
                    price = item.select('div > span > em')[0].string
                    price_detail = re.findall('</em>.*', str(item.select('div > span')[0]))[0].replace(' ', '').replace('</em>', '').replace('</span>', '')
                    tmp['价格'] = price + price_detail
                    with open('data/房源信息.txt', 'ab+') as f:
                        f.write((str(tmp) + '\n').encode('utf-8'))
                except:
                    continue
        # 打印队列长度,即多少条岗位详情 url
        logger.info("队列长度为 {} ".format(self.desc_url_queue.qsize()))
    def post_require(self):
        """
        爬取房源描述
        """
        while True:
            # 从队列中取 url
            url = self.desc_url_queue.get()
            url_ = 'https://gz.zu.ke.com/' + url
            response = requests.get(url=url_, headers=headers)
            response.enconding = response.apparent_encoding
            bs = bs4.BeautifulSoup(response.text, 'html.parser')
            try:
                if response.status_code == 200:
                    logger.info("爬取第 {} 条房源详情".format(self.count))
                    desc = str(bs.select('#desc > p:nth-child(3)')[0]['data-desc']).replace('\n', '').replace(' ','').replace('<br/>', '')
                    with open('data/房源描述.txt', 'ab+') as f:
                        f.write((str(desc) + '\n').encode('utf-8'))
                    self.desc_url_queue.task_done()
                    self.count += 1
                else:
                    self.desc_url_queue.put(url)
                    continue
            except Exception as e:
                logger.error(e)
                logger.warning(url)
    def execute_more_tasks(self, target):
        """
        协程池接收请求任务,可以扩展把解析,存储耗时操作加入各自队列,效率最大化
        :param target: 任务函数
        :param count: 启动线程数量
        """
        for i in range(POOL_MAXSIZE):
            self.pool.apply_async(target)
    def run(self):
        """
        多线程爬取数据
        """
        self.job_spider()
        self.execute_more_tasks(self.post_require)
        self.desc_url_queue.join()  # 主线程阻塞,等待队列清空
 if __name__ == "__main__":
    spider = HouseSpider()
    start = time.time()
    spider.run()
    logger.info("总耗时 {} 秒".format(time.time() - start))
--- a/job_data_analysis1.py
+++ b/job_data_analysis1.py
@ -0,0 +1,109 @@
 import pandas as pd
 import re
 # 数据预处理
 with open('data/岗位信息.txt','rb') as file:
    job_list = []
    while True:
        line = file.readline()
        if not line:
            break
        line = eval(line.decode('utf-8'))
        try:
            line['位置'] = re.split('-',line['位置'])[1]
            danwei = re.findall('[\u4e00-\u9fa5]+',line['薪资'])
            xinzi = re.findall('\d+.*\d',line['薪资'])[0].split('-')
            if not xinzi[1]:
                xinzi[1] = xinzi[0]
            if danwei[0][0] == '万' and danwei[1] == '月':
                line['薪资'] = round((float(xinzi[0])+float(xinzi[1]))/2,2)
            elif danwei[0][0] == '万' and danwei[1] == '年':
                line['薪资'] = round((float(xinzi[0]) + float(xinzi[1])) / 2 /12, 2)
            elif danwei[0] == '千' and danwei[1] == '月':
                line['薪资'] = round((float(xinzi[0]) + float(xinzi[1])) / 2 / 10, 2)
            elif danwei[0] == '元' and danwei[1:] == '小时':
                line['薪资'] = round((float(xinzi[0]) + float(xinzi[1]))*8*22 / 2 / 100, 2)
        except:
            continue
        job_list.append(line)
 job_list_DF = pd.DataFrame(job_list)
 xingzhengqu = [item for item in set(job_list_DF.get(key='位置')) if item]
 # 广州各区岗位分布
 from pyecharts import options as opts
 from pyecharts.charts import Pie
 def getAreaWeight():
    result = job_list_DF.groupby('位置').size().reset_index(name='count')
    areaName = list(result.位置.values)
    areaWeight = list(map(int,result['count'].values))
    areaName_tmp = []
    for index,item in enumerate(areaName):
        areaName_tmp.append(item + str(round(areaWeight[index]/sum(areaWeight)*100,2))+'%')
    return zip(areaName_tmp,areaWeight)
 pie = Pie(init_opts=opts.InitOpts(width='800px',height='800px'))
 pie.add('',getAreaWeight()).set_global_opts(title_opts=opts.TitleOpts(title='广州各区岗位分布'))
 pie.render('images/job/广州各区岗位分布.html')
 # 求广州单月薪资
 from pyecharts.charts import TreeMap
 def getAvgPrice(xingzhengqu):
    totalPrice = 0
    total = 0
    for item in job_list:
        if item['位置'] == xingzhengqu:
            total = total + 1
            totalPrice = totalPrice + item['薪资']
    return totalPrice / total if total >0 else 0
 # 获取每个区 单月薪资
 def getTotalAvgPrice():
    totalAvgPriceList = []
    totalAvgPriceDirList = []
    for index, item in enumerate(xingzhengqu):
        avg_price = getAvgPrice(item)
        totalAvgPriceList.append(round(avg_price,2))
        totalAvgPriceDirList.append({'value':round(avg_price,2),'name':item + " ￥" + str(round(avg_price,2)) +' 万'})
    return totalAvgPriceDirList
 data = getTotalAvgPrice()
 treemap = TreeMap(init_opts=opts.InitOpts(width='1200px',height='1400px'))
 treemap.add('广州各区每月薪资:万/月',data,label_opts=opts.LabelOpts(is_show=True, position='inside',font_size=13))
 treemap.render('images/job/广州各区每月薪资.html')
 # 获取每个区 单日薪资
 from pyecharts.charts import Bar
 totalAvgPriceList = []
 for index,item in enumerate(xingzhengqu):
    avg_price = getAvgPrice(item)
    totalAvgPriceList.append(round(avg_price*10000/30,2))
 attr, value = (xingzhengqu,totalAvgPriceList)
 bar = Bar(init_opts=opts.InitOpts(width='1200px',height='1400px'))
 bar.add_xaxis(attr)
 bar.add_yaxis("广州",value)
 bar.set_global_opts(title_opts=opts.TitleOpts(title='广州各区单日薪资:元/日'),xaxis_opts=opts.AxisOpts(axislabel_opts={"rotate":"270"}))
 bar.render('images/job/广州各区单日薪资.html')
 # 获取岗位数据
 from pyecharts.charts import WordCloud
 def getRooms():
    results = job_list_DF.groupby('岗位').size().reset_index(name='count')
    room_list = list(results.岗位.values)
    weight_list = list(map(int,results['count'].values))
    return (room_list, weight_list)
 attr, value = getRooms()
 wordcloud = WordCloud(init_opts=opts.InitOpts(width='900px',height='400px'))
 wordcloud.add('',zip(attr,value),word_size_range=[2,100])
 wordcloud.render('images/job/广州岗位数据.html')
 # 获取各个区的岗位数量比重
 from pyecharts.charts import Pie
 def getAreaWeight():
    result = job_list_DF.groupby('位置').size().reset_index(name='count')
    areaName = list(result.位置.values)
    areaWeight = list(map(int,result['count'].values))
    areaName_tmp = []
    for index,item in enumerate(areaName):
        areaName_tmp.append(item + str(round(areaWeight[index]/sum(areaWeight)*100,2))+'%')
    return zip(areaName_tmp,areaWeight)
 pie = Pie(init_opts=opts.InitOpts(width='1200px',height='1200px'))
 pie.add('',getAreaWeight()).set_global_opts(title_opts=opts.TitleOpts(title='广州各区岗位数量分布'))
 pie.render('images/job/广州各区岗位数量分布.html')
--- a/job_spider.py
+++ b/job_spider.py
@ -0,0 +1,136 @@
 from gevent import monkey
 from gevent.pool import Pool
 monkey.patch_all(select=False)
 import time
 import os
 import logging
 import requests
 from queue import Queue
 from bs4 import BeautifulSoup
 # 开启多线程
 monkey.patch_all()
 def get_logger():
    """
    创建日志实例
    """
    formatter = logging.Formatter("%(asctime)s - %(message)s")
    logger = logging.getLogger("monitor")
    logger.setLevel(LOG_LEVEL)
    ch = logging.StreamHandler()
    ch.setFormatter(formatter)
    logger.addHandler(ch)
    return logger
 HEADERS = {
    "X-Requested-With": "XMLHttpRequest",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36"
                  "(KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36",
 }
 START_URL = (
    'https://search.51job.com/list/030200,000000,0000,00,9,99,%2520,2,{}.html?'
    'lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&'
    'companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&'
    'fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare='
 )
 LOG_LEVEL = logging.INFO  # 日志等级
 POOL_MAXSIZE = 8  # 线程池最大容量
 logger = get_logger()
 class JobSpider:
    """
    Job 网站爬虫类
    """
    def __init__(self):
        self.count = 1  # 记录当前爬第几条数据
        self.company = []
        self.desc_url_queue = Queue()  # 线程池队列
        self.pool = Pool(POOL_MAXSIZE)  # 线程池管理线程,最大协程数
    # 获取信息
    def job_spider(self):
        """
        爬虫入口
        """
        urls = [START_URL.format(p) for p in range(1, 200)]  # #resultList > div:nth-child(53)
        for url in urls:
            logger.info("爬取链接:{}\n第 {} 页".format(url, urls.index(url) + 1))
            html = requests.get(url, headers=HEADERS).content.decode("gbk")
            bs = BeautifulSoup(html, "lxml").find("div", class_="dw_table").find_all("div", class_="el")
            for b in bs:
                try:
                    href = b.find("a")["href"]
                    self.desc_url_queue.put(href)  # 岗位详情链接加入队列
                except Exception:
                    pass
        # 打印队列长度,即多少条岗位详情 url
        logger.info("队列长度为 {} ".format(self.desc_url_queue.qsize()))
    def post_require(self):
        """
        爬取职位描述
        """
        while True:
            # 从队列中取 url
            url = self.desc_url_queue.get()
            resp = requests.get(url, headers=HEADERS)
            if resp.status_code == 200:
                logger.info("爬取第 {} 条岗位详情".format(self.count))
                html = resp.content.decode("gbk")
                self.desc_url_queue.task_done()
                self.count += 1
            else:
                self.desc_url_queue.put(url)
                continue
            try:
                bs_tmp = BeautifulSoup(html, 'lxml').select(
                    'body > div.tCompanyPage > div.tCompany_center.clearfix > div.tHeader.tHjob > div > div.cn')[0]
                bs_tmp1 = bs_tmp.select('h1')[0]
                bs_tmp2 = bs_tmp.select('strong')[0]
                bs_tmp3 = bs_tmp.select('p.cname > a.catn')[0]
                bs_tmp4 = bs_tmp.select(' p.msg.ltype')[0].text.replace(u'\xa0', '').split('|')
                with open('data/岗位信息.txt', 'ab+') as f:
                    tmp = {"岗位": bs_tmp1.text, "公司": bs_tmp3['title'], "薪资": bs_tmp2.text, '位置': bs_tmp4[0],
                           '工作经验': bs_tmp4[1], '学历': bs_tmp4[2], '招聘人数': bs_tmp4[3], '发布时间': bs_tmp4[4]}
                    f.write((str(tmp) + '\n').encode('utf-8'))
                bs = BeautifulSoup(html, "lxml").find("div", class_="bmsg job_msg inbox").text
                s = bs.replace("微信", "").replace("分享", "").replace("邮件", "").replace("\t", "").strip()
                with open(os.path.join("data", "岗位描述.txt"), "a", encoding="utf-8") as f:
                    f.write(s)
            except Exception as e:
                logger.error(e)
                logger.warning(url)
    def execute_more_tasks(self, target):
        """
        协程池接收请求任务,可以扩展把解析,存储耗时操作加入各自队列,效率最大化
        :param target: 任务函数
        :param count: 启动线程数量
        """
        for i in range(POOL_MAXSIZE):
            self.pool.apply_async(target)
    def run(self):
        """
        多线程爬取数据
        """
        self.job_spider()
        self.execute_more_tasks(self.post_require)
        self.desc_url_queue.join()  # 主线程阻塞,等待队列清空
 if __name__ == "__main__":
    spider = JobSpider()
    start = time.time()
    spider.run()
    logger.info("总耗时 {} 秒".format(time.time() - start))
--- a/requirement.txt
+++ b/requirement.txt
@ -0,0 +1,15 @@
 beautifulsoup4==4.9.1
 bs4==0.0.1
 certifi==2020.6.20
 cffi==1.14.0
 chardet==3.0.4
 gevent==20.6.2
 greenlet==0.4.16
 idna==2.10
 lxml==4.5.1
 pycparser==2.20
 requests==2.24.0
 soupsieve==2.0.1
 urllib3==1.25.9
 zope.event==4.4
 zope.interface==5.1.0
--- a/说明文档.docx
+++ b/说明文档.docx