master
pan 5 years ago
commit dc9a49f501
  1. 146
      .gitignore
  2. 98
      both_data_analysis1.py
  3. 91
      house_data_analysis1.py
  4. 141
      house_spider.py
  5. 109
      job_data_analysis1.py
  6. 136
      job_spider.py
  7. 15
      requirement.txt
  8. BIN
      说明文档.docx

146
.gitignore vendored

@ -0,0 +1,146 @@
# Created by .ignore support plugin (hsz.mobi)
### Python template
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
/images/both/
/images/house/
/images/job/
/data/
/.idea/

@ -0,0 +1,98 @@
import pandas as pd
import re
import pyecharts.options as opts
from pyecharts.charts import Line, Bar
# 岗位信息数据预处理
with open('data/岗位信息.txt','rb') as file:
job_list = []
while True:
line = file.readline()
if not line:
break
line = eval(line.decode('utf-8'))
try:
line['位置'] = re.split('-',line['位置'])[1]
danwei = re.findall('[\u4e00-\u9fa5]+',line['薪资'])
xinzi = re.findall('\d+.*\d',line['薪资'])[0].split('-')
if not xinzi[1]:
xinzi[1] = xinzi[0]
if danwei[0][0] == '' and danwei[1] == '':
line['薪资'] = round((float(xinzi[0])+float(xinzi[1]))/2,2)
elif danwei[0][0] == '' and danwei[1] == '':
line['薪资'] = round((float(xinzi[0]) + float(xinzi[1])) / 2 /12, 2)
elif danwei[0] == '' and danwei[1] == '':
line['薪资'] = round((float(xinzi[0]) + float(xinzi[1])) / 2 / 10, 2)
elif danwei[0] == '' and danwei[1:] == '小时':
line['薪资'] = round((float(xinzi[0]) + float(xinzi[1]))*8*22 / 2 / 100, 2)
except:
continue
job_list.append(line)
job_list_DF = pd.DataFrame(job_list)
# 房源信息预处理
with open('data/房源信息.txt','rb') as file:
house_list = []
while True:
line = file.readline()
if not line:
break
line = eval(line.decode('utf-8'))
line['面积'] = int(re.findall('\d+',line['面积'])[0])
line['价格'] = int(re.findall('\d+',line['价格'])[0])
house_list.append(line)
house_list_DF = pd.DataFrame(house_list)
xingzhengqu = [item for item in set(house_list_DF.get(key='行政区')) if item]
# 获取每个区 单日每平方米的价格
def houserGetAvgPrice(xingzhengqu):
totalPrice = 0
totalArea = 0
for item in house_list:
if item['行政区'] == xingzhengqu:
totalArea = totalArea + item['面积']
totalPrice = totalPrice + item['价格']
return totalPrice / totalArea if totalArea >0 else 1
# 房租每日单价
house_totalAvgPriceList = []
for index,item in enumerate(xingzhengqu):
avg_price = houserGetAvgPrice(item)
house_totalAvgPriceList.append(round(avg_price/30,2))
attr, house_value = (xingzhengqu,house_totalAvgPriceList)
# 获取每个区 单日薪资
def jobGetAvgPrice(xingzhengqu):
totalPrice = 0
total = 0
for item in job_list:
if item['位置'] == xingzhengqu:
total = total + 1
totalPrice = totalPrice + item['薪资']
return totalPrice / total if total >0 else 0
# 获取每个区 单时薪资
job_totalAvgPriceList = []
for index,item in enumerate(xingzhengqu):
avg_price = jobGetAvgPrice(item+'')
job_totalAvgPriceList.append(round(avg_price*10000/30/24,2))
attr, job_value = (xingzhengqu,job_totalAvgPriceList)
# 广州房租-薪资图
line =Line(init_opts=opts.InitOpts(width='800px',height='800px'))
line.add_xaxis(xaxis_data=attr)
line.add_yaxis('房租:元/日(1平方米)', house_value)
line.add_yaxis("薪资:元/日", job_value)
line.render('images/both/广州房租-薪资.html')
difference = []
for i in range(len(job_value)):
difference.append(round(job_value[i]-house_value[i],2))
# 广州房租-薪资差距图
bar = Bar(init_opts=opts.InitOpts(width='1200px',height='1400px'))
bar.add_xaxis(attr)
bar.add_yaxis("广州房租-薪资差距图:元",difference)
bar.set_global_opts(title_opts=opts.TitleOpts(title='广州房租-薪资差距图:元'),xaxis_opts=opts.AxisOpts(axislabel_opts={"rotate":"270"}))
bar.render('images/both/广州房租-薪资差距.html')

@ -0,0 +1,91 @@
import pandas as pd
import pyecharts.options as opts
from pyecharts.charts import Pie
import re
with open('data/房源信息.txt','rb') as file:
house_list = []
while True:
line = file.readline()
if not line:
break
line = eval(line.decode('utf-8'))
line['面积'] = int(re.findall('\d+',line['面积'])[0])
line['价格'] = int(re.findall('\d+',line['价格'])[0])
house_list.append(line)
house_list_DF = pd.DataFrame(house_list)
xingzhengqu = [item for item in set(house_list_DF.get(key='行政区')) if item]
# 租房面积统计
bins = [-1,30,60,90,120,200,300,400,10000]
attr = ['0-30平方米','30-60平方米','60-90平方米','90-120平方米','120-200平方米','200-300平方米','300-400平方米','400+平方米']
tmpDF = house_list_DF.groupby(pd.cut(house_list_DF['面积'],bins = bins,labels=attr)).size().reset_index(name = 'count')
value = list(map(int,tmpDF['count'].values))
pie = Pie(init_opts=opts.InitOpts(width='800px',height='800px'))
pie.add('',zip(attr,value)).set_global_opts(title_opts=opts.TitleOpts(title='租房面积统计'))
pie.render('images/house/广州租房面积统计.html')
# 求每个区的每平方米的租房单价
from pyecharts.charts import TreeMap
def getAvgPrice(xingzhengqu):
totalPrice = 0
totalArea = 0
for item in house_list:
if item['行政区'] == xingzhengqu:
totalArea = totalArea + item['面积']
totalPrice = totalPrice + item['价格']
return totalPrice / totalArea if totalArea >0 else 1
# 获取每个区 单月每平方米的价格
def getTotalAvgPrice():
totalAvgPriceList = []
totalAvgPriceDirList = []
for index, item in enumerate(xingzhengqu):
avg_price = getAvgPrice(item)
totalAvgPriceList.append(round(avg_price,3))
totalAvgPriceDirList.append({'value':round(avg_price,3),'name':item + "" + str(round(avg_price,3))})
return totalAvgPriceDirList
# 获取每月每平方米的价格
data = getTotalAvgPrice()
treemap = TreeMap(init_opts=opts.InitOpts(width='900px',height='800px'))
treemap.add('广州各区房租单价:平方米/月',data,label_opts=opts.LabelOpts(is_show=True, position='inside',font_size=13))
treemap.render('images/house/广州各区房租单价.html')
# 获取每个区 单日每平方米的价格
from pyecharts.charts import Bar
totalAvgPriceList = []
for index,item in enumerate(xingzhengqu):
avg_price = getAvgPrice(item)
totalAvgPriceList.append(round(avg_price/30,3))
attr, value = (xingzhengqu,totalAvgPriceList)
bar = Bar(init_opts=opts.InitOpts(width='900px',height='800px'))
bar.add_xaxis(attr)
bar.add_yaxis("广州",value)
bar.set_global_opts(title_opts=opts.TitleOpts(title='广州各区房租单价:平方米/日'))
bar.render('images/house/广州每日每平方米的价格.html')
# 获取户型数据
from pyecharts.charts import WordCloud
def getRooms():
results = house_list_DF.groupby('房间').size().reset_index(name='count')
room_list = list(results.房间.values)
weight_list = list(map(int,results['count'].values))
return (room_list, weight_list)
attr, value = getRooms()
wordcloud = WordCloud(init_opts=opts.InitOpts(width='900px',height='400px'))
wordcloud.add('',zip(attr,value),word_size_range=[2,100])
wordcloud.render('images/house/广州户型数据.html')
# 获取各个区的房源比重
from pyecharts.charts import Pie
def getAreaWeight():
result = house_list_DF.groupby('行政区').size().reset_index(name='count')
areaName = list(result.行政区.values)
areaWeight = list(map(int,result['count'].values))
areaName_tmp = []
for index,item in enumerate(areaName):
areaName_tmp.append(item + str(round(areaWeight[index]/sum(areaWeight)*100,2))+'%')
return zip(areaName_tmp,areaWeight)
pie = Pie(init_opts=opts.InitOpts(width='600px',height='400px'))
pie.add('',getAreaWeight()).set_global_opts(title_opts=opts.TitleOpts(title='广州房源分布'))
pie.render('images/house/广州房源分布.html')

@ -0,0 +1,141 @@
import time
import logging
import requests
from gevent import monkey
from gevent.pool import Pool
from queue import Queue
import bs4
import re
# 开启多线程
monkey.patch_all()
def get_logger():
"""
创建日志实例
"""
formatter = logging.Formatter("%(asctime)s - %(message)s")
logger = logging.getLogger("monitor")
logger.setLevel(LOG_LEVEL)
ch = logging.StreamHandler()
ch.setFormatter(formatter)
logger.addHandler(ch)
return logger
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3741.400 QQBrowser/10.5.3863.400'}
START_URL = ('https://gz.zu.ke.com/zufang/pg{}/')
LOG_LEVEL = logging.INFO # 日志等级
POOL_MAXSIZE = 8 # 线程池最大容量
logger = get_logger()
class HouseSpider:
"""
House 网站爬虫类
"""
def __init__(self):
self.count = 1 # 记录当前爬第几条数据
self.desc_url_queue = Queue() # 线程池队列
self.pool = Pool(POOL_MAXSIZE) # 线程池管理线程,最大协程数
# 获取信息
def job_spider(self):
"""
爬虫入口
"""
urls = [START_URL.format(p) for p in range(1, 100)]
for url in urls:
logger.info("爬取第 {}".format(urls.index(url) + 1))
response = requests.get(url=url, headers=headers)
response.enconding = response.apparent_encoding
bs = bs4.BeautifulSoup(response.text, 'lxml')
house_list = bs.select('#content > div.content__article > div.content__list > div') #
house = []
for item in house_list:
tmp = {}
try:
self.desc_url_queue.put(item.select('a')[0].attrs['href']) # 房屋详情链接加入队列
title = item.select('div > p.content__list--item--title.twoline > a')[0].string.replace('\n','').replace(' ', '')
tmp['标题'] = title
xinzhengqu = item.select('div > p.content__list--item--des > a:nth-child(1)')[0].string
tmp['行政区'] = xinzhengqu
xinzhengqu_level = item.select('div > p.content__list--item--des > a:nth-child(2)')[0].string
tmp['二级行政区'] = xinzhengqu_level
location = item.select('div > p.content__list--item--des > a:nth-child(3)')[0].string
tmp['地址'] = location
detail_house = str(item.select('div > p.content__list--item--des')[0]).replace(' ', '').replace('\n', '')
detail_house_list = re.split('<i>/</i>', detail_house)[1:-1]
miji = detail_house_list[0]
tmp['面积'] = miji
chaoxiang = detail_house_list[1]
tmp['朝向'] = chaoxiang
rooms = detail_house_list[2][:detail_house_list[2].index('<')]
tmp['房间'] = rooms
price = item.select('div > span > em')[0].string
price_detail = re.findall('</em>.*', str(item.select('div > span')[0]))[0].replace(' ', '').replace('</em>', '').replace('</span>', '')
tmp['价格'] = price + price_detail
with open('data/房源信息.txt', 'ab+') as f:
f.write((str(tmp) + '\n').encode('utf-8'))
except:
continue
# 打印队列长度,即多少条岗位详情 url
logger.info("队列长度为 {} ".format(self.desc_url_queue.qsize()))
def post_require(self):
"""
爬取房源描述
"""
while True:
# 从队列中取 url
url = self.desc_url_queue.get()
url_ = 'https://gz.zu.ke.com/' + url
response = requests.get(url=url_, headers=headers)
response.enconding = response.apparent_encoding
bs = bs4.BeautifulSoup(response.text, 'html.parser')
try:
if response.status_code == 200:
logger.info("爬取第 {} 条房源详情".format(self.count))
desc = str(bs.select('#desc > p:nth-child(3)')[0]['data-desc']).replace('\n', '').replace(' ','').replace('<br/>', '')
with open('data/房源描述.txt', 'ab+') as f:
f.write((str(desc) + '\n').encode('utf-8'))
self.desc_url_queue.task_done()
self.count += 1
else:
self.desc_url_queue.put(url)
continue
except Exception as e:
logger.error(e)
logger.warning(url)
def execute_more_tasks(self, target):
"""
协程池接收请求任务,可以扩展把解析,存储耗时操作加入各自队列,效率最大化
:param target: 任务函数
:param count: 启动线程数量
"""
for i in range(POOL_MAXSIZE):
self.pool.apply_async(target)
def run(self):
"""
多线程爬取数据
"""
self.job_spider()
self.execute_more_tasks(self.post_require)
self.desc_url_queue.join() # 主线程阻塞,等待队列清空
if __name__ == "__main__":
spider = HouseSpider()
start = time.time()
spider.run()
logger.info("总耗时 {}".format(time.time() - start))

@ -0,0 +1,109 @@
import pandas as pd
import re
# 数据预处理
with open('data/岗位信息.txt','rb') as file:
job_list = []
while True:
line = file.readline()
if not line:
break
line = eval(line.decode('utf-8'))
try:
line['位置'] = re.split('-',line['位置'])[1]
danwei = re.findall('[\u4e00-\u9fa5]+',line['薪资'])
xinzi = re.findall('\d+.*\d',line['薪资'])[0].split('-')
if not xinzi[1]:
xinzi[1] = xinzi[0]
if danwei[0][0] == '' and danwei[1] == '':
line['薪资'] = round((float(xinzi[0])+float(xinzi[1]))/2,2)
elif danwei[0][0] == '' and danwei[1] == '':
line['薪资'] = round((float(xinzi[0]) + float(xinzi[1])) / 2 /12, 2)
elif danwei[0] == '' and danwei[1] == '':
line['薪资'] = round((float(xinzi[0]) + float(xinzi[1])) / 2 / 10, 2)
elif danwei[0] == '' and danwei[1:] == '小时':
line['薪资'] = round((float(xinzi[0]) + float(xinzi[1]))*8*22 / 2 / 100, 2)
except:
continue
job_list.append(line)
job_list_DF = pd.DataFrame(job_list)
xingzhengqu = [item for item in set(job_list_DF.get(key='位置')) if item]
# 广州各区岗位分布
from pyecharts import options as opts
from pyecharts.charts import Pie
def getAreaWeight():
result = job_list_DF.groupby('位置').size().reset_index(name='count')
areaName = list(result.位置.values)
areaWeight = list(map(int,result['count'].values))
areaName_tmp = []
for index,item in enumerate(areaName):
areaName_tmp.append(item + str(round(areaWeight[index]/sum(areaWeight)*100,2))+'%')
return zip(areaName_tmp,areaWeight)
pie = Pie(init_opts=opts.InitOpts(width='800px',height='800px'))
pie.add('',getAreaWeight()).set_global_opts(title_opts=opts.TitleOpts(title='广州各区岗位分布'))
pie.render('images/job/广州各区岗位分布.html')
# 求广州单月薪资
from pyecharts.charts import TreeMap
def getAvgPrice(xingzhengqu):
totalPrice = 0
total = 0
for item in job_list:
if item['位置'] == xingzhengqu:
total = total + 1
totalPrice = totalPrice + item['薪资']
return totalPrice / total if total >0 else 0
# 获取每个区 单月薪资
def getTotalAvgPrice():
totalAvgPriceList = []
totalAvgPriceDirList = []
for index, item in enumerate(xingzhengqu):
avg_price = getAvgPrice(item)
totalAvgPriceList.append(round(avg_price,2))
totalAvgPriceDirList.append({'value':round(avg_price,2),'name':item + "" + str(round(avg_price,2)) +''})
return totalAvgPriceDirList
data = getTotalAvgPrice()
treemap = TreeMap(init_opts=opts.InitOpts(width='1200px',height='1400px'))
treemap.add('广州各区每月薪资:万/月',data,label_opts=opts.LabelOpts(is_show=True, position='inside',font_size=13))
treemap.render('images/job/广州各区每月薪资.html')
# 获取每个区 单日薪资
from pyecharts.charts import Bar
totalAvgPriceList = []
for index,item in enumerate(xingzhengqu):
avg_price = getAvgPrice(item)
totalAvgPriceList.append(round(avg_price*10000/30,2))
attr, value = (xingzhengqu,totalAvgPriceList)
bar = Bar(init_opts=opts.InitOpts(width='1200px',height='1400px'))
bar.add_xaxis(attr)
bar.add_yaxis("广州",value)
bar.set_global_opts(title_opts=opts.TitleOpts(title='广州各区单日薪资:元/日'),xaxis_opts=opts.AxisOpts(axislabel_opts={"rotate":"270"}))
bar.render('images/job/广州各区单日薪资.html')
# 获取岗位数据
from pyecharts.charts import WordCloud
def getRooms():
results = job_list_DF.groupby('岗位').size().reset_index(name='count')
room_list = list(results.岗位.values)
weight_list = list(map(int,results['count'].values))
return (room_list, weight_list)
attr, value = getRooms()
wordcloud = WordCloud(init_opts=opts.InitOpts(width='900px',height='400px'))
wordcloud.add('',zip(attr,value),word_size_range=[2,100])
wordcloud.render('images/job/广州岗位数据.html')
# 获取各个区的岗位数量比重
from pyecharts.charts import Pie
def getAreaWeight():
result = job_list_DF.groupby('位置').size().reset_index(name='count')
areaName = list(result.位置.values)
areaWeight = list(map(int,result['count'].values))
areaName_tmp = []
for index,item in enumerate(areaName):
areaName_tmp.append(item + str(round(areaWeight[index]/sum(areaWeight)*100,2))+'%')
return zip(areaName_tmp,areaWeight)
pie = Pie(init_opts=opts.InitOpts(width='1200px',height='1200px'))
pie.add('',getAreaWeight()).set_global_opts(title_opts=opts.TitleOpts(title='广州各区岗位数量分布'))
pie.render('images/job/广州各区岗位数量分布.html')

@ -0,0 +1,136 @@
from gevent import monkey
from gevent.pool import Pool
monkey.patch_all(select=False)
import time
import os
import logging
import requests
from queue import Queue
from bs4 import BeautifulSoup
# 开启多线程
monkey.patch_all()
def get_logger():
"""
创建日志实例
"""
formatter = logging.Formatter("%(asctime)s - %(message)s")
logger = logging.getLogger("monitor")
logger.setLevel(LOG_LEVEL)
ch = logging.StreamHandler()
ch.setFormatter(formatter)
logger.addHandler(ch)
return logger
HEADERS = {
"X-Requested-With": "XMLHttpRequest",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36"
"(KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36",
}
START_URL = (
'https://search.51job.com/list/030200,000000,0000,00,9,99,%2520,2,{}.html?'
'lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&'
'companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&'
'fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare='
)
LOG_LEVEL = logging.INFO # 日志等级
POOL_MAXSIZE = 8 # 线程池最大容量
logger = get_logger()
class JobSpider:
"""
Job 网站爬虫类
"""
def __init__(self):
self.count = 1 # 记录当前爬第几条数据
self.company = []
self.desc_url_queue = Queue() # 线程池队列
self.pool = Pool(POOL_MAXSIZE) # 线程池管理线程,最大协程数
# 获取信息
def job_spider(self):
"""
爬虫入口
"""
urls = [START_URL.format(p) for p in range(1, 200)] # #resultList > div:nth-child(53)
for url in urls:
logger.info("爬取链接:{}\n{}".format(url, urls.index(url) + 1))
html = requests.get(url, headers=HEADERS).content.decode("gbk")
bs = BeautifulSoup(html, "lxml").find("div", class_="dw_table").find_all("div", class_="el")
for b in bs:
try:
href = b.find("a")["href"]
self.desc_url_queue.put(href) # 岗位详情链接加入队列
except Exception:
pass
# 打印队列长度,即多少条岗位详情 url
logger.info("队列长度为 {} ".format(self.desc_url_queue.qsize()))
def post_require(self):
"""
爬取职位描述
"""
while True:
# 从队列中取 url
url = self.desc_url_queue.get()
resp = requests.get(url, headers=HEADERS)
if resp.status_code == 200:
logger.info("爬取第 {} 条岗位详情".format(self.count))
html = resp.content.decode("gbk")
self.desc_url_queue.task_done()
self.count += 1
else:
self.desc_url_queue.put(url)
continue
try:
bs_tmp = BeautifulSoup(html, 'lxml').select(
'body > div.tCompanyPage > div.tCompany_center.clearfix > div.tHeader.tHjob > div > div.cn')[0]
bs_tmp1 = bs_tmp.select('h1')[0]
bs_tmp2 = bs_tmp.select('strong')[0]
bs_tmp3 = bs_tmp.select('p.cname > a.catn')[0]
bs_tmp4 = bs_tmp.select(' p.msg.ltype')[0].text.replace(u'\xa0', '').split('|')
with open('data/岗位信息.txt', 'ab+') as f:
tmp = {"岗位": bs_tmp1.text, "公司": bs_tmp3['title'], "薪资": bs_tmp2.text, '位置': bs_tmp4[0],
'工作经验': bs_tmp4[1], '学历': bs_tmp4[2], '招聘人数': bs_tmp4[3], '发布时间': bs_tmp4[4]}
f.write((str(tmp) + '\n').encode('utf-8'))
bs = BeautifulSoup(html, "lxml").find("div", class_="bmsg job_msg inbox").text
s = bs.replace("微信", "").replace("分享", "").replace("邮件", "").replace("\t", "").strip()
with open(os.path.join("data", "岗位描述.txt"), "a", encoding="utf-8") as f:
f.write(s)
except Exception as e:
logger.error(e)
logger.warning(url)
def execute_more_tasks(self, target):
"""
协程池接收请求任务,可以扩展把解析,存储耗时操作加入各自队列,效率最大化
:param target: 任务函数
:param count: 启动线程数量
"""
for i in range(POOL_MAXSIZE):
self.pool.apply_async(target)
def run(self):
"""
多线程爬取数据
"""
self.job_spider()
self.execute_more_tasks(self.post_require)
self.desc_url_queue.join() # 主线程阻塞,等待队列清空
if __name__ == "__main__":
spider = JobSpider()
start = time.time()
spider.run()
logger.info("总耗时 {}".format(time.time() - start))

@ -0,0 +1,15 @@
beautifulsoup4==4.9.1
bs4==0.0.1
certifi==2020.6.20
cffi==1.14.0
chardet==3.0.4
gevent==20.6.2
greenlet==0.4.16
idna==2.10
lxml==4.5.1
pycparser==2.20
requests==2.24.0
soupsieve==2.0.1
urllib3==1.25.9
zope.event==4.4
zope.interface==5.1.0

Binary file not shown.
Loading…
Cancel
Save