commit
dc9a49f501
@ -0,0 +1,146 @@ |
|||||||
|
# Created by .ignore support plugin (hsz.mobi) |
||||||
|
### Python template |
||||||
|
# Byte-compiled / optimized / DLL files |
||||||
|
__pycache__/ |
||||||
|
*.py[cod] |
||||||
|
*$py.class |
||||||
|
|
||||||
|
# C extensions |
||||||
|
*.so |
||||||
|
|
||||||
|
# Distribution / packaging |
||||||
|
.Python |
||||||
|
build/ |
||||||
|
develop-eggs/ |
||||||
|
dist/ |
||||||
|
downloads/ |
||||||
|
eggs/ |
||||||
|
.eggs/ |
||||||
|
lib/ |
||||||
|
lib64/ |
||||||
|
parts/ |
||||||
|
sdist/ |
||||||
|
var/ |
||||||
|
wheels/ |
||||||
|
pip-wheel-metadata/ |
||||||
|
share/python-wheels/ |
||||||
|
*.egg-info/ |
||||||
|
.installed.cfg |
||||||
|
*.egg |
||||||
|
MANIFEST |
||||||
|
|
||||||
|
# PyInstaller |
||||||
|
# Usually these files are written by a python script from a template |
||||||
|
# before PyInstaller builds the exe, so as to inject date/other infos into it. |
||||||
|
*.manifest |
||||||
|
*.spec |
||||||
|
|
||||||
|
# Installer logs |
||||||
|
pip-log.txt |
||||||
|
pip-delete-this-directory.txt |
||||||
|
|
||||||
|
# Unit test / coverage reports |
||||||
|
htmlcov/ |
||||||
|
.tox/ |
||||||
|
.nox/ |
||||||
|
.coverage |
||||||
|
.coverage.* |
||||||
|
.cache |
||||||
|
nosetests.xml |
||||||
|
coverage.xml |
||||||
|
*.cover |
||||||
|
*.py,cover |
||||||
|
.hypothesis/ |
||||||
|
.pytest_cache/ |
||||||
|
cover/ |
||||||
|
|
||||||
|
# Translations |
||||||
|
*.mo |
||||||
|
*.pot |
||||||
|
|
||||||
|
# Django stuff: |
||||||
|
*.log |
||||||
|
local_settings.py |
||||||
|
db.sqlite3 |
||||||
|
db.sqlite3-journal |
||||||
|
|
||||||
|
# Flask stuff: |
||||||
|
instance/ |
||||||
|
.webassets-cache |
||||||
|
|
||||||
|
# Scrapy stuff: |
||||||
|
.scrapy |
||||||
|
|
||||||
|
# Sphinx documentation |
||||||
|
docs/_build/ |
||||||
|
|
||||||
|
# PyBuilder |
||||||
|
.pybuilder/ |
||||||
|
target/ |
||||||
|
|
||||||
|
# Jupyter Notebook |
||||||
|
.ipynb_checkpoints |
||||||
|
|
||||||
|
# IPython |
||||||
|
profile_default/ |
||||||
|
ipython_config.py |
||||||
|
|
||||||
|
# pyenv |
||||||
|
# For a library or package, you might want to ignore these files since the code is |
||||||
|
# intended to run in multiple environments; otherwise, check them in: |
||||||
|
# .python-version |
||||||
|
|
||||||
|
# pipenv |
||||||
|
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. |
||||||
|
# However, in case of collaboration, if having platform-specific dependencies or dependencies |
||||||
|
# having no cross-platform support, pipenv may install dependencies that don't work, or not |
||||||
|
# install all needed dependencies. |
||||||
|
#Pipfile.lock |
||||||
|
|
||||||
|
# PEP 582; used by e.g. github.com/David-OConnor/pyflow |
||||||
|
__pypackages__/ |
||||||
|
|
||||||
|
# Celery stuff |
||||||
|
celerybeat-schedule |
||||||
|
celerybeat.pid |
||||||
|
|
||||||
|
# SageMath parsed files |
||||||
|
*.sage.py |
||||||
|
|
||||||
|
# Environments |
||||||
|
.env |
||||||
|
.venv |
||||||
|
env/ |
||||||
|
venv/ |
||||||
|
ENV/ |
||||||
|
env.bak/ |
||||||
|
venv.bak/ |
||||||
|
|
||||||
|
# Spyder project settings |
||||||
|
.spyderproject |
||||||
|
.spyproject |
||||||
|
|
||||||
|
# Rope project settings |
||||||
|
.ropeproject |
||||||
|
|
||||||
|
# mkdocs documentation |
||||||
|
/site |
||||||
|
|
||||||
|
# mypy |
||||||
|
.mypy_cache/ |
||||||
|
.dmypy.json |
||||||
|
dmypy.json |
||||||
|
|
||||||
|
# Pyre type checker |
||||||
|
.pyre/ |
||||||
|
|
||||||
|
# pytype static type analyzer |
||||||
|
.pytype/ |
||||||
|
|
||||||
|
# Cython debug symbols |
||||||
|
cython_debug/ |
||||||
|
/images/both/ |
||||||
|
/images/house/ |
||||||
|
/images/job/ |
||||||
|
/data/ |
||||||
|
/.idea/ |
@ -0,0 +1,98 @@ |
|||||||
|
import pandas as pd |
||||||
|
import re |
||||||
|
import pyecharts.options as opts |
||||||
|
from pyecharts.charts import Line, Bar |
||||||
|
|
||||||
|
# 岗位信息数据预处理 |
||||||
|
with open('data/岗位信息.txt','rb') as file: |
||||||
|
job_list = [] |
||||||
|
while True: |
||||||
|
line = file.readline() |
||||||
|
if not line: |
||||||
|
break |
||||||
|
line = eval(line.decode('utf-8')) |
||||||
|
try: |
||||||
|
line['位置'] = re.split('-',line['位置'])[1] |
||||||
|
danwei = re.findall('[\u4e00-\u9fa5]+',line['薪资']) |
||||||
|
xinzi = re.findall('\d+.*\d',line['薪资'])[0].split('-') |
||||||
|
if not xinzi[1]: |
||||||
|
xinzi[1] = xinzi[0] |
||||||
|
if danwei[0][0] == '万' and danwei[1] == '月': |
||||||
|
line['薪资'] = round((float(xinzi[0])+float(xinzi[1]))/2,2) |
||||||
|
elif danwei[0][0] == '万' and danwei[1] == '年': |
||||||
|
line['薪资'] = round((float(xinzi[0]) + float(xinzi[1])) / 2 /12, 2) |
||||||
|
elif danwei[0] == '千' and danwei[1] == '月': |
||||||
|
line['薪资'] = round((float(xinzi[0]) + float(xinzi[1])) / 2 / 10, 2) |
||||||
|
elif danwei[0] == '元' and danwei[1:] == '小时': |
||||||
|
line['薪资'] = round((float(xinzi[0]) + float(xinzi[1]))*8*22 / 2 / 100, 2) |
||||||
|
except: |
||||||
|
continue |
||||||
|
job_list.append(line) |
||||||
|
job_list_DF = pd.DataFrame(job_list) |
||||||
|
|
||||||
|
# 房源信息预处理 |
||||||
|
with open('data/房源信息.txt','rb') as file: |
||||||
|
house_list = [] |
||||||
|
while True: |
||||||
|
line = file.readline() |
||||||
|
if not line: |
||||||
|
break |
||||||
|
line = eval(line.decode('utf-8')) |
||||||
|
line['面积'] = int(re.findall('\d+',line['面积'])[0]) |
||||||
|
line['价格'] = int(re.findall('\d+',line['价格'])[0]) |
||||||
|
house_list.append(line) |
||||||
|
house_list_DF = pd.DataFrame(house_list) |
||||||
|
|
||||||
|
xingzhengqu = [item for item in set(house_list_DF.get(key='行政区')) if item] |
||||||
|
|
||||||
|
# 获取每个区 单日每平方米的价格 |
||||||
|
def houserGetAvgPrice(xingzhengqu): |
||||||
|
totalPrice = 0 |
||||||
|
totalArea = 0 |
||||||
|
for item in house_list: |
||||||
|
if item['行政区'] == xingzhengqu: |
||||||
|
totalArea = totalArea + item['面积'] |
||||||
|
totalPrice = totalPrice + item['价格'] |
||||||
|
return totalPrice / totalArea if totalArea >0 else 1 |
||||||
|
|
||||||
|
# 房租每日单价 |
||||||
|
house_totalAvgPriceList = [] |
||||||
|
for index,item in enumerate(xingzhengqu): |
||||||
|
avg_price = houserGetAvgPrice(item) |
||||||
|
house_totalAvgPriceList.append(round(avg_price/30,2)) |
||||||
|
attr, house_value = (xingzhengqu,house_totalAvgPriceList) |
||||||
|
|
||||||
|
# 获取每个区 单日薪资 |
||||||
|
def jobGetAvgPrice(xingzhengqu): |
||||||
|
totalPrice = 0 |
||||||
|
total = 0 |
||||||
|
for item in job_list: |
||||||
|
if item['位置'] == xingzhengqu: |
||||||
|
total = total + 1 |
||||||
|
totalPrice = totalPrice + item['薪资'] |
||||||
|
return totalPrice / total if total >0 else 0 |
||||||
|
|
||||||
|
# 获取每个区 单时薪资 |
||||||
|
job_totalAvgPriceList = [] |
||||||
|
for index,item in enumerate(xingzhengqu): |
||||||
|
avg_price = jobGetAvgPrice(item+'区') |
||||||
|
job_totalAvgPriceList.append(round(avg_price*10000/30/24,2)) |
||||||
|
attr, job_value = (xingzhengqu,job_totalAvgPriceList) |
||||||
|
|
||||||
|
# 广州房租-薪资图 |
||||||
|
line =Line(init_opts=opts.InitOpts(width='800px',height='800px')) |
||||||
|
line.add_xaxis(xaxis_data=attr) |
||||||
|
line.add_yaxis('房租:元/日(1平方米)', house_value) |
||||||
|
line.add_yaxis("薪资:元/日", job_value) |
||||||
|
line.render('images/both/广州房租-薪资.html') |
||||||
|
|
||||||
|
difference = [] |
||||||
|
for i in range(len(job_value)): |
||||||
|
difference.append(round(job_value[i]-house_value[i],2)) |
||||||
|
# 广州房租-薪资差距图 |
||||||
|
bar = Bar(init_opts=opts.InitOpts(width='1200px',height='1400px')) |
||||||
|
bar.add_xaxis(attr) |
||||||
|
bar.add_yaxis("广州房租-薪资差距图:元",difference) |
||||||
|
bar.set_global_opts(title_opts=opts.TitleOpts(title='广州房租-薪资差距图:元'),xaxis_opts=opts.AxisOpts(axislabel_opts={"rotate":"270"})) |
||||||
|
bar.render('images/both/广州房租-薪资差距.html') |
||||||
|
|
@ -0,0 +1,91 @@ |
|||||||
|
import pandas as pd |
||||||
|
import pyecharts.options as opts |
||||||
|
from pyecharts.charts import Pie |
||||||
|
import re |
||||||
|
|
||||||
|
|
||||||
|
with open('data/房源信息.txt','rb') as file: |
||||||
|
house_list = [] |
||||||
|
while True: |
||||||
|
line = file.readline() |
||||||
|
if not line: |
||||||
|
break |
||||||
|
line = eval(line.decode('utf-8')) |
||||||
|
line['面积'] = int(re.findall('\d+',line['面积'])[0]) |
||||||
|
line['价格'] = int(re.findall('\d+',line['价格'])[0]) |
||||||
|
house_list.append(line) |
||||||
|
house_list_DF = pd.DataFrame(house_list) |
||||||
|
xingzhengqu = [item for item in set(house_list_DF.get(key='行政区')) if item] |
||||||
|
|
||||||
|
# 租房面积统计 |
||||||
|
bins = [-1,30,60,90,120,200,300,400,10000] |
||||||
|
attr = ['0-30平方米','30-60平方米','60-90平方米','90-120平方米','120-200平方米','200-300平方米','300-400平方米','400+平方米'] |
||||||
|
tmpDF = house_list_DF.groupby(pd.cut(house_list_DF['面积'],bins = bins,labels=attr)).size().reset_index(name = 'count') |
||||||
|
value = list(map(int,tmpDF['count'].values)) |
||||||
|
pie = Pie(init_opts=opts.InitOpts(width='800px',height='800px')) |
||||||
|
pie.add('',zip(attr,value)).set_global_opts(title_opts=opts.TitleOpts(title='租房面积统计')) |
||||||
|
pie.render('images/house/广州租房面积统计.html') |
||||||
|
|
||||||
|
# 求每个区的每平方米的租房单价 |
||||||
|
from pyecharts.charts import TreeMap |
||||||
|
def getAvgPrice(xingzhengqu): |
||||||
|
totalPrice = 0 |
||||||
|
totalArea = 0 |
||||||
|
for item in house_list: |
||||||
|
if item['行政区'] == xingzhengqu: |
||||||
|
totalArea = totalArea + item['面积'] |
||||||
|
totalPrice = totalPrice + item['价格'] |
||||||
|
return totalPrice / totalArea if totalArea >0 else 1 |
||||||
|
# 获取每个区 单月每平方米的价格 |
||||||
|
def getTotalAvgPrice(): |
||||||
|
totalAvgPriceList = [] |
||||||
|
totalAvgPriceDirList = [] |
||||||
|
for index, item in enumerate(xingzhengqu): |
||||||
|
avg_price = getAvgPrice(item) |
||||||
|
totalAvgPriceList.append(round(avg_price,3)) |
||||||
|
totalAvgPriceDirList.append({'value':round(avg_price,3),'name':item + " ¥" + str(round(avg_price,3))}) |
||||||
|
return totalAvgPriceDirList |
||||||
|
# 获取每月每平方米的价格 |
||||||
|
data = getTotalAvgPrice() |
||||||
|
treemap = TreeMap(init_opts=opts.InitOpts(width='900px',height='800px')) |
||||||
|
treemap.add('广州各区房租单价:平方米/月',data,label_opts=opts.LabelOpts(is_show=True, position='inside',font_size=13)) |
||||||
|
treemap.render('images/house/广州各区房租单价.html') |
||||||
|
|
||||||
|
# 获取每个区 单日每平方米的价格 |
||||||
|
from pyecharts.charts import Bar |
||||||
|
totalAvgPriceList = [] |
||||||
|
for index,item in enumerate(xingzhengqu): |
||||||
|
avg_price = getAvgPrice(item) |
||||||
|
totalAvgPriceList.append(round(avg_price/30,3)) |
||||||
|
attr, value = (xingzhengqu,totalAvgPriceList) |
||||||
|
bar = Bar(init_opts=opts.InitOpts(width='900px',height='800px')) |
||||||
|
bar.add_xaxis(attr) |
||||||
|
bar.add_yaxis("广州",value) |
||||||
|
bar.set_global_opts(title_opts=opts.TitleOpts(title='广州各区房租单价:平方米/日')) |
||||||
|
bar.render('images/house/广州每日每平方米的价格.html') |
||||||
|
|
||||||
|
# 获取户型数据 |
||||||
|
from pyecharts.charts import WordCloud |
||||||
|
def getRooms(): |
||||||
|
results = house_list_DF.groupby('房间').size().reset_index(name='count') |
||||||
|
room_list = list(results.房间.values) |
||||||
|
weight_list = list(map(int,results['count'].values)) |
||||||
|
return (room_list, weight_list) |
||||||
|
attr, value = getRooms() |
||||||
|
wordcloud = WordCloud(init_opts=opts.InitOpts(width='900px',height='400px')) |
||||||
|
wordcloud.add('',zip(attr,value),word_size_range=[2,100]) |
||||||
|
wordcloud.render('images/house/广州户型数据.html') |
||||||
|
|
||||||
|
# 获取各个区的房源比重 |
||||||
|
from pyecharts.charts import Pie |
||||||
|
def getAreaWeight(): |
||||||
|
result = house_list_DF.groupby('行政区').size().reset_index(name='count') |
||||||
|
areaName = list(result.行政区.values) |
||||||
|
areaWeight = list(map(int,result['count'].values)) |
||||||
|
areaName_tmp = [] |
||||||
|
for index,item in enumerate(areaName): |
||||||
|
areaName_tmp.append(item + str(round(areaWeight[index]/sum(areaWeight)*100,2))+'%') |
||||||
|
return zip(areaName_tmp,areaWeight) |
||||||
|
pie = Pie(init_opts=opts.InitOpts(width='600px',height='400px')) |
||||||
|
pie.add('',getAreaWeight()).set_global_opts(title_opts=opts.TitleOpts(title='广州房源分布')) |
||||||
|
pie.render('images/house/广州房源分布.html') |
@ -0,0 +1,141 @@ |
|||||||
|
import time |
||||||
|
import logging |
||||||
|
import requests |
||||||
|
from gevent import monkey |
||||||
|
from gevent.pool import Pool |
||||||
|
from queue import Queue |
||||||
|
import bs4 |
||||||
|
import re |
||||||
|
|
||||||
|
# 开启多线程 |
||||||
|
monkey.patch_all() |
||||||
|
|
||||||
|
|
||||||
|
def get_logger(): |
||||||
|
""" |
||||||
|
创建日志实例 |
||||||
|
""" |
||||||
|
formatter = logging.Formatter("%(asctime)s - %(message)s") |
||||||
|
logger = logging.getLogger("monitor") |
||||||
|
logger.setLevel(LOG_LEVEL) |
||||||
|
|
||||||
|
ch = logging.StreamHandler() |
||||||
|
ch.setFormatter(formatter) |
||||||
|
logger.addHandler(ch) |
||||||
|
return logger |
||||||
|
|
||||||
|
|
||||||
|
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3741.400 QQBrowser/10.5.3863.400'} |
||||||
|
|
||||||
|
|
||||||
|
START_URL = ('https://gz.zu.ke.com/zufang/pg{}/') |
||||||
|
|
||||||
|
LOG_LEVEL = logging.INFO # 日志等级 |
||||||
|
POOL_MAXSIZE = 8 # 线程池最大容量 |
||||||
|
|
||||||
|
logger = get_logger() |
||||||
|
|
||||||
|
|
||||||
|
class HouseSpider: |
||||||
|
""" |
||||||
|
House 网站爬虫类 |
||||||
|
""" |
||||||
|
|
||||||
|
def __init__(self): |
||||||
|
self.count = 1 # 记录当前爬第几条数据 |
||||||
|
self.desc_url_queue = Queue() # 线程池队列 |
||||||
|
self.pool = Pool(POOL_MAXSIZE) # 线程池管理线程,最大协程数 |
||||||
|
# 获取信息 |
||||||
|
def job_spider(self): |
||||||
|
""" |
||||||
|
爬虫入口 |
||||||
|
""" |
||||||
|
urls = [START_URL.format(p) for p in range(1, 100)] |
||||||
|
for url in urls: |
||||||
|
logger.info("爬取第 {} 页".format(urls.index(url) + 1)) |
||||||
|
response = requests.get(url=url, headers=headers) |
||||||
|
response.enconding = response.apparent_encoding |
||||||
|
bs = bs4.BeautifulSoup(response.text, 'lxml') |
||||||
|
house_list = bs.select('#content > div.content__article > div.content__list > div') # |
||||||
|
house = [] |
||||||
|
for item in house_list: |
||||||
|
tmp = {} |
||||||
|
try: |
||||||
|
self.desc_url_queue.put(item.select('a')[0].attrs['href']) # 房屋详情链接加入队列 |
||||||
|
title = item.select('div > p.content__list--item--title.twoline > a')[0].string.replace('\n','').replace(' ', '') |
||||||
|
tmp['标题'] = title |
||||||
|
xinzhengqu = item.select('div > p.content__list--item--des > a:nth-child(1)')[0].string |
||||||
|
tmp['行政区'] = xinzhengqu |
||||||
|
xinzhengqu_level = item.select('div > p.content__list--item--des > a:nth-child(2)')[0].string |
||||||
|
tmp['二级行政区'] = xinzhengqu_level |
||||||
|
location = item.select('div > p.content__list--item--des > a:nth-child(3)')[0].string |
||||||
|
tmp['地址'] = location |
||||||
|
detail_house = str(item.select('div > p.content__list--item--des')[0]).replace(' ', '').replace('\n', '') |
||||||
|
detail_house_list = re.split('<i>/</i>', detail_house)[1:-1] |
||||||
|
miji = detail_house_list[0] |
||||||
|
tmp['面积'] = miji |
||||||
|
chaoxiang = detail_house_list[1] |
||||||
|
tmp['朝向'] = chaoxiang |
||||||
|
rooms = detail_house_list[2][:detail_house_list[2].index('<')] |
||||||
|
tmp['房间'] = rooms |
||||||
|
price = item.select('div > span > em')[0].string |
||||||
|
price_detail = re.findall('</em>.*', str(item.select('div > span')[0]))[0].replace(' ', '').replace('</em>', '').replace('</span>', '') |
||||||
|
tmp['价格'] = price + price_detail |
||||||
|
with open('data/房源信息.txt', 'ab+') as f: |
||||||
|
f.write((str(tmp) + '\n').encode('utf-8')) |
||||||
|
except: |
||||||
|
continue |
||||||
|
# 打印队列长度,即多少条岗位详情 url |
||||||
|
logger.info("队列长度为 {} ".format(self.desc_url_queue.qsize())) |
||||||
|
|
||||||
|
def post_require(self): |
||||||
|
""" |
||||||
|
爬取房源描述 |
||||||
|
""" |
||||||
|
while True: |
||||||
|
# 从队列中取 url |
||||||
|
url = self.desc_url_queue.get() |
||||||
|
url_ = 'https://gz.zu.ke.com/' + url |
||||||
|
response = requests.get(url=url_, headers=headers) |
||||||
|
response.enconding = response.apparent_encoding |
||||||
|
bs = bs4.BeautifulSoup(response.text, 'html.parser') |
||||||
|
try: |
||||||
|
if response.status_code == 200: |
||||||
|
logger.info("爬取第 {} 条房源详情".format(self.count)) |
||||||
|
desc = str(bs.select('#desc > p:nth-child(3)')[0]['data-desc']).replace('\n', '').replace(' ','').replace('<br/>', '') |
||||||
|
with open('data/房源描述.txt', 'ab+') as f: |
||||||
|
f.write((str(desc) + '\n').encode('utf-8')) |
||||||
|
self.desc_url_queue.task_done() |
||||||
|
self.count += 1 |
||||||
|
else: |
||||||
|
self.desc_url_queue.put(url) |
||||||
|
continue |
||||||
|
except Exception as e: |
||||||
|
logger.error(e) |
||||||
|
logger.warning(url) |
||||||
|
|
||||||
|
def execute_more_tasks(self, target): |
||||||
|
""" |
||||||
|
协程池接收请求任务,可以扩展把解析,存储耗时操作加入各自队列,效率最大化 |
||||||
|
|
||||||
|
:param target: 任务函数 |
||||||
|
:param count: 启动线程数量 |
||||||
|
""" |
||||||
|
for i in range(POOL_MAXSIZE): |
||||||
|
self.pool.apply_async(target) |
||||||
|
|
||||||
|
def run(self): |
||||||
|
""" |
||||||
|
多线程爬取数据 |
||||||
|
""" |
||||||
|
self.job_spider() |
||||||
|
self.execute_more_tasks(self.post_require) |
||||||
|
self.desc_url_queue.join() # 主线程阻塞,等待队列清空 |
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__": |
||||||
|
spider = HouseSpider() |
||||||
|
start = time.time() |
||||||
|
spider.run() |
||||||
|
logger.info("总耗时 {} 秒".format(time.time() - start)) |
||||||
|
|
@ -0,0 +1,109 @@ |
|||||||
|
import pandas as pd |
||||||
|
import re |
||||||
|
|
||||||
|
# 数据预处理 |
||||||
|
with open('data/岗位信息.txt','rb') as file: |
||||||
|
job_list = [] |
||||||
|
while True: |
||||||
|
line = file.readline() |
||||||
|
if not line: |
||||||
|
break |
||||||
|
line = eval(line.decode('utf-8')) |
||||||
|
try: |
||||||
|
line['位置'] = re.split('-',line['位置'])[1] |
||||||
|
danwei = re.findall('[\u4e00-\u9fa5]+',line['薪资']) |
||||||
|
xinzi = re.findall('\d+.*\d',line['薪资'])[0].split('-') |
||||||
|
if not xinzi[1]: |
||||||
|
xinzi[1] = xinzi[0] |
||||||
|
if danwei[0][0] == '万' and danwei[1] == '月': |
||||||
|
line['薪资'] = round((float(xinzi[0])+float(xinzi[1]))/2,2) |
||||||
|
elif danwei[0][0] == '万' and danwei[1] == '年': |
||||||
|
line['薪资'] = round((float(xinzi[0]) + float(xinzi[1])) / 2 /12, 2) |
||||||
|
elif danwei[0] == '千' and danwei[1] == '月': |
||||||
|
line['薪资'] = round((float(xinzi[0]) + float(xinzi[1])) / 2 / 10, 2) |
||||||
|
elif danwei[0] == '元' and danwei[1:] == '小时': |
||||||
|
line['薪资'] = round((float(xinzi[0]) + float(xinzi[1]))*8*22 / 2 / 100, 2) |
||||||
|
except: |
||||||
|
continue |
||||||
|
job_list.append(line) |
||||||
|
job_list_DF = pd.DataFrame(job_list) |
||||||
|
xingzhengqu = [item for item in set(job_list_DF.get(key='位置')) if item] |
||||||
|
|
||||||
|
# 广州各区岗位分布 |
||||||
|
from pyecharts import options as opts |
||||||
|
from pyecharts.charts import Pie |
||||||
|
|
||||||
|
def getAreaWeight(): |
||||||
|
result = job_list_DF.groupby('位置').size().reset_index(name='count') |
||||||
|
areaName = list(result.位置.values) |
||||||
|
areaWeight = list(map(int,result['count'].values)) |
||||||
|
areaName_tmp = [] |
||||||
|
for index,item in enumerate(areaName): |
||||||
|
areaName_tmp.append(item + str(round(areaWeight[index]/sum(areaWeight)*100,2))+'%') |
||||||
|
return zip(areaName_tmp,areaWeight) |
||||||
|
pie = Pie(init_opts=opts.InitOpts(width='800px',height='800px')) |
||||||
|
pie.add('',getAreaWeight()).set_global_opts(title_opts=opts.TitleOpts(title='广州各区岗位分布')) |
||||||
|
pie.render('images/job/广州各区岗位分布.html') |
||||||
|
|
||||||
|
# 求广州单月薪资 |
||||||
|
from pyecharts.charts import TreeMap |
||||||
|
def getAvgPrice(xingzhengqu): |
||||||
|
totalPrice = 0 |
||||||
|
total = 0 |
||||||
|
for item in job_list: |
||||||
|
if item['位置'] == xingzhengqu: |
||||||
|
total = total + 1 |
||||||
|
totalPrice = totalPrice + item['薪资'] |
||||||
|
return totalPrice / total if total >0 else 0 |
||||||
|
# 获取每个区 单月薪资 |
||||||
|
def getTotalAvgPrice(): |
||||||
|
totalAvgPriceList = [] |
||||||
|
totalAvgPriceDirList = [] |
||||||
|
for index, item in enumerate(xingzhengqu): |
||||||
|
avg_price = getAvgPrice(item) |
||||||
|
totalAvgPriceList.append(round(avg_price,2)) |
||||||
|
totalAvgPriceDirList.append({'value':round(avg_price,2),'name':item + " ¥" + str(round(avg_price,2)) +' 万'}) |
||||||
|
return totalAvgPriceDirList |
||||||
|
data = getTotalAvgPrice() |
||||||
|
treemap = TreeMap(init_opts=opts.InitOpts(width='1200px',height='1400px')) |
||||||
|
treemap.add('广州各区每月薪资:万/月',data,label_opts=opts.LabelOpts(is_show=True, position='inside',font_size=13)) |
||||||
|
treemap.render('images/job/广州各区每月薪资.html') |
||||||
|
|
||||||
|
# 获取每个区 单日薪资 |
||||||
|
from pyecharts.charts import Bar |
||||||
|
totalAvgPriceList = [] |
||||||
|
for index,item in enumerate(xingzhengqu): |
||||||
|
avg_price = getAvgPrice(item) |
||||||
|
totalAvgPriceList.append(round(avg_price*10000/30,2)) |
||||||
|
attr, value = (xingzhengqu,totalAvgPriceList) |
||||||
|
bar = Bar(init_opts=opts.InitOpts(width='1200px',height='1400px')) |
||||||
|
bar.add_xaxis(attr) |
||||||
|
bar.add_yaxis("广州",value) |
||||||
|
bar.set_global_opts(title_opts=opts.TitleOpts(title='广州各区单日薪资:元/日'),xaxis_opts=opts.AxisOpts(axislabel_opts={"rotate":"270"})) |
||||||
|
bar.render('images/job/广州各区单日薪资.html') |
||||||
|
|
||||||
|
# 获取岗位数据 |
||||||
|
from pyecharts.charts import WordCloud |
||||||
|
def getRooms(): |
||||||
|
results = job_list_DF.groupby('岗位').size().reset_index(name='count') |
||||||
|
room_list = list(results.岗位.values) |
||||||
|
weight_list = list(map(int,results['count'].values)) |
||||||
|
return (room_list, weight_list) |
||||||
|
attr, value = getRooms() |
||||||
|
wordcloud = WordCloud(init_opts=opts.InitOpts(width='900px',height='400px')) |
||||||
|
wordcloud.add('',zip(attr,value),word_size_range=[2,100]) |
||||||
|
wordcloud.render('images/job/广州岗位数据.html') |
||||||
|
|
||||||
|
# 获取各个区的岗位数量比重 |
||||||
|
from pyecharts.charts import Pie |
||||||
|
def getAreaWeight(): |
||||||
|
result = job_list_DF.groupby('位置').size().reset_index(name='count') |
||||||
|
areaName = list(result.位置.values) |
||||||
|
areaWeight = list(map(int,result['count'].values)) |
||||||
|
areaName_tmp = [] |
||||||
|
for index,item in enumerate(areaName): |
||||||
|
areaName_tmp.append(item + str(round(areaWeight[index]/sum(areaWeight)*100,2))+'%') |
||||||
|
return zip(areaName_tmp,areaWeight) |
||||||
|
pie = Pie(init_opts=opts.InitOpts(width='1200px',height='1200px')) |
||||||
|
pie.add('',getAreaWeight()).set_global_opts(title_opts=opts.TitleOpts(title='广州各区岗位数量分布')) |
||||||
|
pie.render('images/job/广州各区岗位数量分布.html') |
@ -0,0 +1,136 @@ |
|||||||
|
from gevent import monkey |
||||||
|
from gevent.pool import Pool |
||||||
|
|
||||||
|
monkey.patch_all(select=False) |
||||||
|
import time |
||||||
|
import os |
||||||
|
import logging |
||||||
|
import requests |
||||||
|
from queue import Queue |
||||||
|
from bs4 import BeautifulSoup |
||||||
|
|
||||||
|
# 开启多线程 |
||||||
|
monkey.patch_all() |
||||||
|
|
||||||
|
|
||||||
|
def get_logger(): |
||||||
|
""" |
||||||
|
创建日志实例 |
||||||
|
""" |
||||||
|
formatter = logging.Formatter("%(asctime)s - %(message)s") |
||||||
|
logger = logging.getLogger("monitor") |
||||||
|
logger.setLevel(LOG_LEVEL) |
||||||
|
ch = logging.StreamHandler() |
||||||
|
ch.setFormatter(formatter) |
||||||
|
logger.addHandler(ch) |
||||||
|
return logger |
||||||
|
|
||||||
|
|
||||||
|
HEADERS = { |
||||||
|
"X-Requested-With": "XMLHttpRequest", |
||||||
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36" |
||||||
|
"(KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36", |
||||||
|
} |
||||||
|
|
||||||
|
START_URL = ( |
||||||
|
'https://search.51job.com/list/030200,000000,0000,00,9,99,%2520,2,{}.html?' |
||||||
|
'lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&' |
||||||
|
'companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&' |
||||||
|
'fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=' |
||||||
|
) |
||||||
|
|
||||||
|
LOG_LEVEL = logging.INFO # 日志等级 |
||||||
|
POOL_MAXSIZE = 8 # 线程池最大容量 |
||||||
|
|
||||||
|
logger = get_logger() |
||||||
|
|
||||||
|
|
||||||
|
class JobSpider: |
||||||
|
""" |
||||||
|
Job 网站爬虫类 |
||||||
|
""" |
||||||
|
|
||||||
|
def __init__(self): |
||||||
|
self.count = 1 # 记录当前爬第几条数据 |
||||||
|
self.company = [] |
||||||
|
self.desc_url_queue = Queue() # 线程池队列 |
||||||
|
self.pool = Pool(POOL_MAXSIZE) # 线程池管理线程,最大协程数 |
||||||
|
|
||||||
|
# 获取信息 |
||||||
|
def job_spider(self): |
||||||
|
""" |
||||||
|
爬虫入口 |
||||||
|
""" |
||||||
|
urls = [START_URL.format(p) for p in range(1, 200)] # #resultList > div:nth-child(53) |
||||||
|
for url in urls: |
||||||
|
logger.info("爬取链接:{}\n第 {} 页".format(url, urls.index(url) + 1)) |
||||||
|
html = requests.get(url, headers=HEADERS).content.decode("gbk") |
||||||
|
bs = BeautifulSoup(html, "lxml").find("div", class_="dw_table").find_all("div", class_="el") |
||||||
|
for b in bs: |
||||||
|
try: |
||||||
|
href = b.find("a")["href"] |
||||||
|
self.desc_url_queue.put(href) # 岗位详情链接加入队列 |
||||||
|
except Exception: |
||||||
|
pass |
||||||
|
# 打印队列长度,即多少条岗位详情 url |
||||||
|
logger.info("队列长度为 {} ".format(self.desc_url_queue.qsize())) |
||||||
|
|
||||||
|
def post_require(self): |
||||||
|
""" |
||||||
|
爬取职位描述 |
||||||
|
""" |
||||||
|
while True: |
||||||
|
# 从队列中取 url |
||||||
|
url = self.desc_url_queue.get() |
||||||
|
resp = requests.get(url, headers=HEADERS) |
||||||
|
if resp.status_code == 200: |
||||||
|
logger.info("爬取第 {} 条岗位详情".format(self.count)) |
||||||
|
html = resp.content.decode("gbk") |
||||||
|
self.desc_url_queue.task_done() |
||||||
|
self.count += 1 |
||||||
|
else: |
||||||
|
self.desc_url_queue.put(url) |
||||||
|
continue |
||||||
|
try: |
||||||
|
bs_tmp = BeautifulSoup(html, 'lxml').select( |
||||||
|
'body > div.tCompanyPage > div.tCompany_center.clearfix > div.tHeader.tHjob > div > div.cn')[0] |
||||||
|
bs_tmp1 = bs_tmp.select('h1')[0] |
||||||
|
bs_tmp2 = bs_tmp.select('strong')[0] |
||||||
|
bs_tmp3 = bs_tmp.select('p.cname > a.catn')[0] |
||||||
|
bs_tmp4 = bs_tmp.select(' p.msg.ltype')[0].text.replace(u'\xa0', '').split('|') |
||||||
|
with open('data/岗位信息.txt', 'ab+') as f: |
||||||
|
tmp = {"岗位": bs_tmp1.text, "公司": bs_tmp3['title'], "薪资": bs_tmp2.text, '位置': bs_tmp4[0], |
||||||
|
'工作经验': bs_tmp4[1], '学历': bs_tmp4[2], '招聘人数': bs_tmp4[3], '发布时间': bs_tmp4[4]} |
||||||
|
f.write((str(tmp) + '\n').encode('utf-8')) |
||||||
|
bs = BeautifulSoup(html, "lxml").find("div", class_="bmsg job_msg inbox").text |
||||||
|
s = bs.replace("微信", "").replace("分享", "").replace("邮件", "").replace("\t", "").strip() |
||||||
|
with open(os.path.join("data", "岗位描述.txt"), "a", encoding="utf-8") as f: |
||||||
|
f.write(s) |
||||||
|
except Exception as e: |
||||||
|
logger.error(e) |
||||||
|
logger.warning(url) |
||||||
|
|
||||||
|
def execute_more_tasks(self, target): |
||||||
|
""" |
||||||
|
协程池接收请求任务,可以扩展把解析,存储耗时操作加入各自队列,效率最大化 |
||||||
|
|
||||||
|
:param target: 任务函数 |
||||||
|
:param count: 启动线程数量 |
||||||
|
""" |
||||||
|
for i in range(POOL_MAXSIZE): |
||||||
|
self.pool.apply_async(target) |
||||||
|
|
||||||
|
def run(self): |
||||||
|
""" |
||||||
|
多线程爬取数据 |
||||||
|
""" |
||||||
|
self.job_spider() |
||||||
|
self.execute_more_tasks(self.post_require) |
||||||
|
self.desc_url_queue.join() # 主线程阻塞,等待队列清空 |
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__": |
||||||
|
spider = JobSpider() |
||||||
|
start = time.time() |
||||||
|
spider.run() |
||||||
|
logger.info("总耗时 {} 秒".format(time.time() - start)) |
@ -0,0 +1,15 @@ |
|||||||
|
beautifulsoup4==4.9.1 |
||||||
|
bs4==0.0.1 |
||||||
|
certifi==2020.6.20 |
||||||
|
cffi==1.14.0 |
||||||
|
chardet==3.0.4 |
||||||
|
gevent==20.6.2 |
||||||
|
greenlet==0.4.16 |
||||||
|
idna==2.10 |
||||||
|
lxml==4.5.1 |
||||||
|
pycparser==2.20 |
||||||
|
requests==2.24.0 |
||||||
|
soupsieve==2.0.1 |
||||||
|
urllib3==1.25.9 |
||||||
|
zope.event==4.4 |
||||||
|
zope.interface==5.1.0 |
Loading…
Reference in new issue