修复岗位图表生成

master
pan 5 years ago
parent e05b64c18d
commit eeeebf8085
  1. 98
      job_data_analysis1.py
  2. 15
      job_spider.py
  3. 15
      requirement.txt
  4. 13
      util.py

@ -1,8 +1,11 @@
import pandas as pd import pandas as pd
import re import re
import os
# 数据预处理 # 数据预处理
with open('data/岗位信息.txt','rb') as file: import util
with open('data/岗位信息.txt', 'rb') as file:
job_list = [] job_list = []
while True: while True:
line = file.readline() line = file.readline()
@ -10,19 +13,19 @@ with open('data/岗位信息.txt','rb') as file:
break break
line = eval(line.decode('utf-8')) line = eval(line.decode('utf-8'))
try: try:
line['位置'] = re.split('-',line['位置'])[1] line['位置'] = re.split('-', line['位置'])[1]
danwei = re.findall('[\u4e00-\u9fa5]+',line['薪资']) danwei = re.findall('[\u4e00-\u9fa5]+', line['薪资'])
xinzi = re.findall('\d+.*\d',line['薪资'])[0].split('-') xinzi = re.findall('\d+.*\d', line['薪资'])[0].split('-')
if not xinzi[1]: if not xinzi[1]:
xinzi[1] = xinzi[0] xinzi[1] = xinzi[0]
if danwei[0][0] == '' and danwei[1] == '': if danwei[0][0] == '' and danwei[1] == '':
line['薪资'] = round((float(xinzi[0])+float(xinzi[1]))/2,2) line['薪资'] = round((float(xinzi[0]) + float(xinzi[1])) / 2, 2)
elif danwei[0][0] == '' and danwei[1] == '': elif danwei[0][0] == '' and danwei[1] == '':
line['薪资'] = round((float(xinzi[0]) + float(xinzi[1])) / 2 /12, 2) line['薪资'] = round((float(xinzi[0]) + float(xinzi[1])) / 2 / 12, 2)
elif danwei[0] == '' and danwei[1] == '': elif danwei[0] == '' and danwei[1] == '':
line['薪资'] = round((float(xinzi[0]) + float(xinzi[1])) / 2 / 10, 2) line['薪资'] = round((float(xinzi[0]) + float(xinzi[1])) / 2 / 10, 2)
elif danwei[0] == '' and danwei[1:] == '小时': elif danwei[0] == '' and danwei[1:] == '小时':
line['薪资'] = round((float(xinzi[0]) + float(xinzi[1]))*8*22 / 2 / 100, 2) line['薪资'] = round((float(xinzi[0]) + float(xinzi[1])) * 8 * 22 / 2 / 100, 2)
except: except:
continue continue
job_list.append(line) job_list.append(line)
@ -33,20 +36,32 @@ xingzhengqu = [item for item in set(job_list_DF.get(key='位置')) if item]
from pyecharts import options as opts from pyecharts import options as opts
from pyecharts.charts import Pie from pyecharts.charts import Pie
def getAreaWeight(): def getAreaWeight():
result = job_list_DF.groupby('位置').size().reset_index(name='count') result = job_list_DF.groupby('位置').size().reset_index(name='count')
areaName = list(result.位置.values) areaName = list(result.位置.values)
areaWeight = list(map(int,result['count'].values)) areaWeight = list(map(int, result['count'].values))
areaName_tmp = [] areaName_tmp = []
for index,item in enumerate(areaName): for index, item in enumerate(areaName):
areaName_tmp.append(item + str(round(areaWeight[index]/sum(areaWeight)*100,2))+'%') areaName_tmp.append(item + str(round(areaWeight[index] / sum(areaWeight) * 100, 2)) + '%')
return zip(areaName_tmp,areaWeight) return (areaName_tmp, areaWeight)
pie = Pie(init_opts=opts.InitOpts(width='800px',height='800px'))
pie.add('',getAreaWeight()).set_global_opts(title_opts=opts.TitleOpts(title='广州各区岗位分布'))
pie = Pie(init_opts=opts.InitOpts(width='800px', height='800px'))
data = getAreaWeight()
pie.add("", [list(z) for z in zip(data[0], data[1])])
pie.set_global_opts(title_opts=opts.TitleOpts(title='广州各区岗位分布'))
image_dir="images/job"
if os.path.exists(image_dir):
util.clearDir(image_dir)
else:
os.makedirs(image_dir)
pie.render('images/job/广州各区岗位分布.html') pie.render('images/job/广州各区岗位分布.html')
# 求广州单月薪资 # 求广州单月薪资
from pyecharts.charts import TreeMap from pyecharts.charts import TreeMap
def getAvgPrice(xingzhengqu): def getAvgPrice(xingzhengqu):
totalPrice = 0 totalPrice = 0
total = 0 total = 0
@ -54,56 +69,73 @@ def getAvgPrice(xingzhengqu):
if item['位置'] == xingzhengqu: if item['位置'] == xingzhengqu:
total = total + 1 total = total + 1
totalPrice = totalPrice + item['薪资'] totalPrice = totalPrice + item['薪资']
return totalPrice / total if total >0 else 0 return totalPrice / total if total > 0 else 0
# 获取每个区 单月薪资 # 获取每个区 单月薪资
def getTotalAvgPrice(): def getTotalAvgPrice():
totalAvgPriceList = [] totalAvgPriceList = []
totalAvgPriceDirList = [] totalAvgPriceDirList = []
for index, item in enumerate(xingzhengqu): for index, item in enumerate(xingzhengqu):
avg_price = getAvgPrice(item) avg_price = getAvgPrice(item)
totalAvgPriceList.append(round(avg_price,2)) totalAvgPriceList.append(round(avg_price, 2))
totalAvgPriceDirList.append({'value':round(avg_price,2),'name':item + "" + str(round(avg_price,2)) +''}) totalAvgPriceDirList.append(
{'value': round(avg_price, 2), 'name': item + "" + str(round(avg_price, 2)) + ''})
return totalAvgPriceDirList return totalAvgPriceDirList
data = getTotalAvgPrice() data = getTotalAvgPrice()
treemap = TreeMap(init_opts=opts.InitOpts(width='1200px',height='1400px')) treemap = TreeMap(init_opts=opts.InitOpts(width='1200px', height='1400px'))
treemap.add('广州各区每月薪资:万/月',data,label_opts=opts.LabelOpts(is_show=True, position='inside',font_size=13)) treemap.add('广州各区每月薪资:万/月', data, label_opts=opts.LabelOpts(is_show=True, position='inside', font_size=13))
treemap.render('images/job/广州各区每月薪资.html') treemap.render('images/job/广州各区每月薪资.html')
# 获取每个区 单日薪资 # 获取每个区 单日薪资
from pyecharts.charts import Bar from pyecharts.charts import Bar
totalAvgPriceList = [] totalAvgPriceList = []
for index,item in enumerate(xingzhengqu): for index, item in enumerate(xingzhengqu):
avg_price = getAvgPrice(item) avg_price = getAvgPrice(item)
totalAvgPriceList.append(round(avg_price*10000/30,2)) totalAvgPriceList.append(round(avg_price * 10000 / 30, 2))
attr, value = (xingzhengqu,totalAvgPriceList) attr, value = (xingzhengqu, totalAvgPriceList)
bar = Bar(init_opts=opts.InitOpts(width='1200px',height='1400px')) bar = Bar(init_opts=opts.InitOpts(width='1200px', height='1400px'))
bar.add_xaxis(attr) bar.add_xaxis(attr)
bar.add_yaxis("广州",value) bar.add_yaxis("广州", value)
bar.set_global_opts(title_opts=opts.TitleOpts(title='广州各区单日薪资:元/日'),xaxis_opts=opts.AxisOpts(axislabel_opts={"rotate":"270"})) bar.set_global_opts(title_opts=opts.TitleOpts(title='广州各区单日薪资:元/日'),
xaxis_opts=opts.AxisOpts(axislabel_opts={"rotate": "270"}))
bar.render('images/job/广州各区单日薪资.html') bar.render('images/job/广州各区单日薪资.html')
# 获取岗位数据 # 获取岗位数据
from pyecharts.charts import WordCloud from pyecharts.charts import WordCloud
def getRooms(): def getRooms():
results = job_list_DF.groupby('岗位').size().reset_index(name='count') results = job_list_DF.groupby('岗位').size().reset_index(name='count')
room_list = list(results.岗位.values) room_list = list(results.岗位.values)
weight_list = list(map(int,results['count'].values)) weight_list = list(map(int, results['count'].values))
return (room_list, weight_list) return (room_list, weight_list)
attr, value = getRooms() attr, value = getRooms()
wordcloud = WordCloud(init_opts=opts.InitOpts(width='900px',height='400px')) wordcloud = WordCloud(init_opts=opts.InitOpts(width='900px', height='400px'))
wordcloud.add('',zip(attr,value),word_size_range=[2,100]) wordcloud.add('', zip(attr, value), word_size_range=[2, 100])
wordcloud.render('images/job/广州岗位数据.html') wordcloud.render('images/job/广州岗位数据.html')
# 获取各个区的岗位数量比重 # 获取各个区的岗位数量比重
from pyecharts.charts import Pie from pyecharts.charts import Pie
def getAreaWeight(): def getAreaWeight():
result = job_list_DF.groupby('位置').size().reset_index(name='count') result = job_list_DF.groupby('位置').size().reset_index(name='count')
areaName = list(result.位置.values) areaName = list(result.位置.values)
areaWeight = list(map(int,result['count'].values)) areaWeight = list(map(int, result['count'].values))
areaName_tmp = [] areaName_tmp = []
for index,item in enumerate(areaName): for index, item in enumerate(areaName):
areaName_tmp.append(item + str(round(areaWeight[index]/sum(areaWeight)*100,2))+'%') areaName_tmp.append(item + str(round(areaWeight[index] / sum(areaWeight) * 100, 2)) + '%')
return zip(areaName_tmp,areaWeight) return (areaName_tmp, areaWeight)
pie = Pie(init_opts=opts.InitOpts(width='1200px',height='1200px'))
pie.add('',getAreaWeight()).set_global_opts(title_opts=opts.TitleOpts(title='广州各区岗位数量分布'))
pie = Pie(init_opts=opts.InitOpts(width='1200px', height='1200px'))
data = getAreaWeight()
pie.add("", [list(z) for z in zip(data[0], data[1])])
pie.set_global_opts(title_opts=opts.TitleOpts(title='广州各区岗位数量分布'))
pie.render('images/job/广州各区岗位数量分布.html') pie.render('images/job/广州各区岗位数量分布.html')

@ -1,8 +1,8 @@
import shutil
from gevent import monkey from gevent import monkey
from gevent.pool import Pool from gevent.pool import Pool
import util
monkey.patch_all(select=False) monkey.patch_all(select=False)
from requests.adapters import HTTPAdapter from requests.adapters import HTTPAdapter
@ -131,7 +131,7 @@ class JobSpider:
def run(self): def run(self):
if os.path.exists(self.job_dir): if os.path.exists(self.job_dir):
self.clearDir(self.job_dir) util.clearDir(self.job_dir)
else: else:
os.mkdir(self.job_dir) os.mkdir(self.job_dir)
@ -142,15 +142,6 @@ class JobSpider:
self.execute_more_tasks(self.post_require) self.execute_more_tasks(self.post_require)
self.desc_url_queue.join() # 主线程阻塞,等待队列清空 self.desc_url_queue.join() # 主线程阻塞,等待队列清空
def clearDir(self, rootdir):
filelist = os.listdir(rootdir)
for f in filelist:
filepath = os.path.join(rootdir, f)
if os.path.isfile(filepath):
os.remove(filepath)
print(filepath + " removed!")
elif os.path.isdir(filepath):
shutil.rmtree(filepath, True)
if __name__ == "__main__": if __name__ == "__main__":

@ -3,12 +3,27 @@ bs4==0.0.1
certifi==2020.6.20 certifi==2020.6.20
cffi==1.14.0 cffi==1.14.0
chardet==3.0.4 chardet==3.0.4
cycler==0.10.0
gevent==20.6.2 gevent==20.6.2
greenlet==0.4.16 greenlet==0.4.16
idna==2.10 idna==2.10
Jinja2==2.11.2
kiwisolver==1.2.0
lxml==4.5.1 lxml==4.5.1
MarkupSafe==1.1.1
matplotlib==3.2.2
numpy==1.19.0
pandas==1.0.5
Pillow==7.1.2
prettytable==0.7.2
pycparser==2.20 pycparser==2.20
pyecharts==1.8.1
pyparsing==2.4.7
python-dateutil==2.8.1
pytz==2020.1
requests==2.24.0 requests==2.24.0
simplejson==3.17.0
six==1.15.0
soupsieve==2.0.1 soupsieve==2.0.1
urllib3==1.25.9 urllib3==1.25.9
zope.event==4.4 zope.event==4.4

@ -0,0 +1,13 @@
import os
import shutil
def clearDir(rootdir):
filelist = os.listdir(rootdir)
for f in filelist:
filepath = os.path.join(rootdir, f)
if os.path.isfile(filepath):
os.remove(filepath)
print(filepath + " removed!")
elif os.path.isdir(filepath):
shutil.rmtree(filepath, True)
Loading…
Cancel
Save