You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
house-job/job_data_analysis1.py

109 lines
4.7 KiB

5 years ago
import pandas as pd
import re
# 数据预处理
with open('data/岗位信息.txt','rb') as file:
job_list = []
while True:
line = file.readline()
if not line:
break
line = eval(line.decode('utf-8'))
try:
line['位置'] = re.split('-',line['位置'])[1]
danwei = re.findall('[\u4e00-\u9fa5]+',line['薪资'])
xinzi = re.findall('\d+.*\d',line['薪资'])[0].split('-')
if not xinzi[1]:
xinzi[1] = xinzi[0]
if danwei[0][0] == '' and danwei[1] == '':
line['薪资'] = round((float(xinzi[0])+float(xinzi[1]))/2,2)
elif danwei[0][0] == '' and danwei[1] == '':
line['薪资'] = round((float(xinzi[0]) + float(xinzi[1])) / 2 /12, 2)
elif danwei[0] == '' and danwei[1] == '':
line['薪资'] = round((float(xinzi[0]) + float(xinzi[1])) / 2 / 10, 2)
elif danwei[0] == '' and danwei[1:] == '小时':
line['薪资'] = round((float(xinzi[0]) + float(xinzi[1]))*8*22 / 2 / 100, 2)
except:
continue
job_list.append(line)
job_list_DF = pd.DataFrame(job_list)
xingzhengqu = [item for item in set(job_list_DF.get(key='位置')) if item]
# 广州各区岗位分布
from pyecharts import options as opts
from pyecharts.charts import Pie
def getAreaWeight():
result = job_list_DF.groupby('位置').size().reset_index(name='count')
areaName = list(result.位置.values)
areaWeight = list(map(int,result['count'].values))
areaName_tmp = []
for index,item in enumerate(areaName):
areaName_tmp.append(item + str(round(areaWeight[index]/sum(areaWeight)*100,2))+'%')
return zip(areaName_tmp,areaWeight)
pie = Pie(init_opts=opts.InitOpts(width='800px',height='800px'))
pie.add('',getAreaWeight()).set_global_opts(title_opts=opts.TitleOpts(title='广州各区岗位分布'))
pie.render('images/job/广州各区岗位分布.html')
# 求广州单月薪资
from pyecharts.charts import TreeMap
def getAvgPrice(xingzhengqu):
totalPrice = 0
total = 0
for item in job_list:
if item['位置'] == xingzhengqu:
total = total + 1
totalPrice = totalPrice + item['薪资']
return totalPrice / total if total >0 else 0
# 获取每个区 单月薪资
def getTotalAvgPrice():
totalAvgPriceList = []
totalAvgPriceDirList = []
for index, item in enumerate(xingzhengqu):
avg_price = getAvgPrice(item)
totalAvgPriceList.append(round(avg_price,2))
totalAvgPriceDirList.append({'value':round(avg_price,2),'name':item + "" + str(round(avg_price,2)) +''})
return totalAvgPriceDirList
data = getTotalAvgPrice()
treemap = TreeMap(init_opts=opts.InitOpts(width='1200px',height='1400px'))
treemap.add('广州各区每月薪资:万/月',data,label_opts=opts.LabelOpts(is_show=True, position='inside',font_size=13))
treemap.render('images/job/广州各区每月薪资.html')
# 获取每个区 单日薪资
from pyecharts.charts import Bar
totalAvgPriceList = []
for index,item in enumerate(xingzhengqu):
avg_price = getAvgPrice(item)
totalAvgPriceList.append(round(avg_price*10000/30,2))
attr, value = (xingzhengqu,totalAvgPriceList)
bar = Bar(init_opts=opts.InitOpts(width='1200px',height='1400px'))
bar.add_xaxis(attr)
bar.add_yaxis("广州",value)
bar.set_global_opts(title_opts=opts.TitleOpts(title='广州各区单日薪资:元/日'),xaxis_opts=opts.AxisOpts(axislabel_opts={"rotate":"270"}))
bar.render('images/job/广州各区单日薪资.html')
# 获取岗位数据
from pyecharts.charts import WordCloud
def getRooms():
results = job_list_DF.groupby('岗位').size().reset_index(name='count')
room_list = list(results.岗位.values)
weight_list = list(map(int,results['count'].values))
return (room_list, weight_list)
attr, value = getRooms()
wordcloud = WordCloud(init_opts=opts.InitOpts(width='900px',height='400px'))
wordcloud.add('',zip(attr,value),word_size_range=[2,100])
wordcloud.render('images/job/广州岗位数据.html')
# 获取各个区的岗位数量比重
from pyecharts.charts import Pie
def getAreaWeight():
result = job_list_DF.groupby('位置').size().reset_index(name='count')
areaName = list(result.位置.values)
areaWeight = list(map(int,result['count'].values))
areaName_tmp = []
for index,item in enumerate(areaName):
areaName_tmp.append(item + str(round(areaWeight[index]/sum(areaWeight)*100,2))+'%')
return zip(areaName_tmp,areaWeight)
pie = Pie(init_opts=opts.InitOpts(width='1200px',height='1200px'))
pie.add('',getAreaWeight()).set_global_opts(title_opts=opts.TitleOpts(title='广州各区岗位数量分布'))
pie.render('images/job/广州各区岗位数量分布.html')