From eeeebf8085111ba78fa642feb814e25798711f0d Mon Sep 17 00:00:00 2001 From: pan <1029559041@qq.com> Date: Tue, 30 Jun 2020 20:49:11 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=E5=B2=97=E4=BD=8D=E5=9B=BE?= =?UTF-8?q?=E8=A1=A8=E7=94=9F=E6=88=90?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- job_data_analysis1.py | 100 ++++++++++++++++++++++++++++-------------- job_spider.py | 15 ++----- requirement.txt | 15 +++++++ util.py | 13 ++++++ 4 files changed, 97 insertions(+), 46 deletions(-) create mode 100644 util.py diff --git a/job_data_analysis1.py b/job_data_analysis1.py index f2be492..c32ed33 100644 --- a/job_data_analysis1.py +++ b/job_data_analysis1.py @@ -1,8 +1,11 @@ import pandas as pd import re +import os # 数据预处理 -with open('data/岗位信息.txt','rb') as file: +import util + +with open('data/岗位信息.txt', 'rb') as file: job_list = [] while True: line = file.readline() @@ -10,19 +13,19 @@ with open('data/岗位信息.txt','rb') as file: break line = eval(line.decode('utf-8')) try: - line['位置'] = re.split('-',line['位置'])[1] - danwei = re.findall('[\u4e00-\u9fa5]+',line['薪资']) - xinzi = re.findall('\d+.*\d',line['薪资'])[0].split('-') + line['位置'] = re.split('-', line['位置'])[1] + danwei = re.findall('[\u4e00-\u9fa5]+', line['薪资']) + xinzi = re.findall('\d+.*\d', line['薪资'])[0].split('-') if not xinzi[1]: xinzi[1] = xinzi[0] if danwei[0][0] == '万' and danwei[1] == '月': - line['薪资'] = round((float(xinzi[0])+float(xinzi[1]))/2,2) + line['薪资'] = round((float(xinzi[0]) + float(xinzi[1])) / 2, 2) elif danwei[0][0] == '万' and danwei[1] == '年': - line['薪资'] = round((float(xinzi[0]) + float(xinzi[1])) / 2 /12, 2) + line['薪资'] = round((float(xinzi[0]) + float(xinzi[1])) / 2 / 12, 2) elif danwei[0] == '千' and danwei[1] == '月': line['薪资'] = round((float(xinzi[0]) + float(xinzi[1])) / 2 / 10, 2) elif danwei[0] == '元' and danwei[1:] == '小时': - line['薪资'] = round((float(xinzi[0]) + float(xinzi[1]))*8*22 / 2 / 100, 2) + line['薪资'] = round((float(xinzi[0]) + float(xinzi[1])) * 8 * 22 / 2 / 100, 2) except: continue job_list.append(line) @@ -33,20 +36,32 @@ xingzhengqu = [item for item in set(job_list_DF.get(key='位置')) if item] from pyecharts import options as opts from pyecharts.charts import Pie + def getAreaWeight(): result = job_list_DF.groupby('位置').size().reset_index(name='count') areaName = list(result.位置.values) - areaWeight = list(map(int,result['count'].values)) + areaWeight = list(map(int, result['count'].values)) areaName_tmp = [] - for index,item in enumerate(areaName): - areaName_tmp.append(item + str(round(areaWeight[index]/sum(areaWeight)*100,2))+'%') - return zip(areaName_tmp,areaWeight) -pie = Pie(init_opts=opts.InitOpts(width='800px',height='800px')) -pie.add('',getAreaWeight()).set_global_opts(title_opts=opts.TitleOpts(title='广州各区岗位分布')) + for index, item in enumerate(areaName): + areaName_tmp.append(item + str(round(areaWeight[index] / sum(areaWeight) * 100, 2)) + '%') + return (areaName_tmp, areaWeight) + + +pie = Pie(init_opts=opts.InitOpts(width='800px', height='800px')) +data = getAreaWeight() +pie.add("", [list(z) for z in zip(data[0], data[1])]) +pie.set_global_opts(title_opts=opts.TitleOpts(title='广州各区岗位分布')) +image_dir="images/job" +if os.path.exists(image_dir): + util.clearDir(image_dir) +else: + os.makedirs(image_dir) pie.render('images/job/广州各区岗位分布.html') # 求广州单月薪资 from pyecharts.charts import TreeMap + + def getAvgPrice(xingzhengqu): totalPrice = 0 total = 0 @@ -54,56 +69,73 @@ def getAvgPrice(xingzhengqu): if item['位置'] == xingzhengqu: total = total + 1 totalPrice = totalPrice + item['薪资'] - return totalPrice / total if total >0 else 0 + return totalPrice / total if total > 0 else 0 + + # 获取每个区 单月薪资 def getTotalAvgPrice(): totalAvgPriceList = [] totalAvgPriceDirList = [] for index, item in enumerate(xingzhengqu): avg_price = getAvgPrice(item) - totalAvgPriceList.append(round(avg_price,2)) - totalAvgPriceDirList.append({'value':round(avg_price,2),'name':item + " ¥" + str(round(avg_price,2)) +' 万'}) + totalAvgPriceList.append(round(avg_price, 2)) + totalAvgPriceDirList.append( + {'value': round(avg_price, 2), 'name': item + " ¥" + str(round(avg_price, 2)) + ' 万'}) return totalAvgPriceDirList + + data = getTotalAvgPrice() -treemap = TreeMap(init_opts=opts.InitOpts(width='1200px',height='1400px')) -treemap.add('广州各区每月薪资:万/月',data,label_opts=opts.LabelOpts(is_show=True, position='inside',font_size=13)) +treemap = TreeMap(init_opts=opts.InitOpts(width='1200px', height='1400px')) +treemap.add('广州各区每月薪资:万/月', data, label_opts=opts.LabelOpts(is_show=True, position='inside', font_size=13)) treemap.render('images/job/广州各区每月薪资.html') # 获取每个区 单日薪资 from pyecharts.charts import Bar + totalAvgPriceList = [] -for index,item in enumerate(xingzhengqu): +for index, item in enumerate(xingzhengqu): avg_price = getAvgPrice(item) - totalAvgPriceList.append(round(avg_price*10000/30,2)) -attr, value = (xingzhengqu,totalAvgPriceList) -bar = Bar(init_opts=opts.InitOpts(width='1200px',height='1400px')) + totalAvgPriceList.append(round(avg_price * 10000 / 30, 2)) +attr, value = (xingzhengqu, totalAvgPriceList) +bar = Bar(init_opts=opts.InitOpts(width='1200px', height='1400px')) bar.add_xaxis(attr) -bar.add_yaxis("广州",value) -bar.set_global_opts(title_opts=opts.TitleOpts(title='广州各区单日薪资:元/日'),xaxis_opts=opts.AxisOpts(axislabel_opts={"rotate":"270"})) +bar.add_yaxis("广州", value) +bar.set_global_opts(title_opts=opts.TitleOpts(title='广州各区单日薪资:元/日'), + xaxis_opts=opts.AxisOpts(axislabel_opts={"rotate": "270"})) bar.render('images/job/广州各区单日薪资.html') # 获取岗位数据 from pyecharts.charts import WordCloud + + def getRooms(): results = job_list_DF.groupby('岗位').size().reset_index(name='count') room_list = list(results.岗位.values) - weight_list = list(map(int,results['count'].values)) + weight_list = list(map(int, results['count'].values)) return (room_list, weight_list) + + attr, value = getRooms() -wordcloud = WordCloud(init_opts=opts.InitOpts(width='900px',height='400px')) -wordcloud.add('',zip(attr,value),word_size_range=[2,100]) +wordcloud = WordCloud(init_opts=opts.InitOpts(width='900px', height='400px')) +wordcloud.add('', zip(attr, value), word_size_range=[2, 100]) wordcloud.render('images/job/广州岗位数据.html') # 获取各个区的岗位数量比重 from pyecharts.charts import Pie + + def getAreaWeight(): result = job_list_DF.groupby('位置').size().reset_index(name='count') areaName = list(result.位置.values) - areaWeight = list(map(int,result['count'].values)) + areaWeight = list(map(int, result['count'].values)) areaName_tmp = [] - for index,item in enumerate(areaName): - areaName_tmp.append(item + str(round(areaWeight[index]/sum(areaWeight)*100,2))+'%') - return zip(areaName_tmp,areaWeight) -pie = Pie(init_opts=opts.InitOpts(width='1200px',height='1200px')) -pie.add('',getAreaWeight()).set_global_opts(title_opts=opts.TitleOpts(title='广州各区岗位数量分布')) -pie.render('images/job/广州各区岗位数量分布.html') \ No newline at end of file + for index, item in enumerate(areaName): + areaName_tmp.append(item + str(round(areaWeight[index] / sum(areaWeight) * 100, 2)) + '%') + return (areaName_tmp, areaWeight) + + +pie = Pie(init_opts=opts.InitOpts(width='1200px', height='1200px')) +data = getAreaWeight() +pie.add("", [list(z) for z in zip(data[0], data[1])]) +pie.set_global_opts(title_opts=opts.TitleOpts(title='广州各区岗位数量分布')) +pie.render('images/job/广州各区岗位数量分布.html') diff --git a/job_spider.py b/job_spider.py index 684b772..046e4dd 100644 --- a/job_spider.py +++ b/job_spider.py @@ -1,8 +1,8 @@ -import shutil - from gevent import monkey from gevent.pool import Pool +import util + monkey.patch_all(select=False) from requests.adapters import HTTPAdapter @@ -131,7 +131,7 @@ class JobSpider: def run(self): if os.path.exists(self.job_dir): - self.clearDir(self.job_dir) + util.clearDir(self.job_dir) else: os.mkdir(self.job_dir) @@ -142,15 +142,6 @@ class JobSpider: self.execute_more_tasks(self.post_require) self.desc_url_queue.join() # 主线程阻塞,等待队列清空 - def clearDir(self, rootdir): - filelist = os.listdir(rootdir) - for f in filelist: - filepath = os.path.join(rootdir, f) - if os.path.isfile(filepath): - os.remove(filepath) - print(filepath + " removed!") - elif os.path.isdir(filepath): - shutil.rmtree(filepath, True) if __name__ == "__main__": diff --git a/requirement.txt b/requirement.txt index cece435..7a605c2 100644 --- a/requirement.txt +++ b/requirement.txt @@ -3,12 +3,27 @@ bs4==0.0.1 certifi==2020.6.20 cffi==1.14.0 chardet==3.0.4 +cycler==0.10.0 gevent==20.6.2 greenlet==0.4.16 idna==2.10 +Jinja2==2.11.2 +kiwisolver==1.2.0 lxml==4.5.1 +MarkupSafe==1.1.1 +matplotlib==3.2.2 +numpy==1.19.0 +pandas==1.0.5 +Pillow==7.1.2 +prettytable==0.7.2 pycparser==2.20 +pyecharts==1.8.1 +pyparsing==2.4.7 +python-dateutil==2.8.1 +pytz==2020.1 requests==2.24.0 +simplejson==3.17.0 +six==1.15.0 soupsieve==2.0.1 urllib3==1.25.9 zope.event==4.4 diff --git a/util.py b/util.py new file mode 100644 index 0000000..f8a09b8 --- /dev/null +++ b/util.py @@ -0,0 +1,13 @@ +import os +import shutil + + +def clearDir(rootdir): + filelist = os.listdir(rootdir) + for f in filelist: + filepath = os.path.join(rootdir, f) + if os.path.isfile(filepath): + os.remove(filepath) + print(filepath + " removed!") + elif os.path.isdir(filepath): + shutil.rmtree(filepath, True) \ No newline at end of file