From eeeebf8085111ba78fa642feb814e25798711f0d Mon Sep 17 00:00:00 2001
From: pan <1029559041@qq.com>
Date: Tue, 30 Jun 2020 20:49:11 +0800
Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=E5=B2=97=E4=BD=8D=E5=9B=BE?=
 =?UTF-8?q?=E8=A1=A8=E7=94=9F=E6=88=90?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 job_data_analysis1.py | 100 ++++++++++++++++++++++++++++--------------
 job_spider.py         |  15 ++-----
 requirement.txt       |  15 +++++++
 util.py               |  13 ++++++
 4 files changed, 97 insertions(+), 46 deletions(-)
 create mode 100644 util.py

diff --git a/job_data_analysis1.py b/job_data_analysis1.py
index f2be492..c32ed33 100644
--- a/job_data_analysis1.py
+++ b/job_data_analysis1.py
@@ -1,8 +1,11 @@
 import pandas as pd
 import re
+import os
 
 # 数据预处理
-with open('data/岗位信息.txt','rb') as file:
+import util
+
+with open('data/岗位信息.txt', 'rb') as file:
     job_list = []
     while True:
         line = file.readline()
@@ -10,19 +13,19 @@ with open('data/岗位信息.txt','rb') as file:
             break
         line = eval(line.decode('utf-8'))
         try:
-            line['位置'] = re.split('-',line['位置'])[1]
-            danwei = re.findall('[\u4e00-\u9fa5]+',line['薪资'])
-            xinzi = re.findall('\d+.*\d',line['薪资'])[0].split('-')
+            line['位置'] = re.split('-', line['位置'])[1]
+            danwei = re.findall('[\u4e00-\u9fa5]+', line['薪资'])
+            xinzi = re.findall('\d+.*\d', line['薪资'])[0].split('-')
             if not xinzi[1]:
                 xinzi[1] = xinzi[0]
             if danwei[0][0] == '万' and danwei[1] == '月':
-                line['薪资'] = round((float(xinzi[0])+float(xinzi[1]))/2,2)
+                line['薪资'] = round((float(xinzi[0]) + float(xinzi[1])) / 2, 2)
             elif danwei[0][0] == '万' and danwei[1] == '年':
-                line['薪资'] = round((float(xinzi[0]) + float(xinzi[1])) / 2 /12, 2)
+                line['薪资'] = round((float(xinzi[0]) + float(xinzi[1])) / 2 / 12, 2)
             elif danwei[0] == '千' and danwei[1] == '月':
                 line['薪资'] = round((float(xinzi[0]) + float(xinzi[1])) / 2 / 10, 2)
             elif danwei[0] == '元' and danwei[1:] == '小时':
-                line['薪资'] = round((float(xinzi[0]) + float(xinzi[1]))*8*22 / 2 / 100, 2)
+                line['薪资'] = round((float(xinzi[0]) + float(xinzi[1])) * 8 * 22 / 2 / 100, 2)
         except:
             continue
         job_list.append(line)
@@ -33,20 +36,32 @@ xingzhengqu = [item for item in set(job_list_DF.get(key='位置')) if item]
 from pyecharts import options as opts
 from pyecharts.charts import Pie
 
+
 def getAreaWeight():
     result = job_list_DF.groupby('位置').size().reset_index(name='count')
     areaName = list(result.位置.values)
-    areaWeight = list(map(int,result['count'].values))
+    areaWeight = list(map(int, result['count'].values))
     areaName_tmp = []
-    for index,item in enumerate(areaName):
-        areaName_tmp.append(item + str(round(areaWeight[index]/sum(areaWeight)*100,2))+'%')
-    return zip(areaName_tmp,areaWeight)
-pie = Pie(init_opts=opts.InitOpts(width='800px',height='800px'))
-pie.add('',getAreaWeight()).set_global_opts(title_opts=opts.TitleOpts(title='广州各区岗位分布'))
+    for index, item in enumerate(areaName):
+        areaName_tmp.append(item + str(round(areaWeight[index] / sum(areaWeight) * 100, 2)) + '%')
+    return (areaName_tmp, areaWeight)
+
+
+pie = Pie(init_opts=opts.InitOpts(width='800px', height='800px'))
+data = getAreaWeight()
+pie.add("", [list(z) for z in zip(data[0], data[1])])
+pie.set_global_opts(title_opts=opts.TitleOpts(title='广州各区岗位分布'))
+image_dir="images/job"
+if os.path.exists(image_dir):
+    util.clearDir(image_dir)
+else:
+   os.makedirs(image_dir)
 pie.render('images/job/广州各区岗位分布.html')
 
 # 求广州单月薪资
 from pyecharts.charts import TreeMap
+
+
 def getAvgPrice(xingzhengqu):
     totalPrice = 0
     total = 0
@@ -54,56 +69,73 @@ def getAvgPrice(xingzhengqu):
         if item['位置'] == xingzhengqu:
             total = total + 1
             totalPrice = totalPrice + item['薪资']
-    return totalPrice / total if total >0 else 0
+    return totalPrice / total if total > 0 else 0
+
+
 # 获取每个区 单月薪资
 def getTotalAvgPrice():
     totalAvgPriceList = []
     totalAvgPriceDirList = []
     for index, item in enumerate(xingzhengqu):
         avg_price = getAvgPrice(item)
-        totalAvgPriceList.append(round(avg_price,2))
-        totalAvgPriceDirList.append({'value':round(avg_price,2),'name':item + " ￥" + str(round(avg_price,2)) +' 万'})
+        totalAvgPriceList.append(round(avg_price, 2))
+        totalAvgPriceDirList.append(
+            {'value': round(avg_price, 2), 'name': item + " ￥" + str(round(avg_price, 2)) + ' 万'})
     return totalAvgPriceDirList
+
+
 data = getTotalAvgPrice()
-treemap = TreeMap(init_opts=opts.InitOpts(width='1200px',height='1400px'))
-treemap.add('广州各区每月薪资:万/月',data,label_opts=opts.LabelOpts(is_show=True, position='inside',font_size=13))
+treemap = TreeMap(init_opts=opts.InitOpts(width='1200px', height='1400px'))
+treemap.add('广州各区每月薪资:万/月', data, label_opts=opts.LabelOpts(is_show=True, position='inside', font_size=13))
 treemap.render('images/job/广州各区每月薪资.html')
 
 # 获取每个区 单日薪资
 from pyecharts.charts import Bar
+
 totalAvgPriceList = []
-for index,item in enumerate(xingzhengqu):
+for index, item in enumerate(xingzhengqu):
     avg_price = getAvgPrice(item)
-    totalAvgPriceList.append(round(avg_price*10000/30,2))
-attr, value = (xingzhengqu,totalAvgPriceList)
-bar = Bar(init_opts=opts.InitOpts(width='1200px',height='1400px'))
+    totalAvgPriceList.append(round(avg_price * 10000 / 30, 2))
+attr, value = (xingzhengqu, totalAvgPriceList)
+bar = Bar(init_opts=opts.InitOpts(width='1200px', height='1400px'))
 bar.add_xaxis(attr)
-bar.add_yaxis("广州",value)
-bar.set_global_opts(title_opts=opts.TitleOpts(title='广州各区单日薪资:元/日'),xaxis_opts=opts.AxisOpts(axislabel_opts={"rotate":"270"}))
+bar.add_yaxis("广州", value)
+bar.set_global_opts(title_opts=opts.TitleOpts(title='广州各区单日薪资:元/日'),
+                    xaxis_opts=opts.AxisOpts(axislabel_opts={"rotate": "270"}))
 bar.render('images/job/广州各区单日薪资.html')
 
 # 获取岗位数据
 from pyecharts.charts import WordCloud
+
+
 def getRooms():
     results = job_list_DF.groupby('岗位').size().reset_index(name='count')
     room_list = list(results.岗位.values)
-    weight_list = list(map(int,results['count'].values))
+    weight_list = list(map(int, results['count'].values))
     return (room_list, weight_list)
+
+
 attr, value = getRooms()
-wordcloud = WordCloud(init_opts=opts.InitOpts(width='900px',height='400px'))
-wordcloud.add('',zip(attr,value),word_size_range=[2,100])
+wordcloud = WordCloud(init_opts=opts.InitOpts(width='900px', height='400px'))
+wordcloud.add('', zip(attr, value), word_size_range=[2, 100])
 wordcloud.render('images/job/广州岗位数据.html')
 
 # 获取各个区的岗位数量比重
 from pyecharts.charts import Pie
+
+
 def getAreaWeight():
     result = job_list_DF.groupby('位置').size().reset_index(name='count')
     areaName = list(result.位置.values)
-    areaWeight = list(map(int,result['count'].values))
+    areaWeight = list(map(int, result['count'].values))
     areaName_tmp = []
-    for index,item in enumerate(areaName):
-        areaName_tmp.append(item + str(round(areaWeight[index]/sum(areaWeight)*100,2))+'%')
-    return zip(areaName_tmp,areaWeight)
-pie = Pie(init_opts=opts.InitOpts(width='1200px',height='1200px'))
-pie.add('',getAreaWeight()).set_global_opts(title_opts=opts.TitleOpts(title='广州各区岗位数量分布'))
-pie.render('images/job/广州各区岗位数量分布.html')
\ No newline at end of file
+    for index, item in enumerate(areaName):
+        areaName_tmp.append(item + str(round(areaWeight[index] / sum(areaWeight) * 100, 2)) + '%')
+    return (areaName_tmp, areaWeight)
+
+
+pie = Pie(init_opts=opts.InitOpts(width='1200px', height='1200px'))
+data = getAreaWeight()
+pie.add("", [list(z) for z in zip(data[0], data[1])])
+pie.set_global_opts(title_opts=opts.TitleOpts(title='广州各区岗位数量分布'))
+pie.render('images/job/广州各区岗位数量分布.html')
diff --git a/job_spider.py b/job_spider.py
index 684b772..046e4dd 100644
--- a/job_spider.py
+++ b/job_spider.py
@@ -1,8 +1,8 @@
-import shutil
-
 from gevent import monkey
 from gevent.pool import Pool
 
+import util
+
 monkey.patch_all(select=False)
 
 from requests.adapters import HTTPAdapter
@@ -131,7 +131,7 @@ class JobSpider:
 
     def run(self):
         if os.path.exists(self.job_dir):
-            self.clearDir(self.job_dir)
+            util.clearDir(self.job_dir)
         else:
             os.mkdir(self.job_dir)
 
@@ -142,15 +142,6 @@ class JobSpider:
         self.execute_more_tasks(self.post_require)
         self.desc_url_queue.join()  # 主线程阻塞,等待队列清空
 
-    def clearDir(self, rootdir):
-        filelist = os.listdir(rootdir)
-        for f in filelist:
-            filepath = os.path.join(rootdir, f)
-            if os.path.isfile(filepath):
-                os.remove(filepath)
-                print(filepath + " removed!")
-            elif os.path.isdir(filepath):
-                shutil.rmtree(filepath, True)
 
 
 if __name__ == "__main__":
diff --git a/requirement.txt b/requirement.txt
index cece435..7a605c2 100644
--- a/requirement.txt
+++ b/requirement.txt
@@ -3,12 +3,27 @@ bs4==0.0.1
 certifi==2020.6.20
 cffi==1.14.0
 chardet==3.0.4
+cycler==0.10.0
 gevent==20.6.2
 greenlet==0.4.16
 idna==2.10
+Jinja2==2.11.2
+kiwisolver==1.2.0
 lxml==4.5.1
+MarkupSafe==1.1.1
+matplotlib==3.2.2
+numpy==1.19.0
+pandas==1.0.5
+Pillow==7.1.2
+prettytable==0.7.2
 pycparser==2.20
+pyecharts==1.8.1
+pyparsing==2.4.7
+python-dateutil==2.8.1
+pytz==2020.1
 requests==2.24.0
+simplejson==3.17.0
+six==1.15.0
 soupsieve==2.0.1
 urllib3==1.25.9
 zope.event==4.4
diff --git a/util.py b/util.py
new file mode 100644
index 0000000..f8a09b8
--- /dev/null
+++ b/util.py
@@ -0,0 +1,13 @@
+import os
+import shutil
+
+
+def clearDir(rootdir):
+    filelist = os.listdir(rootdir)
+    for f in filelist:
+        filepath = os.path.join(rootdir, f)
+        if os.path.isfile(filepath):
+            os.remove(filepath)
+            print(filepath + " removed!")
+        elif os.path.isdir(filepath):
+            shutil.rmtree(filepath, True)
\ No newline at end of file