+import pandas as pd
+import re
+import pyecharts.options as opts
+from pyecharts.charts import Line, Bar
+# 岗位信息数据预处理
+with open('data/岗位信息.txt','rb') as file:
+ job_list = []
+ while True:
+ line = file.readline()
+ if not line:
+ break
+ line = eval(line.decode('utf-8'))
+ try:
+ line['位置'] = re.split('-',line['位置'])[1]
+ danwei = re.findall('[\u4e00-\u9fa5]+',line['薪资'])
+ xinzi = re.findall('\d+.*\d',line['薪资'])[0].split('-')
+ if not xinzi[1]:
+ xinzi[1] = xinzi[0]
+ if danwei[0][0] == '万' and danwei[1] == '月':
+ line['薪资'] = round((float(xinzi[0])+float(xinzi[1]))/2,2)
+ elif danwei[0][0] == '万' and danwei[1] == '年':
+ line['薪资'] = round((float(xinzi[0]) + float(xinzi[1])) / 2 /12, 2)
+ elif danwei[0] == '千' and danwei[1] == '月':
+ line['薪资'] = round((float(xinzi[0]) + float(xinzi[1])) / 2 / 10, 2)
+ elif danwei[0] == '元' and danwei[1:] == '小时':
+ line['薪资'] = round((float(xinzi[0]) + float(xinzi[1]))*8*22 / 2 / 100, 2)
+ except:
+ continue
+ job_list.append(line)
+job_list_DF = pd.DataFrame(job_list)
+# 房源信息预处理
+with open('data/房源信息.txt','rb') as file:
+ house_list = []
+ while True:
+ line = file.readline()
+ if not line:
+ break
+ line = eval(line.decode('utf-8'))
+ line['面积'] = int(re.findall('\d+',line['面积'])[0])
+ line['价格'] = int(re.findall('\d+',line['价格'])[0])
+ house_list.append(line)
+house_list_DF = pd.DataFrame(house_list)
+xingzhengqu = [item for item in set(house_list_DF.get(key='行政区')) if item]
+# 获取每个区 单日每平方米的价格
+def houserGetAvgPrice(xingzhengqu):
+ totalPrice = 0
+ totalArea = 0
+ for item in house_list:
+ if item['行政区'] == xingzhengqu:
+ totalArea = totalArea + item['面积']
+ totalPrice = totalPrice + item['价格']
+ return totalPrice / totalArea if totalArea >0 else 1
+# 房租每日单价
+house_totalAvgPriceList = []
+for index,item in enumerate(xingzhengqu):
+ avg_price = houserGetAvgPrice(item)
+ house_totalAvgPriceList.append(round(avg_price/30,2))
+attr, house_value = (xingzhengqu,house_totalAvgPriceList)
+# 获取每个区 单日薪资
+def jobGetAvgPrice(xingzhengqu):
+ totalPrice = 0
+ total = 0
+ for item in job_list:
+ if item['位置'] == xingzhengqu:
+ total = total + 1
+ totalPrice = totalPrice + item['薪资']
+ return totalPrice / total if total >0 else 0
+# 获取每个区 单时薪资
+job_totalAvgPriceList = []
+for index,item in enumerate(xingzhengqu):
+ avg_price = jobGetAvgPrice(item+'区')
+ job_totalAvgPriceList.append(round(avg_price*10000/30/24,2))
+attr, job_value = (xingzhengqu,job_totalAvgPriceList)
+# 广州房租-薪资图
+line =Line(init_opts=opts.InitOpts(width='800px',height='800px'))
+line.add_yaxis('房租:元/日(1平方米)', house_value)
+line.add_yaxis("薪资:元/日", job_value)
+difference = []
+for i in range(len(job_value)):
+ difference.append(round(job_value[i]-house_value[i],2))
+# 广州房租-薪资差距图
+bar = Bar(init_opts=opts.InitOpts(width='1200px',height='1400px'))
+import pandas as pd
+import pyecharts.options as opts
+from pyecharts.charts import Pie
+import re
+with open('data/房源信息.txt','rb') as file:
+ house_list = []
+ while True:
+ line = file.readline()
+ if not line:
+ break
+ line = eval(line.decode('utf-8'))
+ line['面积'] = int(re.findall('\d+',line['面积'])[0])
+ line['价格'] = int(re.findall('\d+',line['价格'])[0])
+ house_list.append(line)
+house_list_DF = pd.DataFrame(house_list)
+xingzhengqu = [item for item in set(house_list_DF.get(key='行政区')) if item]
+# 租房面积统计
+bins = [-1,30,60,90,120,200,300,400,10000]
+attr = ['0-30平方米','30-60平方米','60-90平方米','90-120平方米','120-200平方米','200-300平方米','300-400平方米','400+平方米']
+tmpDF = house_list_DF.groupby(pd.cut(house_list_DF['面积'],bins = bins,labels=attr)).size().reset_index(name = 'count')
+value = list(map(int,tmpDF['count'].values))
+pie = Pie(init_opts=opts.InitOpts(width='800px',height='800px'))
+# 求每个区的每平方米的租房单价
+from pyecharts.charts import TreeMap
+def getAvgPrice(xingzhengqu):
+ totalPrice = 0
+ totalArea = 0
+ for item in house_list:
+ if item['行政区'] == xingzhengqu:
+ totalArea = totalArea + item['面积']
+ totalPrice = totalPrice + item['价格']
+ return totalPrice / totalArea if totalArea >0 else 1
+# 获取每个区 单月每平方米的价格
+def getTotalAvgPrice():
+ totalAvgPriceList = []
+ totalAvgPriceDirList = []
+ for index, item in enumerate(xingzhengqu):
+ avg_price = getAvgPrice(item)
+ totalAvgPriceList.append(round(avg_price,3))
+ totalAvgPriceDirList.append({'value':round(avg_price,3),'name':item + " ¥" + str(round(avg_price,3))})
+ return totalAvgPriceDirList
+# 获取每月每平方米的价格
+data = getTotalAvgPrice()
+treemap = TreeMap(init_opts=opts.InitOpts(width='900px',height='800px'))
+treemap.add('广州各区房租单价:平方米/月',data,label_opts=opts.LabelOpts(is_show=True, position='inside',font_size=13))
+# 获取每个区 单日每平方米的价格
+from pyecharts.charts import Bar
+totalAvgPriceList = []
+for index,item in enumerate(xingzhengqu):
+ avg_price = getAvgPrice(item)
+ totalAvgPriceList.append(round(avg_price/30,3))
+attr, value = (xingzhengqu,totalAvgPriceList)
+bar = Bar(init_opts=opts.InitOpts(width='900px',height='800px'))
+# 获取户型数据
+from pyecharts.charts import WordCloud
+def getRooms():
+ results = house_list_DF.groupby('房间').size().reset_index(name='count')
+ room_list = list(results.房间.values)
+ weight_list = list(map(int,results['count'].values))
+ return (room_list, weight_list)
+attr, value = getRooms()
+wordcloud = WordCloud(init_opts=opts.InitOpts(width='900px',height='400px'))
+# 获取各个区的房源比重
+from pyecharts.charts import Pie
+def getAreaWeight():
+ result = house_list_DF.groupby('行政区').size().reset_index(name='count')
+ areaName = list(result.行政区.values)
+ areaWeight = list(map(int,result['count'].values))
+ areaName_tmp = []
+ for index,item in enumerate(areaName):
+ areaName_tmp.append(item + str(round(areaWeight[index]/sum(areaWeight)*100,2))+'%')
+ return zip(areaName_tmp,areaWeight)
+pie = Pie(init_opts=opts.InitOpts(width='600px',height='400px'))
\ No newline at end of file
+import time
+import logging
+import requests
+from gevent import monkey
+from gevent.pool import Pool
+from queue import Queue
+import bs4
+import re
+# 开启多线程
+def get_logger():
+ """
+ 创建日志实例
+ """
+ formatter = logging.Formatter("%(asctime)s - %(message)s")
+ logger = logging.getLogger("monitor")
+ logger.setLevel(LOG_LEVEL)
+ ch = logging.StreamHandler()
+ ch.setFormatter(formatter)
+ logger.addHandler(ch)
+ return logger
+headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3741.400 QQBrowser/10.5.3863.400'}
+START_URL = ('{}/')
+LOG_LEVEL = logging.INFO # 日志等级
+POOL_MAXSIZE = 8 # 线程池最大容量
+logger = get_logger()
+class HouseSpider:
+ """
+ House 网站爬虫类
+ """
+ def __init__(self):
+ self.count = 1 # 记录当前爬第几条数据
+ self.desc_url_queue = Queue() # 线程池队列
+ self.pool = Pool(POOL_MAXSIZE) # 线程池管理线程,最大协程数
+ # 获取信息
+ def job_spider(self):
+ """
+ 爬虫入口
+ """
+ urls = [START_URL.format(p) for p in range(1, 100)]
+ for url in urls:
+"爬取第 {} 页".format(urls.index(url) + 1))
+ response = requests.get(url=url, headers=headers)
+ response.enconding = response.apparent_encoding
+ bs = bs4.BeautifulSoup(response.text, 'lxml')
+ house_list ='#content > div.content__article > div.content__list > div') #
+ house = []
+ for item in house_list:
+ tmp = {}
+ try:
+ self.desc_url_queue.put('a')[0].attrs['href']) # 房屋详情链接加入队列
+ title ='div > p.content__list--item--title.twoline > a')[0].string.replace('\n','').replace(' ', '')
+ tmp['标题'] = title
+ xinzhengqu ='div > p.content__list--item--des > a:nth-child(1)')[0].string
+ tmp['行政区'] = xinzhengqu
+ xinzhengqu_level ='div > p.content__list--item--des > a:nth-child(2)')[0].string
+ tmp['二级行政区'] = xinzhengqu_level
+ location ='div > p.content__list--item--des > a:nth-child(3)')[0].string
+ tmp['地址'] = location
+ detail_house = str('div > p.content__list--item--des')[0]).replace(' ', '').replace('\n', '')
+ detail_house_list = re.split('/', detail_house)[1:-1]
+ miji = detail_house_list[0]
+ tmp['面积'] = miji
+ chaoxiang = detail_house_list[1]
+ tmp['朝向'] = chaoxiang
+ rooms = detail_house_list[2][:detail_house_list[2].index('<')]
+ tmp['房间'] = rooms
+ price ='div > span > em')[0].string
+ price_detail = re.findall('.*', str('div > span')[0]))[0].replace(' ', '').replace('', '').replace('', '')
+ tmp['价格'] = price + price_detail
+ with open('data/房源信息.txt', 'ab+') as f:
+ f.write((str(tmp) + '\n').encode('utf-8'))
+ except:
+ continue
+ # 打印队列长度,即多少条岗位详情 url
+"队列长度为 {} ".format(self.desc_url_queue.qsize()))
+ def post_require(self):
+ """
+ 爬取房源描述
+ """
+ while True:
+ # 从队列中取 url
+ url = self.desc_url_queue.get()
+ url_ = '' + url
+ response = requests.get(url=url_, headers=headers)
+ response.enconding = response.apparent_encoding
+ bs = bs4.BeautifulSoup(response.text, 'html.parser')
+ try:
+ if response.status_code == 200:
+"爬取第 {} 条房源详情".format(self.count))
+ desc = str('#desc > p:nth-child(3)')[0]['data-desc']).replace('\n', '').replace(' ','').replace('
', '')
+ with open('data/房源描述.txt', 'ab+') as f:
+ f.write((str(desc) + '\n').encode('utf-8'))
+ self.desc_url_queue.task_done()
+ self.count += 1
+ else:
+ self.desc_url_queue.put(url)
+ continue
+ except Exception as e:
+ logger.error(e)
+ logger.warning(url)
+ def execute_more_tasks(self, target):
+ """
+ 协程池接收请求任务,可以扩展把解析,存储耗时操作加入各自队列,效率最大化
+ :param target: 任务函数
+ :param count: 启动线程数量
+ """
+ for i in range(POOL_MAXSIZE):
+ self.pool.apply_async(target)
+ def run(self):
+ """
+ 多线程爬取数据
+ """
+ self.job_spider()
+ self.execute_more_tasks(self.post_require)
+ self.desc_url_queue.join() # 主线程阻塞,等待队列清空
+if __name__ == "__main__":
+ spider = HouseSpider()
+ start = time.time()
+"总耗时 {} 秒".format(time.time() - start))
+import pandas as pd
+import re
+# 数据预处理
+with open('data/岗位信息.txt','rb') as file:
+ job_list = []
+ while True:
+ line = file.readline()
+ if not line:
+ break
+ line = eval(line.decode('utf-8'))
+ try:
+ line['位置'] = re.split('-',line['位置'])[1]
+ danwei = re.findall('[\u4e00-\u9fa5]+',line['薪资'])
+ xinzi = re.findall('\d+.*\d',line['薪资'])[0].split('-')
+ if not xinzi[1]:
+ xinzi[1] = xinzi[0]
+ if danwei[0][0] == '万' and danwei[1] == '月':
+ line['薪资'] = round((float(xinzi[0])+float(xinzi[1]))/2,2)
+ elif danwei[0][0] == '万' and danwei[1] == '年':
+ line['薪资'] = round((float(xinzi[0]) + float(xinzi[1])) / 2 /12, 2)
+ elif danwei[0] == '千' and danwei[1] == '月':
+ line['薪资'] = round((float(xinzi[0]) + float(xinzi[1])) / 2 / 10, 2)
+ elif danwei[0] == '元' and danwei[1:] == '小时':
+ line['薪资'] = round((float(xinzi[0]) + float(xinzi[1]))*8*22 / 2 / 100, 2)
+ except:
+ continue
+ job_list.append(line)
+job_list_DF = pd.DataFrame(job_list)
+xingzhengqu = [item for item in set(job_list_DF.get(key='位置')) if item]
+# 广州各区岗位分布
+from pyecharts import options as opts
+from pyecharts.charts import Pie
+def getAreaWeight():
+ result = job_list_DF.groupby('位置').size().reset_index(name='count')
+ areaName = list(result.位置.values)
+ areaWeight = list(map(int,result['count'].values))
+ areaName_tmp = []
+ for index,item in enumerate(areaName):
+ areaName_tmp.append(item + str(round(areaWeight[index]/sum(areaWeight)*100,2))+'%')
+ return zip(areaName_tmp,areaWeight)
+pie = Pie(init_opts=opts.InitOpts(width='800px',height='800px'))
+# 求广州单月薪资
+from pyecharts.charts import TreeMap
+def getAvgPrice(xingzhengqu):
+ totalPrice = 0
+ total = 0
+ for item in job_list:
+ if item['位置'] == xingzhengqu:
+ total = total + 1
+ totalPrice = totalPrice + item['薪资']
+ return totalPrice / total if total >0 else 0
+# 获取每个区 单月薪资
+def getTotalAvgPrice():
+ totalAvgPriceList = []
+ totalAvgPriceDirList = []
+ for index, item in enumerate(xingzhengqu):
+ avg_price = getAvgPrice(item)
+ totalAvgPriceList.append(round(avg_price,2))
+ totalAvgPriceDirList.append({'value':round(avg_price,2),'name':item + " ¥" + str(round(avg_price,2)) +' 万'})
+ return totalAvgPriceDirList
+data = getTotalAvgPrice()
+treemap = TreeMap(init_opts=opts.InitOpts(width='1200px',height='1400px'))
+treemap.add('广州各区每月薪资:万/月',data,label_opts=opts.LabelOpts(is_show=True, position='inside',font_size=13))
+# 获取每个区 单日薪资
+from pyecharts.charts import Bar
+totalAvgPriceList = []
+for index,item in enumerate(xingzhengqu):
+ avg_price = getAvgPrice(item)
+ totalAvgPriceList.append(round(avg_price*10000/30,2))
+attr, value = (xingzhengqu,totalAvgPriceList)
+bar = Bar(init_opts=opts.InitOpts(width='1200px',height='1400px'))
+# 获取岗位数据
+from pyecharts.charts import WordCloud
+def getRooms():
+ results = job_list_DF.groupby('岗位').size().reset_index(name='count')
+ room_list = list(results.岗位.values)
+ weight_list = list(map(int,results['count'].values))
+ return (room_list, weight_list)
+attr, value = getRooms()
+wordcloud = WordCloud(init_opts=opts.InitOpts(width='900px',height='400px'))
+# 获取各个区的岗位数量比重
+from pyecharts.charts import Pie
+def getAreaWeight():
+ result = job_list_DF.groupby('位置').size().reset_index(name='count')
+ areaName = list(result.位置.values)
+ areaWeight = list(map(int,result['count'].values))
+ areaName_tmp = []
+ for index,item in enumerate(areaName):
+ areaName_tmp.append(item + str(round(areaWeight[index]/sum(areaWeight)*100,2))+'%')
+ return zip(areaName_tmp,areaWeight)
+pie = Pie(init_opts=opts.InitOpts(width='1200px',height='1200px'))
\ No newline at end of file
+from gevent import monkey
+from gevent.pool import Pool
+import time
+import os
+import logging
+import requests
+from queue import Queue
+from bs4 import BeautifulSoup
+# 开启多线程
+def get_logger():
+ """
+ 创建日志实例
+ """
+ formatter = logging.Formatter("%(asctime)s - %(message)s")
+ logger = logging.getLogger("monitor")
+ logger.setLevel(LOG_LEVEL)
+ ch = logging.StreamHandler()
+ ch.setFormatter(formatter)
+ logger.addHandler(ch)
+ return logger
+ "X-Requested-With": "XMLHttpRequest",
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36"
+ "(KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36",
+ ',000000,0000,00,9,99,%2520,2,{}.html?'
+ 'lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&'
+ 'companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&'
+ 'fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare='
+LOG_LEVEL = logging.INFO # 日志等级
+POOL_MAXSIZE = 8 # 线程池最大容量
+logger = get_logger()
+class JobSpider:
+ """
+ Job 网站爬虫类
+ """
+ def __init__(self):
+ self.count = 1 # 记录当前爬第几条数据
+ = []
+ self.desc_url_queue = Queue() # 线程池队列
+ self.pool = Pool(POOL_MAXSIZE) # 线程池管理线程,最大协程数
+ # 获取信息
+ def job_spider(self):
+ """
+ 爬虫入口
+ """
+ urls = [START_URL.format(p) for p in range(1, 200)] # #resultList > div:nth-child(53)
+ for url in urls:
+"爬取链接:{}\n第 {} 页".format(url, urls.index(url) + 1))
+ html = requests.get(url, headers=HEADERS).content.decode("gbk")
+ bs = BeautifulSoup(html, "lxml").find("div", class_="dw_table").find_all("div", class_="el")
+ for b in bs:
+ try:
+ href = b.find("a")["href"]
+ self.desc_url_queue.put(href) # 岗位详情链接加入队列
+ except Exception:
+ pass
+ # 打印队列长度,即多少条岗位详情 url
+"队列长度为 {} ".format(self.desc_url_queue.qsize()))
+ def post_require(self):
+ """
+ 爬取职位描述
+ """
+ while True:
+ # 从队列中取 url
+ url = self.desc_url_queue.get()
+ resp = requests.get(url, headers=HEADERS)
+ if resp.status_code == 200:
+"爬取第 {} 条岗位详情".format(self.count))
+ html = resp.content.decode("gbk")
+ self.desc_url_queue.task_done()
+ self.count += 1
+ else:
+ self.desc_url_queue.put(url)
+ continue
+ try:
+ bs_tmp = BeautifulSoup(html, 'lxml').select(
+ 'body > div.tCompanyPage > div.tCompany_center.clearfix > div.tHeader.tHjob > div >')[0]
+ bs_tmp1 ='h1')[0]
+ bs_tmp2 ='strong')[0]
+ bs_tmp3 ='p.cname > a.catn')[0]
+ bs_tmp4 =' p.msg.ltype')[0].text.replace(u'\xa0', '').split('|')
+ with open('data/岗位信息.txt', 'ab+') as f:
+ tmp = {"岗位": bs_tmp1.text, "公司": bs_tmp3['title'], "薪资": bs_tmp2.text, '位置': bs_tmp4[0],
+ '工作经验': bs_tmp4[1], '学历': bs_tmp4[2], '招聘人数': bs_tmp4[3], '发布时间': bs_tmp4[4]}
+ f.write((str(tmp) + '\n').encode('utf-8'))
+ bs = BeautifulSoup(html, "lxml").find("div", class_="bmsg job_msg inbox").text
+ s = bs.replace("微信", "").replace("分享", "").replace("邮件", "").replace("\t", "").strip()
+ with open(os.path.join("data", "岗位描述.txt"), "a", encoding="utf-8") as f:
+ f.write(s)
+ except Exception as e:
+ logger.error(e)
+ logger.warning(url)
+ def execute_more_tasks(self, target):
+ """
+ 协程池接收请求任务,可以扩展把解析,存储耗时操作加入各自队列,效率最大化
+ :param target: 任务函数
+ :param count: 启动线程数量
+ """
+ for i in range(POOL_MAXSIZE):
+ self.pool.apply_async(target)
+ def run(self):
+ """
+ 多线程爬取数据
+ """
+ self.job_spider()
+ self.execute_more_tasks(self.post_require)
+ self.desc_url_queue.join() # 主线程阻塞,等待队列清空
+if __name__ == "__main__":
+ spider = JobSpider()
+ start = time.time()
+"总耗时 {} 秒".format(time.time() - start))
