commit dc9a49f50101177e5ce58c9305d49d43c3230bed Author: pan <1029559041@qq.com> Date: Mon Jun 29 21:25:13 2020 +0800 init diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..c49df9a --- /dev/null +++ b/.gitignore @@ -0,0 +1,146 @@ +# Created by .ignore support plugin (hsz.mobi) +### Python template +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ +/images/both/ +/images/house/ +/images/job/ +/data/ +/.idea/ diff --git a/both_data_analysis1.py b/both_data_analysis1.py new file mode 100644 index 0000000..b9a28e4 --- /dev/null +++ b/both_data_analysis1.py @@ -0,0 +1,98 @@ +import pandas as pd +import re +import pyecharts.options as opts +from pyecharts.charts import Line, Bar + +# 岗位信息数据预处理 +with open('data/岗位信息.txt','rb') as file: + job_list = [] + while True: + line = file.readline() + if not line: + break + line = eval(line.decode('utf-8')) + try: + line['位置'] = re.split('-',line['位置'])[1] + danwei = re.findall('[\u4e00-\u9fa5]+',line['薪资']) + xinzi = re.findall('\d+.*\d',line['薪资'])[0].split('-') + if not xinzi[1]: + xinzi[1] = xinzi[0] + if danwei[0][0] == '万' and danwei[1] == '月': + line['薪资'] = round((float(xinzi[0])+float(xinzi[1]))/2,2) + elif danwei[0][0] == '万' and danwei[1] == '年': + line['薪资'] = round((float(xinzi[0]) + float(xinzi[1])) / 2 /12, 2) + elif danwei[0] == '千' and danwei[1] == '月': + line['薪资'] = round((float(xinzi[0]) + float(xinzi[1])) / 2 / 10, 2) + elif danwei[0] == '元' and danwei[1:] == '小时': + line['薪资'] = round((float(xinzi[0]) + float(xinzi[1]))*8*22 / 2 / 100, 2) + except: + continue + job_list.append(line) +job_list_DF = pd.DataFrame(job_list) + +# 房源信息预处理 +with open('data/房源信息.txt','rb') as file: + house_list = [] + while True: + line = file.readline() + if not line: + break + line = eval(line.decode('utf-8')) + line['面积'] = int(re.findall('\d+',line['面积'])[0]) + line['价格'] = int(re.findall('\d+',line['价格'])[0]) + house_list.append(line) +house_list_DF = pd.DataFrame(house_list) + +xingzhengqu = [item for item in set(house_list_DF.get(key='行政区')) if item] + +# 获取每个区 单日每平方米的价格 +def houserGetAvgPrice(xingzhengqu): + totalPrice = 0 + totalArea = 0 + for item in house_list: + if item['行政区'] == xingzhengqu: + totalArea = totalArea + item['面积'] + totalPrice = totalPrice + item['价格'] + return totalPrice / totalArea if totalArea >0 else 1 + +# 房租每日单价 +house_totalAvgPriceList = [] +for index,item in enumerate(xingzhengqu): + avg_price = houserGetAvgPrice(item) + house_totalAvgPriceList.append(round(avg_price/30,2)) +attr, house_value = (xingzhengqu,house_totalAvgPriceList) + +# 获取每个区 单日薪资 +def jobGetAvgPrice(xingzhengqu): + totalPrice = 0 + total = 0 + for item in job_list: + if item['位置'] == xingzhengqu: + total = total + 1 + totalPrice = totalPrice + item['薪资'] + return totalPrice / total if total >0 else 0 + +# 获取每个区 单时薪资 +job_totalAvgPriceList = [] +for index,item in enumerate(xingzhengqu): + avg_price = jobGetAvgPrice(item+'区') + job_totalAvgPriceList.append(round(avg_price*10000/30/24,2)) +attr, job_value = (xingzhengqu,job_totalAvgPriceList) + +# 广州房租-薪资图 +line =Line(init_opts=opts.InitOpts(width='800px',height='800px')) +line.add_xaxis(xaxis_data=attr) +line.add_yaxis('房租:元/日(1平方米)', house_value) +line.add_yaxis("薪资:元/日", job_value) +line.render('images/both/广州房租-薪资.html') + +difference = [] +for i in range(len(job_value)): + difference.append(round(job_value[i]-house_value[i],2)) +# 广州房租-薪资差距图 +bar = Bar(init_opts=opts.InitOpts(width='1200px',height='1400px')) +bar.add_xaxis(attr) +bar.add_yaxis("广州房租-薪资差距图:元",difference) +bar.set_global_opts(title_opts=opts.TitleOpts(title='广州房租-薪资差距图:元'),xaxis_opts=opts.AxisOpts(axislabel_opts={"rotate":"270"})) +bar.render('images/both/广州房租-薪资差距.html') + diff --git a/house_data_analysis1.py b/house_data_analysis1.py new file mode 100644 index 0000000..58a160a --- /dev/null +++ b/house_data_analysis1.py @@ -0,0 +1,91 @@ +import pandas as pd +import pyecharts.options as opts +from pyecharts.charts import Pie +import re + + +with open('data/房源信息.txt','rb') as file: + house_list = [] + while True: + line = file.readline() + if not line: + break + line = eval(line.decode('utf-8')) + line['面积'] = int(re.findall('\d+',line['面积'])[0]) + line['价格'] = int(re.findall('\d+',line['价格'])[0]) + house_list.append(line) +house_list_DF = pd.DataFrame(house_list) +xingzhengqu = [item for item in set(house_list_DF.get(key='行政区')) if item] + +# 租房面积统计 +bins = [-1,30,60,90,120,200,300,400,10000] +attr = ['0-30平方米','30-60平方米','60-90平方米','90-120平方米','120-200平方米','200-300平方米','300-400平方米','400+平方米'] +tmpDF = house_list_DF.groupby(pd.cut(house_list_DF['面积'],bins = bins,labels=attr)).size().reset_index(name = 'count') +value = list(map(int,tmpDF['count'].values)) +pie = Pie(init_opts=opts.InitOpts(width='800px',height='800px')) +pie.add('',zip(attr,value)).set_global_opts(title_opts=opts.TitleOpts(title='租房面积统计')) +pie.render('images/house/广州租房面积统计.html') + +# 求每个区的每平方米的租房单价 +from pyecharts.charts import TreeMap +def getAvgPrice(xingzhengqu): + totalPrice = 0 + totalArea = 0 + for item in house_list: + if item['行政区'] == xingzhengqu: + totalArea = totalArea + item['面积'] + totalPrice = totalPrice + item['价格'] + return totalPrice / totalArea if totalArea >0 else 1 +# 获取每个区 单月每平方米的价格 +def getTotalAvgPrice(): + totalAvgPriceList = [] + totalAvgPriceDirList = [] + for index, item in enumerate(xingzhengqu): + avg_price = getAvgPrice(item) + totalAvgPriceList.append(round(avg_price,3)) + totalAvgPriceDirList.append({'value':round(avg_price,3),'name':item + " ¥" + str(round(avg_price,3))}) + return totalAvgPriceDirList +# 获取每月每平方米的价格 +data = getTotalAvgPrice() +treemap = TreeMap(init_opts=opts.InitOpts(width='900px',height='800px')) +treemap.add('广州各区房租单价:平方米/月',data,label_opts=opts.LabelOpts(is_show=True, position='inside',font_size=13)) +treemap.render('images/house/广州各区房租单价.html') + +# 获取每个区 单日每平方米的价格 +from pyecharts.charts import Bar +totalAvgPriceList = [] +for index,item in enumerate(xingzhengqu): + avg_price = getAvgPrice(item) + totalAvgPriceList.append(round(avg_price/30,3)) +attr, value = (xingzhengqu,totalAvgPriceList) +bar = Bar(init_opts=opts.InitOpts(width='900px',height='800px')) +bar.add_xaxis(attr) +bar.add_yaxis("广州",value) +bar.set_global_opts(title_opts=opts.TitleOpts(title='广州各区房租单价:平方米/日')) +bar.render('images/house/广州每日每平方米的价格.html') + +# 获取户型数据 +from pyecharts.charts import WordCloud +def getRooms(): + results = house_list_DF.groupby('房间').size().reset_index(name='count') + room_list = list(results.房间.values) + weight_list = list(map(int,results['count'].values)) + return (room_list, weight_list) +attr, value = getRooms() +wordcloud = WordCloud(init_opts=opts.InitOpts(width='900px',height='400px')) +wordcloud.add('',zip(attr,value),word_size_range=[2,100]) +wordcloud.render('images/house/广州户型数据.html') + +# 获取各个区的房源比重 +from pyecharts.charts import Pie +def getAreaWeight(): + result = house_list_DF.groupby('行政区').size().reset_index(name='count') + areaName = list(result.行政区.values) + areaWeight = list(map(int,result['count'].values)) + areaName_tmp = [] + for index,item in enumerate(areaName): + areaName_tmp.append(item + str(round(areaWeight[index]/sum(areaWeight)*100,2))+'%') + return zip(areaName_tmp,areaWeight) +pie = Pie(init_opts=opts.InitOpts(width='600px',height='400px')) +pie.add('',getAreaWeight()).set_global_opts(title_opts=opts.TitleOpts(title='广州房源分布')) +pie.render('images/house/广州房源分布.html') \ No newline at end of file diff --git a/house_spider.py b/house_spider.py new file mode 100644 index 0000000..c8d4307 --- /dev/null +++ b/house_spider.py @@ -0,0 +1,141 @@ +import time +import logging +import requests +from gevent import monkey +from gevent.pool import Pool +from queue import Queue +import bs4 +import re + +# 开启多线程 +monkey.patch_all() + + +def get_logger(): + """ + 创建日志实例 + """ + formatter = logging.Formatter("%(asctime)s - %(message)s") + logger = logging.getLogger("monitor") + logger.setLevel(LOG_LEVEL) + + ch = logging.StreamHandler() + ch.setFormatter(formatter) + logger.addHandler(ch) + return logger + + +headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3741.400 QQBrowser/10.5.3863.400'} + + +START_URL = ('https://gz.zu.ke.com/zufang/pg{}/') + +LOG_LEVEL = logging.INFO # 日志等级 +POOL_MAXSIZE = 8 # 线程池最大容量 + +logger = get_logger() + + +class HouseSpider: + """ + House 网站爬虫类 + """ + + def __init__(self): + self.count = 1 # 记录当前爬第几条数据 + self.desc_url_queue = Queue() # 线程池队列 + self.pool = Pool(POOL_MAXSIZE) # 线程池管理线程,最大协程数 + # 获取信息 + def job_spider(self): + """ + 爬虫入口 + """ + urls = [START_URL.format(p) for p in range(1, 100)] + for url in urls: + logger.info("爬取第 {} 页".format(urls.index(url) + 1)) + response = requests.get(url=url, headers=headers) + response.enconding = response.apparent_encoding + bs = bs4.BeautifulSoup(response.text, 'lxml') + house_list = bs.select('#content > div.content__article > div.content__list > div') # + house = [] + for item in house_list: + tmp = {} + try: + self.desc_url_queue.put(item.select('a')[0].attrs['href']) # 房屋详情链接加入队列 + title = item.select('div > p.content__list--item--title.twoline > a')[0].string.replace('\n','').replace(' ', '') + tmp['标题'] = title + xinzhengqu = item.select('div > p.content__list--item--des > a:nth-child(1)')[0].string + tmp['行政区'] = xinzhengqu + xinzhengqu_level = item.select('div > p.content__list--item--des > a:nth-child(2)')[0].string + tmp['二级行政区'] = xinzhengqu_level + location = item.select('div > p.content__list--item--des > a:nth-child(3)')[0].string + tmp['地址'] = location + detail_house = str(item.select('div > p.content__list--item--des')[0]).replace(' ', '').replace('\n', '') + detail_house_list = re.split('/', detail_house)[1:-1] + miji = detail_house_list[0] + tmp['面积'] = miji + chaoxiang = detail_house_list[1] + tmp['朝向'] = chaoxiang + rooms = detail_house_list[2][:detail_house_list[2].index('<')] + tmp['房间'] = rooms + price = item.select('div > span > em')[0].string + price_detail = re.findall('.*', str(item.select('div > span')[0]))[0].replace(' ', '').replace('', '').replace('', '') + tmp['价格'] = price + price_detail + with open('data/房源信息.txt', 'ab+') as f: + f.write((str(tmp) + '\n').encode('utf-8')) + except: + continue + # 打印队列长度,即多少条岗位详情 url + logger.info("队列长度为 {} ".format(self.desc_url_queue.qsize())) + + def post_require(self): + """ + 爬取房源描述 + """ + while True: + # 从队列中取 url + url = self.desc_url_queue.get() + url_ = 'https://gz.zu.ke.com/' + url + response = requests.get(url=url_, headers=headers) + response.enconding = response.apparent_encoding + bs = bs4.BeautifulSoup(response.text, 'html.parser') + try: + if response.status_code == 200: + logger.info("爬取第 {} 条房源详情".format(self.count)) + desc = str(bs.select('#desc > p:nth-child(3)')[0]['data-desc']).replace('\n', '').replace(' ','').replace('
', '') + with open('data/房源描述.txt', 'ab+') as f: + f.write((str(desc) + '\n').encode('utf-8')) + self.desc_url_queue.task_done() + self.count += 1 + else: + self.desc_url_queue.put(url) + continue + except Exception as e: + logger.error(e) + logger.warning(url) + + def execute_more_tasks(self, target): + """ + 协程池接收请求任务,可以扩展把解析,存储耗时操作加入各自队列,效率最大化 + + :param target: 任务函数 + :param count: 启动线程数量 + """ + for i in range(POOL_MAXSIZE): + self.pool.apply_async(target) + + def run(self): + """ + 多线程爬取数据 + """ + self.job_spider() + self.execute_more_tasks(self.post_require) + self.desc_url_queue.join() # 主线程阻塞,等待队列清空 + + +if __name__ == "__main__": + spider = HouseSpider() + start = time.time() + spider.run() + logger.info("总耗时 {} 秒".format(time.time() - start)) + diff --git a/job_data_analysis1.py b/job_data_analysis1.py new file mode 100644 index 0000000..f2be492 --- /dev/null +++ b/job_data_analysis1.py @@ -0,0 +1,109 @@ +import pandas as pd +import re + +# 数据预处理 +with open('data/岗位信息.txt','rb') as file: + job_list = [] + while True: + line = file.readline() + if not line: + break + line = eval(line.decode('utf-8')) + try: + line['位置'] = re.split('-',line['位置'])[1] + danwei = re.findall('[\u4e00-\u9fa5]+',line['薪资']) + xinzi = re.findall('\d+.*\d',line['薪资'])[0].split('-') + if not xinzi[1]: + xinzi[1] = xinzi[0] + if danwei[0][0] == '万' and danwei[1] == '月': + line['薪资'] = round((float(xinzi[0])+float(xinzi[1]))/2,2) + elif danwei[0][0] == '万' and danwei[1] == '年': + line['薪资'] = round((float(xinzi[0]) + float(xinzi[1])) / 2 /12, 2) + elif danwei[0] == '千' and danwei[1] == '月': + line['薪资'] = round((float(xinzi[0]) + float(xinzi[1])) / 2 / 10, 2) + elif danwei[0] == '元' and danwei[1:] == '小时': + line['薪资'] = round((float(xinzi[0]) + float(xinzi[1]))*8*22 / 2 / 100, 2) + except: + continue + job_list.append(line) +job_list_DF = pd.DataFrame(job_list) +xingzhengqu = [item for item in set(job_list_DF.get(key='位置')) if item] + +# 广州各区岗位分布 +from pyecharts import options as opts +from pyecharts.charts import Pie + +def getAreaWeight(): + result = job_list_DF.groupby('位置').size().reset_index(name='count') + areaName = list(result.位置.values) + areaWeight = list(map(int,result['count'].values)) + areaName_tmp = [] + for index,item in enumerate(areaName): + areaName_tmp.append(item + str(round(areaWeight[index]/sum(areaWeight)*100,2))+'%') + return zip(areaName_tmp,areaWeight) +pie = Pie(init_opts=opts.InitOpts(width='800px',height='800px')) +pie.add('',getAreaWeight()).set_global_opts(title_opts=opts.TitleOpts(title='广州各区岗位分布')) +pie.render('images/job/广州各区岗位分布.html') + +# 求广州单月薪资 +from pyecharts.charts import TreeMap +def getAvgPrice(xingzhengqu): + totalPrice = 0 + total = 0 + for item in job_list: + if item['位置'] == xingzhengqu: + total = total + 1 + totalPrice = totalPrice + item['薪资'] + return totalPrice / total if total >0 else 0 +# 获取每个区 单月薪资 +def getTotalAvgPrice(): + totalAvgPriceList = [] + totalAvgPriceDirList = [] + for index, item in enumerate(xingzhengqu): + avg_price = getAvgPrice(item) + totalAvgPriceList.append(round(avg_price,2)) + totalAvgPriceDirList.append({'value':round(avg_price,2),'name':item + " ¥" + str(round(avg_price,2)) +' 万'}) + return totalAvgPriceDirList +data = getTotalAvgPrice() +treemap = TreeMap(init_opts=opts.InitOpts(width='1200px',height='1400px')) +treemap.add('广州各区每月薪资:万/月',data,label_opts=opts.LabelOpts(is_show=True, position='inside',font_size=13)) +treemap.render('images/job/广州各区每月薪资.html') + +# 获取每个区 单日薪资 +from pyecharts.charts import Bar +totalAvgPriceList = [] +for index,item in enumerate(xingzhengqu): + avg_price = getAvgPrice(item) + totalAvgPriceList.append(round(avg_price*10000/30,2)) +attr, value = (xingzhengqu,totalAvgPriceList) +bar = Bar(init_opts=opts.InitOpts(width='1200px',height='1400px')) +bar.add_xaxis(attr) +bar.add_yaxis("广州",value) +bar.set_global_opts(title_opts=opts.TitleOpts(title='广州各区单日薪资:元/日'),xaxis_opts=opts.AxisOpts(axislabel_opts={"rotate":"270"})) +bar.render('images/job/广州各区单日薪资.html') + +# 获取岗位数据 +from pyecharts.charts import WordCloud +def getRooms(): + results = job_list_DF.groupby('岗位').size().reset_index(name='count') + room_list = list(results.岗位.values) + weight_list = list(map(int,results['count'].values)) + return (room_list, weight_list) +attr, value = getRooms() +wordcloud = WordCloud(init_opts=opts.InitOpts(width='900px',height='400px')) +wordcloud.add('',zip(attr,value),word_size_range=[2,100]) +wordcloud.render('images/job/广州岗位数据.html') + +# 获取各个区的岗位数量比重 +from pyecharts.charts import Pie +def getAreaWeight(): + result = job_list_DF.groupby('位置').size().reset_index(name='count') + areaName = list(result.位置.values) + areaWeight = list(map(int,result['count'].values)) + areaName_tmp = [] + for index,item in enumerate(areaName): + areaName_tmp.append(item + str(round(areaWeight[index]/sum(areaWeight)*100,2))+'%') + return zip(areaName_tmp,areaWeight) +pie = Pie(init_opts=opts.InitOpts(width='1200px',height='1200px')) +pie.add('',getAreaWeight()).set_global_opts(title_opts=opts.TitleOpts(title='广州各区岗位数量分布')) +pie.render('images/job/广州各区岗位数量分布.html') \ No newline at end of file diff --git a/job_spider.py b/job_spider.py new file mode 100644 index 0000000..150cf43 --- /dev/null +++ b/job_spider.py @@ -0,0 +1,136 @@ +from gevent import monkey +from gevent.pool import Pool + +monkey.patch_all(select=False) +import time +import os +import logging +import requests +from queue import Queue +from bs4 import BeautifulSoup + +# 开启多线程 +monkey.patch_all() + + +def get_logger(): + """ + 创建日志实例 + """ + formatter = logging.Formatter("%(asctime)s - %(message)s") + logger = logging.getLogger("monitor") + logger.setLevel(LOG_LEVEL) + ch = logging.StreamHandler() + ch.setFormatter(formatter) + logger.addHandler(ch) + return logger + + +HEADERS = { + "X-Requested-With": "XMLHttpRequest", + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36" + "(KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36", +} + +START_URL = ( + 'https://search.51job.com/list/030200,000000,0000,00,9,99,%2520,2,{}.html?' + 'lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&' + 'companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&' + 'fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=' +) + +LOG_LEVEL = logging.INFO # 日志等级 +POOL_MAXSIZE = 8 # 线程池最大容量 + +logger = get_logger() + + +class JobSpider: + """ + Job 网站爬虫类 + """ + + def __init__(self): + self.count = 1 # 记录当前爬第几条数据 + self.company = [] + self.desc_url_queue = Queue() # 线程池队列 + self.pool = Pool(POOL_MAXSIZE) # 线程池管理线程,最大协程数 + + # 获取信息 + def job_spider(self): + """ + 爬虫入口 + """ + urls = [START_URL.format(p) for p in range(1, 200)] # #resultList > div:nth-child(53) + for url in urls: + logger.info("爬取链接:{}\n第 {} 页".format(url, urls.index(url) + 1)) + html = requests.get(url, headers=HEADERS).content.decode("gbk") + bs = BeautifulSoup(html, "lxml").find("div", class_="dw_table").find_all("div", class_="el") + for b in bs: + try: + href = b.find("a")["href"] + self.desc_url_queue.put(href) # 岗位详情链接加入队列 + except Exception: + pass + # 打印队列长度,即多少条岗位详情 url + logger.info("队列长度为 {} ".format(self.desc_url_queue.qsize())) + + def post_require(self): + """ + 爬取职位描述 + """ + while True: + # 从队列中取 url + url = self.desc_url_queue.get() + resp = requests.get(url, headers=HEADERS) + if resp.status_code == 200: + logger.info("爬取第 {} 条岗位详情".format(self.count)) + html = resp.content.decode("gbk") + self.desc_url_queue.task_done() + self.count += 1 + else: + self.desc_url_queue.put(url) + continue + try: + bs_tmp = BeautifulSoup(html, 'lxml').select( + 'body > div.tCompanyPage > div.tCompany_center.clearfix > div.tHeader.tHjob > div > div.cn')[0] + bs_tmp1 = bs_tmp.select('h1')[0] + bs_tmp2 = bs_tmp.select('strong')[0] + bs_tmp3 = bs_tmp.select('p.cname > a.catn')[0] + bs_tmp4 = bs_tmp.select(' p.msg.ltype')[0].text.replace(u'\xa0', '').split('|') + with open('data/岗位信息.txt', 'ab+') as f: + tmp = {"岗位": bs_tmp1.text, "公司": bs_tmp3['title'], "薪资": bs_tmp2.text, '位置': bs_tmp4[0], + '工作经验': bs_tmp4[1], '学历': bs_tmp4[2], '招聘人数': bs_tmp4[3], '发布时间': bs_tmp4[4]} + f.write((str(tmp) + '\n').encode('utf-8')) + bs = BeautifulSoup(html, "lxml").find("div", class_="bmsg job_msg inbox").text + s = bs.replace("微信", "").replace("分享", "").replace("邮件", "").replace("\t", "").strip() + with open(os.path.join("data", "岗位描述.txt"), "a", encoding="utf-8") as f: + f.write(s) + except Exception as e: + logger.error(e) + logger.warning(url) + + def execute_more_tasks(self, target): + """ + 协程池接收请求任务,可以扩展把解析,存储耗时操作加入各自队列,效率最大化 + + :param target: 任务函数 + :param count: 启动线程数量 + """ + for i in range(POOL_MAXSIZE): + self.pool.apply_async(target) + + def run(self): + """ + 多线程爬取数据 + """ + self.job_spider() + self.execute_more_tasks(self.post_require) + self.desc_url_queue.join() # 主线程阻塞,等待队列清空 + + +if __name__ == "__main__": + spider = JobSpider() + start = time.time() + spider.run() + logger.info("总耗时 {} 秒".format(time.time() - start)) diff --git a/requirement.txt b/requirement.txt new file mode 100644 index 0000000..cece435 --- /dev/null +++ b/requirement.txt @@ -0,0 +1,15 @@ +beautifulsoup4==4.9.1 +bs4==0.0.1 +certifi==2020.6.20 +cffi==1.14.0 +chardet==3.0.4 +gevent==20.6.2 +greenlet==0.4.16 +idna==2.10 +lxml==4.5.1 +pycparser==2.20 +requests==2.24.0 +soupsieve==2.0.1 +urllib3==1.25.9 +zope.event==4.4 +zope.interface==5.1.0 diff --git a/说明文档.docx b/说明文档.docx new file mode 100644 index 0000000..6408f3d Binary files /dev/null and b/说明文档.docx differ