You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
house-job/house_spider.py

141 lines
5.4 KiB

import time
import logging
import requests
from gevent import monkey
from gevent.pool import Pool
from queue import Queue
import bs4
import re
# 开启多线程
monkey.patch_all()
def get_logger():
"""
创建日志实例
"""
formatter = logging.Formatter("%(asctime)s - %(message)s")
logger = logging.getLogger("monitor")
logger.setLevel(LOG_LEVEL)
ch = logging.StreamHandler()
ch.setFormatter(formatter)
logger.addHandler(ch)
return logger
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3741.400 QQBrowser/10.5.3863.400'}
START_URL = ('https://gz.zu.ke.com/zufang/pg{}/')
LOG_LEVEL = logging.INFO # 日志等级
POOL_MAXSIZE = 8 # 线程池最大容量
logger = get_logger()
class HouseSpider:
"""
House 网站爬虫类
"""
def __init__(self):
self.count = 1 # 记录当前爬第几条数据
self.desc_url_queue = Queue() # 线程池队列
self.pool = Pool(POOL_MAXSIZE) # 线程池管理线程,最大协程数
# 获取信息
def job_spider(self):
"""
爬虫入口
"""
urls = [START_URL.format(p) for p in range(1, 100)]
for url in urls:
logger.info("爬取第 {}".format(urls.index(url) + 1))
response = requests.get(url=url, headers=headers)
response.enconding = response.apparent_encoding
bs = bs4.BeautifulSoup(response.text, 'lxml')
house_list = bs.select('#content > div.content__article > div.content__list > div') #
house = []
for item in house_list:
tmp = {}
try:
self.desc_url_queue.put(item.select('a')[0].attrs['href']) # 房屋详情链接加入队列
title = item.select('div > p.content__list--item--title.twoline > a')[0].string.replace('\n','').replace(' ', '')
tmp['标题'] = title
xinzhengqu = item.select('div > p.content__list--item--des > a:nth-child(1)')[0].string
tmp['行政区'] = xinzhengqu
xinzhengqu_level = item.select('div > p.content__list--item--des > a:nth-child(2)')[0].string
tmp['二级行政区'] = xinzhengqu_level
location = item.select('div > p.content__list--item--des > a:nth-child(3)')[0].string
tmp['地址'] = location
detail_house = str(item.select('div > p.content__list--item--des')[0]).replace(' ', '').replace('\n', '')
detail_house_list = re.split('<i>/</i>', detail_house)[1:-1]
miji = detail_house_list[0]
tmp['面积'] = miji
chaoxiang = detail_house_list[1]
tmp['朝向'] = chaoxiang
rooms = detail_house_list[2][:detail_house_list[2].index('<')]
tmp['房间'] = rooms
price = item.select('div > span > em')[0].string
price_detail = re.findall('</em>.*', str(item.select('div > span')[0]))[0].replace(' ', '').replace('</em>', '').replace('</span>', '')
tmp['价格'] = price + price_detail
with open('data/房源信息.txt', 'ab+') as f:
f.write((str(tmp) + '\n').encode('utf-8'))
except:
continue
# 打印队列长度,即多少条岗位详情 url
logger.info("队列长度为 {} ".format(self.desc_url_queue.qsize()))
def post_require(self):
"""
爬取房源描述
"""
while True:
# 从队列中取 url
url = self.desc_url_queue.get()
url_ = 'https://gz.zu.ke.com/' + url
response = requests.get(url=url_, headers=headers)
response.enconding = response.apparent_encoding
bs = bs4.BeautifulSoup(response.text, 'html.parser')
try:
if response.status_code == 200:
logger.info("爬取第 {} 条房源详情".format(self.count))
desc = str(bs.select('#desc > p:nth-child(3)')[0]['data-desc']).replace('\n', '').replace(' ','').replace('<br/>', '')
with open('data/房源描述.txt', 'ab+') as f:
f.write((str(desc) + '\n').encode('utf-8'))
self.desc_url_queue.task_done()
self.count += 1
else:
self.desc_url_queue.put(url)
continue
except Exception as e:
logger.error(e)
logger.warning(url)
def execute_more_tasks(self, target):
"""
协程池接收请求任务,可以扩展把解析,存储耗时操作加入各自队列,效率最大化
:param target: 任务函数
:param count: 启动线程数量
"""
for i in range(POOL_MAXSIZE):
self.pool.apply_async(target)
def run(self):
"""
多线程爬取数据
"""
self.job_spider()
self.execute_more_tasks(self.post_require)
self.desc_url_queue.join() # 主线程阻塞,等待队列清空
if __name__ == "__main__":
spider = HouseSpider()
start = time.time()
spider.run()
logger.info("总耗时 {}".format(time.time() - start))