You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
142 lines
5.4 KiB
142 lines
5.4 KiB
5 years ago
|
import time
|
||
|
import logging
|
||
|
import requests
|
||
|
from gevent import monkey
|
||
|
from gevent.pool import Pool
|
||
|
from queue import Queue
|
||
|
import bs4
|
||
|
import re
|
||
|
|
||
|
# 开启多线程
|
||
|
monkey.patch_all()
|
||
|
|
||
|
|
||
|
def get_logger():
|
||
|
"""
|
||
|
创建日志实例
|
||
|
"""
|
||
|
formatter = logging.Formatter("%(asctime)s - %(message)s")
|
||
|
logger = logging.getLogger("monitor")
|
||
|
logger.setLevel(LOG_LEVEL)
|
||
|
|
||
|
ch = logging.StreamHandler()
|
||
|
ch.setFormatter(formatter)
|
||
|
logger.addHandler(ch)
|
||
|
return logger
|
||
|
|
||
|
|
||
|
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3741.400 QQBrowser/10.5.3863.400'}
|
||
|
|
||
|
|
||
|
START_URL = ('https://gz.zu.ke.com/zufang/pg{}/')
|
||
|
|
||
|
LOG_LEVEL = logging.INFO # 日志等级
|
||
|
POOL_MAXSIZE = 8 # 线程池最大容量
|
||
|
|
||
|
logger = get_logger()
|
||
|
|
||
|
|
||
|
class HouseSpider:
|
||
|
"""
|
||
|
House 网站爬虫类
|
||
|
"""
|
||
|
|
||
|
def __init__(self):
|
||
|
self.count = 1 # 记录当前爬第几条数据
|
||
|
self.desc_url_queue = Queue() # 线程池队列
|
||
|
self.pool = Pool(POOL_MAXSIZE) # 线程池管理线程,最大协程数
|
||
|
# 获取信息
|
||
|
def job_spider(self):
|
||
|
"""
|
||
|
爬虫入口
|
||
|
"""
|
||
|
urls = [START_URL.format(p) for p in range(1, 100)]
|
||
|
for url in urls:
|
||
|
logger.info("爬取第 {} 页".format(urls.index(url) + 1))
|
||
|
response = requests.get(url=url, headers=headers)
|
||
|
response.enconding = response.apparent_encoding
|
||
|
bs = bs4.BeautifulSoup(response.text, 'lxml')
|
||
|
house_list = bs.select('#content > div.content__article > div.content__list > div') #
|
||
|
house = []
|
||
|
for item in house_list:
|
||
|
tmp = {}
|
||
|
try:
|
||
|
self.desc_url_queue.put(item.select('a')[0].attrs['href']) # 房屋详情链接加入队列
|
||
|
title = item.select('div > p.content__list--item--title.twoline > a')[0].string.replace('\n','').replace(' ', '')
|
||
|
tmp['标题'] = title
|
||
|
xinzhengqu = item.select('div > p.content__list--item--des > a:nth-child(1)')[0].string
|
||
|
tmp['行政区'] = xinzhengqu
|
||
|
xinzhengqu_level = item.select('div > p.content__list--item--des > a:nth-child(2)')[0].string
|
||
|
tmp['二级行政区'] = xinzhengqu_level
|
||
|
location = item.select('div > p.content__list--item--des > a:nth-child(3)')[0].string
|
||
|
tmp['地址'] = location
|
||
|
detail_house = str(item.select('div > p.content__list--item--des')[0]).replace(' ', '').replace('\n', '')
|
||
|
detail_house_list = re.split('<i>/</i>', detail_house)[1:-1]
|
||
|
miji = detail_house_list[0]
|
||
|
tmp['面积'] = miji
|
||
|
chaoxiang = detail_house_list[1]
|
||
|
tmp['朝向'] = chaoxiang
|
||
|
rooms = detail_house_list[2][:detail_house_list[2].index('<')]
|
||
|
tmp['房间'] = rooms
|
||
|
price = item.select('div > span > em')[0].string
|
||
|
price_detail = re.findall('</em>.*', str(item.select('div > span')[0]))[0].replace(' ', '').replace('</em>', '').replace('</span>', '')
|
||
|
tmp['价格'] = price + price_detail
|
||
|
with open('data/房源信息.txt', 'ab+') as f:
|
||
|
f.write((str(tmp) + '\n').encode('utf-8'))
|
||
|
except:
|
||
|
continue
|
||
|
# 打印队列长度,即多少条岗位详情 url
|
||
|
logger.info("队列长度为 {} ".format(self.desc_url_queue.qsize()))
|
||
|
|
||
|
def post_require(self):
|
||
|
"""
|
||
|
爬取房源描述
|
||
|
"""
|
||
|
while True:
|
||
|
# 从队列中取 url
|
||
|
url = self.desc_url_queue.get()
|
||
|
url_ = 'https://gz.zu.ke.com/' + url
|
||
|
response = requests.get(url=url_, headers=headers)
|
||
|
response.enconding = response.apparent_encoding
|
||
|
bs = bs4.BeautifulSoup(response.text, 'html.parser')
|
||
|
try:
|
||
|
if response.status_code == 200:
|
||
|
logger.info("爬取第 {} 条房源详情".format(self.count))
|
||
|
desc = str(bs.select('#desc > p:nth-child(3)')[0]['data-desc']).replace('\n', '').replace(' ','').replace('<br/>', '')
|
||
|
with open('data/房源描述.txt', 'ab+') as f:
|
||
|
f.write((str(desc) + '\n').encode('utf-8'))
|
||
|
self.desc_url_queue.task_done()
|
||
|
self.count += 1
|
||
|
else:
|
||
|
self.desc_url_queue.put(url)
|
||
|
continue
|
||
|
except Exception as e:
|
||
|
logger.error(e)
|
||
|
logger.warning(url)
|
||
|
|
||
|
def execute_more_tasks(self, target):
|
||
|
"""
|
||
|
协程池接收请求任务,可以扩展把解析,存储耗时操作加入各自队列,效率最大化
|
||
|
|
||
|
:param target: 任务函数
|
||
|
:param count: 启动线程数量
|
||
|
"""
|
||
|
for i in range(POOL_MAXSIZE):
|
||
|
self.pool.apply_async(target)
|
||
|
|
||
|
def run(self):
|
||
|
"""
|
||
|
多线程爬取数据
|
||
|
"""
|
||
|
self.job_spider()
|
||
|
self.execute_more_tasks(self.post_require)
|
||
|
self.desc_url_queue.join() # 主线程阻塞,等待队列清空
|
||
|
|
||
|
|
||
|
if __name__ == "__main__":
|
||
|
spider = HouseSpider()
|
||
|
start = time.time()
|
||
|
spider.run()
|
||
|
logger.info("总耗时 {} 秒".format(time.time() - start))
|
||
|
|