import time import logging import requests from gevent import monkey from gevent.pool import Pool from queue import Queue import bs4 import re # 开启多线程 monkey.patch_all() def get_logger(): """ 创建日志实例 """ formatter = logging.Formatter("%(asctime)s - %(message)s") logger = logging.getLogger("monitor") logger.setLevel(LOG_LEVEL) ch = logging.StreamHandler() ch.setFormatter(formatter) logger.addHandler(ch) return logger headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3741.400 QQBrowser/10.5.3863.400'} START_URL = ('https://gz.zu.ke.com/zufang/pg{}/') LOG_LEVEL = logging.INFO # 日志等级 POOL_MAXSIZE = 8 # 线程池最大容量 logger = get_logger() class HouseSpider: """ House 网站爬虫类 """ def __init__(self): self.count = 1 # 记录当前爬第几条数据 self.desc_url_queue = Queue() # 线程池队列 self.pool = Pool(POOL_MAXSIZE) # 线程池管理线程,最大协程数 # 获取信息 def job_spider(self): """ 爬虫入口 """ urls = [START_URL.format(p) for p in range(1, 100)] for url in urls: logger.info("爬取第 {} 页".format(urls.index(url) + 1)) response = requests.get(url=url, headers=headers) response.enconding = response.apparent_encoding bs = bs4.BeautifulSoup(response.text, 'lxml') house_list = bs.select('#content > div.content__article > div.content__list > div') # house = [] for item in house_list: tmp = {} try: self.desc_url_queue.put(item.select('a')[0].attrs['href']) # 房屋详情链接加入队列 title = item.select('div > p.content__list--item--title.twoline > a')[0].string.replace('\n','').replace(' ', '') tmp['标题'] = title xinzhengqu = item.select('div > p.content__list--item--des > a:nth-child(1)')[0].string tmp['行政区'] = xinzhengqu xinzhengqu_level = item.select('div > p.content__list--item--des > a:nth-child(2)')[0].string tmp['二级行政区'] = xinzhengqu_level location = item.select('div > p.content__list--item--des > a:nth-child(3)')[0].string tmp['地址'] = location detail_house = str(item.select('div > p.content__list--item--des')[0]).replace(' ', '').replace('\n', '') detail_house_list = re.split('/', detail_house)[1:-1] miji = detail_house_list[0] tmp['面积'] = miji chaoxiang = detail_house_list[1] tmp['朝向'] = chaoxiang rooms = detail_house_list[2][:detail_house_list[2].index('<')] tmp['房间'] = rooms price = item.select('div > span > em')[0].string price_detail = re.findall('.*', str(item.select('div > span')[0]))[0].replace(' ', '').replace('', '').replace('', '') tmp['价格'] = price + price_detail with open('data/房源信息.txt', 'ab+') as f: f.write((str(tmp) + '\n').encode('utf-8')) except: continue # 打印队列长度,即多少条岗位详情 url logger.info("队列长度为 {} ".format(self.desc_url_queue.qsize())) def post_require(self): """ 爬取房源描述 """ while True: # 从队列中取 url url = self.desc_url_queue.get() url_ = 'https://gz.zu.ke.com/' + url response = requests.get(url=url_, headers=headers) response.enconding = response.apparent_encoding bs = bs4.BeautifulSoup(response.text, 'html.parser') try: if response.status_code == 200: logger.info("爬取第 {} 条房源详情".format(self.count)) desc = str(bs.select('#desc > p:nth-child(3)')[0]['data-desc']).replace('\n', '').replace(' ','').replace('
', '') with open('data/房源描述.txt', 'ab+') as f: f.write((str(desc) + '\n').encode('utf-8')) self.desc_url_queue.task_done() self.count += 1 else: self.desc_url_queue.put(url) continue except Exception as e: logger.error(e) logger.warning(url) def execute_more_tasks(self, target): """ 协程池接收请求任务,可以扩展把解析,存储耗时操作加入各自队列,效率最大化 :param target: 任务函数 :param count: 启动线程数量 """ for i in range(POOL_MAXSIZE): self.pool.apply_async(target) def run(self): """ 多线程爬取数据 """ self.job_spider() self.execute_more_tasks(self.post_require) self.desc_url_queue.join() # 主线程阻塞,等待队列清空 if __name__ == "__main__": spider = HouseSpider() start = time.time() spider.run() logger.info("总耗时 {} 秒".format(time.time() - start))