import _thread import json import math import os import random import time from concurrent import futures import django import requests from bs4 import BeautifulSoup from PixivSearch.settings import logger os.environ.setdefault("DJANGO_SETTINGS_MODULE", "PixivSearch.settings") django.setup() from PixivSearch.model.config import mediaInfo, stat current_mediaInfo = mediaInfo(id=0, chn_name='null') isStop = False executors = None def check(): while True: if isStop: logger.info('停止多线程爬虫') executors.shutdown() break time.sleep(1) def save(md): if isStop: return url = "https://www.bilibili.com/bangumi/media/md%d" % md try: req = requests.get(url, timeout=10) except BaseException as e: logger.error(e) save(md) logger.info("request_url=%s,status_code=%d" % (url, req.status_code)) if (req.status_code == 200): tag = BeautifulSoup(req.text, 'lxml') script = tag.select("script")[3].text json_str = script[script.index("=") + 1:script.index("function") - 2] json_obj = json.loads(json_str) try: stat_info = json_obj['mediaInfo']['stat'] print(json_obj['mediaInfo']['chn_name']) print(stat_info) mediaInfo(id=md, chn_name=json_obj['mediaInfo']['chn_name']).save() global current_mediaInfo current_mediaInfo = mediaInfo.objects.get(pk=md) stat(id=md, danmakus=int(stat_info['danmakus']), favorites=stat_info['favorites'], views=stat_info['views']).save() except BaseException as e: logger.error("发生异常") logger.error(e) # asdasd def get_(): global current_mediaInfo return current_mediaInfo page_size = 100 pages = None ids = None def getIds(): global ids global pages if ids is None or len(ids) != 0: ids = [] page = 1 while pages is None or page <= pages: url = 'https://bangumi.bilibili.com/web_api/season/index_global?page=%d&page_size=%d' % (page, page_size) logger.info(url) try: req = requests.get(url, timeout=10) json_obj = json.loads(req.text) bangumiList = json_obj['result']['list'] for bangumi in bangumiList: ids.append(int(bangumi['season_id'])) if pages is None: pages = int(math.ceil(int(json_obj['result']['count']) / page_size)) except BaseException as e: logger.error('连接超时') logger(e) continue page = page + 1 def threadSave(): getIds() logger.info(len(ids)) try: global executors executors = futures.ThreadPoolExecutor(32) global isStop isStop = False with executors as executor: executor.map(save, ids) logger.info('结束爬虫') except BaseException as e: logger.error(e) def stop_(): global isStop isStop = True