import _thread import json import math import os from concurrent import futures from queue import Queue import django import requests from bs4 import BeautifulSoup from PixivSearch.settings import logger os.environ.setdefault("DJANGO_SETTINGS_MODULE", "PixivSearch.settings") django.setup() from PixivSearch.model.config import mediaInfo, stat current_mediaInfo = None isStop = None def stop_(): global isStop isStop = True def save(params): if isStop: return logger.info(params) bangumi_id = params[0] season_id = params[1] media_id = params[2] url = "https://www.bilibili.com/bangumi/media/md%d" % media_id try: req = requests.get(url, timeout=10) except BaseException as e: logger.error(repr(e)) save(media_id) logger.info("request_url=%s,status_code=%d" % (url, req.status_code)) if req.status_code == 200: tag = BeautifulSoup(req.text, 'lxml') script = tag.select("script")[3].text json_str = script[script.index("=") + 1:script.index("function") - 2] json_obj = json.loads(json_str) try: if 'mediaInfo' in json_obj and 'stat' in json_obj['mediaInfo'] and 'chn_name' in json_obj['mediaInfo']: stat_info = json_obj['mediaInfo']['stat'] print(stat_info) mediaInfo(bangumi_id=bangumi_id, season_id=season_id, media_id=media_id, chn_name=json_obj['mediaInfo']['chn_name']).save() global current_mediaInfo current_mediaInfo = mediaInfo.objects.get(pk=season_id) stat(id=season_id, danmakus=int(stat_info['danmakus']), favorites=stat_info['favorites'], views=stat_info['views']).save() except BaseException as e: logger.error(repr(e)) def get_(): global current_mediaInfo return current_mediaInfo page_size = 10 queue = Queue(page_size) def listen(): while True: ids = queue.get() try: executors = futures.ThreadPoolExecutor(page_size) with executors as executor: executor.map(save, ids) logger.info('结束爬虫') except BaseException as e: logger.error(repr(e)) _thread.start_new_thread(listen, ()) def getIds(): seasonIdList = [] page = 1 pages = None name = 'seasonListCallback' global isStop isStop = False while isStop == False and (pages is None or page <= pages): url = 'https://bangumi.bilibili.com/web_api/season/index_global?page=%d&page_size=%d' % (page, page_size) logger.info(url) try: req = requests.get(url, timeout=10) if req.status_code == 200: json_obj = json.loads(req.text) if 'result' in json_obj and 'list' in json_obj['result']: bangumiList = json_obj['result']['list'] ids = [] for bangumi in bangumiList: if isStop: break if 'season_id' in bangumi: season_id = int(bangumi['season_id']) if season_id in seasonIdList: continue url = 'https://bangumi.bilibili.com/jsonp/seasoninfo/%d.ver?callback=%s&jsonp=jsonp' % ( season_id, name) logger.info(url) req = requests.get(url, timeout=10) if req.status_code == 200: child_json_obj = json.loads( req.text.replace('seasonListCallback(', '').replace(');', '')) if 'result' in child_json_obj and 'bangumi_id' in child_json_obj['result']: bangumi_id = int(child_json_obj['result']['bangumi_id']) if 'media' in child_json_obj['result']: media_id = int(child_json_obj['result']['media']['media_id']) ids.append((bangumi_id, season_id, media_id)) seasonIdList.append(season_id) if pages is None and 'count' in json_obj['result']: pages = int(math.ceil(int(json_obj['result']['count']) / page_size)) page = page + 1 logger.info('获取id数量%d' % len(ids)) queue.put(ids) except BaseException as e: logger.error(repr(e)) continue def A(e): logger.info(e) if __name__ == '__main__': getIds()