import _thread import json import math import os import random import threading import zlib from concurrent import futures from queue import Queue from lxml import etree import django import requests from bs4 import BeautifulSoup from PixivSearch.settings import logger os.environ.setdefault("DJANGO_SETTINGS_MODULE", "PixivSearch.settings") django.setup() from PixivSearch.model.config import mediaInfo, stat current_mediaInfo = None isStop = None def stop_(): global isStop isStop = True def save(params): if isStop: return logger.info(params) bangumi_id = params[0] season_id = params[1] media_id = params[2] url = "https://www.bilibili.com/bangumi/media/md%d" % media_id try: req = requests.get(url, timeout=10) except BaseException as e: logger.error(repr(e)) save(params) logger.info("request_url=%s,status_code=%d" % (url, req.status_code)) if req.status_code == 200: json_obj = getJsonText(req, 3) try: if 'mediaInfo' in json_obj and 'stat' in json_obj['mediaInfo'] and 'chn_name' in json_obj['mediaInfo']: stat_info = json_obj['mediaInfo']['stat'] print(stat_info) mediaInfo(bangumi_id=bangumi_id, season_id=season_id, media_id=media_id, chn_name=json_obj['mediaInfo']['chn_name']).save() global current_mediaInfo current_mediaInfo = mediaInfo.objects.get(pk=season_id) stat(id=season_id, danmakus=int(stat_info['danmakus']), favorites=stat_info['favorites'], views=stat_info['views']).save() except BaseException as e: logger.error(repr(e)) def getJsonText(req, index): tag = BeautifulSoup(req.text, 'lxml') script = tag.select("script")[index].text json_str = script[script.index("=") + 1:script.index("function") - 2] return json.loads(json_str) def get_(): global current_mediaInfo return current_mediaInfo page_size = 10 queue = Queue(page_size) def listen(): while True: ids = queue.get() try: executors = futures.ThreadPoolExecutor(page_size) with executors as executor: executor.map(save, ids) logger.info('结束爬虫') except BaseException as e: logger.error(repr(e)) _thread.start_new_thread(listen, ()) def getIds(): seasonIdList = [] page = 1 pages = None name = 'seasonListCallback' global isStop isStop = False while isStop == False and (pages is None or page <= pages): url = 'https://bangumi.bilibili.com/web_api/season/index_global?page=%d&page_size=%d' % (page, page_size) logger.info(url) try: req = requests.get(url, timeout=10) if req.status_code == 200: json_obj = json.loads(req.text) if 'result' in json_obj and 'list' in json_obj['result']: bangumiList = json_obj['result']['list'] ids = [] for bangumi in bangumiList: if isStop: break if 'season_id' in bangumi: season_id = int(bangumi['season_id']) if season_id in seasonIdList: continue url = 'https://bangumi.bilibili.com/jsonp/seasoninfo/%d.ver?callback=%s&jsonp=jsonp' % ( season_id, name) logger.info(url) req = requests.get(url, timeout=10) if req.status_code == 200: child_json_obj = json.loads( req.text.replace('seasonListCallback(', '').replace(');', '')) if 'result' in child_json_obj and 'bangumi_id' in child_json_obj['result']: bangumi_id = int(child_json_obj['result']['bangumi_id']) if 'media' in child_json_obj['result']: media_id = int(child_json_obj['result']['media']['media_id']) ids.append((bangumi_id, season_id, media_id)) seasonIdList.append(season_id) if pages is None and 'count' in json_obj['result']: pages = int(math.ceil(int(json_obj['result']['count']) / page_size)) page = page + 1 logger.info('获取id数量%d' % len(ids)) queue.put(ids) except BaseException as e: logger.error(repr(e)) continue # def testA(): # req = requests.post('https://api.bilibili.com/x/report/web/heartbeat', # data={"aid": 29416,"cid":49052,"csrf": "c0d296db7e33085f9f4730cfee66660b"}, # cookies=_cookies) # print(req.status_code) _cookies = {'DedeUserID': '4372744', 'DedeUserID__ckMd5': 'e8179b74444cae8e', 'SESSDATA': '919b17d2%2C1524917631%2C3eede719'} def getCid(aid, type=True): while True and aid > 0: url = "https://api.bilibili.com/x/web-interface/archive/stat?aid=%d" % aid print(url) req = requests.get(url, cookies=_cookies) code = json.loads(req.text)["code"] if code == 0: req = requests.get("https://www.bilibili.com/video/av%d" % aid) if req.status_code == 200: json_obj = getJsonText(req, 9) if "videoData" in json_obj and "pages" in json_obj['videoData'] and len( json_obj['videoData']['pages']) > 0 and "cid" in json_obj['videoData']['pages'][0]: cid = json_obj['videoData']['pages'][0]['cid'] print('cid=%s' % cid) return cid if type: aid = aid - 1 else: aid = aid + 1 def getCids(aid): s = {"min": getCid(aid, True), "max": getCid(aid, False)} return s def episodeIdToCid(episode_id): cids = [] url = "https://www.bilibili.com/bangumi/play/ep%d" % episode_id print("url=%s" % url) req = requests.get(url) json_obj = getJsonText(req, 8) if "epList" in json_obj: for i in json_obj["epList"]: cids.append(i['cid']) return cids def parseXml(url): print("url=%s" % url) comment_selector = etree.HTML(requests.get(url).content) comment_content = comment_selector.xpath('//i') for comment_each in comment_content: comments = comment_each.xpath('//d/text()') if comments: for comment in comments: if comment in obj["data"]: with lock: obj["data"][comment] = obj["data"][comment] + 1 else: with lock: obj["data"][comment] = 1 if not obj["flag"]: for keyword in keywords: if keyword in comment: obj["flag"] = True lock = threading.Lock() # 多线程全局资源锁 def loadData(cids): params = [] for cid in cids: url = "https://comment.bilibili.com/rolldate,%d" % cid req = requests.get(url) urls = ["https://comment.bilibili.com/%d.xml" % cid] if len(req.text) > 0: for i in json.loads(req.text): urls.append("https://comment.bilibili.com/dmroll,%s,%d" % (i['timestamp'], cid)) for url in urls: params.append(url) with futures.ThreadPoolExecutor(32) as executor: executor.map(parseXml, params) return obj def getCommentSort(cids, keywords_): global keywords, obj keywords = keywords_ obj = {"data": {}, "flag": False} return loadData(cids) if __name__ == '__main__': # print(getCids(29416)) # obj = loadData( # [49052, 49053, 51525, 51526, 53407, 54180, 55295, 55296, 57255, 57256, 59288, 59289, 61559, 61560, 64034, 64035, # 67024, 67025, 69284, 73333, 73334, 74024, 74025], ['穹']) f = getCommentSort(episodeIdToCid(172095), []) # obj = loadData([34807341], []) for i in sorted(f["data"].items(), key=lambda d: d[1], reverse=True)[:50]: print(i)