import json import os import shutil import threading from concurrent import futures import requests from lxml import etree class Comment: lock = threading.Lock() # 多线程全局资源锁 def __init__(self, keywords_=None) -> None: super().__init__() self.obj = {'data': {}, 'flag': False} self.keywords = keywords_ # 获取番剧合集弹幕排行榜 def getCommentSort(self, cids): urls = [] for cid in cids: urls.extend(getCidUrls(cid)) with futures.ThreadPoolExecutor(32) as executor: executor.map(self.count, urls) for index, data in enumerate( sorted(self.obj["data"].items(), key=lambda d: d[1], reverse=True)[ :50]): print('{index}:{data}'.format(index=index + 1, data=data)) # 获取番剧合集弹幕排行榜 def count(self, url, desc=None): bytes = requests.get(url).content comment_selector = etree.HTML(bytes) if not desc is None: print(desc) print("url=%s" % url) for comment in comment_selector.xpath('//i//d/text()'): if comment in self.obj["data"]: with self.lock: self.obj["data"][comment] = self.obj["data"][comment] + 1 else: with self.lock: self.obj["data"][comment] = 1 if not self.obj["flag"]: for keyword in self.keywords: if keyword in comment: self.obj["flag"] = True # 根据cid获取历史弹幕地址 def getCidUrls(cid): urls = [] url = "https://comment.bilibili.com/rolldate,%d" % cid req = requests.get(url) if len(req.text) > 0: for i in json.loads(req.text): urls.append("https://comment.bilibili.com/dmroll,%s,%d" % (i['timestamp'], cid)) else: urls.append("https://comment.bilibili.com/%d.xml" % cid) return urls # 下载弹幕 def downloadXml(path, cid, size=None, histroy=True): dlist = set() flag = None if histroy: flag = parseXml(getCidUrls(cid), dlist, size) else: parseXml("https://comment.bilibili.com/%d.xml" % cid, dlist, size) if size is None or (histroy and not size is None and flag): if os.path.exists(path): shutil.rmtree(path) os.makedirs(path) f = open('{path}/{cid}.xml'.format(path=path, cid=cid), 'wb') f.write(b'') for i in dlist: f.write(('\r\n' + i).encode()) f.write(b'\r\n') f.close() def xml(url): bytes = requests.get(url).content return etree.HTML(bytes) def parseXml(urls, dlist, size=None): if isinstance(urls, str): urls = [urls] if not size is None: size = float(size.strip('%')) / 100.0 for url in urls: comment_selector = xml(url) list = comment_selector.xpath('//i//d/text()') maxlimit = int(comment_selector.xpath('//i//maxlimit/text()')[0]) if len(list) > 0: print('弹幕数:{list},最大弹幕数:{maxlimit},弹幕池填充:{p}'.format(list=len(list), maxlimit=maxlimit, p='%.2f%%' % (len(list) / maxlimit * 100))) for element in comment_selector.xpath('//i//d'): if len(element.xpath("text()")) > 0: fstr = '{content}'.format(p=str(element.xpath("@p")[0]), content=str(element.xpath("text()")[0])) dlist.add(fstr) currentSize = len(dlist) / maxlimit print('填充率:{l}'.format(l='%.2f%%' % (currentSize * 100))) if not size is None and currentSize >= size: return True return False if __name__ == '__main__': cids = [7636499, 7636501, 7636500, 7636503, 7636504, 7636502, 7636509, 7636508, 7636506, 7636507, 7636505] downloadXml('F:/ABC',12026697,histroy=False)