|
|
|
import json
|
|
|
|
import os
|
|
|
|
import shutil
|
|
|
|
import threading
|
|
|
|
from concurrent import futures
|
|
|
|
|
|
|
|
import requests
|
|
|
|
from lxml import etree
|
|
|
|
|
|
|
|
|
|
|
|
class Comment:
|
|
|
|
lock = threading.Lock() # 多线程全局资源锁
|
|
|
|
|
|
|
|
def __init__(self, keywords_=None) -> None:
|
|
|
|
super().__init__()
|
|
|
|
self.obj = {'data': {}, 'flag': False}
|
|
|
|
self.keywords = keywords_
|
|
|
|
|
|
|
|
# 获取番剧合集弹幕排行榜
|
|
|
|
def getCommentSort(self, cids):
|
|
|
|
|
|
|
|
urls = []
|
|
|
|
for cid in cids:
|
|
|
|
urls.extend(getCidUrls(cid))
|
|
|
|
with futures.ThreadPoolExecutor(32) as executor:
|
|
|
|
executor.map(self.count, urls)
|
|
|
|
for index, data in enumerate(
|
|
|
|
sorted(self.obj["data"].items(), key=lambda d: d[1], reverse=True)[
|
|
|
|
:50]):
|
|
|
|
print('{index}:{data}'.format(index=index + 1, data=data))
|
|
|
|
|
|
|
|
# 获取番剧合集弹幕排行榜
|
|
|
|
def count(self, url, desc=None):
|
|
|
|
bytes = requests.get(url).content
|
|
|
|
comment_selector = etree.HTML(bytes)
|
|
|
|
if not desc is None:
|
|
|
|
print(desc)
|
|
|
|
print("url=%s" % url)
|
|
|
|
for comment in comment_selector.xpath('//i//d/text()'):
|
|
|
|
if comment in self.obj["data"]:
|
|
|
|
with self.lock:
|
|
|
|
self.obj["data"][comment] = self.obj["data"][comment] + 1
|
|
|
|
else:
|
|
|
|
with self.lock:
|
|
|
|
self.obj["data"][comment] = 1
|
|
|
|
if not self.obj["flag"]:
|
|
|
|
for keyword in self.keywords:
|
|
|
|
if keyword in comment:
|
|
|
|
self.obj["flag"] = True
|
|
|
|
|
|
|
|
|
|
|
|
# 根据cid获取历史弹幕地址
|
|
|
|
def getCidUrls(cid):
|
|
|
|
urls = []
|
|
|
|
url = "https://comment.bilibili.com/rolldate,%d" % cid
|
|
|
|
req = requests.get(url)
|
|
|
|
if len(req.text) > 0:
|
|
|
|
for i in json.loads(req.text):
|
|
|
|
urls.append("https://comment.bilibili.com/dmroll,%s,%d" % (i['timestamp'], cid))
|
|
|
|
else:
|
|
|
|
urls.append("https://comment.bilibili.com/%d.xml" % cid)
|
|
|
|
return urls
|
|
|
|
|
|
|
|
|
|
|
|
# 下载弹幕
|
|
|
|
def downloadXml(path, cid, size=None, histroy=True):
|
|
|
|
dlist = set()
|
|
|
|
flag = None
|
|
|
|
if histroy:
|
|
|
|
flag = parseXml(getCidUrls(cid), dlist, size)
|
|
|
|
else:
|
|
|
|
parseXml("https://comment.bilibili.com/%d.xml" % cid, dlist, size)
|
|
|
|
if size is None or (histroy and not size is None and flag):
|
|
|
|
if os.path.exists(path):
|
|
|
|
shutil.rmtree(path)
|
|
|
|
os.makedirs(path)
|
|
|
|
f = open('{path}/{cid}.xml'.format(path=path, cid=cid), 'wb')
|
|
|
|
f.write(b'<?xml version="1.0" encoding="UTF-8"?><i>')
|
|
|
|
for i in dlist:
|
|
|
|
f.write(('\r\n' + i).encode())
|
|
|
|
f.write(b'\r\n</i>')
|
|
|
|
f.close()
|
|
|
|
|
|
|
|
|
|
|
|
def xml(url):
|
|
|
|
bytes = requests.get(url).content
|
|
|
|
return etree.HTML(bytes)
|
|
|
|
|
|
|
|
|
|
|
|
def parseXml(urls, dlist, size=None):
|
|
|
|
if isinstance(urls, str):
|
|
|
|
urls = [urls]
|
|
|
|
if not size is None:
|
|
|
|
size = float(size.strip('%')) / 100.0
|
|
|
|
for url in urls:
|
|
|
|
comment_selector = xml(url)
|
|
|
|
list = comment_selector.xpath('//i//d/text()')
|
|
|
|
maxlimit = int(comment_selector.xpath('//i//maxlimit/text()')[0])
|
|
|
|
|
|
|
|
if len(list) > 0:
|
|
|
|
print('弹幕数:{list},最大弹幕数:{maxlimit},弹幕池填充:{p}'.format(list=len(list), maxlimit=maxlimit,
|
|
|
|
p='%.2f%%' % (len(list) / maxlimit * 100)))
|
|
|
|
for element in comment_selector.xpath('//i//d'):
|
|
|
|
if len(element.xpath("text()")) > 0:
|
|
|
|
fstr = '<d p="{p}">{content}</d>'.format(p=str(element.xpath("@p")[0]),
|
|
|
|
content=str(element.xpath("text()")[0]))
|
|
|
|
dlist.add(fstr)
|
|
|
|
|
|
|
|
currentSize = len(dlist) / maxlimit
|
|
|
|
print('填充率:{l}'.format(l='%.2f%%' % (currentSize * 100)))
|
|
|
|
if not size is None and currentSize >= size:
|
|
|
|
return True
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
cids = [7636499, 7636501, 7636500, 7636503, 7636504, 7636502, 7636509, 7636508, 7636506, 7636507, 7636505]
|
|
|
|
|
|
|
|
downloadXml('F:/ABC',12026697,histroy=False)
|