|
|
|
import json
|
|
|
|
import threading
|
|
|
|
from concurrent import futures
|
|
|
|
|
|
|
|
import requests
|
|
|
|
from lxml import etree
|
|
|
|
|
|
|
|
|
|
|
|
class Comment:
|
|
|
|
lock = threading.Lock() # 多线程全局资源锁
|
|
|
|
|
|
|
|
def __init__(self, keywords_=None) -> None:
|
|
|
|
super().__init__()
|
|
|
|
self.obj = {'data': {}, 'flag': False}
|
|
|
|
self.keywords = keywords_
|
|
|
|
|
|
|
|
# 获取番剧合集弹幕排行榜
|
|
|
|
def getCommentSort(self, cids):
|
|
|
|
|
|
|
|
urls = []
|
|
|
|
for cid in cids:
|
|
|
|
urls.extend(getCidUrls(cid))
|
|
|
|
with futures.ThreadPoolExecutor(32) as executor:
|
|
|
|
executor.map(self.count, urls)
|
|
|
|
for index, data in enumerate(
|
|
|
|
sorted(self.obj["data"].items(), key=lambda d: d[1], reverse=True)[
|
|
|
|
:50]):
|
|
|
|
print('{index}:{data}'.format(index=index+1, data=data))
|
|
|
|
|
|
|
|
# 获取番剧合集弹幕排行榜
|
|
|
|
def count(self, url, desc=None):
|
|
|
|
bytes = requests.get(url).content
|
|
|
|
comment_selector = etree.HTML(bytes)
|
|
|
|
if not desc is None:
|
|
|
|
print(desc)
|
|
|
|
print("url=%s" % url)
|
|
|
|
for comment in comment_selector.xpath('//i//d/text()'):
|
|
|
|
if comment in self.obj["data"]:
|
|
|
|
with self.lock:
|
|
|
|
self.obj["data"][comment] = self.obj["data"][comment] + 1
|
|
|
|
else:
|
|
|
|
with self.lock:
|
|
|
|
self.obj["data"][comment] = 1
|
|
|
|
if not self.obj["flag"]:
|
|
|
|
for keyword in self.keywords:
|
|
|
|
if keyword in comment:
|
|
|
|
self.obj["flag"] = True
|
|
|
|
|
|
|
|
|
|
|
|
# 根据cid获取历史弹幕地址
|
|
|
|
def getCidUrls(cid):
|
|
|
|
urls = []
|
|
|
|
url = "https://comment.bilibili.com/rolldate,%d" % cid
|
|
|
|
req = requests.get(url)
|
|
|
|
if len(req.text) > 0:
|
|
|
|
for i in json.loads(req.text):
|
|
|
|
urls.append("https://comment.bilibili.com/dmroll,%s,%d" % (i['timestamp'], cid))
|
|
|
|
else:
|
|
|
|
urls.append("https://comment.bilibili.com/%d.xml" % cid)
|
|
|
|
return urls
|
|
|
|
|
|
|
|
|
|
|
|
# 下载历史弹幕
|
|
|
|
def parseXmlByHistory(path, cid,size=None):
|
|
|
|
dlist = set()
|
|
|
|
flag=parseXml(getCidUrls(cid), dlist, size)
|
|
|
|
if size is None or not size is None and flag:
|
|
|
|
f = open(path, 'wb')
|
|
|
|
f.write(b'<?xml version="1.0" encoding="UTF-8"?><i>')
|
|
|
|
for i in dlist:
|
|
|
|
f.write(('\r\n' + i).encode())
|
|
|
|
f.write(b'\r\n</i>')
|
|
|
|
f.close()
|
|
|
|
|
|
|
|
def xml(url):
|
|
|
|
bytes = requests.get(url).content
|
|
|
|
return etree.HTML(bytes)
|
|
|
|
|
|
|
|
def parseXml(urls, dlist, size=None):
|
|
|
|
if isinstance(urls, str):
|
|
|
|
urls = [urls]
|
|
|
|
if not size is None:
|
|
|
|
size = float(size.strip('%')) / 100.0
|
|
|
|
for url in urls:
|
|
|
|
comment_selector = xml(url)
|
|
|
|
list = comment_selector.xpath('//i//d/text()')
|
|
|
|
maxlimit = int(comment_selector.xpath('//i//maxlimit/text()')[0])
|
|
|
|
|
|
|
|
if len(list) > 0:
|
|
|
|
print('弹幕数:{list},最大弹幕数:{maxlimit},弹幕池填充:{p}'.format(list=len(list), maxlimit=maxlimit,
|
|
|
|
p='%.2f%%' % (len(list) / maxlimit * 100)))
|
|
|
|
for element in comment_selector.xpath('//i//d'):
|
|
|
|
if len(element.xpath("text()")) > 0:
|
|
|
|
fstr = '<d p="{p}">{content}</d>'.format(p=str(element.xpath("@p")[0]),
|
|
|
|
content=str(element.xpath("text()")[0]))
|
|
|
|
dlist.add(fstr)
|
|
|
|
|
|
|
|
currentSize = len(dlist) / maxlimit
|
|
|
|
print('填充率:{l}'.format(l='%.2f%%' % (currentSize * 100)))
|
|
|
|
if not size is None and currentSize >= size:
|
|
|
|
return True
|
|
|
|
return False
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
# parseXmlByHistory('10815558.xml', 10815558)
|
|
|
|
# Comment('').getCommentSort(episodeIdToCid(172095))
|
|
|
|
cids=[653701,653702,653703,653704,653705,653738,653739,653740,653741,653742,653743,655269,655270,655271,655272,655273,655274,655275,655276,655277,655278,655279,1153426,1160531,1160532,1160533,1160534,1166828,3462691,3526913,4566294,4566295,4566296,4566297,4566298,4566299,4997067,4997068,4997069,4997070,4997071,4997072,4997073,4997074,4997075,4997076,4997077,4997078,4997079,4997080,4997081,4997082,4997083,4997084,4997085,4997086,4997087,4997088,4997089,5679996,7091113,7091114,7091115,7091116,7091117,7091118,7091119,7091120,7102322,7102323,7102324,7105422,7105423,7105424,7105425,7105426,7105427,7105428,7105429,7105430,7105431,7105432,7111387,7862786,7862787,7862788,7862789,7862790,7862791,7862792,7862793,7862794,7862795,7862796,8097054,8462197,8462198,8463198,8463856,8464066,8465726,8466012,8466263,8466715,8467897,8468074,8469339,8470303,8470792,8471514,8471910,8472168,8472542,8473404,8474331,8476130,8974866,9006609,9914921,9914922,9930188,9930189,9930190,9935817,9935818,9935819,9946753,10240252,10240253,10240254,10240255,10240256,10240257,10240258,10240259,10240260,10240261,10240262,12029055,12107619,13109117,13109118,13109119,13109120,13109121,13109122,13109123,13109124,13109125,13109126,13109127,13109128,13109129,13109130,13109131,13109132,13109133,13109134,13109135,13109136,13109137,13109138,13109139,13109140,13109141,13109142,13109143,13109144,13109145,13109146,13109147,13109148,13109149,13109150,13109151,13109152,13109153,13109154,13109155,13280053,13386513,13386514,13386515,13386516,13386517,13386518,13386519,13386520,13386521,13386522,13386523,13494289,14369143,14842278]
|
|
|
|
|
|
|
|
cids=[6675884,6675885,6675886,6722497,6722498,6722499,6722500,6761162,6761163,6761164,6761165,7008191,7008192,7008193,7008194,7008195,7008196,7008197,7008198,7008199,7008200,7008201,7018471,7018472,7018473,7018474,7018475,7018476,7018477,7018478,7018479,7018480,7018481,7636499,7636500,7636501,7636502,7636503,7636504,7636505,7636506,7636507,7636508,7636509,9019798,9019799,9019800,9019801,9019802,9019803,9019804,9019805,9019806,9019807,9019808,9019809,9832709]
|
|
|
|
for cid in cids:
|
|
|
|
comment_selector=xml('https://comment.bilibili.com/{cid}.xml'.format(cid=cid))
|
|
|
|
print('cid:{cid},弹幕数:{length}'.format(cid=cid,length=len(comment_selector.xpath('//i//d/text()'))))
|
|
|
|
# parseXmlByHistory('{cid}.xml'.format(cid=cid),cid)
|
|
|
|
|
|
|
|
# path = 'D:/QQ/1029559041/FileRecv/tmp001.xlsx'
|
|
|
|
# ExcelFile = xlrd.open_workbook(path)
|
|
|
|
#
|
|
|
|
#
|
|
|
|
# for sheetName in ExcelFile.sheet_names():
|
|
|
|
# sheet = ExcelFile.sheet_by_name(sheetName)
|
|
|
|
# for row in sheet.get_rows():
|
|
|
|
# try:
|
|
|
|
# row[5].value.index('日剧')
|
|
|
|
# cid = int(row[2].value)
|
|
|
|
# print('aid={aid},cid={cid},title={title}'.format(aid=int(row[1].value),cid=cid,title=row[5].value))
|
|
|
|
# parseXmlByHistory('{cid}.xml'.format(cid=cid),cid,'50%')
|
|
|
|
# except BaseException as e:
|
|
|
|
# repr(e)
|