parent
4dbbb2aac2
commit
667c78dc22
@ -0,0 +1,132 @@ |
||||
import json |
||||
import threading |
||||
from concurrent import futures |
||||
|
||||
import requests |
||||
import xlrd |
||||
from lxml import etree |
||||
|
||||
from PixivSearch.dao.bangumi import episodeIdToCid |
||||
|
||||
|
||||
class Comment: |
||||
lock = threading.Lock() # 多线程全局资源锁 |
||||
|
||||
def __init__(self, keywords_=None) -> None: |
||||
super().__init__() |
||||
self.obj = {'data': {}, 'flag': False} |
||||
self.keywords = keywords_ |
||||
|
||||
# 获取番剧合集弹幕排行榜 |
||||
def getCommentSort(self, cids): |
||||
|
||||
urls = [] |
||||
for cid in cids: |
||||
urls.extend(getCidUrls(cid)) |
||||
with futures.ThreadPoolExecutor(32) as executor: |
||||
executor.map(self.count, urls) |
||||
for index, data in enumerate( |
||||
sorted(self.obj["data"].items(), key=lambda d: d[1], reverse=True)[ |
||||
:50]): |
||||
print('{index}:{data}'.format(index=index+1, data=data)) |
||||
|
||||
# 获取番剧合集弹幕排行榜 |
||||
def count(self, url, desc=None): |
||||
bytes = requests.get(url).content |
||||
comment_selector = etree.HTML(bytes) |
||||
if not desc is None: |
||||
print(desc) |
||||
print("url=%s" % url) |
||||
for comment in comment_selector.xpath('//i//d/text()'): |
||||
if comment in self.obj["data"]: |
||||
with self.lock: |
||||
self.obj["data"][comment] = self.obj["data"][comment] + 1 |
||||
else: |
||||
with self.lock: |
||||
self.obj["data"][comment] = 1 |
||||
if not self.obj["flag"]: |
||||
for keyword in self.keywords: |
||||
if keyword in comment: |
||||
self.obj["flag"] = True |
||||
|
||||
|
||||
# 根据cid获取历史弹幕地址 |
||||
def getCidUrls(cid): |
||||
urls = [] |
||||
url = "https://comment.bilibili.com/rolldate,%d" % cid |
||||
req = requests.get(url) |
||||
if len(req.text) > 0: |
||||
for i in json.loads(req.text): |
||||
urls.append("https://comment.bilibili.com/dmroll,%s,%d" % (i['timestamp'], cid)) |
||||
else: |
||||
urls.append("https://comment.bilibili.com/%d.xml" % cid) |
||||
return urls |
||||
|
||||
|
||||
# 下载历史弹幕 |
||||
def parseXmlByHistory(path, cid,size=None): |
||||
dlist = set() |
||||
flag=parseXml(getCidUrls(cid), dlist, size) |
||||
if size is None or not size is None and flag: |
||||
f = open(path, 'wb') |
||||
f.write(b'<?xml version="1.0" encoding="UTF-8"?><i>') |
||||
for i in dlist: |
||||
f.write(('\r\n' + i).encode()) |
||||
f.write(b'\r\n</i>') |
||||
f.close() |
||||
|
||||
|
||||
def parseXml(urls, dlist, size=None): |
||||
if isinstance(urls, str): |
||||
urls = [urls] |
||||
if not size is None: |
||||
size = float(size.strip('%')) / 100.0 |
||||
for url in urls: |
||||
bytes = requests.get(url).content |
||||
comment_selector = etree.HTML(bytes) |
||||
list = comment_selector.xpath('//i//d/text()') |
||||
maxlimit = int(comment_selector.xpath('//i//maxlimit/text()')[0]) |
||||
|
||||
if len(list) > 0: |
||||
print('弹幕数:{list},最大弹幕数:{maxlimit},弹幕池填充:{p}'.format(list=len(list), maxlimit=maxlimit, |
||||
p='%.2f%%' % (len(list) / maxlimit * 100))) |
||||
for element in comment_selector.xpath('//i//d'): |
||||
if len(element.xpath("text()")) > 0: |
||||
fstr = '<d p="{p}">{content}</d>'.format(p=str(element.xpath("@p")[0]), |
||||
content=str(element.xpath("text()")[0])) |
||||
dlist.add(fstr) |
||||
|
||||
currentSize = len(dlist) / maxlimit |
||||
print('填充率:{l}'.format(l='%.2f%%' % (currentSize * 100))) |
||||
if not size is None and currentSize >= size: |
||||
return True |
||||
return False |
||||
|
||||
if __name__ == '__main__': |
||||
# parseXmlByHistory('10815558.xml', 10815558) |
||||
# Comment('').getCommentSort(episodeIdToCid(172095)) |
||||
cids=[11664778, |
||||
11662541, |
||||
11661412, |
||||
11664304, |
||||
11666093, |
||||
|
||||
] |
||||
|
||||
for cid in cids: |
||||
parseXmlByHistory('{cid}.xml'.format(cid=cid),cid) |
||||
|
||||
# path = 'D:/QQ/1029559041/FileRecv/tmp001.xlsx' |
||||
# ExcelFile = xlrd.open_workbook(path) |
||||
# |
||||
# |
||||
# for sheetName in ExcelFile.sheet_names(): |
||||
# sheet = ExcelFile.sheet_by_name(sheetName) |
||||
# for row in sheet.get_rows(): |
||||
# try: |
||||
# row[5].value.index('日剧') |
||||
# cid = int(row[2].value) |
||||
# print('aid={aid},cid={cid},title={title}'.format(aid=int(row[1].value),cid=cid,title=row[5].value)) |
||||
# parseXmlByHistory('{cid}.xml'.format(cid=cid),cid,'50%') |
||||
# except BaseException as e: |
||||
# repr(e) |
Loading…
Reference in new issue