You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
pixiv/PixivSearch/dao/Comment.py

132 lines
4.4 KiB

import json
import threading
from concurrent import futures
import requests
import xlrd
from lxml import etree
from PixivSearch.dao.bangumi import episodeIdToCid
class Comment:
lock = threading.Lock() # 多线程全局资源锁
def __init__(self, keywords_=None) -> None:
super().__init__()
self.obj = {'data': {}, 'flag': False}
self.keywords = keywords_
# 获取番剧合集弹幕排行榜
def getCommentSort(self, cids):
urls = []
for cid in cids:
urls.extend(getCidUrls(cid))
with futures.ThreadPoolExecutor(32) as executor:
executor.map(self.count, urls)
for index, data in enumerate(
sorted(self.obj["data"].items(), key=lambda d: d[1], reverse=True)[
:50]):
print('{index}:{data}'.format(index=index+1, data=data))
# 获取番剧合集弹幕排行榜
def count(self, url, desc=None):
bytes = requests.get(url).content
comment_selector = etree.HTML(bytes)
if not desc is None:
print(desc)
print("url=%s" % url)
for comment in comment_selector.xpath('//i//d/text()'):
if comment in self.obj["data"]:
with self.lock:
self.obj["data"][comment] = self.obj["data"][comment] + 1
else:
with self.lock:
self.obj["data"][comment] = 1
if not self.obj["flag"]:
for keyword in self.keywords:
if keyword in comment:
self.obj["flag"] = True
# 根据cid获取历史弹幕地址
def getCidUrls(cid):
urls = []
url = "https://comment.bilibili.com/rolldate,%d" % cid
req = requests.get(url)
if len(req.text) > 0:
for i in json.loads(req.text):
urls.append("https://comment.bilibili.com/dmroll,%s,%d" % (i['timestamp'], cid))
else:
urls.append("https://comment.bilibili.com/%d.xml" % cid)
return urls
# 下载历史弹幕
def parseXmlByHistory(path, cid,size=None):
dlist = set()
flag=parseXml(getCidUrls(cid), dlist, size)
if size is None or not size is None and flag:
f = open(path, 'wb')
f.write(b'<?xml version="1.0" encoding="UTF-8"?><i>')
for i in dlist:
f.write(('\r\n' + i).encode())
f.write(b'\r\n</i>')
f.close()
def parseXml(urls, dlist, size=None):
if isinstance(urls, str):
urls = [urls]
if not size is None:
size = float(size.strip('%')) / 100.0
for url in urls:
bytes = requests.get(url).content
comment_selector = etree.HTML(bytes)
list = comment_selector.xpath('//i//d/text()')
maxlimit = int(comment_selector.xpath('//i//maxlimit/text()')[0])
if len(list) > 0:
print('弹幕数:{list},最大弹幕数:{maxlimit},弹幕池填充:{p}'.format(list=len(list), maxlimit=maxlimit,
p='%.2f%%' % (len(list) / maxlimit * 100)))
for element in comment_selector.xpath('//i//d'):
if len(element.xpath("text()")) > 0:
fstr = '<d p="{p}">{content}</d>'.format(p=str(element.xpath("@p")[0]),
content=str(element.xpath("text()")[0]))
dlist.add(fstr)
currentSize = len(dlist) / maxlimit
print('填充率:{l}'.format(l='%.2f%%' % (currentSize * 100)))
if not size is None and currentSize >= size:
return True
return False
if __name__ == '__main__':
# parseXmlByHistory('10815558.xml', 10815558)
# Comment('').getCommentSort(episodeIdToCid(172095))
cids=[11664778,
11662541,
11661412,
11664304,
11666093,
]
for cid in cids:
parseXmlByHistory('{cid}.xml'.format(cid=cid),cid)
# path = 'D:/QQ/1029559041/FileRecv/tmp001.xlsx'
# ExcelFile = xlrd.open_workbook(path)
#
#
# for sheetName in ExcelFile.sheet_names():
# sheet = ExcelFile.sheet_by_name(sheetName)
# for row in sheet.get_rows():
# try:
# row[5].value.index('日剧')
# cid = int(row[2].value)
# print('aid={aid},cid={cid},title={title}'.format(aid=int(row[1].value),cid=cid,title=row[5].value))
# parseXmlByHistory('{cid}.xml'.format(cid=cid),cid,'50%')
# except BaseException as e:
# repr(e)