pixiv/PixivSearch/dao/Comment.py

import json
import threading
from concurrent import futures

import requests
import xlrd
from lxml import etree

from PixivSearch.dao.bangumi import episodeIdToCid


class Comment:
    lock = threading.Lock()  # 多线程全局资源锁

    def __init__(self, keywords_=None) -> None:
        super().__init__()
        self.obj = {'data': {}, 'flag': False}
        self.keywords = keywords_

    # 获取番剧合集弹幕排行榜
    def getCommentSort(self, cids):

        urls = []
        for cid in cids:
            urls.extend(getCidUrls(cid))
        with futures.ThreadPoolExecutor(32) as executor:
            executor.map(self.count, urls)
        for index, data in enumerate(
                sorted(self.obj["data"].items(), key=lambda d: d[1], reverse=True)[
                :50]):
            print('{index}:{data}'.format(index=index+1, data=data))

    # 获取番剧合集弹幕排行榜
    def count(self, url, desc=None):
        bytes = requests.get(url).content
        comment_selector = etree.HTML(bytes)
        if not desc is None:
            print(desc)
        print("url=%s" % url)
        for comment in comment_selector.xpath('//i//d/text()'):
            if comment in self.obj["data"]:
                with self.lock:
                    self.obj["data"][comment] = self.obj["data"][comment] + 1
            else:
                with self.lock:
                    self.obj["data"][comment] = 1
            if not self.obj["flag"]:
                for keyword in self.keywords:
                    if keyword in comment:
                        self.obj["flag"] = True


# 根据cid获取历史弹幕地址
def getCidUrls(cid):
    urls = []
    url = "https://comment.bilibili.com/rolldate,%d" % cid
    req = requests.get(url)
    if len(req.text) > 0:
        for i in json.loads(req.text):
            urls.append("https://comment.bilibili.com/dmroll,%s,%d" % (i['timestamp'], cid))
    else:
        urls.append("https://comment.bilibili.com/%d.xml" % cid)
    return urls


# 下载历史弹幕
def parseXmlByHistory(path, cid,size=None):
    dlist = set()
    flag=parseXml(getCidUrls(cid), dlist, size)
    if size is None or not size is None and flag:
        f = open(path, 'wb')
        f.write(b'<?xml version="1.0" encoding="UTF-8"?><i>')
        for i in dlist:
            f.write(('\r\n' + i).encode())
        f.write(b'\r\n</i>')
        f.close()


def parseXml(urls, dlist, size=None):
    if isinstance(urls, str):
        urls = [urls]
    if not size is None:
        size = float(size.strip('%')) / 100.0
    for url in urls:
        bytes = requests.get(url).content
        comment_selector = etree.HTML(bytes)
        list = comment_selector.xpath('//i//d/text()')
        maxlimit = int(comment_selector.xpath('//i//maxlimit/text()')[0])

        if len(list) > 0:
            print('弹幕数：{list},最大弹幕数：{maxlimit},弹幕池填充：{p}'.format(list=len(list), maxlimit=maxlimit,
                                                                 p='%.2f%%' % (len(list) / maxlimit * 100)))
            for element in comment_selector.xpath('//i//d'):
                if len(element.xpath("text()")) > 0:
                    fstr = '<d p="{p}">{content}</d>'.format(p=str(element.xpath("@p")[0]),
                                                         content=str(element.xpath("text()")[0]))
                    dlist.add(fstr)

                currentSize = len(dlist) / maxlimit
                print('填充率：{l}'.format(l='%.2f%%' % (currentSize * 100)))
                if not size is None and currentSize >= size:
                    return True
    return False

if __name__ == '__main__':
    # parseXmlByHistory('10815558.xml', 10815558)
    # Comment('').getCommentSort(episodeIdToCid(172095))
    cids=[11664778,
11662541,
11661412,
11664304,
11666093,

]

    for cid in cids:
        parseXmlByHistory('{cid}.xml'.format(cid=cid),cid)

    # path = 'D:/QQ/1029559041/FileRecv/tmp001.xlsx'
    # ExcelFile = xlrd.open_workbook(path)
    #
    #
    # for sheetName in ExcelFile.sheet_names():
    #     sheet = ExcelFile.sheet_by_name(sheetName)
    #     for row in sheet.get_rows():
    #         try:
    #             row[5].value.index('日剧')
    #             cid = int(row[2].value)
    #             print('aid={aid},cid={cid},title={title}'.format(aid=int(row[1].value),cid=cid,title=row[5].value))
    #             parseXmlByHistory('{cid}.xml'.format(cid=cid),cid,'50%')
    #         except BaseException as e:
    #             repr(e)
阿里云、弹幕 7 years ago			`import json`
			`import threading`
			`from concurrent import futures`

			`import requests`
			`import xlrd`
			`from lxml import etree`

			`from PixivSearch.dao.bangumi import episodeIdToCid`


			`class Comment:`
			`lock = threading.Lock() # 多线程全局资源锁`

			`def __init__(self, keywords_=None) -> None:`
			`super().__init__()`
			`self.obj = {'data': {}, 'flag': False}`
			`self.keywords = keywords_`

			`# 获取番剧合集弹幕排行榜`
			`def getCommentSort(self, cids):`

			`urls = []`
			`for cid in cids:`
			`urls.extend(getCidUrls(cid))`
			`with futures.ThreadPoolExecutor(32) as executor:`
			`executor.map(self.count, urls)`
			`for index, data in enumerate(`
			`sorted(self.obj["data"].items(), key=lambda d: d[1], reverse=True)[`
			`:50]):`
			`print('{index}:{data}'.format(index=index+1, data=data))`

			`# 获取番剧合集弹幕排行榜`
			`def count(self, url, desc=None):`
			`bytes = requests.get(url).content`
			`comment_selector = etree.HTML(bytes)`
			`if not desc is None:`
			`print(desc)`
			`print("url=%s" % url)`
			`for comment in comment_selector.xpath('//i//d/text()'):`
			`if comment in self.obj["data"]:`
			`with self.lock:`
			`self.obj["data"][comment] = self.obj["data"][comment] + 1`
			`else:`
			`with self.lock:`
			`self.obj["data"][comment] = 1`
			`if not self.obj["flag"]:`
			`for keyword in self.keywords:`
			`if keyword in comment:`
			`self.obj["flag"] = True`


			`# 根据cid获取历史弹幕地址`
			`def getCidUrls(cid):`
			`urls = []`
			`url = "https://comment.bilibili.com/rolldate,%d" % cid`
			`req = requests.get(url)`
			`if len(req.text) > 0:`
			`for i in json.loads(req.text):`
			`urls.append("https://comment.bilibili.com/dmroll,%s,%d" % (i['timestamp'], cid))`
			`else:`
			`urls.append("https://comment.bilibili.com/%d.xml" % cid)`
			`return urls`


			`# 下载历史弹幕`
			`def parseXmlByHistory(path, cid,size=None):`
			`dlist = set()`
			`flag=parseXml(getCidUrls(cid), dlist, size)`
			`if size is None or not size is None and flag:`
			`f = open(path, 'wb')`
			`f.write(b'<?xml version="1.0" encoding="UTF-8"?><i>')`
			`for i in dlist:`
			`f.write(('\r\n' + i).encode())`
			`f.write(b'\r\n</i>')`
			`f.close()`


			`def parseXml(urls, dlist, size=None):`
			`if isinstance(urls, str):`
			`urls = [urls]`
			`if not size is None:`
			`size = float(size.strip('%')) / 100.0`
			`for url in urls:`
			`bytes = requests.get(url).content`
			`comment_selector = etree.HTML(bytes)`
			`list = comment_selector.xpath('//i//d/text()')`
			`maxlimit = int(comment_selector.xpath('//i//maxlimit/text()')[0])`

			`if len(list) > 0:`
			`print('弹幕数：{list},最大弹幕数：{maxlimit},弹幕池填充：{p}'.format(list=len(list), maxlimit=maxlimit,`
			`p='%.2f%%' % (len(list) / maxlimit * 100)))`
			`for element in comment_selector.xpath('//i//d'):`
			`if len(element.xpath("text()")) > 0:`
			`fstr = '<d p="{p}">{content}</d>'.format(p=str(element.xpath("@p")[0]),`
			`content=str(element.xpath("text()")[0]))`
			`dlist.add(fstr)`

			`currentSize = len(dlist) / maxlimit`
			`print('填充率：{l}'.format(l='%.2f%%' % (currentSize * 100)))`
			`if not size is None and currentSize >= size:`
			`return True`
			`return False`

			`if __name__ == '__main__':`
			`# parseXmlByHistory('10815558.xml', 10815558)`
			`# Comment('').getCommentSort(episodeIdToCid(172095))`
			`cids=[11664778,`
			`11662541,`
			`11661412,`
			`11664304,`
			`11666093,`

			`]`

			`for cid in cids:`
			`parseXmlByHistory('{cid}.xml'.format(cid=cid),cid)`

			`# path = 'D:/QQ/1029559041/FileRecv/tmp001.xlsx'`
			`# ExcelFile = xlrd.open_workbook(path)`
			`#`
			`#`
			`# for sheetName in ExcelFile.sheet_names():`
			`# sheet = ExcelFile.sheet_by_name(sheetName)`
			`# for row in sheet.get_rows():`
			`# try:`
			`# row[5].value.index('日剧')`
			`# cid = int(row[2].value)`
			`# print('aid={aid},cid={cid},title={title}'.format(aid=int(row[1].value),cid=cid,title=row[5].value))`
			`# parseXmlByHistory('{cid}.xml'.format(cid=cid),cid,'50%')`
			`# except BaseException as e:`
			`# repr(e)`