You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
247 lines
8.3 KiB
247 lines
8.3 KiB
import _thread
|
|
import json
|
|
import math
|
|
import os
|
|
import random
|
|
import threading
|
|
|
|
import zlib
|
|
|
|
from concurrent import futures
|
|
from queue import Queue
|
|
|
|
from lxml import etree
|
|
import django
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
|
|
from PixivSearch.settings import logger
|
|
|
|
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "PixivSearch.settings")
|
|
django.setup()
|
|
from PixivSearch.model.config import mediaInfo, stat
|
|
|
|
current_mediaInfo = None
|
|
|
|
isStop = None
|
|
|
|
|
|
def stop_():
|
|
global isStop
|
|
isStop = True
|
|
|
|
|
|
def save(params):
|
|
if isStop:
|
|
return
|
|
logger.info(params)
|
|
bangumi_id = params[0]
|
|
season_id = params[1]
|
|
media_id = params[2]
|
|
|
|
url = "https://www.bilibili.com/bangumi/media/md%d" % media_id
|
|
try:
|
|
req = requests.get(url, timeout=10)
|
|
except BaseException as e:
|
|
logger.error(repr(e))
|
|
save(params)
|
|
logger.info("request_url=%s,status_code=%d" % (url, req.status_code))
|
|
if req.status_code == 200:
|
|
json_obj = getJsonText(req, 3)
|
|
try:
|
|
if 'mediaInfo' in json_obj and 'stat' in json_obj['mediaInfo'] and 'chn_name' in json_obj['mediaInfo']:
|
|
stat_info = json_obj['mediaInfo']['stat']
|
|
print(stat_info)
|
|
mediaInfo(bangumi_id=bangumi_id, season_id=season_id, media_id=media_id,
|
|
chn_name=json_obj['mediaInfo']['chn_name']).save()
|
|
global current_mediaInfo
|
|
current_mediaInfo = mediaInfo.objects.get(pk=season_id)
|
|
stat(id=season_id, danmakus=int(stat_info['danmakus']), favorites=stat_info['favorites'],
|
|
views=stat_info['views']).save()
|
|
except BaseException as e:
|
|
logger.error(repr(e))
|
|
|
|
|
|
def getJsonText(req, index):
|
|
tag = BeautifulSoup(req.text, 'lxml')
|
|
script = tag.select("script")[index].text
|
|
json_str = script[script.index("=") + 1:script.index("function") - 2]
|
|
return json.loads(json_str)
|
|
|
|
|
|
def get_():
|
|
global current_mediaInfo
|
|
return current_mediaInfo
|
|
|
|
|
|
page_size = 10
|
|
queue = Queue(page_size)
|
|
|
|
|
|
def listen():
|
|
while True:
|
|
ids = queue.get()
|
|
try:
|
|
executors = futures.ThreadPoolExecutor(page_size)
|
|
with executors as executor:
|
|
executor.map(save, ids)
|
|
logger.info('结束爬虫')
|
|
except BaseException as e:
|
|
logger.error(repr(e))
|
|
|
|
|
|
_thread.start_new_thread(listen, ())
|
|
|
|
|
|
def getIds():
|
|
seasonIdList = []
|
|
page = 1
|
|
pages = None
|
|
name = 'seasonListCallback'
|
|
global isStop
|
|
isStop = False
|
|
while isStop == False and (pages is None or page <= pages):
|
|
|
|
url = 'https://bangumi.bilibili.com/web_api/season/index_global?page=%d&page_size=%d' % (page, page_size)
|
|
logger.info(url)
|
|
try:
|
|
req = requests.get(url, timeout=10)
|
|
if req.status_code == 200:
|
|
json_obj = json.loads(req.text)
|
|
if 'result' in json_obj and 'list' in json_obj['result']:
|
|
bangumiList = json_obj['result']['list']
|
|
ids = []
|
|
for bangumi in bangumiList:
|
|
if isStop:
|
|
break
|
|
if 'season_id' in bangumi:
|
|
season_id = int(bangumi['season_id'])
|
|
if season_id in seasonIdList:
|
|
continue
|
|
url = 'https://bangumi.bilibili.com/jsonp/seasoninfo/%d.ver?callback=%s&jsonp=jsonp' % (
|
|
season_id, name)
|
|
logger.info(url)
|
|
req = requests.get(url, timeout=10)
|
|
if req.status_code == 200:
|
|
child_json_obj = json.loads(
|
|
req.text.replace('seasonListCallback(', '').replace(');', ''))
|
|
if 'result' in child_json_obj and 'bangumi_id' in child_json_obj['result']:
|
|
bangumi_id = int(child_json_obj['result']['bangumi_id'])
|
|
if 'media' in child_json_obj['result']:
|
|
media_id = int(child_json_obj['result']['media']['media_id'])
|
|
ids.append((bangumi_id, season_id, media_id))
|
|
seasonIdList.append(season_id)
|
|
if pages is None and 'count' in json_obj['result']:
|
|
pages = int(math.ceil(int(json_obj['result']['count']) / page_size))
|
|
page = page + 1
|
|
logger.info('获取id数量%d' % len(ids))
|
|
queue.put(ids)
|
|
except BaseException as e:
|
|
logger.error(repr(e))
|
|
continue
|
|
|
|
|
|
# def testA():
|
|
# req = requests.post('https://api.bilibili.com/x/report/web/heartbeat',
|
|
# data={"aid": 29416,"cid":49052,"csrf": "c0d296db7e33085f9f4730cfee66660b"},
|
|
# cookies=_cookies)
|
|
# print(req.status_code)
|
|
|
|
_cookies = {'DedeUserID': '4372744', 'DedeUserID__ckMd5': 'e8179b74444cae8e',
|
|
'SESSDATA': '919b17d2%2C1524917631%2C3eede719'}
|
|
|
|
|
|
def getCid(aid, type=True):
|
|
while True and aid > 0:
|
|
url = "https://api.bilibili.com/x/web-interface/archive/stat?aid=%d" % aid
|
|
print(url)
|
|
req = requests.get(url, cookies=_cookies)
|
|
code = json.loads(req.text)["code"]
|
|
if code == 0:
|
|
req = requests.get("https://www.bilibili.com/video/av%d" % aid)
|
|
if req.status_code == 200:
|
|
json_obj = getJsonText(req, 9)
|
|
if "videoData" in json_obj and "pages" in json_obj['videoData'] and len(
|
|
json_obj['videoData']['pages']) > 0 and "cid" in json_obj['videoData']['pages'][0]:
|
|
cid = json_obj['videoData']['pages'][0]['cid']
|
|
print('cid=%s' % cid)
|
|
return cid
|
|
if type:
|
|
aid = aid - 1
|
|
else:
|
|
aid = aid + 1
|
|
|
|
|
|
def getCids(aid):
|
|
s = {"min": getCid(aid, True), "max": getCid(aid, False)}
|
|
return s
|
|
|
|
|
|
def episodeIdToCid(episode_id):
|
|
cids = []
|
|
url = "https://www.bilibili.com/bangumi/play/ep%d" % episode_id
|
|
print("url=%s" % url)
|
|
req = requests.get(url)
|
|
json_obj = getJsonText(req, 8)
|
|
if "epList" in json_obj:
|
|
for i in json_obj["epList"]:
|
|
cids.append(i['cid'])
|
|
return cids
|
|
|
|
|
|
def parseXml(url):
|
|
print("url=%s" % url)
|
|
comment_selector = etree.HTML(requests.get(url).content)
|
|
comment_content = comment_selector.xpath('//i')
|
|
for comment_each in comment_content:
|
|
comments = comment_each.xpath('//d/text()')
|
|
if comments:
|
|
for comment in comments:
|
|
if comment in obj["data"]:
|
|
with lock:
|
|
obj["data"][comment] = obj["data"][comment] + 1
|
|
else:
|
|
with lock:
|
|
obj["data"][comment] = 1
|
|
if not obj["flag"]:
|
|
for keyword in keywords:
|
|
if keyword in comment:
|
|
obj["flag"] = True
|
|
|
|
|
|
lock = threading.Lock() # 多线程全局资源锁
|
|
|
|
|
|
def loadData(cids):
|
|
params = []
|
|
for cid in cids:
|
|
url = "https://comment.bilibili.com/rolldate,%d" % cid
|
|
req = requests.get(url)
|
|
urls = ["https://comment.bilibili.com/%d.xml" % cid]
|
|
if len(req.text) > 0:
|
|
for i in json.loads(req.text):
|
|
urls.append("https://comment.bilibili.com/dmroll,%s,%d" % (i['timestamp'], cid))
|
|
for url in urls:
|
|
params.append(url)
|
|
with futures.ThreadPoolExecutor(32) as executor:
|
|
executor.map(parseXml, params)
|
|
return obj
|
|
|
|
|
|
def getCommentSort(cids, keywords_):
|
|
global keywords, obj
|
|
keywords = keywords_
|
|
obj = {"data": {}, "flag": False}
|
|
return loadData(cids)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
# print(getCids(29416))
|
|
# obj = loadData(
|
|
# [49052, 49053, 51525, 51526, 53407, 54180, 55295, 55296, 57255, 57256, 59288, 59289, 61559, 61560, 64034, 64035,
|
|
# 67024, 67025, 69284, 73333, 73334, 74024, 74025], ['穹'])
|
|
f = getCommentSort(episodeIdToCid(172129), ['小樱'])
|
|
|
|
# obj = loadData([34807341], [])
|
|
for i in sorted(f["data"].items(), key=lambda d: d[1], reverse=True)[:50]:
|
|
print(i)
|
|
|