You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
pixiv/PixivSearch/dao/bangumi.py

247 lines
8.3 KiB

import _thread
import json
import math
import os
import random
import threading
import zlib
from concurrent import futures
from queue import Queue
from lxml import etree
import django
import requests
from bs4 import BeautifulSoup
from PixivSearch.settings import logger
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "PixivSearch.settings")
django.setup()
from PixivSearch.model.config import mediaInfo, stat
current_mediaInfo = None
isStop = None
def stop_():
global isStop
isStop = True
def save(params):
if isStop:
return
logger.info(params)
bangumi_id = params[0]
season_id = params[1]
media_id = params[2]
url = "https://www.bilibili.com/bangumi/media/md%d" % media_id
try:
req = requests.get(url, timeout=10)
except BaseException as e:
logger.error(repr(e))
save(params)
logger.info("request_url=%s,status_code=%d" % (url, req.status_code))
if req.status_code == 200:
json_obj = getJsonText(req, 3)
try:
if 'mediaInfo' in json_obj and 'stat' in json_obj['mediaInfo'] and 'chn_name' in json_obj['mediaInfo']:
stat_info = json_obj['mediaInfo']['stat']
print(stat_info)
mediaInfo(bangumi_id=bangumi_id, season_id=season_id, media_id=media_id,
chn_name=json_obj['mediaInfo']['chn_name']).save()
global current_mediaInfo
current_mediaInfo = mediaInfo.objects.get(pk=season_id)
stat(id=season_id, danmakus=int(stat_info['danmakus']), favorites=stat_info['favorites'],
views=stat_info['views']).save()
except BaseException as e:
logger.error(repr(e))
def getJsonText(req, index):
tag = BeautifulSoup(req.text, 'lxml')
script = tag.select("script")[index].text
json_str = script[script.index("=") + 1:script.index("function") - 2]
return json.loads(json_str)
def get_():
global current_mediaInfo
return current_mediaInfo
page_size = 10
queue = Queue(page_size)
def listen():
while True:
ids = queue.get()
try:
executors = futures.ThreadPoolExecutor(page_size)
with executors as executor:
executor.map(save, ids)
logger.info('结束爬虫')
except BaseException as e:
logger.error(repr(e))
_thread.start_new_thread(listen, ())
def getIds():
seasonIdList = []
page = 1
pages = None
name = 'seasonListCallback'
global isStop
isStop = False
while isStop == False and (pages is None or page <= pages):
url = 'https://bangumi.bilibili.com/web_api/season/index_global?page=%d&page_size=%d' % (page, page_size)
logger.info(url)
try:
req = requests.get(url, timeout=10)
if req.status_code == 200:
json_obj = json.loads(req.text)
if 'result' in json_obj and 'list' in json_obj['result']:
bangumiList = json_obj['result']['list']
ids = []
for bangumi in bangumiList:
if isStop:
break
if 'season_id' in bangumi:
season_id = int(bangumi['season_id'])
if season_id in seasonIdList:
continue
url = 'https://bangumi.bilibili.com/jsonp/seasoninfo/%d.ver?callback=%s&jsonp=jsonp' % (
season_id, name)
logger.info(url)
req = requests.get(url, timeout=10)
if req.status_code == 200:
child_json_obj = json.loads(
req.text.replace('seasonListCallback(', '').replace(');', ''))
if 'result' in child_json_obj and 'bangumi_id' in child_json_obj['result']:
bangumi_id = int(child_json_obj['result']['bangumi_id'])
if 'media' in child_json_obj['result']:
media_id = int(child_json_obj['result']['media']['media_id'])
ids.append((bangumi_id, season_id, media_id))
seasonIdList.append(season_id)
if pages is None and 'count' in json_obj['result']:
pages = int(math.ceil(int(json_obj['result']['count']) / page_size))
page = page + 1
logger.info('获取id数量%d' % len(ids))
queue.put(ids)
except BaseException as e:
logger.error(repr(e))
continue
# def testA():
# req = requests.post('https://api.bilibili.com/x/report/web/heartbeat',
# data={"aid": 29416,"cid":49052,"csrf": "c0d296db7e33085f9f4730cfee66660b"},
# cookies=_cookies)
# print(req.status_code)
_cookies = {'DedeUserID': '4372744', 'DedeUserID__ckMd5': 'e8179b74444cae8e',
'SESSDATA': '919b17d2%2C1524917631%2C3eede719'}
def getCid(aid, type=True):
while True and aid > 0:
url = "https://api.bilibili.com/x/web-interface/archive/stat?aid=%d" % aid
print(url)
req = requests.get(url, cookies=_cookies)
code = json.loads(req.text)["code"]
if code == 0:
req = requests.get("https://www.bilibili.com/video/av%d" % aid)
if req.status_code == 200:
json_obj = getJsonText(req, 9)
if "videoData" in json_obj and "pages" in json_obj['videoData'] and len(
json_obj['videoData']['pages']) > 0 and "cid" in json_obj['videoData']['pages'][0]:
cid = json_obj['videoData']['pages'][0]['cid']
print('cid=%s' % cid)
return cid
if type:
aid = aid - 1
else:
aid = aid + 1
def getCids(aid):
s = {"min": getCid(aid, True), "max": getCid(aid, False)}
return s
def episodeIdToCid(episode_id):
cids = []
url = "https://www.bilibili.com/bangumi/play/ep%d" % episode_id
print("url=%s" % url)
req = requests.get(url)
json_obj = getJsonText(req, 8)
if "epList" in json_obj:
for i in json_obj["epList"]:
cids.append(i['cid'])
return cids
def parseXml(url):
print("url=%s" % url)
comment_selector = etree.HTML(requests.get(url).content)
comment_content = comment_selector.xpath('//i')
for comment_each in comment_content:
comments = comment_each.xpath('//d/text()')
if comments:
for comment in comments:
if comment in obj["data"]:
with lock:
obj["data"][comment] = obj["data"][comment] + 1
else:
with lock:
obj["data"][comment] = 1
if not obj["flag"]:
for keyword in keywords:
if keyword in comment:
obj["flag"] = True
lock = threading.Lock() # 多线程全局资源锁
def loadData(cids):
params = []
for cid in cids:
url = "https://comment.bilibili.com/rolldate,%d" % cid
req = requests.get(url)
urls = ["https://comment.bilibili.com/%d.xml" % cid]
if len(req.text) > 0:
for i in json.loads(req.text):
urls.append("https://comment.bilibili.com/dmroll,%s,%d" % (i['timestamp'], cid))
for url in urls:
params.append(url)
with futures.ThreadPoolExecutor(32) as executor:
executor.map(parseXml, params)
return obj
def getCommentSort(cids, keywords_):
global keywords, obj
keywords = keywords_
obj = {"data": {}, "flag": False}
return loadData(cids)
if __name__ == '__main__':
# print(getCids(29416))
# obj = loadData(
# [49052, 49053, 51525, 51526, 53407, 54180, 55295, 55296, 57255, 57256, 59288, 59289, 61559, 61560, 64034, 64035,
# 67024, 67025, 69284, 73333, 73334, 74024, 74025], ['穹'])
f = getCommentSort(episodeIdToCid(172095), [])
# obj = loadData([34807341], [])
for i in sorted(f["data"].items(), key=lambda d: d[1], reverse=True)[:50]:
print(i)