You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
pixiv/PixivSearch/dao/bangumi.py

191 lines
6.2 KiB

7 years ago
import _thread
7 years ago
import json
7 years ago
import math
7 years ago
import os
import random
import threading
import zlib
7 years ago
from concurrent import futures
7 years ago
from queue import Queue
7 years ago
from lxml import etree
7 years ago
import django
import requests
from bs4 import BeautifulSoup
from PixivSearch.settings import logger
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "PixivSearch.settings")
django.setup()
from PixivSearch.model.config import mediaInfo, stat
7 years ago
current_mediaInfo = None
7 years ago
7 years ago
isStop = None
7 years ago
7 years ago
def stop_():
global isStop
isStop = True
7 years ago
7 years ago
7 years ago
def save(params):
7 years ago
if isStop:
return
7 years ago
logger.info(params)
bangumi_id = params[0]
season_id = params[1]
media_id = params[2]
7 years ago
7 years ago
url = "https://www.bilibili.com/bangumi/media/md%d" % media_id
7 years ago
try:
req = requests.get(url, timeout=10)
except BaseException as e:
7 years ago
logger.error(repr(e))
save(params)
7 years ago
logger.info("request_url=%s,status_code=%d" % (url, req.status_code))
7 years ago
if req.status_code == 200:
json_obj = getJsonText(req, 3)
7 years ago
try:
7 years ago
if 'mediaInfo' in json_obj and 'stat' in json_obj['mediaInfo'] and 'chn_name' in json_obj['mediaInfo']:
stat_info = json_obj['mediaInfo']['stat']
print(stat_info)
mediaInfo(bangumi_id=bangumi_id, season_id=season_id, media_id=media_id,
chn_name=json_obj['mediaInfo']['chn_name']).save()
global current_mediaInfo
current_mediaInfo = mediaInfo.objects.get(pk=season_id)
stat(id=season_id, danmakus=int(stat_info['danmakus']), favorites=stat_info['favorites'],
views=stat_info['views']).save()
7 years ago
except BaseException as e:
7 years ago
logger.error(repr(e))
7 years ago
7 years ago
def getJsonText(req, index):
tag = BeautifulSoup(req.text, 'lxml')
script = tag.select("script")[index].text
json_str = script[script.index("=") + 1:script.index("function") - 2]
return json.loads(json_str)
7 years ago
def get_():
global current_mediaInfo
return current_mediaInfo
7 years ago
page_size = 10
queue = Queue(page_size)
def listen():
while True:
ids = queue.get()
try:
executors = futures.ThreadPoolExecutor(page_size)
with executors as executor:
executor.map(save, ids)
logger.info('结束爬虫')
except BaseException as e:
logger.error(repr(e))
_thread.start_new_thread(listen, ())
7 years ago
#遍历所有专题视频收藏数信息
7 years ago
def getIds():
7 years ago
seasonIdList = []
7 years ago
page = 1
7 years ago
pages = None
name = 'seasonListCallback'
global isStop
isStop = False
while isStop == False and (pages is None or page <= pages):
7 years ago
url = 'https://bangumi.bilibili.com/web_api/season/index_global?page=%d&page_size=%d' % (page, page_size)
logger.info(url)
try:
req = requests.get(url, timeout=10)
7 years ago
if req.status_code == 200:
json_obj = json.loads(req.text)
if 'result' in json_obj and 'list' in json_obj['result']:
bangumiList = json_obj['result']['list']
ids = []
for bangumi in bangumiList:
if isStop:
break
if 'season_id' in bangumi:
season_id = int(bangumi['season_id'])
if season_id in seasonIdList:
continue
url = 'https://bangumi.bilibili.com/jsonp/seasoninfo/%d.ver?callback=%s&jsonp=jsonp' % (
season_id, name)
logger.info(url)
req = requests.get(url, timeout=10)
if req.status_code == 200:
child_json_obj = json.loads(
req.text.replace('seasonListCallback(', '').replace(');', ''))
if 'result' in child_json_obj and 'bangumi_id' in child_json_obj['result']:
bangumi_id = int(child_json_obj['result']['bangumi_id'])
if 'media' in child_json_obj['result']:
media_id = int(child_json_obj['result']['media']['media_id'])
ids.append((bangumi_id, season_id, media_id))
seasonIdList.append(season_id)
if pages is None and 'count' in json_obj['result']:
pages = int(math.ceil(int(json_obj['result']['count']) / page_size))
page = page + 1
logger.info('获取id数量%d' % len(ids))
queue.put(ids)
7 years ago
except BaseException as e:
7 years ago
logger.error(repr(e))
7 years ago
continue
#根据aid获取cid
def getCid(aid, type=None):
while True and aid > 0:
url = "https://api.bilibili.com/x/web-interface/archive/stat?aid=%d" % aid
print(url)
req = requests.get(url)
code = json.loads(req.text)["code"]
if code == 0:
req = requests.get("https://www.bilibili.com/video/av%d" % aid)
if req.status_code == 200:
json_obj = getJsonText(req, 9)
if "videoData" in json_obj and "pages" in json_obj['videoData'] and len(
json_obj['videoData']['pages']) > 0 and "cid" in json_obj['videoData']['pages'][0]:
cid = json_obj['videoData']['pages'][0]['cid']
print('cid=%s' % cid)
return cid
if type is None:
break
else:
if type:
aid = aid - 1
else:
aid = aid + 1
#根据aid获取cid
def getCids(aid):
s = {"min": getCid(aid, True), "max": getCid(aid, False)}
return s
#获取专题所有cid
def episodeIdToCid(episode_id):
cids = []
url = "https://www.bilibili.com/bangumi/play/ep%d" % episode_id
print("url=%s" % url)
req = requests.get(url)
json_obj = getJsonText(req, 8)
if "epList" in json_obj:
for i in json_obj["epList"]:
cids.append(i['cid'])
return cids
7 years ago
7 years ago
if __name__ == '__main__':
print(getCids(29416))
# obj = loadData([34807341], [])