You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
pixiv/PixivSearch/dao/bangumi.py

118 lines
3.0 KiB

7 years ago
import _thread
7 years ago
import json
7 years ago
import math
7 years ago
import os
7 years ago
import random
7 years ago
import time
7 years ago
from concurrent import futures
7 years ago
import django
import requests
from bs4 import BeautifulSoup
from PixivSearch.settings import logger
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "PixivSearch.settings")
django.setup()
from PixivSearch.model.config import mediaInfo, stat
current_mediaInfo = mediaInfo(id=0, chn_name='null')
7 years ago
7 years ago
isStop = False
7 years ago
7 years ago
executors = None
7 years ago
7 years ago
def check():
while True:
if isStop:
logger.info('停止多线程爬虫')
executors.shutdown()
break
time.sleep(1)
7 years ago
7 years ago
def save(md):
if isStop:
return
7 years ago
url = "https://www.bilibili.com/bangumi/media/md%d" % md
try:
req = requests.get(url, timeout=10)
except BaseException as e:
logger.error(e)
save(md)
logger.info("request_url=%s,status_code=%d" % (url, req.status_code))
7 years ago
if (req.status_code == 200):
tag = BeautifulSoup(req.text, 'lxml')
script = tag.select("script")[3].text
json_str = script[script.index("=") + 1:script.index("function") - 2]
json_obj = json.loads(json_str)
try:
7 years ago
stat_info = json_obj['mediaInfo']['stat']
print(json_obj['mediaInfo']['chn_name'])
print(stat_info)
7 years ago
mediaInfo(id=md, chn_name=json_obj['mediaInfo']['chn_name']).save()
7 years ago
global current_mediaInfo
7 years ago
current_mediaInfo = mediaInfo.objects.get(pk=md)
stat(id=md, danmakus=int(stat_info['danmakus']), favorites=stat_info['favorites'],
7 years ago
views=stat_info['views']).save()
7 years ago
except BaseException as e:
logger.error("发生异常")
logger.error(e)
7 years ago
# asdasd
7 years ago
def get_():
global current_mediaInfo
return current_mediaInfo
7 years ago
page_size = 100
pages = None
ids = None
def getIds():
global ids
global pages
if ids is None or len(ids) != 0:
ids = []
page = 1
while pages is None or page <= pages:
url = 'https://bangumi.bilibili.com/web_api/season/index_global?page=%d&page_size=%d' % (page, page_size)
logger.info(url)
try:
req = requests.get(url, timeout=10)
json_obj = json.loads(req.text)
bangumiList = json_obj['result']['list']
for bangumi in bangumiList:
ids.append(int(bangumi['season_id']))
if pages is None:
pages = int(math.ceil(int(json_obj['result']['count']) / page_size))
except BaseException as e:
logger.error('连接超时')
logger(e)
continue
page = page + 1
7 years ago
7 years ago
def threadSave():
getIds()
logger.info(len(ids))
7 years ago
try:
global executors
executors = futures.ThreadPoolExecutor(32)
global isStop
isStop = False
with executors as executor:
executor.map(save, ids)
logger.info('结束爬虫')
except BaseException as e:
logger.error(e)
7 years ago
7 years ago
def stop_():
global isStop
isStop = True