You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
117 lines
3.0 KiB
117 lines
3.0 KiB
import _thread
|
|
import json
|
|
import math
|
|
import os
|
|
import random
|
|
import time
|
|
from concurrent import futures
|
|
|
|
import django
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
|
|
from PixivSearch.settings import logger
|
|
|
|
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "PixivSearch.settings")
|
|
django.setup()
|
|
from PixivSearch.model.config import mediaInfo, stat
|
|
|
|
current_mediaInfo = mediaInfo(id=0, chn_name='null')
|
|
|
|
isStop = False
|
|
|
|
executors = None
|
|
|
|
|
|
def check():
|
|
while True:
|
|
if isStop:
|
|
logger.info('停止多线程爬虫')
|
|
executors.shutdown()
|
|
break
|
|
time.sleep(1)
|
|
|
|
|
|
def save(md):
|
|
if isStop:
|
|
return
|
|
|
|
url = "https://www.bilibili.com/bangumi/media/md%d" % md
|
|
try:
|
|
req = requests.get(url, timeout=10)
|
|
except BaseException as e:
|
|
logger.error(e)
|
|
save(md)
|
|
logger.info("request_url=%s,status_code=%d" % (url, req.status_code))
|
|
if (req.status_code == 200):
|
|
tag = BeautifulSoup(req.text, 'lxml')
|
|
script = tag.select("script")[3].text
|
|
json_str = script[script.index("=") + 1:script.index("function") - 2]
|
|
json_obj = json.loads(json_str)
|
|
try:
|
|
stat_info = json_obj['mediaInfo']['stat']
|
|
print(json_obj['mediaInfo']['chn_name'])
|
|
print(stat_info)
|
|
mediaInfo(id=md, chn_name=json_obj['mediaInfo']['chn_name']).save()
|
|
global current_mediaInfo
|
|
current_mediaInfo = mediaInfo.objects.get(pk=md)
|
|
stat(id=md, danmakus=int(stat_info['danmakus']), favorites=stat_info['favorites'],
|
|
views=stat_info['views']).save()
|
|
except BaseException as e:
|
|
logger.error("发生异常")
|
|
logger.error(e)
|
|
|
|
|
|
# asdasd
|
|
def get_():
|
|
global current_mediaInfo
|
|
return current_mediaInfo
|
|
|
|
|
|
page_size = 100
|
|
pages = None
|
|
ids = None
|
|
|
|
|
|
def getIds():
|
|
global ids
|
|
global pages
|
|
if ids is None or len(ids) != 0:
|
|
ids = []
|
|
page = 1
|
|
while pages is None or page <= pages:
|
|
url = 'https://bangumi.bilibili.com/web_api/season/index_global?page=%d&page_size=%d' % (page, page_size)
|
|
logger.info(url)
|
|
try:
|
|
req = requests.get(url, timeout=10)
|
|
json_obj = json.loads(req.text)
|
|
bangumiList = json_obj['result']['list']
|
|
for bangumi in bangumiList:
|
|
ids.append(int(bangumi['season_id']))
|
|
if pages is None:
|
|
pages = int(math.ceil(int(json_obj['result']['count']) / page_size))
|
|
except BaseException as e:
|
|
logger.error('连接超时')
|
|
logger(e)
|
|
continue
|
|
page = page + 1
|
|
|
|
|
|
def threadSave():
|
|
getIds()
|
|
logger.info(len(ids))
|
|
try:
|
|
global executors
|
|
executors = futures.ThreadPoolExecutor(32)
|
|
global isStop
|
|
isStop = False
|
|
with executors as executor:
|
|
executor.map(save, ids)
|
|
logger.info('结束爬虫')
|
|
except BaseException as e:
|
|
logger.error(e)
|
|
|
|
|
|
def stop_():
|
|
global isStop
|
|
isStop = True
|
|
|