|
|
@ -2,9 +2,8 @@ import _thread |
|
|
|
import json |
|
|
|
import json |
|
|
|
import math |
|
|
|
import math |
|
|
|
import os |
|
|
|
import os |
|
|
|
import random |
|
|
|
|
|
|
|
import time |
|
|
|
|
|
|
|
from concurrent import futures |
|
|
|
from concurrent import futures |
|
|
|
|
|
|
|
from queue import Queue |
|
|
|
|
|
|
|
|
|
|
|
import django |
|
|
|
import django |
|
|
|
import requests |
|
|
|
import requests |
|
|
@ -16,102 +15,125 @@ os.environ.setdefault("DJANGO_SETTINGS_MODULE", "PixivSearch.settings") |
|
|
|
django.setup() |
|
|
|
django.setup() |
|
|
|
from PixivSearch.model.config import mediaInfo, stat |
|
|
|
from PixivSearch.model.config import mediaInfo, stat |
|
|
|
|
|
|
|
|
|
|
|
current_mediaInfo = mediaInfo(id=0, chn_name='null') |
|
|
|
current_mediaInfo = None |
|
|
|
|
|
|
|
|
|
|
|
isStop = False |
|
|
|
isStop = None |
|
|
|
|
|
|
|
|
|
|
|
executors = None |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def check(): |
|
|
|
def stop_(): |
|
|
|
while True: |
|
|
|
global isStop |
|
|
|
if isStop: |
|
|
|
isStop = True |
|
|
|
logger.info('停止多线程爬虫') |
|
|
|
|
|
|
|
executors.shutdown() |
|
|
|
|
|
|
|
break |
|
|
|
|
|
|
|
time.sleep(1) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def save(md): |
|
|
|
def save(params): |
|
|
|
if isStop: |
|
|
|
if isStop: |
|
|
|
return |
|
|
|
return |
|
|
|
|
|
|
|
logger.info(params) |
|
|
|
|
|
|
|
bangumi_id = params[0] |
|
|
|
|
|
|
|
season_id = params[1] |
|
|
|
|
|
|
|
media_id = params[2] |
|
|
|
|
|
|
|
|
|
|
|
url = "https://www.bilibili.com/bangumi/media/md%d" % md |
|
|
|
url = "https://www.bilibili.com/bangumi/media/md%d" % media_id |
|
|
|
try: |
|
|
|
try: |
|
|
|
req = requests.get(url, timeout=10) |
|
|
|
req = requests.get(url, timeout=10) |
|
|
|
except BaseException as e: |
|
|
|
except BaseException as e: |
|
|
|
logger.error(e) |
|
|
|
logger.error(repr(e)) |
|
|
|
save(md) |
|
|
|
save(media_id) |
|
|
|
logger.info("request_url=%s,status_code=%d" % (url, req.status_code)) |
|
|
|
logger.info("request_url=%s,status_code=%d" % (url, req.status_code)) |
|
|
|
if (req.status_code == 200): |
|
|
|
if req.status_code == 200: |
|
|
|
tag = BeautifulSoup(req.text, 'lxml') |
|
|
|
tag = BeautifulSoup(req.text, 'lxml') |
|
|
|
script = tag.select("script")[3].text |
|
|
|
script = tag.select("script")[3].text |
|
|
|
json_str = script[script.index("=") + 1:script.index("function") - 2] |
|
|
|
json_str = script[script.index("=") + 1:script.index("function") - 2] |
|
|
|
json_obj = json.loads(json_str) |
|
|
|
json_obj = json.loads(json_str) |
|
|
|
try: |
|
|
|
try: |
|
|
|
|
|
|
|
if 'mediaInfo' in json_obj and 'stat' in json_obj['mediaInfo'] and 'chn_name' in json_obj['mediaInfo']: |
|
|
|
stat_info = json_obj['mediaInfo']['stat'] |
|
|
|
stat_info = json_obj['mediaInfo']['stat'] |
|
|
|
print(json_obj['mediaInfo']['chn_name']) |
|
|
|
|
|
|
|
print(stat_info) |
|
|
|
print(stat_info) |
|
|
|
mediaInfo(id=md, chn_name=json_obj['mediaInfo']['chn_name']).save() |
|
|
|
mediaInfo(bangumi_id=bangumi_id, season_id=season_id, media_id=media_id, |
|
|
|
|
|
|
|
chn_name=json_obj['mediaInfo']['chn_name']).save() |
|
|
|
global current_mediaInfo |
|
|
|
global current_mediaInfo |
|
|
|
current_mediaInfo = mediaInfo.objects.get(pk=md) |
|
|
|
current_mediaInfo = mediaInfo.objects.get(pk=season_id) |
|
|
|
stat(id=md, danmakus=int(stat_info['danmakus']), favorites=stat_info['favorites'], |
|
|
|
stat(id=season_id, danmakus=int(stat_info['danmakus']), favorites=stat_info['favorites'], |
|
|
|
views=stat_info['views']).save() |
|
|
|
views=stat_info['views']).save() |
|
|
|
except BaseException as e: |
|
|
|
except BaseException as e: |
|
|
|
logger.error("发生异常") |
|
|
|
logger.error(repr(e)) |
|
|
|
logger.error(e) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# asdasd |
|
|
|
|
|
|
|
def get_(): |
|
|
|
def get_(): |
|
|
|
global current_mediaInfo |
|
|
|
global current_mediaInfo |
|
|
|
return current_mediaInfo |
|
|
|
return current_mediaInfo |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
page_size = 100 |
|
|
|
page_size = 10 |
|
|
|
pages = None |
|
|
|
queue = Queue(page_size) |
|
|
|
ids = None |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def listen(): |
|
|
|
|
|
|
|
while True: |
|
|
|
|
|
|
|
ids = queue.get() |
|
|
|
|
|
|
|
try: |
|
|
|
|
|
|
|
executors = futures.ThreadPoolExecutor(page_size) |
|
|
|
|
|
|
|
with executors as executor: |
|
|
|
|
|
|
|
executor.map(save, ids) |
|
|
|
|
|
|
|
logger.info('结束爬虫') |
|
|
|
|
|
|
|
except BaseException as e: |
|
|
|
|
|
|
|
logger.error(repr(e)) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
_thread.start_new_thread(listen, ()) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def getIds(): |
|
|
|
def getIds(): |
|
|
|
global ids |
|
|
|
seasonIdList = [] |
|
|
|
global pages |
|
|
|
|
|
|
|
if ids is None or len(ids) != 0: |
|
|
|
|
|
|
|
ids = [] |
|
|
|
|
|
|
|
page = 1 |
|
|
|
page = 1 |
|
|
|
while pages is None or page <= pages: |
|
|
|
pages = None |
|
|
|
|
|
|
|
name = 'seasonListCallback' |
|
|
|
|
|
|
|
global isStop |
|
|
|
|
|
|
|
isStop = False |
|
|
|
|
|
|
|
while isStop == False and (pages is None or page <= pages): |
|
|
|
|
|
|
|
|
|
|
|
url = 'https://bangumi.bilibili.com/web_api/season/index_global?page=%d&page_size=%d' % (page, page_size) |
|
|
|
url = 'https://bangumi.bilibili.com/web_api/season/index_global?page=%d&page_size=%d' % (page, page_size) |
|
|
|
logger.info(url) |
|
|
|
logger.info(url) |
|
|
|
try: |
|
|
|
try: |
|
|
|
req = requests.get(url, timeout=10) |
|
|
|
req = requests.get(url, timeout=10) |
|
|
|
|
|
|
|
if req.status_code == 200: |
|
|
|
json_obj = json.loads(req.text) |
|
|
|
json_obj = json.loads(req.text) |
|
|
|
|
|
|
|
if 'result' in json_obj and 'list' in json_obj['result']: |
|
|
|
bangumiList = json_obj['result']['list'] |
|
|
|
bangumiList = json_obj['result']['list'] |
|
|
|
|
|
|
|
ids = [] |
|
|
|
for bangumi in bangumiList: |
|
|
|
for bangumi in bangumiList: |
|
|
|
ids.append(int(bangumi['season_id'])) |
|
|
|
if isStop: |
|
|
|
if pages is None: |
|
|
|
break |
|
|
|
|
|
|
|
if 'season_id' in bangumi: |
|
|
|
|
|
|
|
season_id = int(bangumi['season_id']) |
|
|
|
|
|
|
|
if season_id in seasonIdList: |
|
|
|
|
|
|
|
continue |
|
|
|
|
|
|
|
url = 'https://bangumi.bilibili.com/jsonp/seasoninfo/%d.ver?callback=%s&jsonp=jsonp' % ( |
|
|
|
|
|
|
|
season_id, name) |
|
|
|
|
|
|
|
logger.info(url) |
|
|
|
|
|
|
|
req = requests.get(url, timeout=10) |
|
|
|
|
|
|
|
if req.status_code == 200: |
|
|
|
|
|
|
|
child_json_obj = json.loads( |
|
|
|
|
|
|
|
req.text.replace('seasonListCallback(', '').replace(');', '')) |
|
|
|
|
|
|
|
if 'result' in child_json_obj and 'bangumi_id' in child_json_obj['result']: |
|
|
|
|
|
|
|
bangumi_id = int(child_json_obj['result']['bangumi_id']) |
|
|
|
|
|
|
|
if 'media' in child_json_obj['result']: |
|
|
|
|
|
|
|
media_id = int(child_json_obj['result']['media']['media_id']) |
|
|
|
|
|
|
|
ids.append((bangumi_id, season_id, media_id)) |
|
|
|
|
|
|
|
seasonIdList.append(season_id) |
|
|
|
|
|
|
|
if pages is None and 'count' in json_obj['result']: |
|
|
|
pages = int(math.ceil(int(json_obj['result']['count']) / page_size)) |
|
|
|
pages = int(math.ceil(int(json_obj['result']['count']) / page_size)) |
|
|
|
|
|
|
|
page = page + 1 |
|
|
|
|
|
|
|
logger.info('获取id数量%d' % len(ids)) |
|
|
|
|
|
|
|
queue.put(ids) |
|
|
|
except BaseException as e: |
|
|
|
except BaseException as e: |
|
|
|
logger.error('连接超时') |
|
|
|
logger.error(repr(e)) |
|
|
|
logger(e) |
|
|
|
|
|
|
|
continue |
|
|
|
continue |
|
|
|
page = page + 1 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def threadSave(): |
|
|
|
def A(e): |
|
|
|
getIds() |
|
|
|
logger.info(e) |
|
|
|
logger.info(len(ids)) |
|
|
|
|
|
|
|
try: |
|
|
|
|
|
|
|
global executors |
|
|
|
|
|
|
|
executors = futures.ThreadPoolExecutor(32) |
|
|
|
|
|
|
|
global isStop |
|
|
|
|
|
|
|
isStop = False |
|
|
|
|
|
|
|
with executors as executor: |
|
|
|
|
|
|
|
executor.map(save, ids) |
|
|
|
|
|
|
|
logger.info('结束爬虫') |
|
|
|
|
|
|
|
except BaseException as e: |
|
|
|
|
|
|
|
logger.error(e) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def stop_(): |
|
|
|
if __name__ == '__main__': |
|
|
|
global isStop |
|
|
|
getIds() |
|
|
|
isStop = True |
|
|
|
|
|
|
|