|
|
@ -1,10 +1,12 @@ |
|
|
|
|
|
|
|
import _thread |
|
|
|
import json |
|
|
|
import json |
|
|
|
import os |
|
|
|
import os |
|
|
|
|
|
|
|
import random |
|
|
|
import time |
|
|
|
import time |
|
|
|
|
|
|
|
from concurrent import futures |
|
|
|
|
|
|
|
|
|
|
|
import django |
|
|
|
import django |
|
|
|
import requests |
|
|
|
import requests |
|
|
|
import threading |
|
|
|
|
|
|
|
from bs4 import BeautifulSoup |
|
|
|
from bs4 import BeautifulSoup |
|
|
|
|
|
|
|
|
|
|
|
from PixivSearch.settings import logger |
|
|
|
from PixivSearch.settings import logger |
|
|
@ -15,56 +17,72 @@ from PixivSearch.model.config import mediaInfo, stat |
|
|
|
|
|
|
|
|
|
|
|
current_mediaInfo = mediaInfo(id=0, chn_name='null') |
|
|
|
current_mediaInfo = mediaInfo(id=0, chn_name='null') |
|
|
|
|
|
|
|
|
|
|
|
flag = True |
|
|
|
isStop = False |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
executors = None |
|
|
|
|
|
|
|
|
|
|
|
class bangumi(threading.Thread): |
|
|
|
|
|
|
|
begin = 0 |
|
|
|
|
|
|
|
end = 0 |
|
|
|
|
|
|
|
id = 0 |
|
|
|
|
|
|
|
flag = True |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def __init__(self, begin, end): |
|
|
|
def check(): |
|
|
|
threading.Thread.__init__(self) |
|
|
|
while True: |
|
|
|
self.begin = begin |
|
|
|
if isStop: |
|
|
|
self.end = end |
|
|
|
logger.info('停止多线程爬虫') |
|
|
|
|
|
|
|
executors.shutdown() |
|
|
|
|
|
|
|
break |
|
|
|
|
|
|
|
time.sleep(1) |
|
|
|
|
|
|
|
|
|
|
|
def save(self): |
|
|
|
|
|
|
|
req = requests.get("https://www.bilibili.com/bangumi/media/md%d" % self.id) |
|
|
|
|
|
|
|
if (req.status_code == 200): |
|
|
|
|
|
|
|
tag = BeautifulSoup(req.text, 'lxml') |
|
|
|
|
|
|
|
script = tag.select("script")[3].text |
|
|
|
def save(md): |
|
|
|
json_str = script[script.index("=") + 1:script.index("function") - 2] |
|
|
|
if isStop: |
|
|
|
json_obj = json.loads(json_str) |
|
|
|
return |
|
|
|
|
|
|
|
time.sleep(random.randint(1, 3)) |
|
|
|
|
|
|
|
url = "https://www.bilibili.com/bangumi/media/md%d" % md |
|
|
|
|
|
|
|
req = requests.get(url) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
logger.info("request_url=%s,status_code=%d" % (url,req.status_code)) |
|
|
|
|
|
|
|
if (req.status_code == 200): |
|
|
|
|
|
|
|
tag = BeautifulSoup(req.text, 'lxml') |
|
|
|
|
|
|
|
script = tag.select("script")[3].text |
|
|
|
|
|
|
|
json_str = script[script.index("=") + 1:script.index("function") - 2] |
|
|
|
|
|
|
|
json_obj = json.loads(json_str) |
|
|
|
|
|
|
|
try: |
|
|
|
stat_info = json_obj['mediaInfo']['stat'] |
|
|
|
stat_info = json_obj['mediaInfo']['stat'] |
|
|
|
print(json_obj['mediaInfo']['chn_name']) |
|
|
|
print(json_obj['mediaInfo']['chn_name']) |
|
|
|
print(stat_info) |
|
|
|
print(stat_info) |
|
|
|
mediaInfo(id=self.id, chn_name=json_obj['mediaInfo']['chn_name']).save() |
|
|
|
mediaInfo(id=md, chn_name=json_obj['mediaInfo']['chn_name']).save() |
|
|
|
global current_mediaInfo |
|
|
|
global current_mediaInfo |
|
|
|
current_mediaInfo = mediaInfo.objects.get(pk=self.id) |
|
|
|
current_mediaInfo = mediaInfo.objects.get(pk=md) |
|
|
|
stat(id=self.id, danmakus=int(stat_info['danmakus']), favorites=stat_info['favorites'], |
|
|
|
stat(id=md, danmakus=int(stat_info['danmakus']), favorites=stat_info['favorites'], |
|
|
|
views=stat_info['views']).save() |
|
|
|
views=stat_info['views']).save() |
|
|
|
|
|
|
|
except BaseException as e: |
|
|
|
|
|
|
|
logger.error("发生异常") |
|
|
|
|
|
|
|
logger.error(e) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#asdasd |
|
|
|
|
|
|
|
def get_(): |
|
|
|
|
|
|
|
global current_mediaInfo |
|
|
|
|
|
|
|
return current_mediaInfo |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def threadSave(start, end): |
|
|
|
|
|
|
|
ids = [] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
for id in range(start, end): |
|
|
|
|
|
|
|
ids.append(id) |
|
|
|
|
|
|
|
try: |
|
|
|
|
|
|
|
global executors |
|
|
|
|
|
|
|
executors = futures.ThreadPoolExecutor(32) |
|
|
|
|
|
|
|
global isStop |
|
|
|
|
|
|
|
isStop = False |
|
|
|
|
|
|
|
with executors as executor: |
|
|
|
|
|
|
|
executor.map(save, ids) |
|
|
|
|
|
|
|
logger.info('结束爬虫') |
|
|
|
|
|
|
|
except BaseException as e: |
|
|
|
|
|
|
|
logger.error(e) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def run(self) -> None: |
|
|
|
def stop_(): |
|
|
|
self.go(self.begin, self.end) |
|
|
|
global isStop |
|
|
|
|
|
|
|
isStop = True |
|
|
|
def get(self): |
|
|
|
|
|
|
|
global current_mediaInfo |
|
|
|
|
|
|
|
return current_mediaInfo |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def stop(self): |
|
|
|
|
|
|
|
global flag |
|
|
|
|
|
|
|
flag = False |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def go(self, start, end): |
|
|
|
|
|
|
|
global flag |
|
|
|
|
|
|
|
flag = True |
|
|
|
|
|
|
|
for num in range(start, end): |
|
|
|
|
|
|
|
if flag: |
|
|
|
|
|
|
|
time.sleep(1) |
|
|
|
|
|
|
|
logger.info("爬虫进度:%d" % num) |
|
|
|
|
|
|
|
self.id = num |
|
|
|
|
|
|
|
self.save() |
|
|
|
|
|
|
|
else: |
|
|
|
|
|
|
|
logger.info("停止爬虫") |
|
|
|
|
|
|
|
break |
|
|
|
|
|
|
|