From 09b53c1bb5d558391b73c8a8ae1e687d83c7456e Mon Sep 17 00:00:00 2001 From: 10295 <1029559041@qq.com> Date: Sun, 25 Mar 2018 00:55:21 +0800 Subject: [PATCH] =?UTF-8?q?=E5=A4=9A=E7=BA=BF=E7=A8=8B=E7=88=AC=E8=99=AB?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- PixivSearch/dao/bangumi.py | 146 +++++++++++++++++++++--------------- PixivSearch/model/config.py | 9 +-- PixivSearch/view.py | 4 +- 3 files changed, 90 insertions(+), 69 deletions(-) diff --git a/PixivSearch/dao/bangumi.py b/PixivSearch/dao/bangumi.py index 5915739..a66c738 100644 --- a/PixivSearch/dao/bangumi.py +++ b/PixivSearch/dao/bangumi.py @@ -2,9 +2,8 @@ import _thread import json import math import os -import random -import time from concurrent import futures +from queue import Queue import django import requests @@ -16,102 +15,125 @@ os.environ.setdefault("DJANGO_SETTINGS_MODULE", "PixivSearch.settings") django.setup() from PixivSearch.model.config import mediaInfo, stat -current_mediaInfo = mediaInfo(id=0, chn_name='null') +current_mediaInfo = None -isStop = False +isStop = None -executors = None - -def check(): - while True: - if isStop: - logger.info('停止多线程爬虫') - executors.shutdown() - break - time.sleep(1) +def stop_(): + global isStop + isStop = True -def save(md): +def save(params): if isStop: return + logger.info(params) + bangumi_id = params[0] + season_id = params[1] + media_id = params[2] - url = "https://www.bilibili.com/bangumi/media/md%d" % md + url = "https://www.bilibili.com/bangumi/media/md%d" % media_id try: req = requests.get(url, timeout=10) except BaseException as e: - logger.error(e) - save(md) + logger.error(repr(e)) + save(media_id) logger.info("request_url=%s,status_code=%d" % (url, req.status_code)) - if (req.status_code == 200): + if req.status_code == 200: tag = BeautifulSoup(req.text, 'lxml') script = tag.select("script")[3].text json_str = script[script.index("=") + 1:script.index("function") - 2] json_obj = json.loads(json_str) try: - stat_info = json_obj['mediaInfo']['stat'] - print(json_obj['mediaInfo']['chn_name']) - print(stat_info) - mediaInfo(id=md, chn_name=json_obj['mediaInfo']['chn_name']).save() - global current_mediaInfo - current_mediaInfo = mediaInfo.objects.get(pk=md) - stat(id=md, danmakus=int(stat_info['danmakus']), favorites=stat_info['favorites'], - views=stat_info['views']).save() + if 'mediaInfo' in json_obj and 'stat' in json_obj['mediaInfo'] and 'chn_name' in json_obj['mediaInfo']: + stat_info = json_obj['mediaInfo']['stat'] + print(stat_info) + mediaInfo(bangumi_id=bangumi_id, season_id=season_id, media_id=media_id, + chn_name=json_obj['mediaInfo']['chn_name']).save() + global current_mediaInfo + current_mediaInfo = mediaInfo.objects.get(pk=season_id) + stat(id=season_id, danmakus=int(stat_info['danmakus']), favorites=stat_info['favorites'], + views=stat_info['views']).save() except BaseException as e: - logger.error("发生异常") - logger.error(e) + logger.error(repr(e)) -# asdasd def get_(): global current_mediaInfo return current_mediaInfo -page_size = 100 -pages = None -ids = None +page_size = 10 +queue = Queue(page_size) + + +def listen(): + while True: + ids = queue.get() + try: + executors = futures.ThreadPoolExecutor(page_size) + with executors as executor: + executor.map(save, ids) + logger.info('结束爬虫') + except BaseException as e: + logger.error(repr(e)) + + +_thread.start_new_thread(listen, ()) def getIds(): - global ids - global pages - if ids is None or len(ids) != 0: - ids = [] + seasonIdList = [] page = 1 - while pages is None or page <= pages: + pages = None + name = 'seasonListCallback' + global isStop + isStop = False + while isStop == False and (pages is None or page <= pages): + url = 'https://bangumi.bilibili.com/web_api/season/index_global?page=%d&page_size=%d' % (page, page_size) logger.info(url) try: req = requests.get(url, timeout=10) - json_obj = json.loads(req.text) - bangumiList = json_obj['result']['list'] - for bangumi in bangumiList: - ids.append(int(bangumi['season_id'])) - if pages is None: - pages = int(math.ceil(int(json_obj['result']['count']) / page_size)) + if req.status_code == 200: + json_obj = json.loads(req.text) + if 'result' in json_obj and 'list' in json_obj['result']: + bangumiList = json_obj['result']['list'] + ids = [] + for bangumi in bangumiList: + if isStop: + break + if 'season_id' in bangumi: + season_id = int(bangumi['season_id']) + if season_id in seasonIdList: + continue + url = 'https://bangumi.bilibili.com/jsonp/seasoninfo/%d.ver?callback=%s&jsonp=jsonp' % ( + season_id, name) + logger.info(url) + req = requests.get(url, timeout=10) + if req.status_code == 200: + child_json_obj = json.loads( + req.text.replace('seasonListCallback(', '').replace(');', '')) + if 'result' in child_json_obj and 'bangumi_id' in child_json_obj['result']: + bangumi_id = int(child_json_obj['result']['bangumi_id']) + if 'media' in child_json_obj['result']: + media_id = int(child_json_obj['result']['media']['media_id']) + ids.append((bangumi_id, season_id, media_id)) + seasonIdList.append(season_id) + if pages is None and 'count' in json_obj['result']: + pages = int(math.ceil(int(json_obj['result']['count']) / page_size)) + page = page + 1 + logger.info('获取id数量%d' % len(ids)) + queue.put(ids) except BaseException as e: - logger.error('连接超时') - logger(e) + logger.error(repr(e)) continue - page = page + 1 -def threadSave(): - getIds() - logger.info(len(ids)) - try: - global executors - executors = futures.ThreadPoolExecutor(32) - global isStop - isStop = False - with executors as executor: - executor.map(save, ids) - logger.info('结束爬虫') - except BaseException as e: - logger.error(e) +def A(e): + logger.info(e) -def stop_(): - global isStop - isStop = True +if __name__ == '__main__': + getIds() diff --git a/PixivSearch/model/config.py b/PixivSearch/model/config.py index 2b8e7b0..315d937 100644 --- a/PixivSearch/model/config.py +++ b/PixivSearch/model/config.py @@ -1,5 +1,4 @@ from django.db import models -import json class param(models.Model): @@ -18,13 +17,13 @@ class stat(models.Model): class mediaInfo(models.Model): - id = models.IntegerField(primary_key=True) + bangumi_id = models.IntegerField() + season_id = models.IntegerField(primary_key=True) + media_id = models.IntegerField() chn_name = models.CharField(max_length=128) def __str__(self) -> str: - i = {} - i['media_id'] = self.id - i['chn_name'] = self.chn_name + i = {'media_id': self.id, 'chn_name': self.chn_name} return i diff --git a/PixivSearch/view.py b/PixivSearch/view.py index e0659b2..d01f309 100644 --- a/PixivSearch/view.py +++ b/PixivSearch/view.py @@ -6,7 +6,7 @@ import django from django.http import Http404, StreamingHttpResponse, HttpResponse from django.shortcuts import render -from PixivSearch.dao.bangumi import threadSave, get_, stop_ +from PixivSearch.dao.bangumi import get_, stop_, getIds from PixivSearch.settings import logger os.environ.setdefault("DJANGO_SETTINGS_MODULE", "PixivSearch.settings") @@ -89,7 +89,7 @@ def get(request): # 测试方法 def start(request): - _thread.start_new_thread(threadSave, ()) + _thread.start_new_thread(getIds, ()) return HttpResponse("start success")