多线程爬虫

master
10295 7 years ago
parent d210153bdb
commit 09b53c1bb5
  1. 130
      PixivSearch/dao/bangumi.py
  2. 9
      PixivSearch/model/config.py
  3. 4
      PixivSearch/view.py

@ -2,9 +2,8 @@ import _thread
import json import json
import math import math
import os import os
import random
import time
from concurrent import futures from concurrent import futures
from queue import Queue
import django import django
import requests import requests
@ -16,102 +15,125 @@ os.environ.setdefault("DJANGO_SETTINGS_MODULE", "PixivSearch.settings")
django.setup() django.setup()
from PixivSearch.model.config import mediaInfo, stat from PixivSearch.model.config import mediaInfo, stat
current_mediaInfo = mediaInfo(id=0, chn_name='null') current_mediaInfo = None
isStop = False isStop = None
executors = None
def check(): def stop_():
while True: global isStop
if isStop: isStop = True
logger.info('停止多线程爬虫')
executors.shutdown()
break
time.sleep(1)
def save(md): def save(params):
if isStop: if isStop:
return return
logger.info(params)
bangumi_id = params[0]
season_id = params[1]
media_id = params[2]
url = "https://www.bilibili.com/bangumi/media/md%d" % md url = "https://www.bilibili.com/bangumi/media/md%d" % media_id
try: try:
req = requests.get(url, timeout=10) req = requests.get(url, timeout=10)
except BaseException as e: except BaseException as e:
logger.error(e) logger.error(repr(e))
save(md) save(media_id)
logger.info("request_url=%s,status_code=%d" % (url, req.status_code)) logger.info("request_url=%s,status_code=%d" % (url, req.status_code))
if (req.status_code == 200): if req.status_code == 200:
tag = BeautifulSoup(req.text, 'lxml') tag = BeautifulSoup(req.text, 'lxml')
script = tag.select("script")[3].text script = tag.select("script")[3].text
json_str = script[script.index("=") + 1:script.index("function") - 2] json_str = script[script.index("=") + 1:script.index("function") - 2]
json_obj = json.loads(json_str) json_obj = json.loads(json_str)
try: try:
if 'mediaInfo' in json_obj and 'stat' in json_obj['mediaInfo'] and 'chn_name' in json_obj['mediaInfo']:
stat_info = json_obj['mediaInfo']['stat'] stat_info = json_obj['mediaInfo']['stat']
print(json_obj['mediaInfo']['chn_name'])
print(stat_info) print(stat_info)
mediaInfo(id=md, chn_name=json_obj['mediaInfo']['chn_name']).save() mediaInfo(bangumi_id=bangumi_id, season_id=season_id, media_id=media_id,
chn_name=json_obj['mediaInfo']['chn_name']).save()
global current_mediaInfo global current_mediaInfo
current_mediaInfo = mediaInfo.objects.get(pk=md) current_mediaInfo = mediaInfo.objects.get(pk=season_id)
stat(id=md, danmakus=int(stat_info['danmakus']), favorites=stat_info['favorites'], stat(id=season_id, danmakus=int(stat_info['danmakus']), favorites=stat_info['favorites'],
views=stat_info['views']).save() views=stat_info['views']).save()
except BaseException as e: except BaseException as e:
logger.error("发生异常") logger.error(repr(e))
logger.error(e)
# asdasd
def get_(): def get_():
global current_mediaInfo global current_mediaInfo
return current_mediaInfo return current_mediaInfo
page_size = 100 page_size = 10
pages = None queue = Queue(page_size)
ids = None
def listen():
while True:
ids = queue.get()
try:
executors = futures.ThreadPoolExecutor(page_size)
with executors as executor:
executor.map(save, ids)
logger.info('结束爬虫')
except BaseException as e:
logger.error(repr(e))
_thread.start_new_thread(listen, ())
def getIds(): def getIds():
global ids seasonIdList = []
global pages
if ids is None or len(ids) != 0:
ids = []
page = 1 page = 1
while pages is None or page <= pages: pages = None
name = 'seasonListCallback'
global isStop
isStop = False
while isStop == False and (pages is None or page <= pages):
url = 'https://bangumi.bilibili.com/web_api/season/index_global?page=%d&page_size=%d' % (page, page_size) url = 'https://bangumi.bilibili.com/web_api/season/index_global?page=%d&page_size=%d' % (page, page_size)
logger.info(url) logger.info(url)
try: try:
req = requests.get(url, timeout=10) req = requests.get(url, timeout=10)
if req.status_code == 200:
json_obj = json.loads(req.text) json_obj = json.loads(req.text)
if 'result' in json_obj and 'list' in json_obj['result']:
bangumiList = json_obj['result']['list'] bangumiList = json_obj['result']['list']
ids = []
for bangumi in bangumiList: for bangumi in bangumiList:
ids.append(int(bangumi['season_id'])) if isStop:
if pages is None: break
if 'season_id' in bangumi:
season_id = int(bangumi['season_id'])
if season_id in seasonIdList:
continue
url = 'https://bangumi.bilibili.com/jsonp/seasoninfo/%d.ver?callback=%s&jsonp=jsonp' % (
season_id, name)
logger.info(url)
req = requests.get(url, timeout=10)
if req.status_code == 200:
child_json_obj = json.loads(
req.text.replace('seasonListCallback(', '').replace(');', ''))
if 'result' in child_json_obj and 'bangumi_id' in child_json_obj['result']:
bangumi_id = int(child_json_obj['result']['bangumi_id'])
if 'media' in child_json_obj['result']:
media_id = int(child_json_obj['result']['media']['media_id'])
ids.append((bangumi_id, season_id, media_id))
seasonIdList.append(season_id)
if pages is None and 'count' in json_obj['result']:
pages = int(math.ceil(int(json_obj['result']['count']) / page_size)) pages = int(math.ceil(int(json_obj['result']['count']) / page_size))
page = page + 1
logger.info('获取id数量%d' % len(ids))
queue.put(ids)
except BaseException as e: except BaseException as e:
logger.error('连接超时') logger.error(repr(e))
logger(e)
continue continue
page = page + 1
def threadSave(): def A(e):
getIds() logger.info(e)
logger.info(len(ids))
try:
global executors
executors = futures.ThreadPoolExecutor(32)
global isStop
isStop = False
with executors as executor:
executor.map(save, ids)
logger.info('结束爬虫')
except BaseException as e:
logger.error(e)
def stop_(): if __name__ == '__main__':
global isStop getIds()
isStop = True

@ -1,5 +1,4 @@
from django.db import models from django.db import models
import json
class param(models.Model): class param(models.Model):
@ -18,13 +17,13 @@ class stat(models.Model):
class mediaInfo(models.Model): class mediaInfo(models.Model):
id = models.IntegerField(primary_key=True) bangumi_id = models.IntegerField()
season_id = models.IntegerField(primary_key=True)
media_id = models.IntegerField()
chn_name = models.CharField(max_length=128) chn_name = models.CharField(max_length=128)
def __str__(self) -> str: def __str__(self) -> str:
i = {} i = {'media_id': self.id, 'chn_name': self.chn_name}
i['media_id'] = self.id
i['chn_name'] = self.chn_name
return i return i

@ -6,7 +6,7 @@ import django
from django.http import Http404, StreamingHttpResponse, HttpResponse from django.http import Http404, StreamingHttpResponse, HttpResponse
from django.shortcuts import render from django.shortcuts import render
from PixivSearch.dao.bangumi import threadSave, get_, stop_ from PixivSearch.dao.bangumi import get_, stop_, getIds
from PixivSearch.settings import logger from PixivSearch.settings import logger
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "PixivSearch.settings") os.environ.setdefault("DJANGO_SETTINGS_MODULE", "PixivSearch.settings")
@ -89,7 +89,7 @@ def get(request):
# 测试方法 # 测试方法
def start(request): def start(request):
_thread.start_new_thread(threadSave, ()) _thread.start_new_thread(getIds, ())
return HttpResponse("start success") return HttpResponse("start success")

Loading…
Cancel
Save