多线程爬虫

master
10295 6 years ago
parent d210153bdb
commit 09b53c1bb5
  1. 146
      PixivSearch/dao/bangumi.py
  2. 9
      PixivSearch/model/config.py
  3. 4
      PixivSearch/view.py

@ -2,9 +2,8 @@ import _thread
import json
import math
import os
import random
import time
from concurrent import futures
from queue import Queue
import django
import requests
@ -16,102 +15,125 @@ os.environ.setdefault("DJANGO_SETTINGS_MODULE", "PixivSearch.settings")
django.setup()
from PixivSearch.model.config import mediaInfo, stat
current_mediaInfo = mediaInfo(id=0, chn_name='null')
current_mediaInfo = None
isStop = False
isStop = None
executors = None
def check():
while True:
if isStop:
logger.info('停止多线程爬虫')
executors.shutdown()
break
time.sleep(1)
def stop_():
global isStop
isStop = True
def save(md):
def save(params):
if isStop:
return
logger.info(params)
bangumi_id = params[0]
season_id = params[1]
media_id = params[2]
url = "https://www.bilibili.com/bangumi/media/md%d" % md
url = "https://www.bilibili.com/bangumi/media/md%d" % media_id
try:
req = requests.get(url, timeout=10)
except BaseException as e:
logger.error(e)
save(md)
logger.error(repr(e))
save(media_id)
logger.info("request_url=%s,status_code=%d" % (url, req.status_code))
if (req.status_code == 200):
if req.status_code == 200:
tag = BeautifulSoup(req.text, 'lxml')
script = tag.select("script")[3].text
json_str = script[script.index("=") + 1:script.index("function") - 2]
json_obj = json.loads(json_str)
try:
stat_info = json_obj['mediaInfo']['stat']
print(json_obj['mediaInfo']['chn_name'])
print(stat_info)
mediaInfo(id=md, chn_name=json_obj['mediaInfo']['chn_name']).save()
global current_mediaInfo
current_mediaInfo = mediaInfo.objects.get(pk=md)
stat(id=md, danmakus=int(stat_info['danmakus']), favorites=stat_info['favorites'],
views=stat_info['views']).save()
if 'mediaInfo' in json_obj and 'stat' in json_obj['mediaInfo'] and 'chn_name' in json_obj['mediaInfo']:
stat_info = json_obj['mediaInfo']['stat']
print(stat_info)
mediaInfo(bangumi_id=bangumi_id, season_id=season_id, media_id=media_id,
chn_name=json_obj['mediaInfo']['chn_name']).save()
global current_mediaInfo
current_mediaInfo = mediaInfo.objects.get(pk=season_id)
stat(id=season_id, danmakus=int(stat_info['danmakus']), favorites=stat_info['favorites'],
views=stat_info['views']).save()
except BaseException as e:
logger.error("发生异常")
logger.error(e)
logger.error(repr(e))
# asdasd
def get_():
global current_mediaInfo
return current_mediaInfo
page_size = 100
pages = None
ids = None
page_size = 10
queue = Queue(page_size)
def listen():
while True:
ids = queue.get()
try:
executors = futures.ThreadPoolExecutor(page_size)
with executors as executor:
executor.map(save, ids)
logger.info('结束爬虫')
except BaseException as e:
logger.error(repr(e))
_thread.start_new_thread(listen, ())
def getIds():
global ids
global pages
if ids is None or len(ids) != 0:
ids = []
seasonIdList = []
page = 1
while pages is None or page <= pages:
pages = None
name = 'seasonListCallback'
global isStop
isStop = False
while isStop == False and (pages is None or page <= pages):
url = 'https://bangumi.bilibili.com/web_api/season/index_global?page=%d&page_size=%d' % (page, page_size)
logger.info(url)
try:
req = requests.get(url, timeout=10)
json_obj = json.loads(req.text)
bangumiList = json_obj['result']['list']
for bangumi in bangumiList:
ids.append(int(bangumi['season_id']))
if pages is None:
pages = int(math.ceil(int(json_obj['result']['count']) / page_size))
if req.status_code == 200:
json_obj = json.loads(req.text)
if 'result' in json_obj and 'list' in json_obj['result']:
bangumiList = json_obj['result']['list']
ids = []
for bangumi in bangumiList:
if isStop:
break
if 'season_id' in bangumi:
season_id = int(bangumi['season_id'])
if season_id in seasonIdList:
continue
url = 'https://bangumi.bilibili.com/jsonp/seasoninfo/%d.ver?callback=%s&jsonp=jsonp' % (
season_id, name)
logger.info(url)
req = requests.get(url, timeout=10)
if req.status_code == 200:
child_json_obj = json.loads(
req.text.replace('seasonListCallback(', '').replace(');', ''))
if 'result' in child_json_obj and 'bangumi_id' in child_json_obj['result']:
bangumi_id = int(child_json_obj['result']['bangumi_id'])
if 'media' in child_json_obj['result']:
media_id = int(child_json_obj['result']['media']['media_id'])
ids.append((bangumi_id, season_id, media_id))
seasonIdList.append(season_id)
if pages is None and 'count' in json_obj['result']:
pages = int(math.ceil(int(json_obj['result']['count']) / page_size))
page = page + 1
logger.info('获取id数量%d' % len(ids))
queue.put(ids)
except BaseException as e:
logger.error('连接超时')
logger(e)
logger.error(repr(e))
continue
page = page + 1
def threadSave():
getIds()
logger.info(len(ids))
try:
global executors
executors = futures.ThreadPoolExecutor(32)
global isStop
isStop = False
with executors as executor:
executor.map(save, ids)
logger.info('结束爬虫')
except BaseException as e:
logger.error(e)
def A(e):
logger.info(e)
def stop_():
global isStop
isStop = True
if __name__ == '__main__':
getIds()

@ -1,5 +1,4 @@
from django.db import models
import json
class param(models.Model):
@ -18,13 +17,13 @@ class stat(models.Model):
class mediaInfo(models.Model):
id = models.IntegerField(primary_key=True)
bangumi_id = models.IntegerField()
season_id = models.IntegerField(primary_key=True)
media_id = models.IntegerField()
chn_name = models.CharField(max_length=128)
def __str__(self) -> str:
i = {}
i['media_id'] = self.id
i['chn_name'] = self.chn_name
i = {'media_id': self.id, 'chn_name': self.chn_name}
return i

@ -6,7 +6,7 @@ import django
from django.http import Http404, StreamingHttpResponse, HttpResponse
from django.shortcuts import render
from PixivSearch.dao.bangumi import threadSave, get_, stop_
from PixivSearch.dao.bangumi import get_, stop_, getIds
from PixivSearch.settings import logger
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "PixivSearch.settings")
@ -89,7 +89,7 @@ def get(request):
# 测试方法
def start(request):
_thread.start_new_thread(threadSave, ())
_thread.start_new_thread(getIds, ())
return HttpResponse("start success")

Loading…
Cancel
Save