master
10295 6 years ago
parent bd556a441d
commit 7312fa2315
  1. 106
      PixivSearch/dao/bangumi.py
  2. 3
      PixivSearch/logging.conf
  3. 12
      PixivSearch/settings.py
  4. 3
      PixivSearch/urls.py
  5. 20
      PixivSearch/view.py

@ -1,10 +1,12 @@
import _thread
import json import json
import os import os
import random
import time import time
from concurrent import futures
import django import django
import requests import requests
import threading
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from PixivSearch.settings import logger from PixivSearch.settings import logger
@ -15,56 +17,72 @@ from PixivSearch.model.config import mediaInfo, stat
current_mediaInfo = mediaInfo(id=0, chn_name='null') current_mediaInfo = mediaInfo(id=0, chn_name='null')
flag = True isStop = False
executors = None
class bangumi(threading.Thread):
begin = 0
end = 0
id = 0
flag = True
def __init__(self, begin, end): def check():
threading.Thread.__init__(self) while True:
self.begin = begin if isStop:
self.end = end logger.info('停止多线程爬虫')
executors.shutdown()
break
time.sleep(1)
def save(self):
req = requests.get("https://www.bilibili.com/bangumi/media/md%d" % self.id)
if (req.status_code == 200):
tag = BeautifulSoup(req.text, 'lxml')
script = tag.select("script")[3].text def save(md):
json_str = script[script.index("=") + 1:script.index("function") - 2] if isStop:
json_obj = json.loads(json_str) return
time.sleep(random.randint(1, 3))
url = "https://www.bilibili.com/bangumi/media/md%d" % md
req = requests.get(url)
logger.info("request_url=%s,status_code=%d" % (url,req.status_code))
if (req.status_code == 200):
tag = BeautifulSoup(req.text, 'lxml')
script = tag.select("script")[3].text
json_str = script[script.index("=") + 1:script.index("function") - 2]
json_obj = json.loads(json_str)
try:
stat_info = json_obj['mediaInfo']['stat'] stat_info = json_obj['mediaInfo']['stat']
print(json_obj['mediaInfo']['chn_name']) print(json_obj['mediaInfo']['chn_name'])
print(stat_info) print(stat_info)
mediaInfo(id=self.id, chn_name=json_obj['mediaInfo']['chn_name']).save() mediaInfo(id=md, chn_name=json_obj['mediaInfo']['chn_name']).save()
global current_mediaInfo global current_mediaInfo
current_mediaInfo = mediaInfo.objects.get(pk=self.id) current_mediaInfo = mediaInfo.objects.get(pk=md)
stat(id=self.id, danmakus=int(stat_info['danmakus']), favorites=stat_info['favorites'], stat(id=md, danmakus=int(stat_info['danmakus']), favorites=stat_info['favorites'],
views=stat_info['views']).save() views=stat_info['views']).save()
except BaseException as e:
logger.error("发生异常")
logger.error(e)
#asdasd
def get_():
global current_mediaInfo
return current_mediaInfo
def threadSave(start, end):
ids = []
for id in range(start, end):
ids.append(id)
try:
global executors
executors = futures.ThreadPoolExecutor(32)
global isStop
isStop = False
with executors as executor:
executor.map(save, ids)
logger.info('结束爬虫')
except BaseException as e:
logger.error(e)
def run(self) -> None: def stop_():
self.go(self.begin, self.end) global isStop
isStop = True
def get(self):
global current_mediaInfo
return current_mediaInfo
def stop(self):
global flag
flag = False
def go(self, start, end):
global flag
flag = True
for num in range(start, end):
if flag:
time.sleep(1)
logger.info("爬虫进度:%d" % num)
self.id = num
self.save()
else:
logger.info("停止爬虫")
break

@ -29,4 +29,5 @@ formatter=fmt
args=('pixiv.log','a','utf-8',False) args=('pixiv.log','a','utf-8',False)
[formatter_fmt] [formatter_fmt]
format=%(asctime)s - %(module)s:%(lineno)d - %(levelname)s - %(message)s format=%(asctime)s - %(name)s - %(levelname)s - %(module)s :%(message)s
datefmt=%Y-%m-%d %H:%M:%S

@ -76,8 +76,12 @@ WSGI_APPLICATION = 'PixivSearch.wsgi.application'
DATABASES = { DATABASES = {
'default': { 'default': {
'ENGINE': 'django.db.backends.sqlite3', 'ENGINE': 'django.db.backends.mysql',
'NAME': os.path.join(BASE_DIR, 'db.sqlite3'), 'NAME': 'bangumi',
'USER': 'bilibili',
'PASSWORD': '2233',
'HOST': '127.0.0.1',
'PORT': '3306',
} }
} }
@ -106,13 +110,13 @@ AUTH_PASSWORD_VALIDATORS = [
LANGUAGE_CODE = 'en-us' LANGUAGE_CODE = 'en-us'
TIME_ZONE = 'UTC' TIME_ZONE = 'Asia/Shanghai'
USE_I18N = True USE_I18N = True
USE_L10N = True USE_L10N = True
USE_TZ = True USE_TZ = False
# Static files (CSS, JavaScript, Images) # Static files (CSS, JavaScript, Images)

@ -14,14 +14,13 @@ Including another URLconf
2. Add a URL to urlpatterns: url(r'^blog/', include('blog.urls')) 2. Add a URL to urlpatterns: url(r'^blog/', include('blog.urls'))
""" """
from django.conf.urls import url from django.conf.urls import url
from PixivSearch.view import search, index, download, saveConfig, save, get, start, stop from PixivSearch.view import search, index, download, saveConfig, get, start, stop
urlpatterns = [ urlpatterns = [
url(r'^$', index), url(r'^$', index),
url(r'^pixiv/search', search), url(r'^pixiv/search', search),
url(r'^pixiv/download', download), url(r'^pixiv/download', download),
url(r'^tsdm', saveConfig), url(r'^tsdm', saveConfig),
url(r'^bangumi/save', save),
url(r'^bangumi/get', get), url(r'^bangumi/get', get),
url(r'^bangumi/start', start), url(r'^bangumi/start', start),
url(r'^bangumi/stop', stop) url(r'^bangumi/stop', stop)

@ -1,11 +1,12 @@
# coding=utf-8 # coding=utf-8
import _thread
import os import os
import django import django
from django.http import Http404, StreamingHttpResponse, HttpResponse from django.http import Http404, StreamingHttpResponse, HttpResponse
from django.shortcuts import render from django.shortcuts import render
from PixivSearch.dao.bangumi import bangumi from PixivSearch.dao.bangumi import threadSave, get_, stop_
from PixivSearch.settings import logger from PixivSearch.settings import logger
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "PixivSearch.settings") os.environ.setdefault("DJANGO_SETTINGS_MODULE", "PixivSearch.settings")
@ -82,23 +83,18 @@ def saveConfig(request):
return render(request, 'addConfig.html') return render(request, 'addConfig.html')
def save(request):
media_id = int(request.GET.get('id'))
bangumi.save(media_id)
return HttpResponse("save success")
def get(request): def get(request):
return HttpResponse(str(bangumi.get(bangumi).__str__())) return HttpResponse(str(get_().__str__()))
# 测试方法
def start(request): def start(request):
begin = int(request.GET.get('start')) begin = int(request.GET.get('start'))
end = int(request.GET.get('end')) end = int(request.GET.get('end'))
bangumi(begin=begin, end=end).start() _thread.start_new_thread(threadSave, (begin, end))
return HttpResponse("start success") return HttpResponse("start success")
def stop(request): def stop(request):
bangumi.stop(bangumi) stop_()
return HttpResponse("stop success") return HttpResponse("stop success")

Loading…
Cancel
Save