master
10295 7 years ago
parent bd556a441d
commit 7312fa2315
  1. 106
      PixivSearch/dao/bangumi.py
  2. 3
      PixivSearch/logging.conf
  3. 12
      PixivSearch/settings.py
  4. 3
      PixivSearch/urls.py
  5. 20
      PixivSearch/view.py

@ -1,10 +1,12 @@
import _thread
import json
import os
import random
import time
from concurrent import futures
import django
import requests
import threading
from bs4 import BeautifulSoup
from PixivSearch.settings import logger
@ -15,56 +17,72 @@ from PixivSearch.model.config import mediaInfo, stat
current_mediaInfo = mediaInfo(id=0, chn_name='null')
flag = True
isStop = False
executors = None
class bangumi(threading.Thread):
begin = 0
end = 0
id = 0
flag = True
def __init__(self, begin, end):
threading.Thread.__init__(self)
self.begin = begin
self.end = end
def check():
while True:
if isStop:
logger.info('停止多线程爬虫')
executors.shutdown()
break
time.sleep(1)
def save(self):
req = requests.get("https://www.bilibili.com/bangumi/media/md%d" % self.id)
if (req.status_code == 200):
tag = BeautifulSoup(req.text, 'lxml')
script = tag.select("script")[3].text
json_str = script[script.index("=") + 1:script.index("function") - 2]
json_obj = json.loads(json_str)
def save(md):
if isStop:
return
time.sleep(random.randint(1, 3))
url = "https://www.bilibili.com/bangumi/media/md%d" % md
req = requests.get(url)
logger.info("request_url=%s,status_code=%d" % (url,req.status_code))
if (req.status_code == 200):
tag = BeautifulSoup(req.text, 'lxml')
script = tag.select("script")[3].text
json_str = script[script.index("=") + 1:script.index("function") - 2]
json_obj = json.loads(json_str)
try:
stat_info = json_obj['mediaInfo']['stat']
print(json_obj['mediaInfo']['chn_name'])
print(stat_info)
mediaInfo(id=self.id, chn_name=json_obj['mediaInfo']['chn_name']).save()
mediaInfo(id=md, chn_name=json_obj['mediaInfo']['chn_name']).save()
global current_mediaInfo
current_mediaInfo = mediaInfo.objects.get(pk=self.id)
stat(id=self.id, danmakus=int(stat_info['danmakus']), favorites=stat_info['favorites'],
current_mediaInfo = mediaInfo.objects.get(pk=md)
stat(id=md, danmakus=int(stat_info['danmakus']), favorites=stat_info['favorites'],
views=stat_info['views']).save()
except BaseException as e:
logger.error("发生异常")
logger.error(e)
#asdasd
def get_():
global current_mediaInfo
return current_mediaInfo
def threadSave(start, end):
ids = []
for id in range(start, end):
ids.append(id)
try:
global executors
executors = futures.ThreadPoolExecutor(32)
global isStop
isStop = False
with executors as executor:
executor.map(save, ids)
logger.info('结束爬虫')
except BaseException as e:
logger.error(e)
def run(self) -> None:
self.go(self.begin, self.end)
def get(self):
global current_mediaInfo
return current_mediaInfo
def stop(self):
global flag
flag = False
def go(self, start, end):
global flag
flag = True
for num in range(start, end):
if flag:
time.sleep(1)
logger.info("爬虫进度:%d" % num)
self.id = num
self.save()
else:
logger.info("停止爬虫")
break
def stop_():
global isStop
isStop = True

@ -29,4 +29,5 @@ formatter=fmt
args=('pixiv.log','a','utf-8',False)
[formatter_fmt]
format=%(asctime)s - %(module)s:%(lineno)d - %(levelname)s - %(message)s
format=%(asctime)s - %(name)s - %(levelname)s - %(module)s :%(message)s
datefmt=%Y-%m-%d %H:%M:%S

@ -76,8 +76,12 @@ WSGI_APPLICATION = 'PixivSearch.wsgi.application'
DATABASES = {
'default': {
'ENGINE': 'django.db.backends.sqlite3',
'NAME': os.path.join(BASE_DIR, 'db.sqlite3'),
'ENGINE': 'django.db.backends.mysql',
'NAME': 'bangumi',
'USER': 'bilibili',
'PASSWORD': '2233',
'HOST': '127.0.0.1',
'PORT': '3306',
}
}
@ -106,13 +110,13 @@ AUTH_PASSWORD_VALIDATORS = [
LANGUAGE_CODE = 'en-us'
TIME_ZONE = 'UTC'
TIME_ZONE = 'Asia/Shanghai'
USE_I18N = True
USE_L10N = True
USE_TZ = True
USE_TZ = False
# Static files (CSS, JavaScript, Images)

@ -14,14 +14,13 @@ Including another URLconf
2. Add a URL to urlpatterns: url(r'^blog/', include('blog.urls'))
"""
from django.conf.urls import url
from PixivSearch.view import search, index, download, saveConfig, save, get, start, stop
from PixivSearch.view import search, index, download, saveConfig, get, start, stop
urlpatterns = [
url(r'^$', index),
url(r'^pixiv/search', search),
url(r'^pixiv/download', download),
url(r'^tsdm', saveConfig),
url(r'^bangumi/save', save),
url(r'^bangumi/get', get),
url(r'^bangumi/start', start),
url(r'^bangumi/stop', stop)

@ -1,11 +1,12 @@
# coding=utf-8
import _thread
import os
import django
from django.http import Http404, StreamingHttpResponse, HttpResponse
from django.shortcuts import render
from PixivSearch.dao.bangumi import bangumi
from PixivSearch.dao.bangumi import threadSave, get_, stop_
from PixivSearch.settings import logger
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "PixivSearch.settings")
@ -82,23 +83,18 @@ def saveConfig(request):
return render(request, 'addConfig.html')
def save(request):
media_id = int(request.GET.get('id'))
bangumi.save(media_id)
return HttpResponse("save success")
def get(request):
return HttpResponse(str(bangumi.get(bangumi).__str__()))
return HttpResponse(str(get_().__str__()))
# 测试方法
def start(request):
begin = int(request.GET.get('start'))
end = int(request.GET.get('end'))
bangumi(begin=begin, end=end).start()
_thread.start_new_thread(threadSave, (begin, end))
return HttpResponse("start success")
def stop(request):
bangumi.stop(bangumi)
return HttpResponse("stop success")
stop_()
return HttpResponse("stop success")

Loading…
Cancel
Save