From d210153bdb3b6151b781b271edbf200a28b1ee0b Mon Sep 17 00:00:00 2001 From: 10295 <1029559041@qq.com> Date: Sat, 24 Mar 2018 15:03:40 +0800 Subject: [PATCH] =?UTF-8?q?=E5=A4=9A=E7=BA=BF=E7=A8=8B=E7=88=AC=E8=99=AB?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- PixivSearch/dao/bangumi.py | 53 +++++++++++++----- PixivSearch/logging.conf | 2 +- PixivSearch/pixiv.py | 107 +++++++++++++++++++------------------ PixivSearch/settings.py | 7 +-- PixivSearch/view.py | 4 +- 5 files changed, 103 insertions(+), 70 deletions(-) diff --git a/PixivSearch/dao/bangumi.py b/PixivSearch/dao/bangumi.py index 863709f..5915739 100644 --- a/PixivSearch/dao/bangumi.py +++ b/PixivSearch/dao/bangumi.py @@ -1,5 +1,6 @@ import _thread import json +import math import os import random import time @@ -31,17 +32,17 @@ def check(): time.sleep(1) - - - def save(md): if isStop: return - time.sleep(random.randint(1, 3)) - url = "https://www.bilibili.com/bangumi/media/md%d" % md - req = requests.get(url) - logger.info("request_url=%s,status_code=%d" % (url,req.status_code)) + url = "https://www.bilibili.com/bangumi/media/md%d" % md + try: + req = requests.get(url, timeout=10) + except BaseException as e: + logger.error(e) + save(md) + logger.info("request_url=%s,status_code=%d" % (url, req.status_code)) if (req.status_code == 200): tag = BeautifulSoup(req.text, 'lxml') script = tag.select("script")[3].text @@ -60,17 +61,45 @@ def save(md): logger.error("发生异常") logger.error(e) -#asdasd + +# asdasd def get_(): global current_mediaInfo return current_mediaInfo -def threadSave(start, end): - ids = [] +page_size = 100 +pages = None +ids = None + + +def getIds(): + global ids + global pages + if ids is None or len(ids) != 0: + ids = [] + page = 1 + while pages is None or page <= pages: + url = 'https://bangumi.bilibili.com/web_api/season/index_global?page=%d&page_size=%d' % (page, page_size) + logger.info(url) + try: + req = requests.get(url, timeout=10) + json_obj = json.loads(req.text) + bangumiList = json_obj['result']['list'] + for bangumi in bangumiList: + ids.append(int(bangumi['season_id'])) + if pages is None: + pages = int(math.ceil(int(json_obj['result']['count']) / page_size)) + except BaseException as e: + logger.error('连接超时') + logger(e) + continue + page = page + 1 + - for id in range(start, end): - ids.append(id) +def threadSave(): + getIds() + logger.info(len(ids)) try: global executors executors = futures.ThreadPoolExecutor(32) diff --git a/PixivSearch/logging.conf b/PixivSearch/logging.conf index 005e412..dc3325b 100644 --- a/PixivSearch/logging.conf +++ b/PixivSearch/logging.conf @@ -29,5 +29,5 @@ formatter=fmt args=('pixiv.log','a','utf-8',False) [formatter_fmt] -format=%(asctime)s - %(name)s - %(levelname)s - %(module)s :%(message)s +format=%(asctime)s - %(name)s - %(levelname)s - %(module)s:line:%(lineno)d - %(message)s datefmt=%Y-%m-%d %H:%M:%S \ No newline at end of file diff --git a/PixivSearch/pixiv.py b/PixivSearch/pixiv.py index 1e3822d..a3312b5 100644 --- a/PixivSearch/pixiv.py +++ b/PixivSearch/pixiv.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#coding:utf-8 +# coding:utf-8 from concurrent import futures import threading import json @@ -9,9 +9,9 @@ import sys from datetime import datetime import os import zipfile -import logging.config from PixivSearch.model import config +from PixivSearch.settings import logger headers = { 'X-Requested-With': 'XMLHttpRequest', @@ -19,10 +19,9 @@ headers = { 'Chrome/56.0.2924.87 Safari/537.36' } -lock = threading.Lock() # 多线程全局资源锁 +lock = threading.Lock() # 多线程全局资源锁 total = 1 -logging.config.fileConfig('PixivSearch/logging.conf') -logger = logging.getLogger('file') + def get_cookies(): _cookies = {} @@ -33,58 +32,65 @@ def get_cookies(): return _cookies - - def crawl(url): global total req = requests.get(url, headers=headers, cookies=get_cookies()).text - tag=BeautifulSoup(req, 'lxml').select('#js-mount-point-search-result-list')[0].attrs['data-items'] - imageNodes=json.loads(tag) + tag = BeautifulSoup(req, 'lxml').select('#js-mount-point-search-result-list')[0].attrs['data-items'] + imageNodes = json.loads(tag) for imageNode in imageNodes: with lock: nodes.append(imageNode) + def get_urls(search, page): fmt = 'https://www.pixiv.net/search.php?word={}&order=date_d&p={}' return [fmt.format(search, p) for p in range(1, page)] + def get_Img(params): - params[1]['imgUrl']='https://i.pximg.net/img-original/img/'+params[1]['url'][-46:-15] + params[1]['imgUrl'] = 'https://i.pximg.net/img-original/img/' + params[1]['url'][-46:-15] - headers['referer']='https://www.pixiv.net/member_illust.php?mode=medium&illust_id='+params[1]['illustId'] + headers['referer'] = 'https://www.pixiv.net/member_illust.php?mode=medium&illust_id=' + params[1]['illustId'] - suffix=".jpg" + suffix = ".jpg" logger.info('开始下载图片:%s%s' % (params[1]['imgUrl'], suffix)) - s=requests.get(params[1]['imgUrl']+suffix, headers=headers, cookies=get_cookies()) - if(s.status_code==404): - suffix='.png' - s=requests.get(params[1]['imgUrl']+suffix,headers=headers, cookies=get_cookies()) - if(s.status_code==404): + s = requests.get(params[1]['imgUrl'] + suffix, headers=headers, cookies=get_cookies()) + if (s.status_code == 404): + suffix = '.png' + s = requests.get(params[1]['imgUrl'] + suffix, headers=headers, cookies=get_cookies()) + if (s.status_code == 404): logger.error('无法下载图片:%s' % (params[1]['illustTitle'])) return - logger.info('下载图片:"%s"到%s'%(params[1]['illustTitle'],os.getcwd().replace('\\','/')+'/'+imgPath+params[1]['illustId']+suffix)) - f = open(imgPath+params[1]['illustId']+suffix, 'wb') #写入多媒体文件要 b 这个参数 - f.write(s.content) #多媒体文件要是用conctent + logger.info('下载图片:"%s"到%s' % ( + params[1]['illustTitle'], os.getcwd().replace('\\', '/') + '/' + imgPath + params[1]['illustId'] + suffix)) + f = open(imgPath + params[1]['illustId'] + suffix, 'wb') # 写入多媒体文件要 b 这个参数 + f.write(s.content) # 多媒体文件要是用conctent f.close() - params[1]['localName']=params[1]['illustId']+suffix - logger.info('排行第%d名,收藏数%d,标题:%s,标签:%s,(%s)前投稿,链接:%s'%(params[0],params[1]['bookmarkCount'],params[1]['illustTitle'], ','.join(params[1]['tags']),'',params[1]['imgUrl'])) + params[1]['localName'] = params[1]['illustId'] + suffix + logger.info('排行第%d名,收藏数%d,标题:%s,标签:%s,(%s)前投稿,链接:%s' % ( + params[0], params[1]['bookmarkCount'], params[1]['illustTitle'], ','.join(params[1]['tags']), '', + params[1]['imgUrl'])) -def zip(inputFile,outFile): - f = zipfile.ZipFile(outFile,'w',zipfile.ZIP_DEFLATED) + +def zip(inputFile, outFile): + f = zipfile.ZipFile(outFile, 'w', zipfile.ZIP_DEFLATED) for dirpath, dirnames, filenames in os.walk(inputFile): for filename in filenames: - f.write(os.path.join(dirpath,filename),filename) + f.write(os.path.join(dirpath, filename), filename) f.close() -fsize='' + +fsize = '' + + def get_nodes(param): - global nodes,fsize,imgPath - nodes=[] + global nodes, fsize, imgPath + nodes = [] start = datetime.now() - urls = get_urls(param[1], int(param[2])+1) + urls = get_urls(param[1], int(param[2]) + 1) logger.info('开始从P站获取图片数据') with futures.ThreadPoolExecutor(32) as executor: executor.map(crawl, urls) @@ -92,43 +98,42 @@ def get_nodes(param): # for url in urls: # crawl(url) - length=len(nodes) - logger.info('获取到%d张图片'%(length)) + length = len(nodes) + logger.info('获取到%d张图片' % (length)) logger.info('对图片收藏数进行排序') - nodes=sorted(nodes, key=lambda v: v.get('bookmarkCount'), reverse=True)[:int(param[3])] # 按star数降序排序 - if(param[4]!=None and param[4]=='img'): - imgPath='PixivSearch/static/images/' + nodes = sorted(nodes, key=lambda v: v.get('bookmarkCount'), reverse=True)[:int(param[3])] # 按star数降序排序 + if (param[4] != None and param[4] == 'img'): + imgPath = 'PixivSearch/static/images/' for file in os.listdir(imgPath): - os.remove(imgPath+file) - nodes_tup=[] - start_d=datetime.now() - for index,img in enumerate(nodes): - nodes_tup.append((index+1,img)) + os.remove(imgPath + file) + nodes_tup = [] + start_d = datetime.now() + for index, img in enumerate(nodes): + nodes_tup.append((index + 1, img)) # get_Img((index+1,img)) with futures.ThreadPoolExecutor(32) as executor: - executor.map(get_Img, nodes_tup) + executor.map(get_Img, nodes_tup) print('下载图片花费时间:%s' % (datetime.now() - start_d)) - logger.info('%s张图片下载完毕'%(len(os.listdir(imgPath)))) + logger.info('%s张图片下载完毕' % (len(os.listdir(imgPath)))) - zipPath='PixivSearch/static/download/' + param[1] + '.zip' + zipPath = 'PixivSearch/static/download/' + param[1] + '.zip' logger.info('图片打包到:%s' % (zipPath)) - zip(imgPath,zipPath) - fsize = str(round(os.path.getsize(zipPath)/float(1024*1024),2))+'MB' - logger.info('图包大小:%s'%(fsize)) + zip(imgPath, zipPath) + fsize = str(round(os.path.getsize(zipPath) / float(1024 * 1024), 2)) + 'MB' + logger.info('图包大小:%s' % (fsize)) - tip='从%d张图片中筛选出收藏数前%s的图片,处理耗时:%s'%(length,param[3],datetime.now()-start) + tip = '从%d张图片中筛选出收藏数前%s的图片,处理耗时:%s' % (length, param[3], datetime.now() - start) logger.info(tip) - return [nodes,tip,fsize] - + return [nodes, tip, fsize] if __name__ == "__main__": - if (len(sys.argv))==5 and sys.argv[2].isdigit() and sys.argv[3].isdigit(): + if (len(sys.argv)) == 5 and sys.argv[2].isdigit() and sys.argv[3].isdigit(): try: get_nodes(sys.argv) except BaseException as e: repr(e) - else : - logger.error('参数不合法') \ No newline at end of file + else: + logger.error('参数不合法') diff --git a/PixivSearch/settings.py b/PixivSearch/settings.py index c1f0004..717f957 100644 --- a/PixivSearch/settings.py +++ b/PixivSearch/settings.py @@ -80,7 +80,8 @@ DATABASES = { 'NAME': 'bangumi', 'USER': 'bilibili', 'PASSWORD': '2233', - 'HOST': '127.0.0.1', + # 'HOST': '127.0.0.1', + 'HOST': 'mikuhime.xyz', 'PORT': '3306', } } @@ -127,7 +128,7 @@ STATICFILES_DIRS = [ os.path.join(os.path.dirname(__file__), 'static').replace('\\', '/'), ] -curr_dir = os.path.dirname(os.path.realpath(__file__)) -logging.config.fileConfig('%s%slogging.conf' % (curr_dir,os.sep)) +configPath = '%s/logging.conf' % os.path.dirname(__file__).replace('\\', '/') +logging.config.fileConfig(configPath) logger = logging.getLogger('file') diff --git a/PixivSearch/view.py b/PixivSearch/view.py index e34dc73..e0659b2 100644 --- a/PixivSearch/view.py +++ b/PixivSearch/view.py @@ -89,9 +89,7 @@ def get(request): # 测试方法 def start(request): - begin = int(request.GET.get('start')) - end = int(request.GET.get('end')) - _thread.start_new_thread(threadSave, (begin, end)) + _thread.start_new_thread(threadSave, ()) return HttpResponse("start success")