#!/usr/bin/env python #coding:utf-8 from concurrent import futures import threading import json import requests from bs4 import BeautifulSoup import sys from datetime import datetime import os import zipfile import logging.config from PixivSearch.model import config headers = { 'X-Requested-With': 'XMLHttpRequest', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/56.0.2924.87 Safari/537.36' } lock = threading.Lock() # 多线程全局资源锁 total = 1 logging.config.fileConfig('PixivSearch/logging.conf') logger = logging.getLogger('file') def get_cookies(): _cookies = {} array = config.param.objects.get(param_name='pixiv_cookie').param_value.split(';') for row in array: k, v = row.strip().split('=', 1) _cookies[k] = v return _cookies def crawl(url): global total req = requests.get(url, headers=headers, cookies=get_cookies()).text tag=BeautifulSoup(req, 'lxml').select('#js-mount-point-search-result-list')[0].attrs['data-items'] imageNodes=json.loads(tag) for imageNode in imageNodes: with lock: nodes.append(imageNode) def get_urls(search, page): fmt = 'https://www.pixiv.net/search.php?word={}&order=date_d&p={}' return [fmt.format(search, p) for p in range(1, page)] def get_Img(params): params[1]['imgUrl']='https://i.pximg.net/img-original/img/'+params[1]['url'][-46:-15] headers['referer']='https://www.pixiv.net/member_illust.php?mode=medium&illust_id='+params[1]['illustId'] suffix=".jpg" logger.info('开始下载图片:%s%s' % (params[1]['imgUrl'], suffix)) s=requests.get(params[1]['imgUrl']+suffix, headers=headers, cookies=get_cookies()) if(s.status_code==404): suffix='.png' s=requests.get(params[1]['imgUrl']+suffix,headers=headers, cookies=get_cookies()) if(s.status_code==404): logger.error('无法下载图片:%s' % (params[1]['illustTitle'])) return logger.info('下载图片:"%s"到%s'%(params[1]['illustTitle'],os.getcwd().replace('\\','/')+'/'+imgPath+params[1]['illustId']+suffix)) f = open(imgPath+params[1]['illustId']+suffix, 'wb') #写入多媒体文件要 b 这个参数 f.write(s.content) #多媒体文件要是用conctent f.close() params[1]['localName']=params[1]['illustId']+suffix logger.info('排行第%d名,收藏数%d,标题:%s,标签:%s,(%s)前投稿,链接:%s'%(params[0],params[1]['bookmarkCount'],params[1]['illustTitle'], ','.join(params[1]['tags']),'',params[1]['imgUrl'])) def zip(inputFile,outFile): f = zipfile.ZipFile(outFile,'w',zipfile.ZIP_DEFLATED) for dirpath, dirnames, filenames in os.walk(inputFile): for filename in filenames: f.write(os.path.join(dirpath,filename),filename) f.close() fsize='' def get_nodes(param): global nodes,fsize,imgPath nodes=[] start = datetime.now() urls = get_urls(param[1], int(param[2])+1) logger.info('开始从P站获取图片数据') with futures.ThreadPoolExecutor(32) as executor: executor.map(crawl, urls) # for url in urls: # crawl(url) length=len(nodes) logger.info('获取到%d张图片'%(length)) logger.info('对图片收藏数进行排序') nodes=sorted(nodes, key=lambda v: v.get('bookmarkCount'), reverse=True)[:int(param[3])] # 按star数降序排序 if(param[4]!=None and param[4]=='img'): imgPath='PixivSearch/static/images/' for file in os.listdir(imgPath): os.remove(imgPath+file) nodes_tup=[] start_d=datetime.now() for index,img in enumerate(nodes): nodes_tup.append((index+1,img)) # get_Img((index+1,img)) with futures.ThreadPoolExecutor(32) as executor: executor.map(get_Img, nodes_tup) print('下载图片花费时间:%s' % (datetime.now() - start_d)) logger.info('%s张图片下载完毕'%(len(os.listdir(imgPath)))) zipPath='PixivSearch/static/download/' + param[1] + '.zip' logger.info('图片打包到:%s' % (zipPath)) zip(imgPath,zipPath) fsize = str(round(os.path.getsize(zipPath)/float(1024*1024),2))+'MB' logger.info('图包大小:%s'%(fsize)) tip='从%d张图片中筛选出收藏数前%s的图片,处理耗时:%s'%(length,param[3],datetime.now()-start) logger.info(tip) return [nodes,tip,fsize] if __name__ == "__main__": if (len(sys.argv))==5 and sys.argv[2].isdigit() and sys.argv[3].isdigit(): try: get_nodes(sys.argv) except BaseException as e: repr(e) else : logger.error('参数不合法')