|
|
@ -1,5 +1,5 @@ |
|
|
|
#!/usr/bin/env python |
|
|
|
#!/usr/bin/env python |
|
|
|
#coding:utf-8 |
|
|
|
# coding:utf-8 |
|
|
|
from concurrent import futures |
|
|
|
from concurrent import futures |
|
|
|
import threading |
|
|
|
import threading |
|
|
|
import json |
|
|
|
import json |
|
|
@ -9,9 +9,9 @@ import sys |
|
|
|
from datetime import datetime |
|
|
|
from datetime import datetime |
|
|
|
import os |
|
|
|
import os |
|
|
|
import zipfile |
|
|
|
import zipfile |
|
|
|
import logging.config |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from PixivSearch.model import config |
|
|
|
from PixivSearch.model import config |
|
|
|
|
|
|
|
from PixivSearch.settings import logger |
|
|
|
|
|
|
|
|
|
|
|
headers = { |
|
|
|
headers = { |
|
|
|
'X-Requested-With': 'XMLHttpRequest', |
|
|
|
'X-Requested-With': 'XMLHttpRequest', |
|
|
@ -19,10 +19,9 @@ headers = { |
|
|
|
'Chrome/56.0.2924.87 Safari/537.36' |
|
|
|
'Chrome/56.0.2924.87 Safari/537.36' |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
lock = threading.Lock() # 多线程全局资源锁 |
|
|
|
lock = threading.Lock() # 多线程全局资源锁 |
|
|
|
total = 1 |
|
|
|
total = 1 |
|
|
|
logging.config.fileConfig('PixivSearch/logging.conf') |
|
|
|
|
|
|
|
logger = logging.getLogger('file') |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_cookies(): |
|
|
|
def get_cookies(): |
|
|
|
_cookies = {} |
|
|
|
_cookies = {} |
|
|
@ -33,58 +32,65 @@ def get_cookies(): |
|
|
|
return _cookies |
|
|
|
return _cookies |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def crawl(url): |
|
|
|
def crawl(url): |
|
|
|
global total |
|
|
|
global total |
|
|
|
req = requests.get(url, headers=headers, cookies=get_cookies()).text |
|
|
|
req = requests.get(url, headers=headers, cookies=get_cookies()).text |
|
|
|
tag=BeautifulSoup(req, 'lxml').select('#js-mount-point-search-result-list')[0].attrs['data-items'] |
|
|
|
tag = BeautifulSoup(req, 'lxml').select('#js-mount-point-search-result-list')[0].attrs['data-items'] |
|
|
|
imageNodes=json.loads(tag) |
|
|
|
imageNodes = json.loads(tag) |
|
|
|
for imageNode in imageNodes: |
|
|
|
for imageNode in imageNodes: |
|
|
|
with lock: |
|
|
|
with lock: |
|
|
|
nodes.append(imageNode) |
|
|
|
nodes.append(imageNode) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_urls(search, page): |
|
|
|
def get_urls(search, page): |
|
|
|
fmt = 'https://www.pixiv.net/search.php?word={}&order=date_d&p={}' |
|
|
|
fmt = 'https://www.pixiv.net/search.php?word={}&order=date_d&p={}' |
|
|
|
return [fmt.format(search, p) for p in range(1, page)] |
|
|
|
return [fmt.format(search, p) for p in range(1, page)] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_Img(params): |
|
|
|
def get_Img(params): |
|
|
|
params[1]['imgUrl']='https://i.pximg.net/img-original/img/'+params[1]['url'][-46:-15] |
|
|
|
params[1]['imgUrl'] = 'https://i.pximg.net/img-original/img/' + params[1]['url'][-46:-15] |
|
|
|
|
|
|
|
|
|
|
|
headers['referer']='https://www.pixiv.net/member_illust.php?mode=medium&illust_id='+params[1]['illustId'] |
|
|
|
headers['referer'] = 'https://www.pixiv.net/member_illust.php?mode=medium&illust_id=' + params[1]['illustId'] |
|
|
|
|
|
|
|
|
|
|
|
suffix=".jpg" |
|
|
|
suffix = ".jpg" |
|
|
|
logger.info('开始下载图片:%s%s' % (params[1]['imgUrl'], suffix)) |
|
|
|
logger.info('开始下载图片:%s%s' % (params[1]['imgUrl'], suffix)) |
|
|
|
|
|
|
|
|
|
|
|
s=requests.get(params[1]['imgUrl']+suffix, headers=headers, cookies=get_cookies()) |
|
|
|
s = requests.get(params[1]['imgUrl'] + suffix, headers=headers, cookies=get_cookies()) |
|
|
|
if(s.status_code==404): |
|
|
|
if (s.status_code == 404): |
|
|
|
suffix='.png' |
|
|
|
suffix = '.png' |
|
|
|
s=requests.get(params[1]['imgUrl']+suffix,headers=headers, cookies=get_cookies()) |
|
|
|
s = requests.get(params[1]['imgUrl'] + suffix, headers=headers, cookies=get_cookies()) |
|
|
|
if(s.status_code==404): |
|
|
|
if (s.status_code == 404): |
|
|
|
logger.error('无法下载图片:%s' % (params[1]['illustTitle'])) |
|
|
|
logger.error('无法下载图片:%s' % (params[1]['illustTitle'])) |
|
|
|
return |
|
|
|
return |
|
|
|
|
|
|
|
|
|
|
|
logger.info('下载图片:"%s"到%s'%(params[1]['illustTitle'],os.getcwd().replace('\\','/')+'/'+imgPath+params[1]['illustId']+suffix)) |
|
|
|
logger.info('下载图片:"%s"到%s' % ( |
|
|
|
f = open(imgPath+params[1]['illustId']+suffix, 'wb') #写入多媒体文件要 b 这个参数 |
|
|
|
params[1]['illustTitle'], os.getcwd().replace('\\', '/') + '/' + imgPath + params[1]['illustId'] + suffix)) |
|
|
|
f.write(s.content) #多媒体文件要是用conctent |
|
|
|
f = open(imgPath + params[1]['illustId'] + suffix, 'wb') # 写入多媒体文件要 b 这个参数 |
|
|
|
|
|
|
|
f.write(s.content) # 多媒体文件要是用conctent |
|
|
|
f.close() |
|
|
|
f.close() |
|
|
|
|
|
|
|
|
|
|
|
params[1]['localName']=params[1]['illustId']+suffix |
|
|
|
params[1]['localName'] = params[1]['illustId'] + suffix |
|
|
|
logger.info('排行第%d名,收藏数%d,标题:%s,标签:%s,(%s)前投稿,链接:%s'%(params[0],params[1]['bookmarkCount'],params[1]['illustTitle'], ','.join(params[1]['tags']),'',params[1]['imgUrl'])) |
|
|
|
logger.info('排行第%d名,收藏数%d,标题:%s,标签:%s,(%s)前投稿,链接:%s' % ( |
|
|
|
|
|
|
|
params[0], params[1]['bookmarkCount'], params[1]['illustTitle'], ','.join(params[1]['tags']), '', |
|
|
|
|
|
|
|
params[1]['imgUrl'])) |
|
|
|
|
|
|
|
|
|
|
|
def zip(inputFile,outFile): |
|
|
|
|
|
|
|
f = zipfile.ZipFile(outFile,'w',zipfile.ZIP_DEFLATED) |
|
|
|
def zip(inputFile, outFile): |
|
|
|
|
|
|
|
f = zipfile.ZipFile(outFile, 'w', zipfile.ZIP_DEFLATED) |
|
|
|
for dirpath, dirnames, filenames in os.walk(inputFile): |
|
|
|
for dirpath, dirnames, filenames in os.walk(inputFile): |
|
|
|
for filename in filenames: |
|
|
|
for filename in filenames: |
|
|
|
f.write(os.path.join(dirpath,filename),filename) |
|
|
|
f.write(os.path.join(dirpath, filename), filename) |
|
|
|
f.close() |
|
|
|
f.close() |
|
|
|
|
|
|
|
|
|
|
|
fsize='' |
|
|
|
|
|
|
|
|
|
|
|
fsize = '' |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_nodes(param): |
|
|
|
def get_nodes(param): |
|
|
|
global nodes,fsize,imgPath |
|
|
|
global nodes, fsize, imgPath |
|
|
|
nodes=[] |
|
|
|
nodes = [] |
|
|
|
start = datetime.now() |
|
|
|
start = datetime.now() |
|
|
|
urls = get_urls(param[1], int(param[2])+1) |
|
|
|
urls = get_urls(param[1], int(param[2]) + 1) |
|
|
|
logger.info('开始从P站获取图片数据') |
|
|
|
logger.info('开始从P站获取图片数据') |
|
|
|
with futures.ThreadPoolExecutor(32) as executor: |
|
|
|
with futures.ThreadPoolExecutor(32) as executor: |
|
|
|
executor.map(crawl, urls) |
|
|
|
executor.map(crawl, urls) |
|
|
@ -92,43 +98,42 @@ def get_nodes(param): |
|
|
|
# for url in urls: |
|
|
|
# for url in urls: |
|
|
|
# crawl(url) |
|
|
|
# crawl(url) |
|
|
|
|
|
|
|
|
|
|
|
length=len(nodes) |
|
|
|
length = len(nodes) |
|
|
|
logger.info('获取到%d张图片'%(length)) |
|
|
|
logger.info('获取到%d张图片' % (length)) |
|
|
|
logger.info('对图片收藏数进行排序') |
|
|
|
logger.info('对图片收藏数进行排序') |
|
|
|
nodes=sorted(nodes, key=lambda v: v.get('bookmarkCount'), reverse=True)[:int(param[3])] # 按star数降序排序 |
|
|
|
nodes = sorted(nodes, key=lambda v: v.get('bookmarkCount'), reverse=True)[:int(param[3])] # 按star数降序排序 |
|
|
|
if(param[4]!=None and param[4]=='img'): |
|
|
|
if (param[4] != None and param[4] == 'img'): |
|
|
|
imgPath='PixivSearch/static/images/' |
|
|
|
imgPath = 'PixivSearch/static/images/' |
|
|
|
for file in os.listdir(imgPath): |
|
|
|
for file in os.listdir(imgPath): |
|
|
|
os.remove(imgPath+file) |
|
|
|
os.remove(imgPath + file) |
|
|
|
nodes_tup=[] |
|
|
|
nodes_tup = [] |
|
|
|
start_d=datetime.now() |
|
|
|
start_d = datetime.now() |
|
|
|
for index,img in enumerate(nodes): |
|
|
|
for index, img in enumerate(nodes): |
|
|
|
nodes_tup.append((index+1,img)) |
|
|
|
nodes_tup.append((index + 1, img)) |
|
|
|
# get_Img((index+1,img)) |
|
|
|
# get_Img((index+1,img)) |
|
|
|
|
|
|
|
|
|
|
|
with futures.ThreadPoolExecutor(32) as executor: |
|
|
|
with futures.ThreadPoolExecutor(32) as executor: |
|
|
|
executor.map(get_Img, nodes_tup) |
|
|
|
executor.map(get_Img, nodes_tup) |
|
|
|
print('下载图片花费时间:%s' % (datetime.now() - start_d)) |
|
|
|
print('下载图片花费时间:%s' % (datetime.now() - start_d)) |
|
|
|
logger.info('%s张图片下载完毕'%(len(os.listdir(imgPath)))) |
|
|
|
logger.info('%s张图片下载完毕' % (len(os.listdir(imgPath)))) |
|
|
|
|
|
|
|
|
|
|
|
zipPath='PixivSearch/static/download/' + param[1] + '.zip' |
|
|
|
zipPath = 'PixivSearch/static/download/' + param[1] + '.zip' |
|
|
|
logger.info('图片打包到:%s' % (zipPath)) |
|
|
|
logger.info('图片打包到:%s' % (zipPath)) |
|
|
|
zip(imgPath,zipPath) |
|
|
|
zip(imgPath, zipPath) |
|
|
|
fsize = str(round(os.path.getsize(zipPath)/float(1024*1024),2))+'MB' |
|
|
|
fsize = str(round(os.path.getsize(zipPath) / float(1024 * 1024), 2)) + 'MB' |
|
|
|
logger.info('图包大小:%s'%(fsize)) |
|
|
|
logger.info('图包大小:%s' % (fsize)) |
|
|
|
|
|
|
|
|
|
|
|
tip='从%d张图片中筛选出收藏数前%s的图片,处理耗时:%s'%(length,param[3],datetime.now()-start) |
|
|
|
tip = '从%d张图片中筛选出收藏数前%s的图片,处理耗时:%s' % (length, param[3], datetime.now() - start) |
|
|
|
logger.info(tip) |
|
|
|
logger.info(tip) |
|
|
|
return [nodes,tip,fsize] |
|
|
|
return [nodes, tip, fsize] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
if __name__ == "__main__": |
|
|
|
if (len(sys.argv))==5 and sys.argv[2].isdigit() and sys.argv[3].isdigit(): |
|
|
|
if (len(sys.argv)) == 5 and sys.argv[2].isdigit() and sys.argv[3].isdigit(): |
|
|
|
try: |
|
|
|
try: |
|
|
|
get_nodes(sys.argv) |
|
|
|
get_nodes(sys.argv) |
|
|
|
except BaseException as e: |
|
|
|
except BaseException as e: |
|
|
|
repr(e) |
|
|
|
repr(e) |
|
|
|
|
|
|
|
|
|
|
|
else : |
|
|
|
else: |
|
|
|
logger.error('参数不合法') |
|
|
|
logger.error('参数不合法') |