You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
pixiv/PixivSearch/pixiv.py

134 lines
4.6 KiB

#!/usr/bin/env python
#coding:utf-8
from concurrent import futures
import threading
import json
import requests
from bs4 import BeautifulSoup
import sys
from datetime import datetime
import os
import zipfile
import logging.config
from PixivSearch.model import config
headers = {
'X-Requested-With': 'XMLHttpRequest',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/56.0.2924.87 Safari/537.36'
}
lock = threading.Lock() # 多线程全局资源锁
total = 1
logging.config.fileConfig('PixivSearch/logging.conf')
logger = logging.getLogger('file')
def get_cookies():
_cookies = {}
array = config.param.objects.get(param_name='pixiv_cookie').param_value.split(';')
for row in array:
k, v = row.strip().split('=', 1)
_cookies[k] = v
return _cookies
def crawl(url):
global total
req = requests.get(url, headers=headers, cookies=get_cookies()).text
tag=BeautifulSoup(req, 'lxml').select('#js-mount-point-search-result-list')[0].attrs['data-items']
imageNodes=json.loads(tag)
for imageNode in imageNodes:
with lock:
nodes.append(imageNode)
def get_urls(search, page):
fmt = 'https://www.pixiv.net/search.php?word={}&order=date_d&p={}'
return [fmt.format(search, p) for p in range(1, page)]
def get_Img(params):
params[1]['imgUrl']='https://i.pximg.net/img-original/img/'+params[1]['url'][-46:-15]
headers['referer']='https://www.pixiv.net/member_illust.php?mode=medium&illust_id='+params[1]['illustId']
suffix=".jpg"
logger.info('开始下载图片:%s%s' % (params[1]['imgUrl'], suffix))
s=requests.get(params[1]['imgUrl']+suffix, headers=headers, cookies=get_cookies())
if(s.status_code==404):
suffix='.png'
s=requests.get(params[1]['imgUrl']+suffix,headers=headers, cookies=get_cookies())
if(s.status_code==404):
logger.error('无法下载图片:%s' % (params[1]['illustTitle']))
return
logger.info('下载图片:"%s"%s'%(params[1]['illustTitle'],os.getcwd().replace('\\','/')+'/'+imgPath+params[1]['illustId']+suffix))
f = open(imgPath+params[1]['illustId']+suffix, 'wb') #写入多媒体文件要 b 这个参数
f.write(s.content) #多媒体文件要是用conctent
f.close()
params[1]['localName']=params[1]['illustId']+suffix
logger.info('排行第%d名,收藏数%d,标题:%s,标签:%s,(%s)前投稿,链接:%s'%(params[0],params[1]['bookmarkCount'],params[1]['illustTitle'], ','.join(params[1]['tags']),'',params[1]['imgUrl']))
def zip(inputFile,outFile):
f = zipfile.ZipFile(outFile,'w',zipfile.ZIP_DEFLATED)
for dirpath, dirnames, filenames in os.walk(inputFile):
for filename in filenames:
f.write(os.path.join(dirpath,filename),filename)
f.close()
fsize=''
def get_nodes(param):
global nodes,fsize,imgPath
nodes=[]
start = datetime.now()
urls = get_urls(param[1], int(param[2])+1)
logger.info('开始从P站获取图片数据')
with futures.ThreadPoolExecutor(32) as executor:
executor.map(crawl, urls)
# for url in urls:
# crawl(url)
length=len(nodes)
logger.info('获取到%d张图片'%(length))
logger.info('对图片收藏数进行排序')
nodes=sorted(nodes, key=lambda v: v.get('bookmarkCount'), reverse=True)[:int(param[3])] # 按star数降序排序
if(param[4]!=None and param[4]=='img'):
imgPath='PixivSearch/static/images/'
for file in os.listdir(imgPath):
os.remove(imgPath+file)
nodes_tup=[]
start_d=datetime.now()
for index,img in enumerate(nodes):
nodes_tup.append((index+1,img))
# get_Img((index+1,img))
with futures.ThreadPoolExecutor(32) as executor:
executor.map(get_Img, nodes_tup)
print('下载图片花费时间:%s' % (datetime.now() - start_d))
logger.info('%s张图片下载完毕'%(len(os.listdir(imgPath))))
zipPath='PixivSearch/static/download/' + param[1] + '.zip'
logger.info('图片打包到:%s' % (zipPath))
zip(imgPath,zipPath)
fsize = str(round(os.path.getsize(zipPath)/float(1024*1024),2))+'MB'
logger.info('图包大小:%s'%(fsize))
tip='%d张图片中筛选出收藏数前%s的图片,处理耗时:%s'%(length,param[3],datetime.now()-start)
logger.info(tip)
return [nodes,tip,fsize]
if __name__ == "__main__":
if (len(sys.argv))==5 and sys.argv[2].isdigit() and sys.argv[3].isdigit():
try:
get_nodes(sys.argv)
except BaseException as e:
repr(e)
else :
logger.error('参数不合法')