You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
pixiv/PixivSearch/pixiv.py

140 lines
4.7 KiB

7 years ago
#!/usr/bin/env python
7 years ago
# coding:utf-8
7 years ago
from concurrent import futures
import threading
import json
import requests
from bs4 import BeautifulSoup
import sys
from datetime import datetime
import os
import zipfile
7 years ago
from PixivSearch.model import config
7 years ago
from PixivSearch.settings import logger
7 years ago
7 years ago
headers = {
'X-Requested-With': 'XMLHttpRequest',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/56.0.2924.87 Safari/537.36'
}
7 years ago
lock = threading.Lock() # 多线程全局资源锁
7 years ago
total = 1
7 years ago
7 years ago
def get_cookies():
_cookies = {}
7 years ago
array = config.param.objects.get(param_name='pixiv_cookie').param_value.split(';')
7 years ago
for row in array:
k, v = row.strip().split('=', 1)
_cookies[k] = v
return _cookies
def crawl(url):
global total
req = requests.get(url, headers=headers, cookies=get_cookies()).text
7 years ago
tag = BeautifulSoup(req, 'lxml').select('#js-mount-point-search-result-list')[0].attrs['data-items']
imageNodes = json.loads(tag)
7 years ago
for imageNode in imageNodes:
with lock:
nodes.append(imageNode)
7 years ago
7 years ago
def get_urls(search, page):
fmt = 'https://www.pixiv.net/search.php?word={}&order=date_d&p={}'
return [fmt.format(search, p) for p in range(1, page)]
7 years ago
7 years ago
def get_Img(params):
7 years ago
params[1]['imgUrl'] = 'https://i.pximg.net/img-original/img/' + params[1]['url'][-46:-15]
7 years ago
7 years ago
headers['referer'] = 'https://www.pixiv.net/member_illust.php?mode=medium&illust_id=' + params[1]['illustId']
7 years ago
7 years ago
suffix = ".jpg"
7 years ago
logger.info('开始下载图片:%s%s' % (params[1]['imgUrl'], suffix))
7 years ago
s = requests.get(params[1]['imgUrl'] + suffix, headers=headers, cookies=get_cookies())
if (s.status_code == 404):
suffix = '.png'
s = requests.get(params[1]['imgUrl'] + suffix, headers=headers, cookies=get_cookies())
if (s.status_code == 404):
7 years ago
logger.error('无法下载图片:%s' % (params[1]['illustTitle']))
return
7 years ago
logger.info('下载图片:"%s"%s' % (
params[1]['illustTitle'], os.getcwd().replace('\\', '/') + '/' + imgPath + params[1]['illustId'] + suffix))
f = open(imgPath + params[1]['illustId'] + suffix, 'wb') # 写入多媒体文件要 b 这个参数
f.write(s.content) # 多媒体文件要是用conctent
7 years ago
f.close()
7 years ago
params[1]['localName'] = params[1]['illustId'] + suffix
logger.info('排行第%d名,收藏数%d,标题:%s,标签:%s,(%s)前投稿,链接:%s' % (
params[0], params[1]['bookmarkCount'], params[1]['illustTitle'], ','.join(params[1]['tags']), '',
params[1]['imgUrl']))
7 years ago
7 years ago
def zip(inputFile, outFile):
f = zipfile.ZipFile(outFile, 'w', zipfile.ZIP_DEFLATED)
7 years ago
for dirpath, dirnames, filenames in os.walk(inputFile):
for filename in filenames:
7 years ago
f.write(os.path.join(dirpath, filename), filename)
7 years ago
f.close()
7 years ago
fsize = ''
7 years ago
def get_nodes(param):
7 years ago
global nodes, fsize, imgPath
nodes = []
7 years ago
start = datetime.now()
7 years ago
urls = get_urls(param[1], int(param[2]) + 1)
7 years ago
logger.info('开始从P站获取图片数据')
with futures.ThreadPoolExecutor(32) as executor:
executor.map(crawl, urls)
# for url in urls:
# crawl(url)
7 years ago
length = len(nodes)
logger.info('获取到%d张图片' % (length))
7 years ago
logger.info('对图片收藏数进行排序')
7 years ago
nodes = sorted(nodes, key=lambda v: v.get('bookmarkCount'), reverse=True)[:int(param[3])] # 按star数降序排序
if (param[4] != None and param[4] == 'img'):
imgPath = 'PixivSearch/static/images/'
7 years ago
for file in os.listdir(imgPath):
7 years ago
os.remove(imgPath + file)
nodes_tup = []
start_d = datetime.now()
for index, img in enumerate(nodes):
nodes_tup.append((index + 1, img))
7 years ago
# get_Img((index+1,img))
with futures.ThreadPoolExecutor(32) as executor:
7 years ago
executor.map(get_Img, nodes_tup)
7 years ago
print('下载图片花费时间:%s' % (datetime.now() - start_d))
7 years ago
logger.info('%s张图片下载完毕' % (len(os.listdir(imgPath))))
7 years ago
7 years ago
zipPath = 'PixivSearch/static/download/' + param[1] + '.zip'
7 years ago
logger.info('图片打包到:%s' % (zipPath))
7 years ago
zip(imgPath, zipPath)
fsize = str(round(os.path.getsize(zipPath) / float(1024 * 1024), 2)) + 'MB'
logger.info('图包大小:%s' % (fsize))
7 years ago
7 years ago
tip = '%d张图片中筛选出收藏数前%s的图片,处理耗时:%s' % (length, param[3], datetime.now() - start)
7 years ago
logger.info(tip)
7 years ago
return [nodes, tip, fsize]
7 years ago
if __name__ == "__main__":
7 years ago
if (len(sys.argv)) == 5 and sys.argv[2].isdigit() and sys.argv[3].isdigit():
7 years ago
try:
get_nodes(sys.argv)
except BaseException as e:
repr(e)
7 years ago
else:
logger.error('参数不合法')