多线程爬虫

master
10295 7 years ago
parent 7312fa2315
commit d210153bdb
  1. 53
      PixivSearch/dao/bangumi.py
  2. 2
      PixivSearch/logging.conf
  3. 107
      PixivSearch/pixiv.py
  4. 7
      PixivSearch/settings.py
  5. 4
      PixivSearch/view.py

@ -1,5 +1,6 @@
import _thread
import json
import math
import os
import random
import time
@ -31,17 +32,17 @@ def check():
time.sleep(1)
def save(md):
if isStop:
return
time.sleep(random.randint(1, 3))
url = "https://www.bilibili.com/bangumi/media/md%d" % md
req = requests.get(url)
logger.info("request_url=%s,status_code=%d" % (url,req.status_code))
url = "https://www.bilibili.com/bangumi/media/md%d" % md
try:
req = requests.get(url, timeout=10)
except BaseException as e:
logger.error(e)
save(md)
logger.info("request_url=%s,status_code=%d" % (url, req.status_code))
if (req.status_code == 200):
tag = BeautifulSoup(req.text, 'lxml')
script = tag.select("script")[3].text
@ -60,17 +61,45 @@ def save(md):
logger.error("发生异常")
logger.error(e)
#asdasd
# asdasd
def get_():
global current_mediaInfo
return current_mediaInfo
def threadSave(start, end):
ids = []
page_size = 100
pages = None
ids = None
def getIds():
global ids
global pages
if ids is None or len(ids) != 0:
ids = []
page = 1
while pages is None or page <= pages:
url = 'https://bangumi.bilibili.com/web_api/season/index_global?page=%d&page_size=%d' % (page, page_size)
logger.info(url)
try:
req = requests.get(url, timeout=10)
json_obj = json.loads(req.text)
bangumiList = json_obj['result']['list']
for bangumi in bangumiList:
ids.append(int(bangumi['season_id']))
if pages is None:
pages = int(math.ceil(int(json_obj['result']['count']) / page_size))
except BaseException as e:
logger.error('连接超时')
logger(e)
continue
page = page + 1
for id in range(start, end):
ids.append(id)
def threadSave():
getIds()
logger.info(len(ids))
try:
global executors
executors = futures.ThreadPoolExecutor(32)

@ -29,5 +29,5 @@ formatter=fmt
args=('pixiv.log','a','utf-8',False)
[formatter_fmt]
format=%(asctime)s - %(name)s - %(levelname)s - %(module)s :%(message)s
format=%(asctime)s - %(name)s - %(levelname)s - %(module)s:line:%(lineno)d - %(message)s
datefmt=%Y-%m-%d %H:%M:%S

@ -1,5 +1,5 @@
#!/usr/bin/env python
#coding:utf-8
# coding:utf-8
from concurrent import futures
import threading
import json
@ -9,9 +9,9 @@ import sys
from datetime import datetime
import os
import zipfile
import logging.config
from PixivSearch.model import config
from PixivSearch.settings import logger
headers = {
'X-Requested-With': 'XMLHttpRequest',
@ -19,10 +19,9 @@ headers = {
'Chrome/56.0.2924.87 Safari/537.36'
}
lock = threading.Lock() # 多线程全局资源锁
lock = threading.Lock() # 多线程全局资源锁
total = 1
logging.config.fileConfig('PixivSearch/logging.conf')
logger = logging.getLogger('file')
def get_cookies():
_cookies = {}
@ -33,58 +32,65 @@ def get_cookies():
return _cookies
def crawl(url):
global total
req = requests.get(url, headers=headers, cookies=get_cookies()).text
tag=BeautifulSoup(req, 'lxml').select('#js-mount-point-search-result-list')[0].attrs['data-items']
imageNodes=json.loads(tag)
tag = BeautifulSoup(req, 'lxml').select('#js-mount-point-search-result-list')[0].attrs['data-items']
imageNodes = json.loads(tag)
for imageNode in imageNodes:
with lock:
nodes.append(imageNode)
def get_urls(search, page):
fmt = 'https://www.pixiv.net/search.php?word={}&order=date_d&p={}'
return [fmt.format(search, p) for p in range(1, page)]
def get_Img(params):
params[1]['imgUrl']='https://i.pximg.net/img-original/img/'+params[1]['url'][-46:-15]
params[1]['imgUrl'] = 'https://i.pximg.net/img-original/img/' + params[1]['url'][-46:-15]
headers['referer']='https://www.pixiv.net/member_illust.php?mode=medium&illust_id='+params[1]['illustId']
headers['referer'] = 'https://www.pixiv.net/member_illust.php?mode=medium&illust_id=' + params[1]['illustId']
suffix=".jpg"
suffix = ".jpg"
logger.info('开始下载图片:%s%s' % (params[1]['imgUrl'], suffix))
s=requests.get(params[1]['imgUrl']+suffix, headers=headers, cookies=get_cookies())
if(s.status_code==404):
suffix='.png'
s=requests.get(params[1]['imgUrl']+suffix,headers=headers, cookies=get_cookies())
if(s.status_code==404):
s = requests.get(params[1]['imgUrl'] + suffix, headers=headers, cookies=get_cookies())
if (s.status_code == 404):
suffix = '.png'
s = requests.get(params[1]['imgUrl'] + suffix, headers=headers, cookies=get_cookies())
if (s.status_code == 404):
logger.error('无法下载图片:%s' % (params[1]['illustTitle']))
return
logger.info('下载图片:"%s"%s'%(params[1]['illustTitle'],os.getcwd().replace('\\','/')+'/'+imgPath+params[1]['illustId']+suffix))
f = open(imgPath+params[1]['illustId']+suffix, 'wb') #写入多媒体文件要 b 这个参数
f.write(s.content) #多媒体文件要是用conctent
logger.info('下载图片:"%s"%s' % (
params[1]['illustTitle'], os.getcwd().replace('\\', '/') + '/' + imgPath + params[1]['illustId'] + suffix))
f = open(imgPath + params[1]['illustId'] + suffix, 'wb') # 写入多媒体文件要 b 这个参数
f.write(s.content) # 多媒体文件要是用conctent
f.close()
params[1]['localName']=params[1]['illustId']+suffix
logger.info('排行第%d名,收藏数%d,标题:%s,标签:%s,(%s)前投稿,链接:%s'%(params[0],params[1]['bookmarkCount'],params[1]['illustTitle'], ','.join(params[1]['tags']),'',params[1]['imgUrl']))
params[1]['localName'] = params[1]['illustId'] + suffix
logger.info('排行第%d名,收藏数%d,标题:%s,标签:%s,(%s)前投稿,链接:%s' % (
params[0], params[1]['bookmarkCount'], params[1]['illustTitle'], ','.join(params[1]['tags']), '',
params[1]['imgUrl']))
def zip(inputFile,outFile):
f = zipfile.ZipFile(outFile,'w',zipfile.ZIP_DEFLATED)
def zip(inputFile, outFile):
f = zipfile.ZipFile(outFile, 'w', zipfile.ZIP_DEFLATED)
for dirpath, dirnames, filenames in os.walk(inputFile):
for filename in filenames:
f.write(os.path.join(dirpath,filename),filename)
f.write(os.path.join(dirpath, filename), filename)
f.close()
fsize=''
fsize = ''
def get_nodes(param):
global nodes,fsize,imgPath
nodes=[]
global nodes, fsize, imgPath
nodes = []
start = datetime.now()
urls = get_urls(param[1], int(param[2])+1)
urls = get_urls(param[1], int(param[2]) + 1)
logger.info('开始从P站获取图片数据')
with futures.ThreadPoolExecutor(32) as executor:
executor.map(crawl, urls)
@ -92,43 +98,42 @@ def get_nodes(param):
# for url in urls:
# crawl(url)
length=len(nodes)
logger.info('获取到%d张图片'%(length))
length = len(nodes)
logger.info('获取到%d张图片' % (length))
logger.info('对图片收藏数进行排序')
nodes=sorted(nodes, key=lambda v: v.get('bookmarkCount'), reverse=True)[:int(param[3])] # 按star数降序排序
if(param[4]!=None and param[4]=='img'):
imgPath='PixivSearch/static/images/'
nodes = sorted(nodes, key=lambda v: v.get('bookmarkCount'), reverse=True)[:int(param[3])] # 按star数降序排序
if (param[4] != None and param[4] == 'img'):
imgPath = 'PixivSearch/static/images/'
for file in os.listdir(imgPath):
os.remove(imgPath+file)
nodes_tup=[]
start_d=datetime.now()
for index,img in enumerate(nodes):
nodes_tup.append((index+1,img))
os.remove(imgPath + file)
nodes_tup = []
start_d = datetime.now()
for index, img in enumerate(nodes):
nodes_tup.append((index + 1, img))
# get_Img((index+1,img))
with futures.ThreadPoolExecutor(32) as executor:
executor.map(get_Img, nodes_tup)
executor.map(get_Img, nodes_tup)
print('下载图片花费时间:%s' % (datetime.now() - start_d))
logger.info('%s张图片下载完毕'%(len(os.listdir(imgPath))))
logger.info('%s张图片下载完毕' % (len(os.listdir(imgPath))))
zipPath='PixivSearch/static/download/' + param[1] + '.zip'
zipPath = 'PixivSearch/static/download/' + param[1] + '.zip'
logger.info('图片打包到:%s' % (zipPath))
zip(imgPath,zipPath)
fsize = str(round(os.path.getsize(zipPath)/float(1024*1024),2))+'MB'
logger.info('图包大小:%s'%(fsize))
zip(imgPath, zipPath)
fsize = str(round(os.path.getsize(zipPath) / float(1024 * 1024), 2)) + 'MB'
logger.info('图包大小:%s' % (fsize))
tip='%d张图片中筛选出收藏数前%s的图片,处理耗时:%s'%(length,param[3],datetime.now()-start)
tip = '%d张图片中筛选出收藏数前%s的图片,处理耗时:%s' % (length, param[3], datetime.now() - start)
logger.info(tip)
return [nodes,tip,fsize]
return [nodes, tip, fsize]
if __name__ == "__main__":
if (len(sys.argv))==5 and sys.argv[2].isdigit() and sys.argv[3].isdigit():
if (len(sys.argv)) == 5 and sys.argv[2].isdigit() and sys.argv[3].isdigit():
try:
get_nodes(sys.argv)
except BaseException as e:
repr(e)
else :
logger.error('参数不合法')
else:
logger.error('参数不合法')

@ -80,7 +80,8 @@ DATABASES = {
'NAME': 'bangumi',
'USER': 'bilibili',
'PASSWORD': '2233',
'HOST': '127.0.0.1',
# 'HOST': '127.0.0.1',
'HOST': 'mikuhime.xyz',
'PORT': '3306',
}
}
@ -127,7 +128,7 @@ STATICFILES_DIRS = [
os.path.join(os.path.dirname(__file__), 'static').replace('\\', '/'),
]
curr_dir = os.path.dirname(os.path.realpath(__file__))
logging.config.fileConfig('%s%slogging.conf' % (curr_dir,os.sep))
configPath = '%s/logging.conf' % os.path.dirname(__file__).replace('\\', '/')
logging.config.fileConfig(configPath)
logger = logging.getLogger('file')

@ -89,9 +89,7 @@ def get(request):
# 测试方法
def start(request):
begin = int(request.GET.get('start'))
end = int(request.GET.get('end'))
_thread.start_new_thread(threadSave, (begin, end))
_thread.start_new_thread(threadSave, ())
return HttpResponse("start success")

Loading…
Cancel
Save