多线程爬虫

master
10295 7 years ago
parent 7312fa2315
commit d210153bdb
  1. 53
      PixivSearch/dao/bangumi.py
  2. 2
      PixivSearch/logging.conf
  3. 105
      PixivSearch/pixiv.py
  4. 7
      PixivSearch/settings.py
  5. 4
      PixivSearch/view.py

@ -1,5 +1,6 @@
import _thread import _thread
import json import json
import math
import os import os
import random import random
import time import time
@ -31,17 +32,17 @@ def check():
time.sleep(1) time.sleep(1)
def save(md): def save(md):
if isStop: if isStop:
return return
time.sleep(random.randint(1, 3))
url = "https://www.bilibili.com/bangumi/media/md%d" % md
req = requests.get(url)
logger.info("request_url=%s,status_code=%d" % (url,req.status_code)) url = "https://www.bilibili.com/bangumi/media/md%d" % md
try:
req = requests.get(url, timeout=10)
except BaseException as e:
logger.error(e)
save(md)
logger.info("request_url=%s,status_code=%d" % (url, req.status_code))
if (req.status_code == 200): if (req.status_code == 200):
tag = BeautifulSoup(req.text, 'lxml') tag = BeautifulSoup(req.text, 'lxml')
script = tag.select("script")[3].text script = tag.select("script")[3].text
@ -60,17 +61,45 @@ def save(md):
logger.error("发生异常") logger.error("发生异常")
logger.error(e) logger.error(e)
#asdasd
# asdasd
def get_(): def get_():
global current_mediaInfo global current_mediaInfo
return current_mediaInfo return current_mediaInfo
def threadSave(start, end): page_size = 100
ids = [] pages = None
ids = None
def getIds():
global ids
global pages
if ids is None or len(ids) != 0:
ids = []
page = 1
while pages is None or page <= pages:
url = 'https://bangumi.bilibili.com/web_api/season/index_global?page=%d&page_size=%d' % (page, page_size)
logger.info(url)
try:
req = requests.get(url, timeout=10)
json_obj = json.loads(req.text)
bangumiList = json_obj['result']['list']
for bangumi in bangumiList:
ids.append(int(bangumi['season_id']))
if pages is None:
pages = int(math.ceil(int(json_obj['result']['count']) / page_size))
except BaseException as e:
logger.error('连接超时')
logger(e)
continue
page = page + 1
for id in range(start, end): def threadSave():
ids.append(id) getIds()
logger.info(len(ids))
try: try:
global executors global executors
executors = futures.ThreadPoolExecutor(32) executors = futures.ThreadPoolExecutor(32)

@ -29,5 +29,5 @@ formatter=fmt
args=('pixiv.log','a','utf-8',False) args=('pixiv.log','a','utf-8',False)
[formatter_fmt] [formatter_fmt]
format=%(asctime)s - %(name)s - %(levelname)s - %(module)s :%(message)s format=%(asctime)s - %(name)s - %(levelname)s - %(module)s:line:%(lineno)d - %(message)s
datefmt=%Y-%m-%d %H:%M:%S datefmt=%Y-%m-%d %H:%M:%S

@ -1,5 +1,5 @@
#!/usr/bin/env python #!/usr/bin/env python
#coding:utf-8 # coding:utf-8
from concurrent import futures from concurrent import futures
import threading import threading
import json import json
@ -9,9 +9,9 @@ import sys
from datetime import datetime from datetime import datetime
import os import os
import zipfile import zipfile
import logging.config
from PixivSearch.model import config from PixivSearch.model import config
from PixivSearch.settings import logger
headers = { headers = {
'X-Requested-With': 'XMLHttpRequest', 'X-Requested-With': 'XMLHttpRequest',
@ -19,10 +19,9 @@ headers = {
'Chrome/56.0.2924.87 Safari/537.36' 'Chrome/56.0.2924.87 Safari/537.36'
} }
lock = threading.Lock() # 多线程全局资源锁 lock = threading.Lock() # 多线程全局资源锁
total = 1 total = 1
logging.config.fileConfig('PixivSearch/logging.conf')
logger = logging.getLogger('file')
def get_cookies(): def get_cookies():
_cookies = {} _cookies = {}
@ -33,58 +32,65 @@ def get_cookies():
return _cookies return _cookies
def crawl(url): def crawl(url):
global total global total
req = requests.get(url, headers=headers, cookies=get_cookies()).text req = requests.get(url, headers=headers, cookies=get_cookies()).text
tag=BeautifulSoup(req, 'lxml').select('#js-mount-point-search-result-list')[0].attrs['data-items'] tag = BeautifulSoup(req, 'lxml').select('#js-mount-point-search-result-list')[0].attrs['data-items']
imageNodes=json.loads(tag) imageNodes = json.loads(tag)
for imageNode in imageNodes: for imageNode in imageNodes:
with lock: with lock:
nodes.append(imageNode) nodes.append(imageNode)
def get_urls(search, page): def get_urls(search, page):
fmt = 'https://www.pixiv.net/search.php?word={}&order=date_d&p={}' fmt = 'https://www.pixiv.net/search.php?word={}&order=date_d&p={}'
return [fmt.format(search, p) for p in range(1, page)] return [fmt.format(search, p) for p in range(1, page)]
def get_Img(params): def get_Img(params):
params[1]['imgUrl']='https://i.pximg.net/img-original/img/'+params[1]['url'][-46:-15] params[1]['imgUrl'] = 'https://i.pximg.net/img-original/img/' + params[1]['url'][-46:-15]
headers['referer']='https://www.pixiv.net/member_illust.php?mode=medium&illust_id='+params[1]['illustId'] headers['referer'] = 'https://www.pixiv.net/member_illust.php?mode=medium&illust_id=' + params[1]['illustId']
suffix=".jpg" suffix = ".jpg"
logger.info('开始下载图片:%s%s' % (params[1]['imgUrl'], suffix)) logger.info('开始下载图片:%s%s' % (params[1]['imgUrl'], suffix))
s=requests.get(params[1]['imgUrl']+suffix, headers=headers, cookies=get_cookies()) s = requests.get(params[1]['imgUrl'] + suffix, headers=headers, cookies=get_cookies())
if(s.status_code==404): if (s.status_code == 404):
suffix='.png' suffix = '.png'
s=requests.get(params[1]['imgUrl']+suffix,headers=headers, cookies=get_cookies()) s = requests.get(params[1]['imgUrl'] + suffix, headers=headers, cookies=get_cookies())
if(s.status_code==404): if (s.status_code == 404):
logger.error('无法下载图片:%s' % (params[1]['illustTitle'])) logger.error('无法下载图片:%s' % (params[1]['illustTitle']))
return return
logger.info('下载图片:"%s"%s'%(params[1]['illustTitle'],os.getcwd().replace('\\','/')+'/'+imgPath+params[1]['illustId']+suffix)) logger.info('下载图片:"%s"%s' % (
f = open(imgPath+params[1]['illustId']+suffix, 'wb') #写入多媒体文件要 b 这个参数 params[1]['illustTitle'], os.getcwd().replace('\\', '/') + '/' + imgPath + params[1]['illustId'] + suffix))
f.write(s.content) #多媒体文件要是用conctent f = open(imgPath + params[1]['illustId'] + suffix, 'wb') # 写入多媒体文件要 b 这个参数
f.write(s.content) # 多媒体文件要是用conctent
f.close() f.close()
params[1]['localName']=params[1]['illustId']+suffix params[1]['localName'] = params[1]['illustId'] + suffix
logger.info('排行第%d名,收藏数%d,标题:%s,标签:%s,(%s)前投稿,链接:%s'%(params[0],params[1]['bookmarkCount'],params[1]['illustTitle'], ','.join(params[1]['tags']),'',params[1]['imgUrl'])) logger.info('排行第%d名,收藏数%d,标题:%s,标签:%s,(%s)前投稿,链接:%s' % (
params[0], params[1]['bookmarkCount'], params[1]['illustTitle'], ','.join(params[1]['tags']), '',
params[1]['imgUrl']))
def zip(inputFile,outFile):
f = zipfile.ZipFile(outFile,'w',zipfile.ZIP_DEFLATED) def zip(inputFile, outFile):
f = zipfile.ZipFile(outFile, 'w', zipfile.ZIP_DEFLATED)
for dirpath, dirnames, filenames in os.walk(inputFile): for dirpath, dirnames, filenames in os.walk(inputFile):
for filename in filenames: for filename in filenames:
f.write(os.path.join(dirpath,filename),filename) f.write(os.path.join(dirpath, filename), filename)
f.close() f.close()
fsize=''
fsize = ''
def get_nodes(param): def get_nodes(param):
global nodes,fsize,imgPath global nodes, fsize, imgPath
nodes=[] nodes = []
start = datetime.now() start = datetime.now()
urls = get_urls(param[1], int(param[2])+1) urls = get_urls(param[1], int(param[2]) + 1)
logger.info('开始从P站获取图片数据') logger.info('开始从P站获取图片数据')
with futures.ThreadPoolExecutor(32) as executor: with futures.ThreadPoolExecutor(32) as executor:
executor.map(crawl, urls) executor.map(crawl, urls)
@ -92,43 +98,42 @@ def get_nodes(param):
# for url in urls: # for url in urls:
# crawl(url) # crawl(url)
length=len(nodes) length = len(nodes)
logger.info('获取到%d张图片'%(length)) logger.info('获取到%d张图片' % (length))
logger.info('对图片收藏数进行排序') logger.info('对图片收藏数进行排序')
nodes=sorted(nodes, key=lambda v: v.get('bookmarkCount'), reverse=True)[:int(param[3])] # 按star数降序排序 nodes = sorted(nodes, key=lambda v: v.get('bookmarkCount'), reverse=True)[:int(param[3])] # 按star数降序排序
if(param[4]!=None and param[4]=='img'): if (param[4] != None and param[4] == 'img'):
imgPath='PixivSearch/static/images/' imgPath = 'PixivSearch/static/images/'
for file in os.listdir(imgPath): for file in os.listdir(imgPath):
os.remove(imgPath+file) os.remove(imgPath + file)
nodes_tup=[] nodes_tup = []
start_d=datetime.now() start_d = datetime.now()
for index,img in enumerate(nodes): for index, img in enumerate(nodes):
nodes_tup.append((index+1,img)) nodes_tup.append((index + 1, img))
# get_Img((index+1,img)) # get_Img((index+1,img))
with futures.ThreadPoolExecutor(32) as executor: with futures.ThreadPoolExecutor(32) as executor:
executor.map(get_Img, nodes_tup) executor.map(get_Img, nodes_tup)
print('下载图片花费时间:%s' % (datetime.now() - start_d)) print('下载图片花费时间:%s' % (datetime.now() - start_d))
logger.info('%s张图片下载完毕'%(len(os.listdir(imgPath)))) logger.info('%s张图片下载完毕' % (len(os.listdir(imgPath))))
zipPath='PixivSearch/static/download/' + param[1] + '.zip' zipPath = 'PixivSearch/static/download/' + param[1] + '.zip'
logger.info('图片打包到:%s' % (zipPath)) logger.info('图片打包到:%s' % (zipPath))
zip(imgPath,zipPath) zip(imgPath, zipPath)
fsize = str(round(os.path.getsize(zipPath)/float(1024*1024),2))+'MB' fsize = str(round(os.path.getsize(zipPath) / float(1024 * 1024), 2)) + 'MB'
logger.info('图包大小:%s'%(fsize)) logger.info('图包大小:%s' % (fsize))
tip='%d张图片中筛选出收藏数前%s的图片,处理耗时:%s'%(length,param[3],datetime.now()-start) tip = '%d张图片中筛选出收藏数前%s的图片,处理耗时:%s' % (length, param[3], datetime.now() - start)
logger.info(tip) logger.info(tip)
return [nodes,tip,fsize] return [nodes, tip, fsize]
if __name__ == "__main__": if __name__ == "__main__":
if (len(sys.argv))==5 and sys.argv[2].isdigit() and sys.argv[3].isdigit(): if (len(sys.argv)) == 5 and sys.argv[2].isdigit() and sys.argv[3].isdigit():
try: try:
get_nodes(sys.argv) get_nodes(sys.argv)
except BaseException as e: except BaseException as e:
repr(e) repr(e)
else : else:
logger.error('参数不合法') logger.error('参数不合法')

@ -80,7 +80,8 @@ DATABASES = {
'NAME': 'bangumi', 'NAME': 'bangumi',
'USER': 'bilibili', 'USER': 'bilibili',
'PASSWORD': '2233', 'PASSWORD': '2233',
'HOST': '127.0.0.1', # 'HOST': '127.0.0.1',
'HOST': 'mikuhime.xyz',
'PORT': '3306', 'PORT': '3306',
} }
} }
@ -127,7 +128,7 @@ STATICFILES_DIRS = [
os.path.join(os.path.dirname(__file__), 'static').replace('\\', '/'), os.path.join(os.path.dirname(__file__), 'static').replace('\\', '/'),
] ]
curr_dir = os.path.dirname(os.path.realpath(__file__)) configPath = '%s/logging.conf' % os.path.dirname(__file__).replace('\\', '/')
logging.config.fileConfig('%s%slogging.conf' % (curr_dir,os.sep)) logging.config.fileConfig(configPath)
logger = logging.getLogger('file') logger = logging.getLogger('file')

@ -89,9 +89,7 @@ def get(request):
# 测试方法 # 测试方法
def start(request): def start(request):
begin = int(request.GET.get('start')) _thread.start_new_thread(threadSave, ())
end = int(request.GET.get('end'))
_thread.start_new_thread(threadSave, (begin, end))
return HttpResponse("start success") return HttpResponse("start success")

Loading…
Cancel
Save