多线程爬虫

master
10295 7 years ago
parent 7312fa2315
commit d210153bdb
  1. 47
      PixivSearch/dao/bangumi.py
  2. 2
      PixivSearch/logging.conf
  3. 21
      PixivSearch/pixiv.py
  4. 7
      PixivSearch/settings.py
  5. 4
      PixivSearch/view.py

@ -1,5 +1,6 @@
import _thread
import json
import math
import os
import random
import time
@ -31,16 +32,16 @@ def check():
time.sleep(1)
def save(md):
if isStop:
return
time.sleep(random.randint(1, 3))
url = "https://www.bilibili.com/bangumi/media/md%d" % md
req = requests.get(url)
url = "https://www.bilibili.com/bangumi/media/md%d" % md
try:
req = requests.get(url, timeout=10)
except BaseException as e:
logger.error(e)
save(md)
logger.info("request_url=%s,status_code=%d" % (url, req.status_code))
if (req.status_code == 200):
tag = BeautifulSoup(req.text, 'lxml')
@ -60,17 +61,45 @@ def save(md):
logger.error("发生异常")
logger.error(e)
# asdasd
def get_():
global current_mediaInfo
return current_mediaInfo
def threadSave(start, end):
page_size = 100
pages = None
ids = None
def getIds():
global ids
global pages
if ids is None or len(ids) != 0:
ids = []
page = 1
while pages is None or page <= pages:
url = 'https://bangumi.bilibili.com/web_api/season/index_global?page=%d&page_size=%d' % (page, page_size)
logger.info(url)
try:
req = requests.get(url, timeout=10)
json_obj = json.loads(req.text)
bangumiList = json_obj['result']['list']
for bangumi in bangumiList:
ids.append(int(bangumi['season_id']))
if pages is None:
pages = int(math.ceil(int(json_obj['result']['count']) / page_size))
except BaseException as e:
logger.error('连接超时')
logger(e)
continue
page = page + 1
for id in range(start, end):
ids.append(id)
def threadSave():
getIds()
logger.info(len(ids))
try:
global executors
executors = futures.ThreadPoolExecutor(32)

@ -29,5 +29,5 @@ formatter=fmt
args=('pixiv.log','a','utf-8',False)
[formatter_fmt]
format=%(asctime)s - %(name)s - %(levelname)s - %(module)s :%(message)s
format=%(asctime)s - %(name)s - %(levelname)s - %(module)s:line:%(lineno)d - %(message)s
datefmt=%Y-%m-%d %H:%M:%S

@ -9,9 +9,9 @@ import sys
from datetime import datetime
import os
import zipfile
import logging.config
from PixivSearch.model import config
from PixivSearch.settings import logger
headers = {
'X-Requested-With': 'XMLHttpRequest',
@ -21,8 +21,7 @@ headers = {
lock = threading.Lock() # 多线程全局资源锁
total = 1
logging.config.fileConfig('PixivSearch/logging.conf')
logger = logging.getLogger('file')
def get_cookies():
_cookies = {}
@ -33,8 +32,6 @@ def get_cookies():
return _cookies
def crawl(url):
global total
req = requests.get(url, headers=headers, cookies=get_cookies()).text
@ -44,10 +41,12 @@ def crawl(url):
with lock:
nodes.append(imageNode)
def get_urls(search, page):
fmt = 'https://www.pixiv.net/search.php?word={}&order=date_d&p={}'
return [fmt.format(search, p) for p in range(1, page)]
def get_Img(params):
params[1]['imgUrl'] = 'https://i.pximg.net/img-original/img/' + params[1]['url'][-46:-15]
@ -64,13 +63,17 @@ def get_Img(params):
logger.error('无法下载图片:%s' % (params[1]['illustTitle']))
return
logger.info('下载图片:"%s"%s'%(params[1]['illustTitle'],os.getcwd().replace('\\','/')+'/'+imgPath+params[1]['illustId']+suffix))
logger.info('下载图片:"%s"%s' % (
params[1]['illustTitle'], os.getcwd().replace('\\', '/') + '/' + imgPath + params[1]['illustId'] + suffix))
f = open(imgPath + params[1]['illustId'] + suffix, 'wb') # 写入多媒体文件要 b 这个参数
f.write(s.content) # 多媒体文件要是用conctent
f.close()
params[1]['localName'] = params[1]['illustId'] + suffix
logger.info('排行第%d名,收藏数%d,标题:%s,标签:%s,(%s)前投稿,链接:%s'%(params[0],params[1]['bookmarkCount'],params[1]['illustTitle'], ','.join(params[1]['tags']),'',params[1]['imgUrl']))
logger.info('排行第%d名,收藏数%d,标题:%s,标签:%s,(%s)前投稿,链接:%s' % (
params[0], params[1]['bookmarkCount'], params[1]['illustTitle'], ','.join(params[1]['tags']), '',
params[1]['imgUrl']))
def zip(inputFile, outFile):
f = zipfile.ZipFile(outFile, 'w', zipfile.ZIP_DEFLATED)
@ -79,7 +82,10 @@ def zip(inputFile,outFile):
f.write(os.path.join(dirpath, filename), filename)
f.close()
fsize = ''
def get_nodes(param):
global nodes, fsize, imgPath
nodes = []
@ -122,7 +128,6 @@ def get_nodes(param):
return [nodes, tip, fsize]
if __name__ == "__main__":
if (len(sys.argv)) == 5 and sys.argv[2].isdigit() and sys.argv[3].isdigit():
try:

@ -80,7 +80,8 @@ DATABASES = {
'NAME': 'bangumi',
'USER': 'bilibili',
'PASSWORD': '2233',
'HOST': '127.0.0.1',
# 'HOST': '127.0.0.1',
'HOST': 'mikuhime.xyz',
'PORT': '3306',
}
}
@ -127,7 +128,7 @@ STATICFILES_DIRS = [
os.path.join(os.path.dirname(__file__), 'static').replace('\\', '/'),
]
curr_dir = os.path.dirname(os.path.realpath(__file__))
logging.config.fileConfig('%s%slogging.conf' % (curr_dir,os.sep))
configPath = '%s/logging.conf' % os.path.dirname(__file__).replace('\\', '/')
logging.config.fileConfig(configPath)
logger = logging.getLogger('file')

@ -89,9 +89,7 @@ def get(request):
# 测试方法
def start(request):
begin = int(request.GET.get('start'))
end = int(request.GET.get('end'))
_thread.start_new_thread(threadSave, (begin, end))
_thread.start_new_thread(threadSave, ())
return HttpResponse("start success")

Loading…
Cancel
Save