多线程爬虫

master
10295 7 years ago
parent 7312fa2315
commit d210153bdb
  1. 47
      PixivSearch/dao/bangumi.py
  2. 2
      PixivSearch/logging.conf
  3. 21
      PixivSearch/pixiv.py
  4. 7
      PixivSearch/settings.py
  5. 4
      PixivSearch/view.py

@ -1,5 +1,6 @@
import _thread import _thread
import json import json
import math
import os import os
import random import random
import time import time
@ -31,16 +32,16 @@ def check():
time.sleep(1) time.sleep(1)
def save(md): def save(md):
if isStop: if isStop:
return return
time.sleep(random.randint(1, 3))
url = "https://www.bilibili.com/bangumi/media/md%d" % md
req = requests.get(url)
url = "https://www.bilibili.com/bangumi/media/md%d" % md
try:
req = requests.get(url, timeout=10)
except BaseException as e:
logger.error(e)
save(md)
logger.info("request_url=%s,status_code=%d" % (url, req.status_code)) logger.info("request_url=%s,status_code=%d" % (url, req.status_code))
if (req.status_code == 200): if (req.status_code == 200):
tag = BeautifulSoup(req.text, 'lxml') tag = BeautifulSoup(req.text, 'lxml')
@ -60,17 +61,45 @@ def save(md):
logger.error("发生异常") logger.error("发生异常")
logger.error(e) logger.error(e)
# asdasd # asdasd
def get_(): def get_():
global current_mediaInfo global current_mediaInfo
return current_mediaInfo return current_mediaInfo
def threadSave(start, end): page_size = 100
pages = None
ids = None
def getIds():
global ids
global pages
if ids is None or len(ids) != 0:
ids = [] ids = []
page = 1
while pages is None or page <= pages:
url = 'https://bangumi.bilibili.com/web_api/season/index_global?page=%d&page_size=%d' % (page, page_size)
logger.info(url)
try:
req = requests.get(url, timeout=10)
json_obj = json.loads(req.text)
bangumiList = json_obj['result']['list']
for bangumi in bangumiList:
ids.append(int(bangumi['season_id']))
if pages is None:
pages = int(math.ceil(int(json_obj['result']['count']) / page_size))
except BaseException as e:
logger.error('连接超时')
logger(e)
continue
page = page + 1
for id in range(start, end): def threadSave():
ids.append(id) getIds()
logger.info(len(ids))
try: try:
global executors global executors
executors = futures.ThreadPoolExecutor(32) executors = futures.ThreadPoolExecutor(32)

@ -29,5 +29,5 @@ formatter=fmt
args=('pixiv.log','a','utf-8',False) args=('pixiv.log','a','utf-8',False)
[formatter_fmt] [formatter_fmt]
format=%(asctime)s - %(name)s - %(levelname)s - %(module)s :%(message)s format=%(asctime)s - %(name)s - %(levelname)s - %(module)s:line:%(lineno)d - %(message)s
datefmt=%Y-%m-%d %H:%M:%S datefmt=%Y-%m-%d %H:%M:%S

@ -9,9 +9,9 @@ import sys
from datetime import datetime from datetime import datetime
import os import os
import zipfile import zipfile
import logging.config
from PixivSearch.model import config from PixivSearch.model import config
from PixivSearch.settings import logger
headers = { headers = {
'X-Requested-With': 'XMLHttpRequest', 'X-Requested-With': 'XMLHttpRequest',
@ -21,8 +21,7 @@ headers = {
lock = threading.Lock() # 多线程全局资源锁 lock = threading.Lock() # 多线程全局资源锁
total = 1 total = 1
logging.config.fileConfig('PixivSearch/logging.conf')
logger = logging.getLogger('file')
def get_cookies(): def get_cookies():
_cookies = {} _cookies = {}
@ -33,8 +32,6 @@ def get_cookies():
return _cookies return _cookies
def crawl(url): def crawl(url):
global total global total
req = requests.get(url, headers=headers, cookies=get_cookies()).text req = requests.get(url, headers=headers, cookies=get_cookies()).text
@ -44,10 +41,12 @@ def crawl(url):
with lock: with lock:
nodes.append(imageNode) nodes.append(imageNode)
def get_urls(search, page): def get_urls(search, page):
fmt = 'https://www.pixiv.net/search.php?word={}&order=date_d&p={}' fmt = 'https://www.pixiv.net/search.php?word={}&order=date_d&p={}'
return [fmt.format(search, p) for p in range(1, page)] return [fmt.format(search, p) for p in range(1, page)]
def get_Img(params): def get_Img(params):
params[1]['imgUrl'] = 'https://i.pximg.net/img-original/img/' + params[1]['url'][-46:-15] params[1]['imgUrl'] = 'https://i.pximg.net/img-original/img/' + params[1]['url'][-46:-15]
@ -64,13 +63,17 @@ def get_Img(params):
logger.error('无法下载图片:%s' % (params[1]['illustTitle'])) logger.error('无法下载图片:%s' % (params[1]['illustTitle']))
return return
logger.info('下载图片:"%s"%s'%(params[1]['illustTitle'],os.getcwd().replace('\\','/')+'/'+imgPath+params[1]['illustId']+suffix)) logger.info('下载图片:"%s"%s' % (
params[1]['illustTitle'], os.getcwd().replace('\\', '/') + '/' + imgPath + params[1]['illustId'] + suffix))
f = open(imgPath + params[1]['illustId'] + suffix, 'wb') # 写入多媒体文件要 b 这个参数 f = open(imgPath + params[1]['illustId'] + suffix, 'wb') # 写入多媒体文件要 b 这个参数
f.write(s.content) # 多媒体文件要是用conctent f.write(s.content) # 多媒体文件要是用conctent
f.close() f.close()
params[1]['localName'] = params[1]['illustId'] + suffix params[1]['localName'] = params[1]['illustId'] + suffix
logger.info('排行第%d名,收藏数%d,标题:%s,标签:%s,(%s)前投稿,链接:%s'%(params[0],params[1]['bookmarkCount'],params[1]['illustTitle'], ','.join(params[1]['tags']),'',params[1]['imgUrl'])) logger.info('排行第%d名,收藏数%d,标题:%s,标签:%s,(%s)前投稿,链接:%s' % (
params[0], params[1]['bookmarkCount'], params[1]['illustTitle'], ','.join(params[1]['tags']), '',
params[1]['imgUrl']))
def zip(inputFile, outFile): def zip(inputFile, outFile):
f = zipfile.ZipFile(outFile, 'w', zipfile.ZIP_DEFLATED) f = zipfile.ZipFile(outFile, 'w', zipfile.ZIP_DEFLATED)
@ -79,7 +82,10 @@ def zip(inputFile,outFile):
f.write(os.path.join(dirpath, filename), filename) f.write(os.path.join(dirpath, filename), filename)
f.close() f.close()
fsize = '' fsize = ''
def get_nodes(param): def get_nodes(param):
global nodes, fsize, imgPath global nodes, fsize, imgPath
nodes = [] nodes = []
@ -122,7 +128,6 @@ def get_nodes(param):
return [nodes, tip, fsize] return [nodes, tip, fsize]
if __name__ == "__main__": if __name__ == "__main__":
if (len(sys.argv)) == 5 and sys.argv[2].isdigit() and sys.argv[3].isdigit(): if (len(sys.argv)) == 5 and sys.argv[2].isdigit() and sys.argv[3].isdigit():
try: try:

@ -80,7 +80,8 @@ DATABASES = {
'NAME': 'bangumi', 'NAME': 'bangumi',
'USER': 'bilibili', 'USER': 'bilibili',
'PASSWORD': '2233', 'PASSWORD': '2233',
'HOST': '127.0.0.1', # 'HOST': '127.0.0.1',
'HOST': 'mikuhime.xyz',
'PORT': '3306', 'PORT': '3306',
} }
} }
@ -127,7 +128,7 @@ STATICFILES_DIRS = [
os.path.join(os.path.dirname(__file__), 'static').replace('\\', '/'), os.path.join(os.path.dirname(__file__), 'static').replace('\\', '/'),
] ]
curr_dir = os.path.dirname(os.path.realpath(__file__)) configPath = '%s/logging.conf' % os.path.dirname(__file__).replace('\\', '/')
logging.config.fileConfig('%s%slogging.conf' % (curr_dir,os.sep)) logging.config.fileConfig(configPath)
logger = logging.getLogger('file') logger = logging.getLogger('file')

@ -89,9 +89,7 @@ def get(request):
# 测试方法 # 测试方法
def start(request): def start(request):
begin = int(request.GET.get('start')) _thread.start_new_thread(threadSave, ())
end = int(request.GET.get('end'))
_thread.start_new_thread(threadSave, (begin, end))
return HttpResponse("start success") return HttpResponse("start success")

Loading…
Cancel
Save