多线程爬虫

6 years ago · d210153bdb
parent 7312fa2315
commit d210153bdb
5 changed files with 103 additions and 70 deletions
--- a/PixivSearch/dao/bangumi.py
+++ b/PixivSearch/dao/bangumi.py
@ -1,5 +1,6 @@
 import _thread
 import json
+import math
 import os
 import random
 import time
@ -31,16 +32,16 @@ def check():
        time.sleep(1)


-
-
-
 def save(md):
    if isStop:
        return
-    time.sleep(random.randint(1, 3))
-    url = "https://www.bilibili.com/bangumi/media/md%d" % md
-    req = requests.get(url)

+    url = "https://www.bilibili.com/bangumi/media/md%d" % md
+    try:
+        req = requests.get(url, timeout=10)
+    except BaseException as e:
+        logger.error(e)
+        save(md)
    logger.info("request_url=%s,status_code=%d" % (url, req.status_code))
    if (req.status_code == 200):
        tag = BeautifulSoup(req.text, 'lxml')
@ -60,17 +61,45 @@ def save(md):
            logger.error("发生异常")
            logger.error(e)

+
 # asdasd
 def get_():
    global current_mediaInfo
    return current_mediaInfo


-def threadSave(start, end):
+page_size = 100
+pages = None
+ids = None
+
+
+def getIds():
+    global ids
+    global pages
+    if ids is None or len(ids) != 0:
        ids = []
+    page = 1
+    while pages is None or page <= pages:
+        url = 'https://bangumi.bilibili.com/web_api/season/index_global?page=%d&page_size=%d' % (page, page_size)
+        logger.info(url)
+        try:
+            req = requests.get(url, timeout=10)
+            json_obj = json.loads(req.text)
+            bangumiList = json_obj['result']['list']
+            for bangumi in bangumiList:
+                ids.append(int(bangumi['season_id']))
+            if pages is None:
+                pages = int(math.ceil(int(json_obj['result']['count']) / page_size))
+        except BaseException as e:
+            logger.error('连接超时')
+            logger(e)
+            continue
+        page = page + 1
+

-    for id in range(start, end):
-        ids.append(id)
+def threadSave():
+    getIds()
+    logger.info(len(ids))
    try:
        global executors
        executors = futures.ThreadPoolExecutor(32)
--- a/PixivSearch/logging.conf
+++ b/PixivSearch/logging.conf
@ -29,5 +29,5 @@ formatter=fmt
 args=('pixiv.log','a','utf-8',False)

 [formatter_fmt]
-format=%(asctime)s - %(name)s - %(levelname)s - %(module)s :%(message)s
+format=%(asctime)s - %(name)s - %(levelname)s - %(module)s:line:%(lineno)d - %(message)s
 datefmt=%Y-%m-%d %H:%M:%S
--- a/PixivSearch/pixiv.py
+++ b/PixivSearch/pixiv.py
@ -9,9 +9,9 @@ import sys
 from datetime import datetime
 import os
 import zipfile
-import logging.config

 from PixivSearch.model import config
+from PixivSearch.settings import logger

 headers = {
    'X-Requested-With': 'XMLHttpRequest',
@ -21,8 +21,7 @@ headers = {

 lock = threading.Lock()  # 多线程全局资源锁
 total = 1
-logging.config.fileConfig('PixivSearch/logging.conf')
-logger = logging.getLogger('file')
+

 def get_cookies():
    _cookies = {}
@ -33,8 +32,6 @@ def get_cookies():
    return _cookies


-
-
 def crawl(url):
    global total
    req = requests.get(url, headers=headers, cookies=get_cookies()).text
@ -44,10 +41,12 @@ def crawl(url):
        with lock:
            nodes.append(imageNode)

+
 def get_urls(search, page):
    fmt = 'https://www.pixiv.net/search.php?word={}&order=date_d&p={}'
    return [fmt.format(search, p) for p in range(1, page)]

+
 def get_Img(params):
    params[1]['imgUrl'] = 'https://i.pximg.net/img-original/img/' + params[1]['url'][-46:-15]

@ -64,13 +63,17 @@ def get_Img(params):
        logger.error('无法下载图片：%s' % (params[1]['illustTitle']))
        return

-    logger.info('下载图片："%s"到%s'%(params[1]['illustTitle'],os.getcwd().replace('\\','/')+'/'+imgPath+params[1]['illustId']+suffix))
+    logger.info('下载图片："%s"到%s' % (
+    params[1]['illustTitle'], os.getcwd().replace('\\', '/') + '/' + imgPath + params[1]['illustId'] + suffix))
    f = open(imgPath + params[1]['illustId'] + suffix, 'wb')  # 写入多媒体文件要 b 这个参数
    f.write(s.content)  # 多媒体文件要是用conctent
    f.close()

    params[1]['localName'] = params[1]['illustId'] + suffix
-    logger.info('排行第%d名，收藏数%d，标题：%s，标签：%s，(%s)前投稿，链接：%s'%(params[0],params[1]['bookmarkCount'],params[1]['illustTitle'], ','.join(params[1]['tags']),'',params[1]['imgUrl']))
+    logger.info('排行第%d名，收藏数%d，标题：%s，标签：%s，(%s)前投稿，链接：%s' % (
+    params[0], params[1]['bookmarkCount'], params[1]['illustTitle'], ','.join(params[1]['tags']), '',
+    params[1]['imgUrl']))
+

 def zip(inputFile, outFile):
    f = zipfile.ZipFile(outFile, 'w', zipfile.ZIP_DEFLATED)
@ -79,7 +82,10 @@ def zip(inputFile,outFile):
            f.write(os.path.join(dirpath, filename), filename)
    f.close()

+
 fsize = ''
+
+
 def get_nodes(param):
    global nodes, fsize, imgPath
    nodes = []
@ -122,7 +128,6 @@ def get_nodes(param):
    return [nodes, tip, fsize]


-
 if __name__ == "__main__":
    if (len(sys.argv)) == 5 and sys.argv[2].isdigit() and sys.argv[3].isdigit():
        try:
--- a/PixivSearch/settings.py
+++ b/PixivSearch/settings.py
@ -80,7 +80,8 @@ DATABASES = {
        'NAME': 'bangumi',
        'USER': 'bilibili',
        'PASSWORD': '2233',
-        'HOST': '127.0.0.1',
+        # 'HOST': '127.0.0.1',
+        'HOST': 'mikuhime.xyz',
        'PORT': '3306',
    }
 }
@ -127,7 +128,7 @@ STATICFILES_DIRS = [
    os.path.join(os.path.dirname(__file__), 'static').replace('\\', '/'),
 ]

-curr_dir = os.path.dirname(os.path.realpath(__file__))
-logging.config.fileConfig('%s%slogging.conf' % (curr_dir,os.sep))
+configPath = '%s/logging.conf' % os.path.dirname(__file__).replace('\\', '/')
+logging.config.fileConfig(configPath)
 logger = logging.getLogger('file')

--- a/PixivSearch/view.py
+++ b/PixivSearch/view.py
@ -89,9 +89,7 @@ def get(request):

 # 测试方法
 def start(request):
-    begin = int(request.GET.get('start'))
-    end = int(request.GET.get('end'))
-    _thread.start_new_thread(threadSave, (begin, end))
+    _thread.start_new_thread(threadSave, ())
    return HttpResponse("start success")