From 9b0e661f9094fd748cc0df486130bf03834d0020 Mon Sep 17 00:00:00 2001
From: pan <1029559041@qq.com>
Date: Tue, 11 Aug 2020 22:59:27 +0800
Subject: [PATCH] =?UTF-8?q?=E6=A3=80=E6=9F=A5cookie?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 main.py | 33 ++++++++++++++++++++++++---------
 test.py |  4 ++--
 2 files changed, 26 insertions(+), 11 deletions(-)

diff --git a/main.py b/main.py
index 14fab93..c85c41d 100644
--- a/main.py
+++ b/main.py
@@ -316,6 +316,20 @@ def save(des, res, params):
         ))
 
 
+login_url = 'https://libcon.bupt.edu.cn/login'
+
+
+def check(res):
+    if res.status_code == 200:
+        if res.url == login_url:
+            raise Exception('请更新cookie信息')
+        else:
+            return True
+    else:
+        return False
+
+
+
 # 万方平台论文采集
 def run(max=10, last_page=100, page_size=20):
     if max > last_page * page_size:
@@ -330,7 +344,7 @@ def run(max=10, last_page=100, page_size=20):
         writeInfo(f'分页url={url}')
         res = session.get(url,
                           headers=headers)
-        if res.status_code == 200:
+        if check(res):
             params_list = parse(res.content)
             for params in params_list:
                 params["base_url"] = base_url
@@ -338,15 +352,15 @@ def run(max=10, last_page=100, page_size=20):
                     **params)
                 writeInfo(f'下载接口={url}')
                 res = session.get(url, headers=headers)
-                if res.status_code == 200 and 'downloadliterature.do' in res.url:
+                if check(res) and 'downloadliterature.do' in res.url:
                     res_html = BeautifulSoup(res.content, "html.parser")
                     downloadIframe = res_html.select_one('#downloadIframe')
                     if downloadIframe:
                         res = session.get(downloadIframe["src"])
-                        if res.status_code == 200 and 'download.ashx' in res.url:
+                        if check(res) and 'download.ashx' in res.url:
                             writeInfo("成功获取真实下载地址={path}".format(path=res.url))
                             res = session.get(res.url, headers=headers, stream=True)
-                            if res.status_code == 200 and 'pdf' in res.headers['Content-Type']:
+                            if check(res) and 'pdf' in res.headers['Content-Type']:
                                 des = res.headers['Content-Disposition'].split(';')
                                 if len(des) == 2 and len(des[1].split('=')) == 2:
                                     count = count + 1
@@ -354,11 +368,7 @@ def run(max=10, last_page=100, page_size=20):
                                     if count <= max:
                                         executor.submit(save, des, res, params)
                                         if count == max:
-                                            writeInfo('采集任务已完成，论文入库中')
-                                            executor.shutdown(wait=True)
-                                            writeInfo('论文已入库')
-                                            split_word()
-                                            return
+                                            break
                                 else:
                                     writeError("非法响应类型")
                             else:
@@ -369,5 +379,10 @@ def run(max=10, last_page=100, page_size=20):
                         writeError("无法获取真实下载地址")
                 else:
                     writeError('error code={code}'.format(code=res.status_code))
+            break
         else:
             writeError('error code={code}'.format(code=res.status_code))
+    writeInfo('采集任务已完成，论文入库中')
+    executor.shutdown(wait=True)
+    writeInfo('论文已入库')
+    split_word()
diff --git a/test.py b/test.py
index 27636ee..fd5ef84 100644
--- a/test.py
+++ b/test.py
@@ -1,7 +1,7 @@
-from main import split_word,run,create_sim
+from main import split_word, run, create_sim
 
 if __name__ == '__main__':
     # 默认采集max篇论文
     run()
     # 指定max
-    # run(max=10)
+    # run(max=3)
\ No newline at end of file