检查cookie

4 years ago · 9b0e661f90
parent 192ea09162
commit 9b0e661f90
2 changed files with 26 additions and 11 deletions
--- a/main.py
+++ b/main.py
@ -316,6 +316,20 @@ def save(des, res, params):
        ))


+login_url = 'https://libcon.bupt.edu.cn/login'
+
+
+def check(res):
+    if res.status_code == 200:
+        if res.url == login_url:
+            raise Exception('请更新cookie信息')
+        else:
+            return True
+    else:
+        return False
+
+
+
 # 万方平台论文采集
 def run(max=10, last_page=100, page_size=20):
    if max > last_page * page_size:
@ -330,7 +344,7 @@ def run(max=10, last_page=100, page_size=20):
        writeInfo(f'分页url={url}')
        res = session.get(url,
                          headers=headers)
-        if res.status_code == 200:
+        if check(res):
            params_list = parse(res.content)
            for params in params_list:
                params["base_url"] = base_url
@ -338,15 +352,15 @@ def run(max=10, last_page=100, page_size=20):
                    **params)
                writeInfo(f'下载接口={url}')
                res = session.get(url, headers=headers)
-                if res.status_code == 200 and 'downloadliterature.do' in res.url:
+                if check(res) and 'downloadliterature.do' in res.url:
                    res_html = BeautifulSoup(res.content, "html.parser")
                    downloadIframe = res_html.select_one('#downloadIframe')
                    if downloadIframe:
                        res = session.get(downloadIframe["src"])
-                        if res.status_code == 200 and 'download.ashx' in res.url:
+                        if check(res) and 'download.ashx' in res.url:
                            writeInfo("成功获取真实下载地址={path}".format(path=res.url))
                            res = session.get(res.url, headers=headers, stream=True)
-                            if res.status_code == 200 and 'pdf' in res.headers['Content-Type']:
+                            if check(res) and 'pdf' in res.headers['Content-Type']:
                                des = res.headers['Content-Disposition'].split(';')
                                if len(des) == 2 and len(des[1].split('=')) == 2:
                                    count = count + 1
@ -354,11 +368,7 @@ def run(max=10, last_page=100, page_size=20):
                                    if count <= max:
                                        executor.submit(save, des, res, params)
                                        if count == max:
-                                            writeInfo('采集任务已完成，论文入库中')
-                                            executor.shutdown(wait=True)
-                                            writeInfo('论文已入库')
-                                            split_word()
-                                            return
+                                            break
                                else:
                                    writeError("非法响应类型")
                            else:
@ -369,5 +379,10 @@ def run(max=10, last_page=100, page_size=20):
                        writeError("无法获取真实下载地址")
                else:
                    writeError('error code={code}'.format(code=res.status_code))
+            break
        else:
            writeError('error code={code}'.format(code=res.status_code))
+    writeInfo('采集任务已完成，论文入库中')
+    executor.shutdown(wait=True)
+    writeInfo('论文已入库')
+    split_word()
--- a/test.py
+++ b/test.py
@ -1,7 +1,7 @@
-from main import split_word,run,create_sim
+from main import split_word, run, create_sim

 if __name__ == '__main__':
    # 默认采集max篇论文
    run()
    # 指定max
-    # run(max=10)
+    # run(max=3)