From 9b0e661f9094fd748cc0df486130bf03834d0020 Mon Sep 17 00:00:00 2001 From: pan <1029559041@qq.com> Date: Tue, 11 Aug 2020 22:59:27 +0800 Subject: [PATCH] =?UTF-8?q?=E6=A3=80=E6=9F=A5cookie?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- main.py | 33 ++++++++++++++++++++++++--------- test.py | 4 ++-- 2 files changed, 26 insertions(+), 11 deletions(-) diff --git a/main.py b/main.py index 14fab93..c85c41d 100644 --- a/main.py +++ b/main.py @@ -316,6 +316,20 @@ def save(des, res, params): )) +login_url = 'https://libcon.bupt.edu.cn/login' + + +def check(res): + if res.status_code == 200: + if res.url == login_url: + raise Exception('请更新cookie信息') + else: + return True + else: + return False + + + # 万方平台论文采集 def run(max=10, last_page=100, page_size=20): if max > last_page * page_size: @@ -330,7 +344,7 @@ def run(max=10, last_page=100, page_size=20): writeInfo(f'分页url={url}') res = session.get(url, headers=headers) - if res.status_code == 200: + if check(res): params_list = parse(res.content) for params in params_list: params["base_url"] = base_url @@ -338,15 +352,15 @@ def run(max=10, last_page=100, page_size=20): **params) writeInfo(f'下载接口={url}') res = session.get(url, headers=headers) - if res.status_code == 200 and 'downloadliterature.do' in res.url: + if check(res) and 'downloadliterature.do' in res.url: res_html = BeautifulSoup(res.content, "html.parser") downloadIframe = res_html.select_one('#downloadIframe') if downloadIframe: res = session.get(downloadIframe["src"]) - if res.status_code == 200 and 'download.ashx' in res.url: + if check(res) and 'download.ashx' in res.url: writeInfo("成功获取真实下载地址={path}".format(path=res.url)) res = session.get(res.url, headers=headers, stream=True) - if res.status_code == 200 and 'pdf' in res.headers['Content-Type']: + if check(res) and 'pdf' in res.headers['Content-Type']: des = res.headers['Content-Disposition'].split(';') if len(des) == 2 and len(des[1].split('=')) == 2: count = count + 1 @@ -354,11 +368,7 @@ def run(max=10, last_page=100, page_size=20): if count <= max: executor.submit(save, des, res, params) if count == max: - writeInfo('采集任务已完成,论文入库中') - executor.shutdown(wait=True) - writeInfo('论文已入库') - split_word() - return + break else: writeError("非法响应类型") else: @@ -369,5 +379,10 @@ def run(max=10, last_page=100, page_size=20): writeError("无法获取真实下载地址") else: writeError('error code={code}'.format(code=res.status_code)) + break else: writeError('error code={code}'.format(code=res.status_code)) + writeInfo('采集任务已完成,论文入库中') + executor.shutdown(wait=True) + writeInfo('论文已入库') + split_word() diff --git a/test.py b/test.py index 27636ee..fd5ef84 100644 --- a/test.py +++ b/test.py @@ -1,7 +1,7 @@ -from main import split_word,run,create_sim +from main import split_word, run, create_sim if __name__ == '__main__': # 默认采集max篇论文 run() # 指定max - # run(max=10) + # run(max=3) \ No newline at end of file