diff --git a/main.py b/main.py index 773b28b..6d3d37d 100644 --- a/main.py +++ b/main.py @@ -135,7 +135,7 @@ def parse(content): return params_list -suffix = '77726476706e69737468656265737421f7b9569d2936695e790c88b8991b203a18454272' +suffix = '77726476706e69737468656265737421e7e056d2303166567f068ea89941227bfcd3ca21bd0c' base_url = f'https://libcon.bupt.edu.cn/http/{suffix}' profession = "计算机软件与理论" keyword = f'(专业%3A"{profession}")' @@ -351,31 +351,36 @@ def run(max=10, last_page=100, page_size=20): **params) writeInfo(f'下载接口={url}') res = session.get(url, headers=headers) - if check(res) and 'downloadliterature.do' in res.url: + if check(res): res_html = BeautifulSoup(res.content, "html.parser") - downloadIframe = res_html.select_one('#downloadIframe') - if downloadIframe: - res = session.get(downloadIframe["src"]) - if check(res) and 'download.ashx' in res.url: - writeInfo("成功获取真实下载地址={path}".format(path=res.url)) - res = session.get(res.url, headers=headers, stream=True) - if check(res) and 'pdf' in res.headers['Content-Type']: - des = res.headers['Content-Disposition'].split(';') - if len(des) == 2 and len(des[1].split('=')) == 2: - count = count + 1 - writeInfo(f'当前采集进度{count}/{max},{round(count / max, 4) * 100}%') - if count <= max: - save(des, res, params) - if count == max: - break + if 'downloadliterature.do' in res.url: + downloadIframe = res_html.select_one('#downloadIframe') + if downloadIframe: + res = session.get(downloadIframe["src"]) + if check(res) and 'download.ashx' in res.url: + writeInfo("成功获取真实下载地址={path}".format(path=res.url)) + res = session.get(res.url, headers=headers, stream=True) + if check(res) and 'pdf' in res.headers['Content-Type']: + des = res.headers['Content-Disposition'].split(';') + if len(des) == 2 and len(des[1].split('=')) == 2: + count = count + 1 + writeInfo(f'当前采集进度{count}/{max},{round(count / max, 4) * 100}%') + if count <= max: + save(des, res, params) + if count == max: + break + else: + writeError("非法响应类型") else: - writeError("非法响应类型") + writeError("无法获取文档信息") else: - writeError("无法获取文档信息") + writeError("无法获取文档真实下载地址") else: - writeError("无法获取文档真实下载地址") + writeError("无法获取真实下载地址") + elif res_html.select_one('title').text == '交易': + raise Exception(res_html.select_one('div.NotWork>span').text) else: - writeError("无法获取真实下载地址") + raise Exception('发生未知错误!!!') else: writeError('error code={code}'.format(code=res.status_code)) break