|
|
@ -135,7 +135,7 @@ def parse(content): |
|
|
|
return params_list |
|
|
|
return params_list |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
suffix = '77726476706e69737468656265737421f7b9569d2936695e790c88b8991b203a18454272' |
|
|
|
suffix = '77726476706e69737468656265737421e7e056d2303166567f068ea89941227bfcd3ca21bd0c' |
|
|
|
base_url = f'https://libcon.bupt.edu.cn/http/{suffix}' |
|
|
|
base_url = f'https://libcon.bupt.edu.cn/http/{suffix}' |
|
|
|
profession = "计算机软件与理论" |
|
|
|
profession = "计算机软件与理论" |
|
|
|
keyword = f'(专业%3A"{profession}")' |
|
|
|
keyword = f'(专业%3A"{profession}")' |
|
|
@ -351,31 +351,36 @@ def run(max=10, last_page=100, page_size=20): |
|
|
|
**params) |
|
|
|
**params) |
|
|
|
writeInfo(f'下载接口={url}') |
|
|
|
writeInfo(f'下载接口={url}') |
|
|
|
res = session.get(url, headers=headers) |
|
|
|
res = session.get(url, headers=headers) |
|
|
|
if check(res) and 'downloadliterature.do' in res.url: |
|
|
|
if check(res): |
|
|
|
res_html = BeautifulSoup(res.content, "html.parser") |
|
|
|
res_html = BeautifulSoup(res.content, "html.parser") |
|
|
|
downloadIframe = res_html.select_one('#downloadIframe') |
|
|
|
if 'downloadliterature.do' in res.url: |
|
|
|
if downloadIframe: |
|
|
|
downloadIframe = res_html.select_one('#downloadIframe') |
|
|
|
res = session.get(downloadIframe["src"]) |
|
|
|
if downloadIframe: |
|
|
|
if check(res) and 'download.ashx' in res.url: |
|
|
|
res = session.get(downloadIframe["src"]) |
|
|
|
writeInfo("成功获取真实下载地址={path}".format(path=res.url)) |
|
|
|
if check(res) and 'download.ashx' in res.url: |
|
|
|
res = session.get(res.url, headers=headers, stream=True) |
|
|
|
writeInfo("成功获取真实下载地址={path}".format(path=res.url)) |
|
|
|
if check(res) and 'pdf' in res.headers['Content-Type']: |
|
|
|
res = session.get(res.url, headers=headers, stream=True) |
|
|
|
des = res.headers['Content-Disposition'].split(';') |
|
|
|
if check(res) and 'pdf' in res.headers['Content-Type']: |
|
|
|
if len(des) == 2 and len(des[1].split('=')) == 2: |
|
|
|
des = res.headers['Content-Disposition'].split(';') |
|
|
|
count = count + 1 |
|
|
|
if len(des) == 2 and len(des[1].split('=')) == 2: |
|
|
|
writeInfo(f'当前采集进度{count}/{max},{round(count / max, 4) * 100}%') |
|
|
|
count = count + 1 |
|
|
|
if count <= max: |
|
|
|
writeInfo(f'当前采集进度{count}/{max},{round(count / max, 4) * 100}%') |
|
|
|
save(des, res, params) |
|
|
|
if count <= max: |
|
|
|
if count == max: |
|
|
|
save(des, res, params) |
|
|
|
break |
|
|
|
if count == max: |
|
|
|
|
|
|
|
break |
|
|
|
|
|
|
|
else: |
|
|
|
|
|
|
|
writeError("非法响应类型") |
|
|
|
else: |
|
|
|
else: |
|
|
|
writeError("非法响应类型") |
|
|
|
writeError("无法获取文档信息") |
|
|
|
else: |
|
|
|
else: |
|
|
|
writeError("无法获取文档信息") |
|
|
|
writeError("无法获取文档真实下载地址") |
|
|
|
else: |
|
|
|
else: |
|
|
|
writeError("无法获取文档真实下载地址") |
|
|
|
writeError("无法获取真实下载地址") |
|
|
|
|
|
|
|
elif res_html.select_one('title').text == '交易': |
|
|
|
|
|
|
|
raise Exception(res_html.select_one('div.NotWork>span').text) |
|
|
|
else: |
|
|
|
else: |
|
|
|
writeError("无法获取真实下载地址") |
|
|
|
raise Exception('发生未知错误!!!') |
|
|
|
else: |
|
|
|
else: |
|
|
|
writeError('error code={code}'.format(code=res.status_code)) |
|
|
|
writeError('error code={code}'.format(code=res.status_code)) |
|
|
|
break |
|
|
|
break |
|
|
|