|
|
|
@ -316,6 +316,20 @@ def save(des, res, params): |
|
|
|
|
)) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
login_url = 'https://libcon.bupt.edu.cn/login' |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def check(res): |
|
|
|
|
if res.status_code == 200: |
|
|
|
|
if res.url == login_url: |
|
|
|
|
raise Exception('请更新cookie信息') |
|
|
|
|
else: |
|
|
|
|
return True |
|
|
|
|
else: |
|
|
|
|
return False |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 万方平台论文采集 |
|
|
|
|
def run(max=10, last_page=100, page_size=20): |
|
|
|
|
if max > last_page * page_size: |
|
|
|
@ -330,7 +344,7 @@ def run(max=10, last_page=100, page_size=20): |
|
|
|
|
writeInfo(f'分页url={url}') |
|
|
|
|
res = session.get(url, |
|
|
|
|
headers=headers) |
|
|
|
|
if res.status_code == 200: |
|
|
|
|
if check(res): |
|
|
|
|
params_list = parse(res.content) |
|
|
|
|
for params in params_list: |
|
|
|
|
params["base_url"] = base_url |
|
|
|
@ -338,15 +352,15 @@ def run(max=10, last_page=100, page_size=20): |
|
|
|
|
**params) |
|
|
|
|
writeInfo(f'下载接口={url}') |
|
|
|
|
res = session.get(url, headers=headers) |
|
|
|
|
if res.status_code == 200 and 'downloadliterature.do' in res.url: |
|
|
|
|
if check(res) and 'downloadliterature.do' in res.url: |
|
|
|
|
res_html = BeautifulSoup(res.content, "html.parser") |
|
|
|
|
downloadIframe = res_html.select_one('#downloadIframe') |
|
|
|
|
if downloadIframe: |
|
|
|
|
res = session.get(downloadIframe["src"]) |
|
|
|
|
if res.status_code == 200 and 'download.ashx' in res.url: |
|
|
|
|
if check(res) and 'download.ashx' in res.url: |
|
|
|
|
writeInfo("成功获取真实下载地址={path}".format(path=res.url)) |
|
|
|
|
res = session.get(res.url, headers=headers, stream=True) |
|
|
|
|
if res.status_code == 200 and 'pdf' in res.headers['Content-Type']: |
|
|
|
|
if check(res) and 'pdf' in res.headers['Content-Type']: |
|
|
|
|
des = res.headers['Content-Disposition'].split(';') |
|
|
|
|
if len(des) == 2 and len(des[1].split('=')) == 2: |
|
|
|
|
count = count + 1 |
|
|
|
@ -354,11 +368,7 @@ def run(max=10, last_page=100, page_size=20): |
|
|
|
|
if count <= max: |
|
|
|
|
executor.submit(save, des, res, params) |
|
|
|
|
if count == max: |
|
|
|
|
writeInfo('采集任务已完成,论文入库中') |
|
|
|
|
executor.shutdown(wait=True) |
|
|
|
|
writeInfo('论文已入库') |
|
|
|
|
split_word() |
|
|
|
|
return |
|
|
|
|
break |
|
|
|
|
else: |
|
|
|
|
writeError("非法响应类型") |
|
|
|
|
else: |
|
|
|
@ -369,5 +379,10 @@ def run(max=10, last_page=100, page_size=20): |
|
|
|
|
writeError("无法获取真实下载地址") |
|
|
|
|
else: |
|
|
|
|
writeError('error code={code}'.format(code=res.status_code)) |
|
|
|
|
break |
|
|
|
|
else: |
|
|
|
|
writeError('error code={code}'.format(code=res.status_code)) |
|
|
|
|
writeInfo('采集任务已完成,论文入库中') |
|
|
|
|
executor.shutdown(wait=True) |
|
|
|
|
writeInfo('论文已入库') |
|
|
|
|
split_word() |
|
|
|
|