检查cookie

master
pan 4 years ago
parent 192ea09162
commit 9b0e661f90
  1. 33
      main.py
  2. 4
      test.py

@ -316,6 +316,20 @@ def save(des, res, params):
))
login_url = 'https://libcon.bupt.edu.cn/login'
def check(res):
if res.status_code == 200:
if res.url == login_url:
raise Exception('请更新cookie信息')
else:
return True
else:
return False
# 万方平台论文采集
def run(max=10, last_page=100, page_size=20):
if max > last_page * page_size:
@ -330,7 +344,7 @@ def run(max=10, last_page=100, page_size=20):
writeInfo(f'分页url={url}')
res = session.get(url,
headers=headers)
if res.status_code == 200:
if check(res):
params_list = parse(res.content)
for params in params_list:
params["base_url"] = base_url
@ -338,15 +352,15 @@ def run(max=10, last_page=100, page_size=20):
**params)
writeInfo(f'下载接口={url}')
res = session.get(url, headers=headers)
if res.status_code == 200 and 'downloadliterature.do' in res.url:
if check(res) and 'downloadliterature.do' in res.url:
res_html = BeautifulSoup(res.content, "html.parser")
downloadIframe = res_html.select_one('#downloadIframe')
if downloadIframe:
res = session.get(downloadIframe["src"])
if res.status_code == 200 and 'download.ashx' in res.url:
if check(res) and 'download.ashx' in res.url:
writeInfo("成功获取真实下载地址={path}".format(path=res.url))
res = session.get(res.url, headers=headers, stream=True)
if res.status_code == 200 and 'pdf' in res.headers['Content-Type']:
if check(res) and 'pdf' in res.headers['Content-Type']:
des = res.headers['Content-Disposition'].split(';')
if len(des) == 2 and len(des[1].split('=')) == 2:
count = count + 1
@ -354,11 +368,7 @@ def run(max=10, last_page=100, page_size=20):
if count <= max:
executor.submit(save, des, res, params)
if count == max:
writeInfo('采集任务已完成,论文入库中')
executor.shutdown(wait=True)
writeInfo('论文已入库')
split_word()
return
break
else:
writeError("非法响应类型")
else:
@ -369,5 +379,10 @@ def run(max=10, last_page=100, page_size=20):
writeError("无法获取真实下载地址")
else:
writeError('error code={code}'.format(code=res.status_code))
break
else:
writeError('error code={code}'.format(code=res.status_code))
writeInfo('采集任务已完成,论文入库中')
executor.shutdown(wait=True)
writeInfo('论文已入库')
split_word()

@ -1,7 +1,7 @@
from main import split_word,run,create_sim
from main import split_word, run, create_sim
if __name__ == '__main__':
# 默认采集max篇论文
run()
# 指定max
# run(max=10)
# run(max=3)
Loading…
Cancel
Save