检查cookie

master
pan 4 years ago
parent 192ea09162
commit 9b0e661f90
  1. 33
      main.py
  2. 4
      test.py

@ -316,6 +316,20 @@ def save(des, res, params):
)) ))
login_url = 'https://libcon.bupt.edu.cn/login'
def check(res):
if res.status_code == 200:
if res.url == login_url:
raise Exception('请更新cookie信息')
else:
return True
else:
return False
# 万方平台论文采集 # 万方平台论文采集
def run(max=10, last_page=100, page_size=20): def run(max=10, last_page=100, page_size=20):
if max > last_page * page_size: if max > last_page * page_size:
@ -330,7 +344,7 @@ def run(max=10, last_page=100, page_size=20):
writeInfo(f'分页url={url}') writeInfo(f'分页url={url}')
res = session.get(url, res = session.get(url,
headers=headers) headers=headers)
if res.status_code == 200: if check(res):
params_list = parse(res.content) params_list = parse(res.content)
for params in params_list: for params in params_list:
params["base_url"] = base_url params["base_url"] = base_url
@ -338,15 +352,15 @@ def run(max=10, last_page=100, page_size=20):
**params) **params)
writeInfo(f'下载接口={url}') writeInfo(f'下载接口={url}')
res = session.get(url, headers=headers) res = session.get(url, headers=headers)
if res.status_code == 200 and 'downloadliterature.do' in res.url: if check(res) and 'downloadliterature.do' in res.url:
res_html = BeautifulSoup(res.content, "html.parser") res_html = BeautifulSoup(res.content, "html.parser")
downloadIframe = res_html.select_one('#downloadIframe') downloadIframe = res_html.select_one('#downloadIframe')
if downloadIframe: if downloadIframe:
res = session.get(downloadIframe["src"]) res = session.get(downloadIframe["src"])
if res.status_code == 200 and 'download.ashx' in res.url: if check(res) and 'download.ashx' in res.url:
writeInfo("成功获取真实下载地址={path}".format(path=res.url)) writeInfo("成功获取真实下载地址={path}".format(path=res.url))
res = session.get(res.url, headers=headers, stream=True) res = session.get(res.url, headers=headers, stream=True)
if res.status_code == 200 and 'pdf' in res.headers['Content-Type']: if check(res) and 'pdf' in res.headers['Content-Type']:
des = res.headers['Content-Disposition'].split(';') des = res.headers['Content-Disposition'].split(';')
if len(des) == 2 and len(des[1].split('=')) == 2: if len(des) == 2 and len(des[1].split('=')) == 2:
count = count + 1 count = count + 1
@ -354,11 +368,7 @@ def run(max=10, last_page=100, page_size=20):
if count <= max: if count <= max:
executor.submit(save, des, res, params) executor.submit(save, des, res, params)
if count == max: if count == max:
writeInfo('采集任务已完成,论文入库中') break
executor.shutdown(wait=True)
writeInfo('论文已入库')
split_word()
return
else: else:
writeError("非法响应类型") writeError("非法响应类型")
else: else:
@ -369,5 +379,10 @@ def run(max=10, last_page=100, page_size=20):
writeError("无法获取真实下载地址") writeError("无法获取真实下载地址")
else: else:
writeError('error code={code}'.format(code=res.status_code)) writeError('error code={code}'.format(code=res.status_code))
break
else: else:
writeError('error code={code}'.format(code=res.status_code)) writeError('error code={code}'.format(code=res.status_code))
writeInfo('采集任务已完成,论文入库中')
executor.shutdown(wait=True)
writeInfo('论文已入库')
split_word()

@ -1,7 +1,7 @@
from main import split_word,run,create_sim from main import split_word, run, create_sim
if __name__ == '__main__': if __name__ == '__main__':
# 默认采集max篇论文 # 默认采集max篇论文
run() run()
# 指定max # 指定max
# run(max=10) # run(max=3)
Loading…
Cancel
Save