|
|
@ -152,7 +152,7 @@ session.cookies.update(cookies) |
|
|
|
|
|
|
|
|
|
|
|
pdf_dir = 'pdf' |
|
|
|
pdf_dir = 'pdf' |
|
|
|
html_dir = 'html' |
|
|
|
html_dir = 'html' |
|
|
|
executor = ThreadPoolExecutor(max_workers=2) |
|
|
|
# executor = ThreadPoolExecutor(max_workers=1) |
|
|
|
# 向量表 |
|
|
|
# 向量表 |
|
|
|
sys_tfidf = 'sys_tfidf' |
|
|
|
sys_tfidf = 'sys_tfidf' |
|
|
|
# 论文表 |
|
|
|
# 论文表 |
|
|
@ -329,7 +329,6 @@ def check(res): |
|
|
|
return False |
|
|
|
return False |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 万方平台论文采集 |
|
|
|
# 万方平台论文采集 |
|
|
|
def run(max=10, last_page=100, page_size=20): |
|
|
|
def run(max=10, last_page=100, page_size=20): |
|
|
|
if max > last_page * page_size: |
|
|
|
if max > last_page * page_size: |
|
|
@ -366,7 +365,7 @@ def run(max=10, last_page=100, page_size=20): |
|
|
|
count = count + 1 |
|
|
|
count = count + 1 |
|
|
|
writeInfo(f'当前采集进度{count}/{max},{round(count / max, 4) * 100}%') |
|
|
|
writeInfo(f'当前采集进度{count}/{max},{round(count / max, 4) * 100}%') |
|
|
|
if count <= max: |
|
|
|
if count <= max: |
|
|
|
executor.submit(save, des, res, params) |
|
|
|
save(des, res, params) |
|
|
|
if count == max: |
|
|
|
if count == max: |
|
|
|
break |
|
|
|
break |
|
|
|
else: |
|
|
|
else: |
|
|
|