增加采集指定论文数的逻辑

master
pan 4 years ago
parent d614204850
commit 5520d64b39
  1. 101
      main.py
  2. 7
      test.py

@ -3,7 +3,7 @@ import re
from concurrent.futures.thread import ThreadPoolExecutor
from itertools import combinations
from typing import Tuple
from urllib.parse import unquote
from urllib.parse import unquote, urlencode
import jieba
import pymysql as pymysql
@ -12,6 +12,7 @@ from Scripts import pdf2txt
from bs4 import BeautifulSoup
from jieba import posseg
from lxml import etree
from requests.adapters import HTTPAdapter
from requests.cookies import RequestsCookieJar
from config.config import cf
@ -134,7 +135,7 @@ def parse(content):
return params_list
base_url = 'https://libcon.bupt.edu.cn/http/77726476706e69737468656265737421e7e056d2303166567f068ea89941227bfcd3ca21bd0c'
base_url = 'https://libcon.bupt.edu.cn/http/77726476706e69737468656265737421f7b9569d2936695e790c88b8991b203a18454272'
profession = "计算机软件与理论"
keyword = f'(专业%3A"{profession}")'
headers = {
@ -142,10 +143,10 @@ headers = {
}
db = MysqlDB()
session = requests.Session()
session.mount('http://', HTTPAdapter(max_retries=3))
session.mount('https://', HTTPAdapter(max_retries=3))
cookies = RequestsCookieJar()
cookies.set('wengine_vpn_ticketlibcon_bupt_edu_cn', '85f585f32fdf27ca', path='/', domain='.libcon.bupt.edu.cn')
cookies.set('remember_token', 'yeysYCGGzNIrcDDUNXOVyNXODUZzJpROJgLmsObLAXrsmibvEqcwqzRkDYlODYWA', path='/',
domain='libcon.bupt.edu.cn')
cookies.set('wengine_vpn_ticketlibcon_bupt_edu_cn', '7bde1fe6992c50f9', path='/', domain='.libcon.bupt.edu.cn')
session.cookies.update(cookies)
pdf_dir = 'pdf'
@ -155,6 +156,8 @@ executor = ThreadPoolExecutor(max_workers=2)
sys_tfidf = 'sys_tfidf'
# 论文表
sys_paper = 'sys_paper'
# 词库表
sys_word = 'sys_word'
# 相似度表
sys_similarity = 'sys_similarity'
@ -179,7 +182,7 @@ def split_word():
jieba.enable_paddle()
start = db.query(f'select min(id) from {sys_paper}')[0][0]
end = db.query(f'select max(id) from {sys_paper}')[0][0]
result = db.query('select word,flag from sys_word')
result = db.query(f'select word,flag from {sys_word}')
filter_word = set(Word(_[0], _[1]) for _ in result)
new_word = set()
count = 0
@ -198,7 +201,7 @@ def split_word():
if len(new_word) > 0:
words = tuple((_.word, _.flag) for _ in new_word)
db.modify('insert into sys_word(word,flag) values (%s,%s)', words)
db.modify(f'insert into {sys_word}(word,flag) values (%s,%s)', words)
create_doc_vector()
create_sim()
else:
@ -210,7 +213,7 @@ def create_doc_vector():
start = time.time()
writeInfo('开始计算文档向量')
db.modify(f'drop table if exists {sys_tfidf}')
db.modify(f'''
create_table_sql = f'''
create table {sys_tfidf}
(
id bigint NOT NULL AUTO_INCREMENT,
@ -228,39 +231,43 @@ def create_doc_vector():
sum(if(locate(word, txt_content) > 0, 1, 0)) as df,
log((select count(*) from {sys_paper}) / sum(if(locate(word, txt_content) > 0, 1, 0))) + 1 as idf
from {sys_paper},
sys_word
{sys_word}
group by word) as f) as f
group by id
''')
writeInfo(f'计算文档向量花费{round(time.time() - start)}s')
'''
db.modify(create_table_sql)
writeInfo(f'计算文档向量执行sql{create_table_sql}花费{round(time.time() - start)}s')
# 计算文档相似度
def create_sim():
ids = db.query(f'select group_concat(id) from {sys_paper}')
if len(ids)>0 and len(ids[0])>0:
if len(ids) > 0 and len(ids[0]) > 0:
group_ids = list(combinations(ids[0][0].split(','), 2))
for id1, id2 in group_ids:
result = db.query(f'select tfidf from {sys_tfidf} where id in ({id1},{id2})')
tfidf1, tfidf2 = result
writeInfo(f'开始比较文档{id1}和文档{id2}的相似度')
w1 = tfidf1[0].split(',')
w2 = tfidf2[0].split(',')
if len(w1) == len(w2):
a = 0
b = 0
c = 0
for i in range(0, len(w1)):
# writeInfo(f'第{i+1}个词汇在文档{id1}中tfidf={w1[i]},在文档{id2}中tfidf={w2[i]}')
a += float(w1[i]) * float(w2[i])
b += numpy.square(float(w1[i]))
c += numpy.square(float(w2[i]))
b = numpy.sqrt(b)
c = numpy.sqrt(c)
count = float(a / (b * c))
writeInfo(f'文档{id1}和文档{id2}的相似度={count}')
db.modify('replace into sys_similarity (source_paper, target_paper, similarity) VALUE (%s,%s,%s)',
(id1, id2, count))
if len(result) == 2:
tfidf1, tfidf2 = result
writeInfo(f'开始比较文档{id1}和文档{id2}的相似度')
w1 = tfidf1[0].split(',')
w2 = tfidf2[0].split(',')
if len(w1) == len(w2):
a = 0
b = 0
c = 0
for i in range(0, len(w1)):
# writeInfo(f'第{i+1}个词汇在文档{id1}中tfidf={w1[i]},在文档{id2}中tfidf={w2[i]}')
a += float(w1[i]) * float(w2[i])
b += numpy.square(float(w1[i]))
c += numpy.square(float(w2[i]))
b = numpy.sqrt(b)
c = numpy.sqrt(c)
count = float(a / (b * c))
writeInfo(f'文档{id1}和文档{id2}的相似度={count}')
db.modify('replace into sys_similarity (source_paper, target_paper, similarity) VALUE (%s,%s,%s)',
(id1, id2, count))
else:
writeError(f'查询tfidf失败{result}')
# 文件格式转换保存到数据库
@ -309,17 +316,26 @@ def save(des, res, params):
# 万方平台论文采集
def run():
for page in range(1, 100):
res = session.get(
f"{base_url}/search/searchList.do?searchType=degree&showType=detail&page={page}&pageSize=20&searchWord={keyword}&isTriggerTag=",
headers=headers)
def run(max=10, last_page=100, page_size=20):
if max > last_page * page_size:
writeInfo(f'采集数不能超过{last_page*page_size}')
return
db.modify(f'delete from {sys_paper}')
db.modify(f'delete from {sys_word}')
db.modify(f'delete from {sys_similarity}')
count = 0
for page in range(1, last_page):
url = f"{base_url}/search/searchList.do?searchType=degree&showType=detail&page={page}&pageSize={page_size}&searchWord={keyword}&isTriggerTag="
writeInfo(f'分页url={url}')
res = session.get(url,
headers=headers)
if res.status_code == 200:
params_list = parse(res.content)
for params in params_list:
params["base_url"] = base_url
url = '{base_url}/search/downLoad.do?page_cnt={page_cnt}&language={language}&resourceType={resourceType}&source={source}&resourceId={resourceId}&resourceTitle={resourceTitle}&isoa={isoa}&type={resourceType}&first=null'.format(
**params)
writeInfo(f'下载接口={url}')
res = session.get(url, headers=headers)
if res.status_code == 200 and 'downloadliterature.do' in res.url:
res_html = BeautifulSoup(res.content, "html.parser")
@ -327,12 +343,21 @@ def run():
if downloadIframe:
res = session.get(downloadIframe["src"])
if res.status_code == 200 and 'download.ashx' in res.url:
writeInfo("成功获取真实下载地址{path}".format(path=res.url))
writeInfo("成功获取真实下载地址={path}".format(path=res.url))
res = session.get(res.url, headers=headers, stream=True)
if res.status_code == 200 and 'pdf' in res.headers['Content-Type']:
des = res.headers['Content-Disposition'].split(';')
if len(des) == 2 and len(des[1].split('=')) == 2:
executor.submit(save, des, res, params)
count = count + 1
writeInfo(f'当前采集进度{count}/{max},{round(count / max, 4) * 100}%')
if count <= max:
executor.submit(save, des, res, params)
if count == max:
writeInfo('采集任务已完成,论文入库中')
executor.shutdown(wait=True)
writeInfo('论文已入库')
split_word()
return
else:
writeError("非法响应类型")
else:

@ -1,4 +1,7 @@
from main import split_word
from main import split_word,run,create_sim
if __name__ == '__main__':
split_word()
# 默认采集max篇论文
run()
# 指定max
# run(max=10)

Loading…
Cancel
Save