|
|
|
@ -1,6 +1,7 @@ |
|
|
|
|
import os |
|
|
|
|
import re |
|
|
|
|
from concurrent.futures.thread import ThreadPoolExecutor |
|
|
|
|
from itertools import combinations |
|
|
|
|
from typing import Tuple |
|
|
|
|
from urllib.parse import unquote |
|
|
|
|
|
|
|
|
@ -16,6 +17,7 @@ from requests.cookies import RequestsCookieJar |
|
|
|
|
from config.config import cf |
|
|
|
|
from config.log import writeInfo, writeError |
|
|
|
|
import time |
|
|
|
|
import numpy |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# mysql数据库 |
|
|
|
@ -149,6 +151,12 @@ session.cookies.update(cookies) |
|
|
|
|
pdf_dir = 'pdf' |
|
|
|
|
html_dir = 'html' |
|
|
|
|
executor = ThreadPoolExecutor(max_workers=2) |
|
|
|
|
# 向量表 |
|
|
|
|
sys_tfidf = 'sys_tfidf' |
|
|
|
|
# 论文表 |
|
|
|
|
sys_paper = 'sys_paper' |
|
|
|
|
# 相似度表 |
|
|
|
|
sys_similarity = 'sys_similarity' |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class Word: |
|
|
|
@ -169,40 +177,41 @@ class Word: |
|
|
|
|
# 更新词库 |
|
|
|
|
def split_word(): |
|
|
|
|
jieba.enable_paddle() |
|
|
|
|
start = db.query('select min(id) from sys_paper')[0][0] |
|
|
|
|
end = db.query('select max(id) from sys_paper')[0][0] |
|
|
|
|
start = db.query(f'select min(id) from {sys_paper}')[0][0] |
|
|
|
|
end = db.query(f'select max(id) from {sys_paper}')[0][0] |
|
|
|
|
result = db.query('select word,flag from sys_word') |
|
|
|
|
filter_word = set(Word(_[0], _[1]) for _ in result) |
|
|
|
|
new_word=set() |
|
|
|
|
new_word = set() |
|
|
|
|
count = 0 |
|
|
|
|
for i in range(start, end + 1): |
|
|
|
|
txt_content = db.query('select txt_content from sys_paper where id=%s', (i))[0][0] |
|
|
|
|
txt_content = db.query(f'select txt_content from {sys_paper} where id=%s', (i))[0][0] |
|
|
|
|
words = posseg.cut(txt_content, use_paddle=True) |
|
|
|
|
for word, flag in words: |
|
|
|
|
# writeInfo(f'word={word},flag={flag}') |
|
|
|
|
if flag == 'n': |
|
|
|
|
word = re.sub(u"([^\u4e00-\u9fa5a-zA-Z])", "", word) |
|
|
|
|
w=Word(word, flag) |
|
|
|
|
w = Word(word, flag) |
|
|
|
|
if len(word) > 0 and w not in filter_word: |
|
|
|
|
new_word.add(w) |
|
|
|
|
count = count + 1 |
|
|
|
|
writeInfo(f'从{count}个词语中过滤出{len(new_word)}个新词汇') |
|
|
|
|
|
|
|
|
|
if len(new_word)>0: |
|
|
|
|
if len(new_word) > 0: |
|
|
|
|
words = tuple((_.word, _.flag) for _ in new_word) |
|
|
|
|
db.modify('insert into sys_word(word,flag) values (%s,%s)', words) |
|
|
|
|
create_doc_vector() |
|
|
|
|
create_sim() |
|
|
|
|
else: |
|
|
|
|
writeInfo('没有发现新词汇,不需要更新词库') |
|
|
|
|
|
|
|
|
|
table_name = 'sys_tfidf' |
|
|
|
|
|
|
|
|
|
# 计算文档向量 |
|
|
|
|
def create_doc_vector(): |
|
|
|
|
start=time.time() |
|
|
|
|
start = time.time() |
|
|
|
|
writeInfo('开始计算文档向量') |
|
|
|
|
db.modify(f'drop table if exists {table_name}') |
|
|
|
|
db.modify(f'drop table if exists {sys_tfidf}') |
|
|
|
|
db.modify(f''' |
|
|
|
|
create table {table_name} |
|
|
|
|
create table {sys_tfidf} |
|
|
|
|
( |
|
|
|
|
id bigint NOT NULL AUTO_INCREMENT, |
|
|
|
|
tfidf longtext not null, |
|
|
|
@ -214,28 +223,44 @@ def create_doc_vector(): |
|
|
|
|
f.idf, |
|
|
|
|
round((LENGTH(txt_content) - LENGTH(REPLACE(txt_content, word, ''))) / LENGTH(word)) AS tf, |
|
|
|
|
id |
|
|
|
|
from sys_paper, |
|
|
|
|
from {sys_paper}, |
|
|
|
|
(select word, |
|
|
|
|
sum(if(locate(word, txt_content) > 0, 1, 0)) as df, |
|
|
|
|
log((select count(*) from sys_paper) / sum(if(locate(word, txt_content) > 0, 1, 0))) + 1 as idf |
|
|
|
|
from sys_paper, |
|
|
|
|
log((select count(*) from {sys_paper}) / sum(if(locate(word, txt_content) > 0, 1, 0))) + 1 as idf |
|
|
|
|
from {sys_paper}, |
|
|
|
|
sys_word |
|
|
|
|
group by word) as f) as f |
|
|
|
|
group by id |
|
|
|
|
''') |
|
|
|
|
writeInfo(f'计算文档向量花费{round(time.time()-start)}s') |
|
|
|
|
writeInfo(f'计算文档向量花费{round(time.time() - start)}s') |
|
|
|
|
|
|
|
|
|
# 文档向量计算 |
|
|
|
|
def compare_doc_vector(ids=None): |
|
|
|
|
|
|
|
|
|
if ids is not None: |
|
|
|
|
|
|
|
|
|
result1 = db.query(f'select * from {table_name} where id in ({",".join(ids)})') |
|
|
|
|
result2 = db.query(f'select * from {table_name} where id not in ({",".join(ids)})') |
|
|
|
|
|
|
|
|
|
for id1, tfidf1 in result1: |
|
|
|
|
for id2,tfidf2 in result2: |
|
|
|
|
print(f'id={id1}和id={id2}比较') |
|
|
|
|
# 计算文档相似度 |
|
|
|
|
def create_sim(): |
|
|
|
|
ids = db.query(f'select group_concat(id) from {sys_paper}') |
|
|
|
|
if len(ids)>0 and len(ids[0])>0: |
|
|
|
|
group_ids = list(combinations(ids[0][0].split(','), 2)) |
|
|
|
|
for id1, id2 in group_ids: |
|
|
|
|
result = db.query(f'select tfidf from {sys_tfidf} where id in ({id1},{id2})') |
|
|
|
|
tfidf1, tfidf2 = result |
|
|
|
|
writeInfo(f'开始比较文档{id1}和文档{id2}的相似度') |
|
|
|
|
w1 = tfidf1[0].split(',') |
|
|
|
|
w2 = tfidf2[0].split(',') |
|
|
|
|
if len(w1) == len(w2): |
|
|
|
|
a = 0 |
|
|
|
|
b = 0 |
|
|
|
|
c = 0 |
|
|
|
|
for i in range(0, len(w1)): |
|
|
|
|
# writeInfo(f'第{i+1}个词汇在文档{id1}中tfidf={w1[i]},在文档{id2}中tfidf={w2[i]}') |
|
|
|
|
a += float(w1[i]) * float(w2[i]) |
|
|
|
|
b += numpy.square(float(w1[i])) |
|
|
|
|
c += numpy.square(float(w2[i])) |
|
|
|
|
b = numpy.sqrt(b) |
|
|
|
|
c = numpy.sqrt(c) |
|
|
|
|
count = float(a / (b * c)) |
|
|
|
|
writeInfo(f'文档{id1}和文档{id2}的相似度={count}') |
|
|
|
|
db.modify('replace into sys_similarity (source_paper, target_paper, similarity) VALUE (%s,%s,%s)', |
|
|
|
|
(id1, id2, count)) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 文件格式转换保存到数据库 |
|
|
|
@ -274,7 +299,7 @@ def save(des, res, params): |
|
|
|
|
# writeInfo('论文信息{info}'.format(info=info)) |
|
|
|
|
writeInfo(f'{params["title"]} 插入数据库') |
|
|
|
|
db.modify( |
|
|
|
|
f'insert into sys_paper (author, create_time, pdf_content,html_content,txt_content, profession, school, summary, tag, title, type, year) value(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)', |
|
|
|
|
f'insert into {sys_paper} (author, create_time, pdf_content,html_content,txt_content, profession, school, summary, tag, title, type, year) value(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)', |
|
|
|
|
( |
|
|
|
|
info['author'], info['create_time'], info['pdf_content'], |
|
|
|
|
info['html_content'], info['txt_content'], |
|
|
|
|