实现相似度计算并入库

master
pan 4 years ago
parent 7c4f7344a1
commit 656d83c7c1
  1. 65
      main.py
  2. 17
      test.py

@ -1,6 +1,7 @@
import os
import re
from concurrent.futures.thread import ThreadPoolExecutor
from itertools import combinations
from typing import Tuple
from urllib.parse import unquote
@ -16,6 +17,7 @@ from requests.cookies import RequestsCookieJar
from config.config import cf
from config.log import writeInfo, writeError
import time
import numpy
# mysql数据库
@ -149,6 +151,12 @@ session.cookies.update(cookies)
pdf_dir = 'pdf'
html_dir = 'html'
executor = ThreadPoolExecutor(max_workers=2)
# 向量表
sys_tfidf = 'sys_tfidf'
# 论文表
sys_paper = 'sys_paper'
# 相似度表
sys_similarity = 'sys_similarity'
class Word:
@ -169,14 +177,14 @@ class Word:
# 更新词库
def split_word():
jieba.enable_paddle()
start = db.query('select min(id) from sys_paper')[0][0]
end = db.query('select max(id) from sys_paper')[0][0]
start = db.query(f'select min(id) from {sys_paper}')[0][0]
end = db.query(f'select max(id) from {sys_paper}')[0][0]
result = db.query('select word,flag from sys_word')
filter_word = set(Word(_[0], _[1]) for _ in result)
new_word = set()
count = 0
for i in range(start, end + 1):
txt_content = db.query('select txt_content from sys_paper where id=%s', (i))[0][0]
txt_content = db.query(f'select txt_content from {sys_paper} where id=%s', (i))[0][0]
words = posseg.cut(txt_content, use_paddle=True)
for word, flag in words:
# writeInfo(f'word={word},flag={flag}')
@ -192,17 +200,18 @@ def split_word():
words = tuple((_.word, _.flag) for _ in new_word)
db.modify('insert into sys_word(word,flag) values (%s,%s)', words)
create_doc_vector()
create_sim()
else:
writeInfo('没有发现新词汇,不需要更新词库')
table_name = 'sys_tfidf'
# 计算文档向量
def create_doc_vector():
start = time.time()
writeInfo('开始计算文档向量')
db.modify(f'drop table if exists {table_name}')
db.modify(f'drop table if exists {sys_tfidf}')
db.modify(f'''
create table {table_name}
create table {sys_tfidf}
(
id bigint NOT NULL AUTO_INCREMENT,
tfidf longtext not null,
@ -214,28 +223,44 @@ def create_doc_vector():
f.idf,
round((LENGTH(txt_content) - LENGTH(REPLACE(txt_content, word, ''))) / LENGTH(word)) AS tf,
id
from sys_paper,
from {sys_paper},
(select word,
sum(if(locate(word, txt_content) > 0, 1, 0)) as df,
log((select count(*) from sys_paper) / sum(if(locate(word, txt_content) > 0, 1, 0))) + 1 as idf
from sys_paper,
log((select count(*) from {sys_paper}) / sum(if(locate(word, txt_content) > 0, 1, 0))) + 1 as idf
from {sys_paper},
sys_word
group by word) as f) as f
group by id
''')
writeInfo(f'计算文档向量花费{round(time.time() - start)}s')
# 文档向量计算
def compare_doc_vector(ids=None):
if ids is not None:
result1 = db.query(f'select * from {table_name} where id in ({",".join(ids)})')
result2 = db.query(f'select * from {table_name} where id not in ({",".join(ids)})')
for id1, tfidf1 in result1:
for id2,tfidf2 in result2:
print(f'id={id1}和id={id2}比较')
# 计算文档相似度
def create_sim():
ids = db.query(f'select group_concat(id) from {sys_paper}')
if len(ids)>0 and len(ids[0])>0:
group_ids = list(combinations(ids[0][0].split(','), 2))
for id1, id2 in group_ids:
result = db.query(f'select tfidf from {sys_tfidf} where id in ({id1},{id2})')
tfidf1, tfidf2 = result
writeInfo(f'开始比较文档{id1}和文档{id2}的相似度')
w1 = tfidf1[0].split(',')
w2 = tfidf2[0].split(',')
if len(w1) == len(w2):
a = 0
b = 0
c = 0
for i in range(0, len(w1)):
# writeInfo(f'第{i+1}个词汇在文档{id1}中tfidf={w1[i]},在文档{id2}中tfidf={w2[i]}')
a += float(w1[i]) * float(w2[i])
b += numpy.square(float(w1[i]))
c += numpy.square(float(w2[i]))
b = numpy.sqrt(b)
c = numpy.sqrt(c)
count = float(a / (b * c))
writeInfo(f'文档{id1}和文档{id2}的相似度={count}')
db.modify('replace into sys_similarity (source_paper, target_paper, similarity) VALUE (%s,%s,%s)',
(id1, id2, count))
# 文件格式转换保存到数据库
@ -274,7 +299,7 @@ def save(des, res, params):
# writeInfo('论文信息{info}'.format(info=info))
writeInfo(f'{params["title"]} 插入数据库')
db.modify(
f'insert into sys_paper (author, create_time, pdf_content,html_content,txt_content, profession, school, summary, tag, title, type, year) value(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)',
f'insert into {sys_paper} (author, create_time, pdf_content,html_content,txt_content, profession, school, summary, tag, title, type, year) value(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)',
(
info['author'], info['create_time'], info['pdf_content'],
info['html_content'], info['txt_content'],

@ -1,17 +1,4 @@
import sys
from typing import Tuple
import jieba
from Scripts import pdf2txt
from bs4 import BeautifulSoup
from jieba import posseg
from config.log import writeInfo
from main import MysqlDB, run, split_word, Word, create_doc_vector
db=MysqlDB()
from main import split_word
if __name__ == '__main__':
# split_word()
create_doc_vector()
# c({'3'})
split_word()
Loading…
Cancel
Save