实现相似度计算并入库

master
pan 4 years ago
parent 7c4f7344a1
commit 656d83c7c1
  1. 75
      main.py
  2. 17
      test.py

@ -1,6 +1,7 @@
import os import os
import re import re
from concurrent.futures.thread import ThreadPoolExecutor from concurrent.futures.thread import ThreadPoolExecutor
from itertools import combinations
from typing import Tuple from typing import Tuple
from urllib.parse import unquote from urllib.parse import unquote
@ -16,6 +17,7 @@ from requests.cookies import RequestsCookieJar
from config.config import cf from config.config import cf
from config.log import writeInfo, writeError from config.log import writeInfo, writeError
import time import time
import numpy
# mysql数据库 # mysql数据库
@ -149,6 +151,12 @@ session.cookies.update(cookies)
pdf_dir = 'pdf' pdf_dir = 'pdf'
html_dir = 'html' html_dir = 'html'
executor = ThreadPoolExecutor(max_workers=2) executor = ThreadPoolExecutor(max_workers=2)
# 向量表
sys_tfidf = 'sys_tfidf'
# 论文表
sys_paper = 'sys_paper'
# 相似度表
sys_similarity = 'sys_similarity'
class Word: class Word:
@ -169,40 +177,41 @@ class Word:
# 更新词库 # 更新词库
def split_word(): def split_word():
jieba.enable_paddle() jieba.enable_paddle()
start = db.query('select min(id) from sys_paper')[0][0] start = db.query(f'select min(id) from {sys_paper}')[0][0]
end = db.query('select max(id) from sys_paper')[0][0] end = db.query(f'select max(id) from {sys_paper}')[0][0]
result = db.query('select word,flag from sys_word') result = db.query('select word,flag from sys_word')
filter_word = set(Word(_[0], _[1]) for _ in result) filter_word = set(Word(_[0], _[1]) for _ in result)
new_word=set() new_word = set()
count = 0 count = 0
for i in range(start, end + 1): for i in range(start, end + 1):
txt_content = db.query('select txt_content from sys_paper where id=%s', (i))[0][0] txt_content = db.query(f'select txt_content from {sys_paper} where id=%s', (i))[0][0]
words = posseg.cut(txt_content, use_paddle=True) words = posseg.cut(txt_content, use_paddle=True)
for word, flag in words: for word, flag in words:
# writeInfo(f'word={word},flag={flag}') # writeInfo(f'word={word},flag={flag}')
if flag == 'n': if flag == 'n':
word = re.sub(u"([^\u4e00-\u9fa5a-zA-Z])", "", word) word = re.sub(u"([^\u4e00-\u9fa5a-zA-Z])", "", word)
w=Word(word, flag) w = Word(word, flag)
if len(word) > 0 and w not in filter_word: if len(word) > 0 and w not in filter_word:
new_word.add(w) new_word.add(w)
count = count + 1 count = count + 1
writeInfo(f'{count}个词语中过滤出{len(new_word)}个新词汇') writeInfo(f'{count}个词语中过滤出{len(new_word)}个新词汇')
if len(new_word)>0: if len(new_word) > 0:
words = tuple((_.word, _.flag) for _ in new_word) words = tuple((_.word, _.flag) for _ in new_word)
db.modify('insert into sys_word(word,flag) values (%s,%s)', words) db.modify('insert into sys_word(word,flag) values (%s,%s)', words)
create_doc_vector() create_doc_vector()
create_sim()
else: else:
writeInfo('没有发现新词汇,不需要更新词库') writeInfo('没有发现新词汇,不需要更新词库')
table_name = 'sys_tfidf'
# 计算文档向量
def create_doc_vector(): def create_doc_vector():
start=time.time() start = time.time()
writeInfo('开始计算文档向量') writeInfo('开始计算文档向量')
db.modify(f'drop table if exists {table_name}') db.modify(f'drop table if exists {sys_tfidf}')
db.modify(f''' db.modify(f'''
create table {table_name} create table {sys_tfidf}
( (
id bigint NOT NULL AUTO_INCREMENT, id bigint NOT NULL AUTO_INCREMENT,
tfidf longtext not null, tfidf longtext not null,
@ -214,28 +223,44 @@ def create_doc_vector():
f.idf, f.idf,
round((LENGTH(txt_content) - LENGTH(REPLACE(txt_content, word, ''))) / LENGTH(word)) AS tf, round((LENGTH(txt_content) - LENGTH(REPLACE(txt_content, word, ''))) / LENGTH(word)) AS tf,
id id
from sys_paper, from {sys_paper},
(select word, (select word,
sum(if(locate(word, txt_content) > 0, 1, 0)) as df, sum(if(locate(word, txt_content) > 0, 1, 0)) as df,
log((select count(*) from sys_paper) / sum(if(locate(word, txt_content) > 0, 1, 0))) + 1 as idf log((select count(*) from {sys_paper}) / sum(if(locate(word, txt_content) > 0, 1, 0))) + 1 as idf
from sys_paper, from {sys_paper},
sys_word sys_word
group by word) as f) as f group by word) as f) as f
group by id group by id
''') ''')
writeInfo(f'计算文档向量花费{round(time.time()-start)}s') writeInfo(f'计算文档向量花费{round(time.time() - start)}s')
# 文档向量计算
def compare_doc_vector(ids=None):
if ids is not None: # 计算文档相似度
def create_sim():
result1 = db.query(f'select * from {table_name} where id in ({",".join(ids)})') ids = db.query(f'select group_concat(id) from {sys_paper}')
result2 = db.query(f'select * from {table_name} where id not in ({",".join(ids)})') if len(ids)>0 and len(ids[0])>0:
group_ids = list(combinations(ids[0][0].split(','), 2))
for id1, tfidf1 in result1: for id1, id2 in group_ids:
for id2,tfidf2 in result2: result = db.query(f'select tfidf from {sys_tfidf} where id in ({id1},{id2})')
print(f'id={id1}和id={id2}比较') tfidf1, tfidf2 = result
writeInfo(f'开始比较文档{id1}和文档{id2}的相似度')
w1 = tfidf1[0].split(',')
w2 = tfidf2[0].split(',')
if len(w1) == len(w2):
a = 0
b = 0
c = 0
for i in range(0, len(w1)):
# writeInfo(f'第{i+1}个词汇在文档{id1}中tfidf={w1[i]},在文档{id2}中tfidf={w2[i]}')
a += float(w1[i]) * float(w2[i])
b += numpy.square(float(w1[i]))
c += numpy.square(float(w2[i]))
b = numpy.sqrt(b)
c = numpy.sqrt(c)
count = float(a / (b * c))
writeInfo(f'文档{id1}和文档{id2}的相似度={count}')
db.modify('replace into sys_similarity (source_paper, target_paper, similarity) VALUE (%s,%s,%s)',
(id1, id2, count))
# 文件格式转换保存到数据库 # 文件格式转换保存到数据库
@ -274,7 +299,7 @@ def save(des, res, params):
# writeInfo('论文信息{info}'.format(info=info)) # writeInfo('论文信息{info}'.format(info=info))
writeInfo(f'{params["title"]} 插入数据库') writeInfo(f'{params["title"]} 插入数据库')
db.modify( db.modify(
f'insert into sys_paper (author, create_time, pdf_content,html_content,txt_content, profession, school, summary, tag, title, type, year) value(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)', f'insert into {sys_paper} (author, create_time, pdf_content,html_content,txt_content, profession, school, summary, tag, title, type, year) value(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)',
( (
info['author'], info['create_time'], info['pdf_content'], info['author'], info['create_time'], info['pdf_content'],
info['html_content'], info['txt_content'], info['html_content'], info['txt_content'],

@ -1,17 +1,4 @@
import sys from main import split_word
from typing import Tuple
import jieba
from Scripts import pdf2txt
from bs4 import BeautifulSoup
from jieba import posseg
from config.log import writeInfo
from main import MysqlDB, run, split_word, Word, create_doc_vector
db=MysqlDB()
if __name__ == '__main__': if __name__ == '__main__':
# split_word() split_word()
create_doc_vector()
# c({'3'})
Loading…
Cancel
Save