实现相似度计算并入库

4 years ago · 656d83c7c1
parent 7c4f7344a1
commit 656d83c7c1
2 changed files with 52 additions and 40 deletions
--- a/main.py
+++ b/main.py
@ -1,6 +1,7 @@
 import os
 import re
 from concurrent.futures.thread import ThreadPoolExecutor
+from itertools import combinations
 from typing import Tuple
 from urllib.parse import unquote

@ -16,6 +17,7 @@ from requests.cookies import RequestsCookieJar
 from config.config import cf
 from config.log import writeInfo, writeError
 import time
+import numpy


 # mysql数据库
@ -149,6 +151,12 @@ session.cookies.update(cookies)
 pdf_dir = 'pdf'
 html_dir = 'html'
 executor = ThreadPoolExecutor(max_workers=2)
+# 向量表
+sys_tfidf = 'sys_tfidf'
+# 论文表
+sys_paper = 'sys_paper'
+# 相似度表
+sys_similarity = 'sys_similarity'


 class Word:
@ -169,14 +177,14 @@ class Word:
 # 更新词库
 def split_word():
    jieba.enable_paddle()
-    start = db.query('select min(id) from sys_paper')[0][0]
-    end = db.query('select max(id) from sys_paper')[0][0]
+    start = db.query(f'select min(id) from {sys_paper}')[0][0]
+    end = db.query(f'select max(id) from {sys_paper}')[0][0]
    result = db.query('select word,flag from sys_word')
    filter_word = set(Word(_[0], _[1]) for _ in result)
    new_word = set()
    count = 0
    for i in range(start, end + 1):
-        txt_content = db.query('select txt_content from sys_paper where id=%s', (i))[0][0]
+        txt_content = db.query(f'select txt_content from {sys_paper} where id=%s', (i))[0][0]
        words = posseg.cut(txt_content, use_paddle=True)
        for word, flag in words:
            # writeInfo(f'word={word},flag={flag}')
@ -192,17 +200,18 @@ def split_word():
        words = tuple((_.word, _.flag) for _ in new_word)
        db.modify('insert into sys_word(word,flag) values (%s,%s)', words)
        create_doc_vector()
+        create_sim()
    else:
        writeInfo('没有发现新词汇，不需要更新词库')

-table_name = 'sys_tfidf'

+# 计算文档向量
 def create_doc_vector():
    start = time.time()
    writeInfo('开始计算文档向量')
-    db.modify(f'drop table if exists {table_name}')
+    db.modify(f'drop table if exists {sys_tfidf}')
    db.modify(f'''
-    create table {table_name}
+    create table {sys_tfidf}
    (
        id    bigint   NOT NULL AUTO_INCREMENT,
        tfidf longtext not null,
@ -214,28 +223,44 @@ def create_doc_vector():
                 f.idf,
                 round((LENGTH(txt_content) - LENGTH(REPLACE(txt_content, word, ''))) / LENGTH(word)) AS tf,
                 id
-          from sys_paper,
+          from {sys_paper},
               (select word,
                       sum(if(locate(word, txt_content) > 0, 1, 0))                                             as df,
-                       log((select count(*) from sys_paper) / sum(if(locate(word, txt_content) > 0, 1, 0))) + 1 as idf
-                from sys_paper,
+                       log((select count(*) from {sys_paper}) / sum(if(locate(word, txt_content) > 0, 1, 0))) + 1 as idf
+                from {sys_paper},
                     sys_word
                group by word) as f) as f
    group by id
    ''')
    writeInfo(f'计算文档向量花费{round(time.time() - start)}s')

-# 文档向量计算
-def compare_doc_vector(ids=None):

-    if ids is not None:
-
-        result1 = db.query(f'select * from {table_name} where id in ({",".join(ids)})')
-        result2 = db.query(f'select * from {table_name} where id not in ({",".join(ids)})')
-
-        for id1, tfidf1 in result1:
-            for id2,tfidf2 in result2:
-                print(f'id={id1}和id={id2}比较')
+# 计算文档相似度
+def create_sim():
+    ids = db.query(f'select group_concat(id) from {sys_paper}')
+    if len(ids)>0 and len(ids[0])>0:
+        group_ids = list(combinations(ids[0][0].split(','), 2))
+        for id1, id2 in group_ids:
+            result = db.query(f'select tfidf from {sys_tfidf} where id in ({id1},{id2})')
+            tfidf1, tfidf2 = result
+            writeInfo(f'开始比较文档{id1}和文档{id2}的相似度')
+            w1 = tfidf1[0].split(',')
+            w2 = tfidf2[0].split(',')
+            if len(w1) == len(w2):
+                a = 0
+                b = 0
+                c = 0
+                for i in range(0, len(w1)):
+                    # writeInfo(f'第{i+1}个词汇在文档{id1}中tfidf={w1[i]},在文档{id2}中tfidf={w2[i]}')
+                    a += float(w1[i]) * float(w2[i])
+                    b += numpy.square(float(w1[i]))
+                    c += numpy.square(float(w2[i]))
+                b = numpy.sqrt(b)
+                c = numpy.sqrt(c)
+                count = float(a / (b * c))
+                writeInfo(f'文档{id1}和文档{id2}的相似度={count}')
+                db.modify('replace into sys_similarity (source_paper, target_paper, similarity) VALUE (%s,%s,%s)',
+                          (id1, id2, count))


 # 文件格式转换保存到数据库
@ -274,7 +299,7 @@ def save(des, res, params):
    # writeInfo('论文信息{info}'.format(info=info))
    writeInfo(f'{params["title"]} 插入数据库')
    db.modify(
-        f'insert into sys_paper (author, create_time, pdf_content,html_content,txt_content, profession, school, summary, tag, title, type, year) value(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)',
+        f'insert into {sys_paper} (author, create_time, pdf_content,html_content,txt_content, profession, school, summary, tag, title, type, year) value(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)',
        (
            info['author'], info['create_time'], info['pdf_content'],
            info['html_content'], info['txt_content'],
--- a/test.py
+++ b/test.py
@ -1,17 +1,4 @@
-import sys
-from typing import Tuple
-
-import jieba
-from Scripts import pdf2txt
-from bs4 import BeautifulSoup
-from jieba import posseg
-from config.log import writeInfo
-
-from main import MysqlDB, run, split_word, Word, create_doc_vector
-
-db=MysqlDB()
+from main import split_word

 if __name__ == '__main__':
-    # split_word()
-    create_doc_vector()
-    # c({'3'})
+    split_word()