From 656d83c7c1b275c0aa56ced5ef6ae5b68937b74a Mon Sep 17 00:00:00 2001 From: pan <1029559041@qq.com> Date: Mon, 10 Aug 2020 16:08:49 +0800 Subject: [PATCH] =?UTF-8?q?=E5=AE=9E=E7=8E=B0=E7=9B=B8=E4=BC=BC=E5=BA=A6?= =?UTF-8?q?=E8=AE=A1=E7=AE=97=E5=B9=B6=E5=85=A5=E5=BA=93?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- main.py | 75 ++++++++++++++++++++++++++++++++++++++------------------- test.py | 17 ++----------- 2 files changed, 52 insertions(+), 40 deletions(-) diff --git a/main.py b/main.py index c7a9b67..358742a 100644 --- a/main.py +++ b/main.py @@ -1,6 +1,7 @@ import os import re from concurrent.futures.thread import ThreadPoolExecutor +from itertools import combinations from typing import Tuple from urllib.parse import unquote @@ -16,6 +17,7 @@ from requests.cookies import RequestsCookieJar from config.config import cf from config.log import writeInfo, writeError import time +import numpy # mysql数据库 @@ -149,6 +151,12 @@ session.cookies.update(cookies) pdf_dir = 'pdf' html_dir = 'html' executor = ThreadPoolExecutor(max_workers=2) +# 向量表 +sys_tfidf = 'sys_tfidf' +# 论文表 +sys_paper = 'sys_paper' +# 相似度表 +sys_similarity = 'sys_similarity' class Word: @@ -169,40 +177,41 @@ class Word: # 更新词库 def split_word(): jieba.enable_paddle() - start = db.query('select min(id) from sys_paper')[0][0] - end = db.query('select max(id) from sys_paper')[0][0] + start = db.query(f'select min(id) from {sys_paper}')[0][0] + end = db.query(f'select max(id) from {sys_paper}')[0][0] result = db.query('select word,flag from sys_word') filter_word = set(Word(_[0], _[1]) for _ in result) - new_word=set() + new_word = set() count = 0 for i in range(start, end + 1): - txt_content = db.query('select txt_content from sys_paper where id=%s', (i))[0][0] + txt_content = db.query(f'select txt_content from {sys_paper} where id=%s', (i))[0][0] words = posseg.cut(txt_content, use_paddle=True) for word, flag in words: # writeInfo(f'word={word},flag={flag}') if flag == 'n': word = re.sub(u"([^\u4e00-\u9fa5a-zA-Z])", "", word) - w=Word(word, flag) + w = Word(word, flag) if len(word) > 0 and w not in filter_word: new_word.add(w) count = count + 1 writeInfo(f'从{count}个词语中过滤出{len(new_word)}个新词汇') - if len(new_word)>0: + if len(new_word) > 0: words = tuple((_.word, _.flag) for _ in new_word) db.modify('insert into sys_word(word,flag) values (%s,%s)', words) create_doc_vector() + create_sim() else: writeInfo('没有发现新词汇,不需要更新词库') -table_name = 'sys_tfidf' +# 计算文档向量 def create_doc_vector(): - start=time.time() + start = time.time() writeInfo('开始计算文档向量') - db.modify(f'drop table if exists {table_name}') + db.modify(f'drop table if exists {sys_tfidf}') db.modify(f''' - create table {table_name} + create table {sys_tfidf} ( id bigint NOT NULL AUTO_INCREMENT, tfidf longtext not null, @@ -214,28 +223,44 @@ def create_doc_vector(): f.idf, round((LENGTH(txt_content) - LENGTH(REPLACE(txt_content, word, ''))) / LENGTH(word)) AS tf, id - from sys_paper, + from {sys_paper}, (select word, sum(if(locate(word, txt_content) > 0, 1, 0)) as df, - log((select count(*) from sys_paper) / sum(if(locate(word, txt_content) > 0, 1, 0))) + 1 as idf - from sys_paper, + log((select count(*) from {sys_paper}) / sum(if(locate(word, txt_content) > 0, 1, 0))) + 1 as idf + from {sys_paper}, sys_word group by word) as f) as f group by id ''') - writeInfo(f'计算文档向量花费{round(time.time()-start)}s') + writeInfo(f'计算文档向量花费{round(time.time() - start)}s') -# 文档向量计算 -def compare_doc_vector(ids=None): - if ids is not None: - - result1 = db.query(f'select * from {table_name} where id in ({",".join(ids)})') - result2 = db.query(f'select * from {table_name} where id not in ({",".join(ids)})') - - for id1, tfidf1 in result1: - for id2,tfidf2 in result2: - print(f'id={id1}和id={id2}比较') +# 计算文档相似度 +def create_sim(): + ids = db.query(f'select group_concat(id) from {sys_paper}') + if len(ids)>0 and len(ids[0])>0: + group_ids = list(combinations(ids[0][0].split(','), 2)) + for id1, id2 in group_ids: + result = db.query(f'select tfidf from {sys_tfidf} where id in ({id1},{id2})') + tfidf1, tfidf2 = result + writeInfo(f'开始比较文档{id1}和文档{id2}的相似度') + w1 = tfidf1[0].split(',') + w2 = tfidf2[0].split(',') + if len(w1) == len(w2): + a = 0 + b = 0 + c = 0 + for i in range(0, len(w1)): + # writeInfo(f'第{i+1}个词汇在文档{id1}中tfidf={w1[i]},在文档{id2}中tfidf={w2[i]}') + a += float(w1[i]) * float(w2[i]) + b += numpy.square(float(w1[i])) + c += numpy.square(float(w2[i])) + b = numpy.sqrt(b) + c = numpy.sqrt(c) + count = float(a / (b * c)) + writeInfo(f'文档{id1}和文档{id2}的相似度={count}') + db.modify('replace into sys_similarity (source_paper, target_paper, similarity) VALUE (%s,%s,%s)', + (id1, id2, count)) # 文件格式转换保存到数据库 @@ -274,7 +299,7 @@ def save(des, res, params): # writeInfo('论文信息{info}'.format(info=info)) writeInfo(f'{params["title"]} 插入数据库') db.modify( - f'insert into sys_paper (author, create_time, pdf_content,html_content,txt_content, profession, school, summary, tag, title, type, year) value(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)', + f'insert into {sys_paper} (author, create_time, pdf_content,html_content,txt_content, profession, school, summary, tag, title, type, year) value(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)', ( info['author'], info['create_time'], info['pdf_content'], info['html_content'], info['txt_content'], diff --git a/test.py b/test.py index 4cdeaf2..8bdacd5 100644 --- a/test.py +++ b/test.py @@ -1,17 +1,4 @@ -import sys -from typing import Tuple - -import jieba -from Scripts import pdf2txt -from bs4 import BeautifulSoup -from jieba import posseg -from config.log import writeInfo - -from main import MysqlDB, run, split_word, Word, create_doc_vector - -db=MysqlDB() +from main import split_word if __name__ == '__main__': - # split_word() - create_doc_vector() - # c({'3'}) \ No newline at end of file + split_word() \ No newline at end of file