From 5520d64b394304126839491edba484dcf7ebfa66 Mon Sep 17 00:00:00 2001 From: pan <1029559041@qq.com> Date: Tue, 11 Aug 2020 22:26:17 +0800 Subject: [PATCH] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E9=87=87=E9=9B=86=E6=8C=87?= =?UTF-8?q?=E5=AE=9A=E8=AE=BA=E6=96=87=E6=95=B0=E7=9A=84=E9=80=BB=E8=BE=91?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- main.py | 101 +++++++++++++++++++++++++++++++++++--------------------- test.py | 7 ++-- 2 files changed, 68 insertions(+), 40 deletions(-) diff --git a/main.py b/main.py index 358742a..bbf8ace 100644 --- a/main.py +++ b/main.py @@ -3,7 +3,7 @@ import re from concurrent.futures.thread import ThreadPoolExecutor from itertools import combinations from typing import Tuple -from urllib.parse import unquote +from urllib.parse import unquote, urlencode import jieba import pymysql as pymysql @@ -12,6 +12,7 @@ from Scripts import pdf2txt from bs4 import BeautifulSoup from jieba import posseg from lxml import etree +from requests.adapters import HTTPAdapter from requests.cookies import RequestsCookieJar from config.config import cf @@ -134,7 +135,7 @@ def parse(content): return params_list -base_url = 'https://libcon.bupt.edu.cn/http/77726476706e69737468656265737421e7e056d2303166567f068ea89941227bfcd3ca21bd0c' +base_url = 'https://libcon.bupt.edu.cn/http/77726476706e69737468656265737421f7b9569d2936695e790c88b8991b203a18454272' profession = "计算机软件与理论" keyword = f'(专业%3A"{profession}")' headers = { @@ -142,10 +143,10 @@ headers = { } db = MysqlDB() session = requests.Session() +session.mount('http://', HTTPAdapter(max_retries=3)) +session.mount('https://', HTTPAdapter(max_retries=3)) cookies = RequestsCookieJar() -cookies.set('wengine_vpn_ticketlibcon_bupt_edu_cn', '85f585f32fdf27ca', path='/', domain='.libcon.bupt.edu.cn') -cookies.set('remember_token', 'yeysYCGGzNIrcDDUNXOVyNXODUZzJpROJgLmsObLAXrsmibvEqcwqzRkDYlODYWA', path='/', - domain='libcon.bupt.edu.cn') +cookies.set('wengine_vpn_ticketlibcon_bupt_edu_cn', '7bde1fe6992c50f9', path='/', domain='.libcon.bupt.edu.cn') session.cookies.update(cookies) pdf_dir = 'pdf' @@ -155,6 +156,8 @@ executor = ThreadPoolExecutor(max_workers=2) sys_tfidf = 'sys_tfidf' # 论文表 sys_paper = 'sys_paper' +# 词库表 +sys_word = 'sys_word' # 相似度表 sys_similarity = 'sys_similarity' @@ -179,7 +182,7 @@ def split_word(): jieba.enable_paddle() start = db.query(f'select min(id) from {sys_paper}')[0][0] end = db.query(f'select max(id) from {sys_paper}')[0][0] - result = db.query('select word,flag from sys_word') + result = db.query(f'select word,flag from {sys_word}') filter_word = set(Word(_[0], _[1]) for _ in result) new_word = set() count = 0 @@ -198,7 +201,7 @@ def split_word(): if len(new_word) > 0: words = tuple((_.word, _.flag) for _ in new_word) - db.modify('insert into sys_word(word,flag) values (%s,%s)', words) + db.modify(f'insert into {sys_word}(word,flag) values (%s,%s)', words) create_doc_vector() create_sim() else: @@ -210,7 +213,7 @@ def create_doc_vector(): start = time.time() writeInfo('开始计算文档向量') db.modify(f'drop table if exists {sys_tfidf}') - db.modify(f''' + create_table_sql = f''' create table {sys_tfidf} ( id bigint NOT NULL AUTO_INCREMENT, @@ -228,39 +231,43 @@ def create_doc_vector(): sum(if(locate(word, txt_content) > 0, 1, 0)) as df, log((select count(*) from {sys_paper}) / sum(if(locate(word, txt_content) > 0, 1, 0))) + 1 as idf from {sys_paper}, - sys_word + {sys_word} group by word) as f) as f group by id - ''') - writeInfo(f'计算文档向量花费{round(time.time() - start)}s') + ''' + db.modify(create_table_sql) + writeInfo(f'计算文档向量执行sql{create_table_sql}花费{round(time.time() - start)}s') # 计算文档相似度 def create_sim(): ids = db.query(f'select group_concat(id) from {sys_paper}') - if len(ids)>0 and len(ids[0])>0: + if len(ids) > 0 and len(ids[0]) > 0: group_ids = list(combinations(ids[0][0].split(','), 2)) for id1, id2 in group_ids: result = db.query(f'select tfidf from {sys_tfidf} where id in ({id1},{id2})') - tfidf1, tfidf2 = result - writeInfo(f'开始比较文档{id1}和文档{id2}的相似度') - w1 = tfidf1[0].split(',') - w2 = tfidf2[0].split(',') - if len(w1) == len(w2): - a = 0 - b = 0 - c = 0 - for i in range(0, len(w1)): - # writeInfo(f'第{i+1}个词汇在文档{id1}中tfidf={w1[i]},在文档{id2}中tfidf={w2[i]}') - a += float(w1[i]) * float(w2[i]) - b += numpy.square(float(w1[i])) - c += numpy.square(float(w2[i])) - b = numpy.sqrt(b) - c = numpy.sqrt(c) - count = float(a / (b * c)) - writeInfo(f'文档{id1}和文档{id2}的相似度={count}') - db.modify('replace into sys_similarity (source_paper, target_paper, similarity) VALUE (%s,%s,%s)', - (id1, id2, count)) + if len(result) == 2: + tfidf1, tfidf2 = result + writeInfo(f'开始比较文档{id1}和文档{id2}的相似度') + w1 = tfidf1[0].split(',') + w2 = tfidf2[0].split(',') + if len(w1) == len(w2): + a = 0 + b = 0 + c = 0 + for i in range(0, len(w1)): + # writeInfo(f'第{i+1}个词汇在文档{id1}中tfidf={w1[i]},在文档{id2}中tfidf={w2[i]}') + a += float(w1[i]) * float(w2[i]) + b += numpy.square(float(w1[i])) + c += numpy.square(float(w2[i])) + b = numpy.sqrt(b) + c = numpy.sqrt(c) + count = float(a / (b * c)) + writeInfo(f'文档{id1}和文档{id2}的相似度={count}') + db.modify('replace into sys_similarity (source_paper, target_paper, similarity) VALUE (%s,%s,%s)', + (id1, id2, count)) + else: + writeError(f'查询tfidf失败{result}') # 文件格式转换保存到数据库 @@ -309,17 +316,26 @@ def save(des, res, params): # 万方平台论文采集 -def run(): - for page in range(1, 100): - res = session.get( - f"{base_url}/search/searchList.do?searchType=degree&showType=detail&page={page}&pageSize=20&searchWord={keyword}&isTriggerTag=", - headers=headers) +def run(max=10, last_page=100, page_size=20): + if max > last_page * page_size: + writeInfo(f'采集数不能超过{last_page*page_size}') + return + db.modify(f'delete from {sys_paper}') + db.modify(f'delete from {sys_word}') + db.modify(f'delete from {sys_similarity}') + count = 0 + for page in range(1, last_page): + url = f"{base_url}/search/searchList.do?searchType=degree&showType=detail&page={page}&pageSize={page_size}&searchWord={keyword}&isTriggerTag=" + writeInfo(f'分页url={url}') + res = session.get(url, + headers=headers) if res.status_code == 200: params_list = parse(res.content) for params in params_list: params["base_url"] = base_url url = '{base_url}/search/downLoad.do?page_cnt={page_cnt}&language={language}&resourceType={resourceType}&source={source}&resourceId={resourceId}&resourceTitle={resourceTitle}&isoa={isoa}&type={resourceType}&first=null'.format( **params) + writeInfo(f'下载接口={url}') res = session.get(url, headers=headers) if res.status_code == 200 and 'downloadliterature.do' in res.url: res_html = BeautifulSoup(res.content, "html.parser") @@ -327,12 +343,21 @@ def run(): if downloadIframe: res = session.get(downloadIframe["src"]) if res.status_code == 200 and 'download.ashx' in res.url: - writeInfo("成功获取真实下载地址{path}".format(path=res.url)) + writeInfo("成功获取真实下载地址={path}".format(path=res.url)) res = session.get(res.url, headers=headers, stream=True) if res.status_code == 200 and 'pdf' in res.headers['Content-Type']: des = res.headers['Content-Disposition'].split(';') if len(des) == 2 and len(des[1].split('=')) == 2: - executor.submit(save, des, res, params) + count = count + 1 + writeInfo(f'当前采集进度{count}/{max},{round(count / max, 4) * 100}%') + if count <= max: + executor.submit(save, des, res, params) + if count == max: + writeInfo('采集任务已完成,论文入库中') + executor.shutdown(wait=True) + writeInfo('论文已入库') + split_word() + return else: writeError("非法响应类型") else: diff --git a/test.py b/test.py index 8bdacd5..27636ee 100644 --- a/test.py +++ b/test.py @@ -1,4 +1,7 @@ -from main import split_word +from main import split_word,run,create_sim if __name__ == '__main__': - split_word() \ No newline at end of file + # 默认采集max篇论文 + run() + # 指定max + # run(max=10)