增加采集指定论文数的逻辑

master
pan 4 years ago
parent d614204850
commit 5520d64b39
  1. 101
      main.py
  2. 7
      test.py

@ -3,7 +3,7 @@ import re
from concurrent.futures.thread import ThreadPoolExecutor from concurrent.futures.thread import ThreadPoolExecutor
from itertools import combinations from itertools import combinations
from typing import Tuple from typing import Tuple
from urllib.parse import unquote from urllib.parse import unquote, urlencode
import jieba import jieba
import pymysql as pymysql import pymysql as pymysql
@ -12,6 +12,7 @@ from Scripts import pdf2txt
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from jieba import posseg from jieba import posseg
from lxml import etree from lxml import etree
from requests.adapters import HTTPAdapter
from requests.cookies import RequestsCookieJar from requests.cookies import RequestsCookieJar
from config.config import cf from config.config import cf
@ -134,7 +135,7 @@ def parse(content):
return params_list return params_list
base_url = 'https://libcon.bupt.edu.cn/http/77726476706e69737468656265737421e7e056d2303166567f068ea89941227bfcd3ca21bd0c' base_url = 'https://libcon.bupt.edu.cn/http/77726476706e69737468656265737421f7b9569d2936695e790c88b8991b203a18454272'
profession = "计算机软件与理论" profession = "计算机软件与理论"
keyword = f'(专业%3A"{profession}")' keyword = f'(专业%3A"{profession}")'
headers = { headers = {
@ -142,10 +143,10 @@ headers = {
} }
db = MysqlDB() db = MysqlDB()
session = requests.Session() session = requests.Session()
session.mount('http://', HTTPAdapter(max_retries=3))
session.mount('https://', HTTPAdapter(max_retries=3))
cookies = RequestsCookieJar() cookies = RequestsCookieJar()
cookies.set('wengine_vpn_ticketlibcon_bupt_edu_cn', '85f585f32fdf27ca', path='/', domain='.libcon.bupt.edu.cn') cookies.set('wengine_vpn_ticketlibcon_bupt_edu_cn', '7bde1fe6992c50f9', path='/', domain='.libcon.bupt.edu.cn')
cookies.set('remember_token', 'yeysYCGGzNIrcDDUNXOVyNXODUZzJpROJgLmsObLAXrsmibvEqcwqzRkDYlODYWA', path='/',
domain='libcon.bupt.edu.cn')
session.cookies.update(cookies) session.cookies.update(cookies)
pdf_dir = 'pdf' pdf_dir = 'pdf'
@ -155,6 +156,8 @@ executor = ThreadPoolExecutor(max_workers=2)
sys_tfidf = 'sys_tfidf' sys_tfidf = 'sys_tfidf'
# 论文表 # 论文表
sys_paper = 'sys_paper' sys_paper = 'sys_paper'
# 词库表
sys_word = 'sys_word'
# 相似度表 # 相似度表
sys_similarity = 'sys_similarity' sys_similarity = 'sys_similarity'
@ -179,7 +182,7 @@ def split_word():
jieba.enable_paddle() jieba.enable_paddle()
start = db.query(f'select min(id) from {sys_paper}')[0][0] start = db.query(f'select min(id) from {sys_paper}')[0][0]
end = db.query(f'select max(id) from {sys_paper}')[0][0] end = db.query(f'select max(id) from {sys_paper}')[0][0]
result = db.query('select word,flag from sys_word') result = db.query(f'select word,flag from {sys_word}')
filter_word = set(Word(_[0], _[1]) for _ in result) filter_word = set(Word(_[0], _[1]) for _ in result)
new_word = set() new_word = set()
count = 0 count = 0
@ -198,7 +201,7 @@ def split_word():
if len(new_word) > 0: if len(new_word) > 0:
words = tuple((_.word, _.flag) for _ in new_word) words = tuple((_.word, _.flag) for _ in new_word)
db.modify('insert into sys_word(word,flag) values (%s,%s)', words) db.modify(f'insert into {sys_word}(word,flag) values (%s,%s)', words)
create_doc_vector() create_doc_vector()
create_sim() create_sim()
else: else:
@ -210,7 +213,7 @@ def create_doc_vector():
start = time.time() start = time.time()
writeInfo('开始计算文档向量') writeInfo('开始计算文档向量')
db.modify(f'drop table if exists {sys_tfidf}') db.modify(f'drop table if exists {sys_tfidf}')
db.modify(f''' create_table_sql = f'''
create table {sys_tfidf} create table {sys_tfidf}
( (
id bigint NOT NULL AUTO_INCREMENT, id bigint NOT NULL AUTO_INCREMENT,
@ -228,39 +231,43 @@ def create_doc_vector():
sum(if(locate(word, txt_content) > 0, 1, 0)) as df, sum(if(locate(word, txt_content) > 0, 1, 0)) as df,
log((select count(*) from {sys_paper}) / sum(if(locate(word, txt_content) > 0, 1, 0))) + 1 as idf log((select count(*) from {sys_paper}) / sum(if(locate(word, txt_content) > 0, 1, 0))) + 1 as idf
from {sys_paper}, from {sys_paper},
sys_word {sys_word}
group by word) as f) as f group by word) as f) as f
group by id group by id
''') '''
writeInfo(f'计算文档向量花费{round(time.time() - start)}s') db.modify(create_table_sql)
writeInfo(f'计算文档向量执行sql{create_table_sql}花费{round(time.time() - start)}s')
# 计算文档相似度 # 计算文档相似度
def create_sim(): def create_sim():
ids = db.query(f'select group_concat(id) from {sys_paper}') ids = db.query(f'select group_concat(id) from {sys_paper}')
if len(ids)>0 and len(ids[0])>0: if len(ids) > 0 and len(ids[0]) > 0:
group_ids = list(combinations(ids[0][0].split(','), 2)) group_ids = list(combinations(ids[0][0].split(','), 2))
for id1, id2 in group_ids: for id1, id2 in group_ids:
result = db.query(f'select tfidf from {sys_tfidf} where id in ({id1},{id2})') result = db.query(f'select tfidf from {sys_tfidf} where id in ({id1},{id2})')
tfidf1, tfidf2 = result if len(result) == 2:
writeInfo(f'开始比较文档{id1}和文档{id2}的相似度') tfidf1, tfidf2 = result
w1 = tfidf1[0].split(',') writeInfo(f'开始比较文档{id1}和文档{id2}的相似度')
w2 = tfidf2[0].split(',') w1 = tfidf1[0].split(',')
if len(w1) == len(w2): w2 = tfidf2[0].split(',')
a = 0 if len(w1) == len(w2):
b = 0 a = 0
c = 0 b = 0
for i in range(0, len(w1)): c = 0
# writeInfo(f'第{i+1}个词汇在文档{id1}中tfidf={w1[i]},在文档{id2}中tfidf={w2[i]}') for i in range(0, len(w1)):
a += float(w1[i]) * float(w2[i]) # writeInfo(f'第{i+1}个词汇在文档{id1}中tfidf={w1[i]},在文档{id2}中tfidf={w2[i]}')
b += numpy.square(float(w1[i])) a += float(w1[i]) * float(w2[i])
c += numpy.square(float(w2[i])) b += numpy.square(float(w1[i]))
b = numpy.sqrt(b) c += numpy.square(float(w2[i]))
c = numpy.sqrt(c) b = numpy.sqrt(b)
count = float(a / (b * c)) c = numpy.sqrt(c)
writeInfo(f'文档{id1}和文档{id2}的相似度={count}') count = float(a / (b * c))
db.modify('replace into sys_similarity (source_paper, target_paper, similarity) VALUE (%s,%s,%s)', writeInfo(f'文档{id1}和文档{id2}的相似度={count}')
(id1, id2, count)) db.modify('replace into sys_similarity (source_paper, target_paper, similarity) VALUE (%s,%s,%s)',
(id1, id2, count))
else:
writeError(f'查询tfidf失败{result}')
# 文件格式转换保存到数据库 # 文件格式转换保存到数据库
@ -309,17 +316,26 @@ def save(des, res, params):
# 万方平台论文采集 # 万方平台论文采集
def run(): def run(max=10, last_page=100, page_size=20):
for page in range(1, 100): if max > last_page * page_size:
res = session.get( writeInfo(f'采集数不能超过{last_page*page_size}')
f"{base_url}/search/searchList.do?searchType=degree&showType=detail&page={page}&pageSize=20&searchWord={keyword}&isTriggerTag=", return
headers=headers) db.modify(f'delete from {sys_paper}')
db.modify(f'delete from {sys_word}')
db.modify(f'delete from {sys_similarity}')
count = 0
for page in range(1, last_page):
url = f"{base_url}/search/searchList.do?searchType=degree&showType=detail&page={page}&pageSize={page_size}&searchWord={keyword}&isTriggerTag="
writeInfo(f'分页url={url}')
res = session.get(url,
headers=headers)
if res.status_code == 200: if res.status_code == 200:
params_list = parse(res.content) params_list = parse(res.content)
for params in params_list: for params in params_list:
params["base_url"] = base_url params["base_url"] = base_url
url = '{base_url}/search/downLoad.do?page_cnt={page_cnt}&language={language}&resourceType={resourceType}&source={source}&resourceId={resourceId}&resourceTitle={resourceTitle}&isoa={isoa}&type={resourceType}&first=null'.format( url = '{base_url}/search/downLoad.do?page_cnt={page_cnt}&language={language}&resourceType={resourceType}&source={source}&resourceId={resourceId}&resourceTitle={resourceTitle}&isoa={isoa}&type={resourceType}&first=null'.format(
**params) **params)
writeInfo(f'下载接口={url}')
res = session.get(url, headers=headers) res = session.get(url, headers=headers)
if res.status_code == 200 and 'downloadliterature.do' in res.url: if res.status_code == 200 and 'downloadliterature.do' in res.url:
res_html = BeautifulSoup(res.content, "html.parser") res_html = BeautifulSoup(res.content, "html.parser")
@ -327,12 +343,21 @@ def run():
if downloadIframe: if downloadIframe:
res = session.get(downloadIframe["src"]) res = session.get(downloadIframe["src"])
if res.status_code == 200 and 'download.ashx' in res.url: if res.status_code == 200 and 'download.ashx' in res.url:
writeInfo("成功获取真实下载地址{path}".format(path=res.url)) writeInfo("成功获取真实下载地址={path}".format(path=res.url))
res = session.get(res.url, headers=headers, stream=True) res = session.get(res.url, headers=headers, stream=True)
if res.status_code == 200 and 'pdf' in res.headers['Content-Type']: if res.status_code == 200 and 'pdf' in res.headers['Content-Type']:
des = res.headers['Content-Disposition'].split(';') des = res.headers['Content-Disposition'].split(';')
if len(des) == 2 and len(des[1].split('=')) == 2: if len(des) == 2 and len(des[1].split('=')) == 2:
executor.submit(save, des, res, params) count = count + 1
writeInfo(f'当前采集进度{count}/{max},{round(count / max, 4) * 100}%')
if count <= max:
executor.submit(save, des, res, params)
if count == max:
writeInfo('采集任务已完成,论文入库中')
executor.shutdown(wait=True)
writeInfo('论文已入库')
split_word()
return
else: else:
writeError("非法响应类型") writeError("非法响应类型")
else: else:

@ -1,4 +1,7 @@
from main import split_word from main import split_word,run,create_sim
if __name__ == '__main__': if __name__ == '__main__':
split_word() # 默认采集max篇论文
run()
# 指定max
# run(max=10)

Loading…
Cancel
Save