|
|
@ -3,7 +3,7 @@ import re |
|
|
|
from concurrent.futures.thread import ThreadPoolExecutor |
|
|
|
from concurrent.futures.thread import ThreadPoolExecutor |
|
|
|
from itertools import combinations |
|
|
|
from itertools import combinations |
|
|
|
from typing import Tuple |
|
|
|
from typing import Tuple |
|
|
|
from urllib.parse import unquote |
|
|
|
from urllib.parse import unquote, urlencode |
|
|
|
|
|
|
|
|
|
|
|
import jieba |
|
|
|
import jieba |
|
|
|
import pymysql as pymysql |
|
|
|
import pymysql as pymysql |
|
|
@ -12,6 +12,7 @@ from Scripts import pdf2txt |
|
|
|
from bs4 import BeautifulSoup |
|
|
|
from bs4 import BeautifulSoup |
|
|
|
from jieba import posseg |
|
|
|
from jieba import posseg |
|
|
|
from lxml import etree |
|
|
|
from lxml import etree |
|
|
|
|
|
|
|
from requests.adapters import HTTPAdapter |
|
|
|
from requests.cookies import RequestsCookieJar |
|
|
|
from requests.cookies import RequestsCookieJar |
|
|
|
|
|
|
|
|
|
|
|
from config.config import cf |
|
|
|
from config.config import cf |
|
|
@ -134,7 +135,7 @@ def parse(content): |
|
|
|
return params_list |
|
|
|
return params_list |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
base_url = 'https://libcon.bupt.edu.cn/http/77726476706e69737468656265737421e7e056d2303166567f068ea89941227bfcd3ca21bd0c' |
|
|
|
base_url = 'https://libcon.bupt.edu.cn/http/77726476706e69737468656265737421f7b9569d2936695e790c88b8991b203a18454272' |
|
|
|
profession = "计算机软件与理论" |
|
|
|
profession = "计算机软件与理论" |
|
|
|
keyword = f'(专业%3A"{profession}")' |
|
|
|
keyword = f'(专业%3A"{profession}")' |
|
|
|
headers = { |
|
|
|
headers = { |
|
|
@ -142,10 +143,10 @@ headers = { |
|
|
|
} |
|
|
|
} |
|
|
|
db = MysqlDB() |
|
|
|
db = MysqlDB() |
|
|
|
session = requests.Session() |
|
|
|
session = requests.Session() |
|
|
|
|
|
|
|
session.mount('http://', HTTPAdapter(max_retries=3)) |
|
|
|
|
|
|
|
session.mount('https://', HTTPAdapter(max_retries=3)) |
|
|
|
cookies = RequestsCookieJar() |
|
|
|
cookies = RequestsCookieJar() |
|
|
|
cookies.set('wengine_vpn_ticketlibcon_bupt_edu_cn', '85f585f32fdf27ca', path='/', domain='.libcon.bupt.edu.cn') |
|
|
|
cookies.set('wengine_vpn_ticketlibcon_bupt_edu_cn', '7bde1fe6992c50f9', path='/', domain='.libcon.bupt.edu.cn') |
|
|
|
cookies.set('remember_token', 'yeysYCGGzNIrcDDUNXOVyNXODUZzJpROJgLmsObLAXrsmibvEqcwqzRkDYlODYWA', path='/', |
|
|
|
|
|
|
|
domain='libcon.bupt.edu.cn') |
|
|
|
|
|
|
|
session.cookies.update(cookies) |
|
|
|
session.cookies.update(cookies) |
|
|
|
|
|
|
|
|
|
|
|
pdf_dir = 'pdf' |
|
|
|
pdf_dir = 'pdf' |
|
|
@ -155,6 +156,8 @@ executor = ThreadPoolExecutor(max_workers=2) |
|
|
|
sys_tfidf = 'sys_tfidf' |
|
|
|
sys_tfidf = 'sys_tfidf' |
|
|
|
# 论文表 |
|
|
|
# 论文表 |
|
|
|
sys_paper = 'sys_paper' |
|
|
|
sys_paper = 'sys_paper' |
|
|
|
|
|
|
|
# 词库表 |
|
|
|
|
|
|
|
sys_word = 'sys_word' |
|
|
|
# 相似度表 |
|
|
|
# 相似度表 |
|
|
|
sys_similarity = 'sys_similarity' |
|
|
|
sys_similarity = 'sys_similarity' |
|
|
|
|
|
|
|
|
|
|
@ -179,7 +182,7 @@ def split_word(): |
|
|
|
jieba.enable_paddle() |
|
|
|
jieba.enable_paddle() |
|
|
|
start = db.query(f'select min(id) from {sys_paper}')[0][0] |
|
|
|
start = db.query(f'select min(id) from {sys_paper}')[0][0] |
|
|
|
end = db.query(f'select max(id) from {sys_paper}')[0][0] |
|
|
|
end = db.query(f'select max(id) from {sys_paper}')[0][0] |
|
|
|
result = db.query('select word,flag from sys_word') |
|
|
|
result = db.query(f'select word,flag from {sys_word}') |
|
|
|
filter_word = set(Word(_[0], _[1]) for _ in result) |
|
|
|
filter_word = set(Word(_[0], _[1]) for _ in result) |
|
|
|
new_word = set() |
|
|
|
new_word = set() |
|
|
|
count = 0 |
|
|
|
count = 0 |
|
|
@ -198,7 +201,7 @@ def split_word(): |
|
|
|
|
|
|
|
|
|
|
|
if len(new_word) > 0: |
|
|
|
if len(new_word) > 0: |
|
|
|
words = tuple((_.word, _.flag) for _ in new_word) |
|
|
|
words = tuple((_.word, _.flag) for _ in new_word) |
|
|
|
db.modify('insert into sys_word(word,flag) values (%s,%s)', words) |
|
|
|
db.modify(f'insert into {sys_word}(word,flag) values (%s,%s)', words) |
|
|
|
create_doc_vector() |
|
|
|
create_doc_vector() |
|
|
|
create_sim() |
|
|
|
create_sim() |
|
|
|
else: |
|
|
|
else: |
|
|
@ -210,7 +213,7 @@ def create_doc_vector(): |
|
|
|
start = time.time() |
|
|
|
start = time.time() |
|
|
|
writeInfo('开始计算文档向量') |
|
|
|
writeInfo('开始计算文档向量') |
|
|
|
db.modify(f'drop table if exists {sys_tfidf}') |
|
|
|
db.modify(f'drop table if exists {sys_tfidf}') |
|
|
|
db.modify(f''' |
|
|
|
create_table_sql = f''' |
|
|
|
create table {sys_tfidf} |
|
|
|
create table {sys_tfidf} |
|
|
|
( |
|
|
|
( |
|
|
|
id bigint NOT NULL AUTO_INCREMENT, |
|
|
|
id bigint NOT NULL AUTO_INCREMENT, |
|
|
@ -228,39 +231,43 @@ def create_doc_vector(): |
|
|
|
sum(if(locate(word, txt_content) > 0, 1, 0)) as df, |
|
|
|
sum(if(locate(word, txt_content) > 0, 1, 0)) as df, |
|
|
|
log((select count(*) from {sys_paper}) / sum(if(locate(word, txt_content) > 0, 1, 0))) + 1 as idf |
|
|
|
log((select count(*) from {sys_paper}) / sum(if(locate(word, txt_content) > 0, 1, 0))) + 1 as idf |
|
|
|
from {sys_paper}, |
|
|
|
from {sys_paper}, |
|
|
|
sys_word |
|
|
|
{sys_word} |
|
|
|
group by word) as f) as f |
|
|
|
group by word) as f) as f |
|
|
|
group by id |
|
|
|
group by id |
|
|
|
''') |
|
|
|
''' |
|
|
|
writeInfo(f'计算文档向量花费{round(time.time() - start)}s') |
|
|
|
db.modify(create_table_sql) |
|
|
|
|
|
|
|
writeInfo(f'计算文档向量执行sql{create_table_sql}花费{round(time.time() - start)}s') |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 计算文档相似度 |
|
|
|
# 计算文档相似度 |
|
|
|
def create_sim(): |
|
|
|
def create_sim(): |
|
|
|
ids = db.query(f'select group_concat(id) from {sys_paper}') |
|
|
|
ids = db.query(f'select group_concat(id) from {sys_paper}') |
|
|
|
if len(ids)>0 and len(ids[0])>0: |
|
|
|
if len(ids) > 0 and len(ids[0]) > 0: |
|
|
|
group_ids = list(combinations(ids[0][0].split(','), 2)) |
|
|
|
group_ids = list(combinations(ids[0][0].split(','), 2)) |
|
|
|
for id1, id2 in group_ids: |
|
|
|
for id1, id2 in group_ids: |
|
|
|
result = db.query(f'select tfidf from {sys_tfidf} where id in ({id1},{id2})') |
|
|
|
result = db.query(f'select tfidf from {sys_tfidf} where id in ({id1},{id2})') |
|
|
|
tfidf1, tfidf2 = result |
|
|
|
if len(result) == 2: |
|
|
|
writeInfo(f'开始比较文档{id1}和文档{id2}的相似度') |
|
|
|
tfidf1, tfidf2 = result |
|
|
|
w1 = tfidf1[0].split(',') |
|
|
|
writeInfo(f'开始比较文档{id1}和文档{id2}的相似度') |
|
|
|
w2 = tfidf2[0].split(',') |
|
|
|
w1 = tfidf1[0].split(',') |
|
|
|
if len(w1) == len(w2): |
|
|
|
w2 = tfidf2[0].split(',') |
|
|
|
a = 0 |
|
|
|
if len(w1) == len(w2): |
|
|
|
b = 0 |
|
|
|
a = 0 |
|
|
|
c = 0 |
|
|
|
b = 0 |
|
|
|
for i in range(0, len(w1)): |
|
|
|
c = 0 |
|
|
|
# writeInfo(f'第{i+1}个词汇在文档{id1}中tfidf={w1[i]},在文档{id2}中tfidf={w2[i]}') |
|
|
|
for i in range(0, len(w1)): |
|
|
|
a += float(w1[i]) * float(w2[i]) |
|
|
|
# writeInfo(f'第{i+1}个词汇在文档{id1}中tfidf={w1[i]},在文档{id2}中tfidf={w2[i]}') |
|
|
|
b += numpy.square(float(w1[i])) |
|
|
|
a += float(w1[i]) * float(w2[i]) |
|
|
|
c += numpy.square(float(w2[i])) |
|
|
|
b += numpy.square(float(w1[i])) |
|
|
|
b = numpy.sqrt(b) |
|
|
|
c += numpy.square(float(w2[i])) |
|
|
|
c = numpy.sqrt(c) |
|
|
|
b = numpy.sqrt(b) |
|
|
|
count = float(a / (b * c)) |
|
|
|
c = numpy.sqrt(c) |
|
|
|
writeInfo(f'文档{id1}和文档{id2}的相似度={count}') |
|
|
|
count = float(a / (b * c)) |
|
|
|
db.modify('replace into sys_similarity (source_paper, target_paper, similarity) VALUE (%s,%s,%s)', |
|
|
|
writeInfo(f'文档{id1}和文档{id2}的相似度={count}') |
|
|
|
(id1, id2, count)) |
|
|
|
db.modify('replace into sys_similarity (source_paper, target_paper, similarity) VALUE (%s,%s,%s)', |
|
|
|
|
|
|
|
(id1, id2, count)) |
|
|
|
|
|
|
|
else: |
|
|
|
|
|
|
|
writeError(f'查询tfidf失败{result}') |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 文件格式转换保存到数据库 |
|
|
|
# 文件格式转换保存到数据库 |
|
|
@ -309,17 +316,26 @@ def save(des, res, params): |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 万方平台论文采集 |
|
|
|
# 万方平台论文采集 |
|
|
|
def run(): |
|
|
|
def run(max=10, last_page=100, page_size=20): |
|
|
|
for page in range(1, 100): |
|
|
|
if max > last_page * page_size: |
|
|
|
res = session.get( |
|
|
|
writeInfo(f'采集数不能超过{last_page*page_size}') |
|
|
|
f"{base_url}/search/searchList.do?searchType=degree&showType=detail&page={page}&pageSize=20&searchWord={keyword}&isTriggerTag=", |
|
|
|
return |
|
|
|
headers=headers) |
|
|
|
db.modify(f'delete from {sys_paper}') |
|
|
|
|
|
|
|
db.modify(f'delete from {sys_word}') |
|
|
|
|
|
|
|
db.modify(f'delete from {sys_similarity}') |
|
|
|
|
|
|
|
count = 0 |
|
|
|
|
|
|
|
for page in range(1, last_page): |
|
|
|
|
|
|
|
url = f"{base_url}/search/searchList.do?searchType=degree&showType=detail&page={page}&pageSize={page_size}&searchWord={keyword}&isTriggerTag=" |
|
|
|
|
|
|
|
writeInfo(f'分页url={url}') |
|
|
|
|
|
|
|
res = session.get(url, |
|
|
|
|
|
|
|
headers=headers) |
|
|
|
if res.status_code == 200: |
|
|
|
if res.status_code == 200: |
|
|
|
params_list = parse(res.content) |
|
|
|
params_list = parse(res.content) |
|
|
|
for params in params_list: |
|
|
|
for params in params_list: |
|
|
|
params["base_url"] = base_url |
|
|
|
params["base_url"] = base_url |
|
|
|
url = '{base_url}/search/downLoad.do?page_cnt={page_cnt}&language={language}&resourceType={resourceType}&source={source}&resourceId={resourceId}&resourceTitle={resourceTitle}&isoa={isoa}&type={resourceType}&first=null'.format( |
|
|
|
url = '{base_url}/search/downLoad.do?page_cnt={page_cnt}&language={language}&resourceType={resourceType}&source={source}&resourceId={resourceId}&resourceTitle={resourceTitle}&isoa={isoa}&type={resourceType}&first=null'.format( |
|
|
|
**params) |
|
|
|
**params) |
|
|
|
|
|
|
|
writeInfo(f'下载接口={url}') |
|
|
|
res = session.get(url, headers=headers) |
|
|
|
res = session.get(url, headers=headers) |
|
|
|
if res.status_code == 200 and 'downloadliterature.do' in res.url: |
|
|
|
if res.status_code == 200 and 'downloadliterature.do' in res.url: |
|
|
|
res_html = BeautifulSoup(res.content, "html.parser") |
|
|
|
res_html = BeautifulSoup(res.content, "html.parser") |
|
|
@ -327,12 +343,21 @@ def run(): |
|
|
|
if downloadIframe: |
|
|
|
if downloadIframe: |
|
|
|
res = session.get(downloadIframe["src"]) |
|
|
|
res = session.get(downloadIframe["src"]) |
|
|
|
if res.status_code == 200 and 'download.ashx' in res.url: |
|
|
|
if res.status_code == 200 and 'download.ashx' in res.url: |
|
|
|
writeInfo("成功获取真实下载地址{path}".format(path=res.url)) |
|
|
|
writeInfo("成功获取真实下载地址={path}".format(path=res.url)) |
|
|
|
res = session.get(res.url, headers=headers, stream=True) |
|
|
|
res = session.get(res.url, headers=headers, stream=True) |
|
|
|
if res.status_code == 200 and 'pdf' in res.headers['Content-Type']: |
|
|
|
if res.status_code == 200 and 'pdf' in res.headers['Content-Type']: |
|
|
|
des = res.headers['Content-Disposition'].split(';') |
|
|
|
des = res.headers['Content-Disposition'].split(';') |
|
|
|
if len(des) == 2 and len(des[1].split('=')) == 2: |
|
|
|
if len(des) == 2 and len(des[1].split('=')) == 2: |
|
|
|
executor.submit(save, des, res, params) |
|
|
|
count = count + 1 |
|
|
|
|
|
|
|
writeInfo(f'当前采集进度{count}/{max},{round(count / max, 4) * 100}%') |
|
|
|
|
|
|
|
if count <= max: |
|
|
|
|
|
|
|
executor.submit(save, des, res, params) |
|
|
|
|
|
|
|
if count == max: |
|
|
|
|
|
|
|
writeInfo('采集任务已完成,论文入库中') |
|
|
|
|
|
|
|
executor.shutdown(wait=True) |
|
|
|
|
|
|
|
writeInfo('论文已入库') |
|
|
|
|
|
|
|
split_word() |
|
|
|
|
|
|
|
return |
|
|
|
else: |
|
|
|
else: |
|
|
|
writeError("非法响应类型") |
|
|
|
writeError("非法响应类型") |
|
|
|
else: |
|
|
|
else: |
|
|
|