You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
cloudnote_sim/main.py

391 lines
15 KiB

4 years ago
import os
import re
from concurrent.futures.thread import ThreadPoolExecutor
from itertools import combinations
4 years ago
from typing import Tuple
from urllib.parse import unquote, urlencode
4 years ago
import jieba
import pymysql as pymysql
import requests
from Scripts import pdf2txt
from bs4 import BeautifulSoup
from jieba import posseg
from lxml import etree
from requests.adapters import HTTPAdapter
4 years ago
from requests.cookies import RequestsCookieJar
from config.config import cf
from config.log import writeInfo, writeError
import time
import numpy
4 years ago
# mysql数据库
class MysqlDB:
# 建立连接
def connect(self):
mysql = 'mysql'
host = cf.get(mysql, 'host')
user = cf.get(mysql, 'user')
passwd = cf.get(mysql, 'passwd')
db = cf.get(mysql, 'db')
port = int(cf.get(mysql, 'port'))
charset = cf.get(mysql, 'charset')
return pymysql.connect(host=host, user=user, passwd=passwd, db=db, port=port, charset=charset)
# 执行insert语句
def modify(self, sql, params=()):
connection = self.connect()
try:
with connection.cursor() as cursor:
# Create a new record
if isinstance(params, Tuple) and len(params) > 0 and isinstance(params[0], Tuple):
cursor.executemany(sql, params)
else:
cursor.execute(sql, params)
# connection is not autocommit by default. So you must commit to save
# your changes.
# 提交事务
sql = ''' select LAST_INSERT_ID() '''
num = cursor.execute(sql)
if num > 0:
id = cursor.fetchall()[0]
connection.commit()
return id
except Exception as e:
writeError(e)
finally:
connection.close()
# 查询语句
def query(self, sql, params=()):
connection = self.connect()
try:
with connection.cursor() as cursor:
cursor.execute(sql, params)
return cursor.fetchall()
except Exception as e:
writeError(e)
finally:
connection.close()
def parse(content):
res_html = BeautifulSoup(content, "html.parser")
# 论文下载标签
ResultCont = res_html.select('div.ResultCont')
params_list = []
for result in ResultCont:
# 论文标题
title = str(result.select_one('div.title>a:nth-child(3)').text).strip()
# 授予学位
resultResouceType = str(result.select_one('span.resultResouceType').text).strip()
# 作者
author = str(result.select_one('div.author>a').text).strip()
# 学校
source = etree.HTML(result.select_one('div.Source').prettify()).xpath('//comment()[2]')
if len(source) > 0:
school = source[0].tail.strip()
else:
school = ''
# 年份
year = str(result.select_one('span.blockspan').text).strip()
# 关键词
tag = ''
for a in result.select('div.Keyword>a'):
tag += f",{a.text}"
if len(tag) > 0:
tag = tag[1:]
# 摘要
if result.select_one('div.summary'):
summary = result.select_one('div.summary').text
else:
summary = ''
info = {
"title": title,
"resultResouceType": resultResouceType,
"author": author,
"school": school,
"year": "".join(filter(str.isdigit, year)),
"tag": tag,
"summary": summary
}
writeInfo('正在获取论文《{title}》的真实下载地址'.format(title=title))
onClick = result.select_one('a.result_opera_down')['onclick']
prefix = 'downLoadPermissions'
suffix = ",'0'"
match = re.findall('{prefix}.*{suffix}'.format(prefix=prefix, suffix=suffix), onClick)
if len(match) > 0:
match = match[0]
# 下载参数
params_str = match[len(prefix) + 1:].split(",'")
param_keys = ["page_cnt", "language", "resourceType", "source", "resourceId", "resourceTitle", "isoa"]
params_obj = {}
if len(params_str) == len(param_keys):
for index, key in enumerate(param_keys):
params_obj[key] = params_str[index].replace("'", "")
params_list.append({**params_obj, **info})
else:
writeError('匹配下载参数失败')
else:
writeError('匹配下载参数失败')
return params_list
suffix = '77726476706e69737468656265737421e7e056d2303166567f068ea89941227bfcd3ca21bd0c'
base_url = f'https://libcon.bupt.edu.cn/http/{suffix}'
4 years ago
profession = "计算机软件与理论"
keyword = f'(专业%3A"{profession}")'
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36",
}
db = MysqlDB()
session = requests.Session()
session.mount('http://', HTTPAdapter(max_retries=3))
session.mount('https://', HTTPAdapter(max_retries=3))
4 years ago
cookies = RequestsCookieJar()
cookies.set('wengine_vpn_ticketlibcon_bupt_edu_cn', '7bde1fe6992c50f9', path='/', domain='.libcon.bupt.edu.cn')
4 years ago
session.cookies.update(cookies)
pdf_dir = 'pdf'
html_dir = 'html'
4 years ago
# executor = ThreadPoolExecutor(max_workers=1)
# 向量表
sys_tfidf = 'sys_tfidf'
# 论文表
sys_paper = 'sys_paper'
# 词库表
sys_word = 'sys_word'
# 相似度表
sys_similarity = 'sys_similarity'
4 years ago
class Word:
def __init__(self, word, flag):
self.word = word
self.flag = flag
def __eq__(self, other: object) -> bool:
if isinstance(other, self.__class__):
return self.word == other.word
else:
return False
def __hash__(self) -> int:
return hash(self.word)
# 更新词库
def split_word():
jieba.enable_paddle()
start = db.query(f'select min(id) from {sys_paper}')[0][0]
end = db.query(f'select max(id) from {sys_paper}')[0][0]
result = db.query(f'select word,flag from {sys_word}')
4 years ago
filter_word = set(Word(_[0], _[1]) for _ in result)
new_word = set()
4 years ago
count = 0
for i in range(start, end + 1):
txt_content = db.query(f'select txt_content from {sys_paper} where id=%s', (i))[0][0]
4 years ago
words = posseg.cut(txt_content, use_paddle=True)
for word, flag in words:
# writeInfo(f'word={word},flag={flag}')
if flag == 'n':
word = re.sub(u"([^\u4e00-\u9fa5a-zA-Z])", "", word)
w = Word(word, flag)
4 years ago
if len(word) > 0 and w not in filter_word:
new_word.add(w)
count = count + 1
writeInfo(f'{count}个词语中过滤出{len(new_word)}个新词汇')
if len(new_word) > 0:
4 years ago
words = tuple((_.word, _.flag) for _ in new_word)
db.modify(f'insert into {sys_word}(word,flag) values (%s,%s)', words)
4 years ago
create_doc_vector()
create_sim()
4 years ago
else:
writeInfo('没有发现新词汇,不需要更新词库')
# 计算文档向量
4 years ago
def create_doc_vector():
start = time.time()
4 years ago
writeInfo('开始计算文档向量')
db.modify(f'drop table if exists {sys_tfidf}')
create_table_sql = f'''
create table {sys_tfidf}
4 years ago
(
id bigint NOT NULL AUTO_INCREMENT,
tfidf longtext not null,
primary key (id)
) as
select id, group_concat(tf * idf order by word) as tfidf
from (select f.word,
df,
f.idf,
round((LENGTH(txt_content) - LENGTH(REPLACE(txt_content, word, ''))) / LENGTH(word)) AS tf,
id
from {sys_paper},
4 years ago
(select word,
sum(if(locate(word, txt_content) > 0, 1, 0)) as df,
log((select count(*) from {sys_paper}) / sum(if(locate(word, txt_content) > 0, 1, 0))) + 1 as idf
from {sys_paper},
{sys_word}
4 years ago
group by word) as f) as f
group by id
'''
db.modify(create_table_sql)
writeInfo(f'计算文档向量执行sql{create_table_sql}花费{round(time.time() - start)}s')
4 years ago
# 计算文档相似度
def create_sim():
ids = db.query(f'select group_concat(id) from {sys_paper}')
if len(ids) > 0 and len(ids[0]) > 0:
group_ids = list(combinations(ids[0][0].split(','), 2))
for id1, id2 in group_ids:
result = db.query(f'select tfidf from {sys_tfidf} where id in ({id1},{id2})')
if len(result) == 2:
tfidf1, tfidf2 = result
writeInfo(f'开始比较文档{id1}和文档{id2}的相似度')
w1 = tfidf1[0].split(',')
w2 = tfidf2[0].split(',')
if len(w1) == len(w2):
a = 0
b = 0
c = 0
for i in range(0, len(w1)):
# writeInfo(f'第{i+1}个词汇在文档{id1}中tfidf={w1[i]},在文档{id2}中tfidf={w2[i]}')
a += float(w1[i]) * float(w2[i])
b += numpy.square(float(w1[i]))
c += numpy.square(float(w2[i]))
b = numpy.sqrt(b)
c = numpy.sqrt(c)
count = float(a / (b * c))
writeInfo(f'文档{id1}和文档{id2}的相似度={count}')
db.modify('replace into sys_similarity (source_paper, target_paper, similarity) VALUE (%s,%s,%s)',
(id1, id2, count))
else:
writeError(f'查询tfidf失败{result}')
4 years ago
# 文件格式转换保存到数据库
def save(des, res, params):
des = des[1].split('=')
file_name = unquote(des[1], 'utf-8').replace('"', '')
if not os.path.exists(pdf_dir):
os.mkdir(pdf_dir)
writeInfo(f'{params["title"]} PDF文件大小{len(res.content)}字节')
with open(f'{pdf_dir}/{file_name}', 'wb') as file:
file.write(res.content)
if not os.path.exists(html_dir):
os.mkdir(html_dir)
html_file = f'{html_dir}/{file_name.replace("pdf", "html")}'
writeInfo(f'{params["title"]} BEGIN PDF转HTML')
pdf2txt.main(['-o', html_file, '-Y', 'exact', f'{pdf_dir}/{file_name}'])
writeInfo(f'{params["title"]} END PDF转HTML')
with open(html_file, 'rb') as file:
html_content = file.read()
parse_html = BeautifulSoup(html_content, "html.parser")
txt_content = parse_html.text.replace('\n', '').replace(' ', '')
info = {
"title": params['title'],
"type": params['resultResouceType'],
"author": params['author'],
"profession": profession,
"school": params['school'],
"year": params['year'],
"summary": params['summary'],
"tag": params['tag'],
"pdf_content": res.content,
"html_content": html_content,
"txt_content": txt_content,
"create_time": time.time()
}
# writeInfo('论文信息{info}'.format(info=info))
writeInfo(f'{params["title"]} 插入数据库')
db.modify(
f'insert into {sys_paper} (author, create_time, pdf_content,html_content,txt_content, profession, school, summary, tag, title, type, year) value(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)',
4 years ago
(
info['author'], info['create_time'], info['pdf_content'],
info['html_content'], info['txt_content'],
info['profession'], info['school'], info['summary'], info['tag']
, info['title'], info['type'], info['year']
))
4 years ago
login_url = 'https://libcon.bupt.edu.cn/login'
def check(res):
if res.status_code == 200:
if res.url == login_url:
raise Exception('请更新cookie信息')
else:
return True
else:
return False
4 years ago
# 万方平台论文采集
def run(max=10, last_page=100, page_size=20):
if max > last_page * page_size:
writeInfo(f'采集数不能超过{last_page * page_size}')
return
db.modify(f'delete from {sys_paper}')
db.modify(f'delete from {sys_word}')
db.modify(f'delete from {sys_similarity}')
count = 0
for page in range(1, last_page):
url = f"{base_url}/search/searchList.do?searchType=degree&showType=detail&page={page}&pageSize={page_size}&searchWord={keyword}&isTriggerTag="
writeInfo(f'分页url={url}')
res = session.get(url,
headers=headers)
4 years ago
if check(res):
4 years ago
params_list = parse(res.content)
for params in params_list:
params["base_url"] = base_url
url = '{base_url}/search/downLoad.do?page_cnt={page_cnt}&language={language}&resourceType={resourceType}&source={source}&resourceId={resourceId}&resourceTitle={resourceTitle}&isoa={isoa}&type={resourceType}&first=null'.format(
**params)
writeInfo(f'下载接口={url}')
4 years ago
res = session.get(url, headers=headers)
if check(res):
4 years ago
res_html = BeautifulSoup(res.content, "html.parser")
if 'downloadliterature.do' in res.url:
downloadIframe = res_html.select_one('#downloadIframe')
if downloadIframe:
res = session.get(downloadIframe["src"])
if check(res) and 'download.ashx' in res.url:
writeInfo("成功获取真实下载地址={path}".format(path=res.url))
res = session.get(res.url, headers=headers, stream=True)
if check(res) and 'pdf' in res.headers['Content-Type']:
des = res.headers['Content-Disposition'].split(';')
if len(des) == 2 and len(des[1].split('=')) == 2:
count = count + 1
writeInfo(f'当前采集进度{count}/{max},{round(count / max, 4) * 100}%')
if count <= max:
save(des, res, params)
if count == max:
break
else:
writeError("非法响应类型")
4 years ago
else:
writeError("无法获取文档信息")
4 years ago
else:
writeError("无法获取文档真实下载地址")
4 years ago
else:
writeError("无法获取真实下载地址")
elif res_html.select_one('title').text == '交易':
raise Exception(res_html.select_one('div.NotWork>span').text)
4 years ago
else:
raise Exception('发生未知错误!!!')
4 years ago
else:
writeError('error code={code}'.format(code=res.status_code))
4 years ago
break
4 years ago
else:
writeError('error code={code}'.format(code=res.status_code))
4 years ago
writeInfo('采集任务已完成')
4 years ago
split_word()