cloudnote_sim/main.py

import os
import re
from concurrent.futures.thread import ThreadPoolExecutor
from itertools import combinations
from typing import Tuple
from urllib.parse import unquote, urlencode

import jieba
import pymysql as pymysql
import requests
from Scripts import pdf2txt
from bs4 import BeautifulSoup
from jieba import posseg
from lxml import etree
from requests.adapters import HTTPAdapter
from requests.cookies import RequestsCookieJar

from config.config import cf
from config.log import writeInfo, writeError
import time
import numpy


# mysql数据库
class MysqlDB:
    # 建立连接
    def connect(self):
        mysql = 'mysql'
        host = cf.get(mysql, 'host')
        user = cf.get(mysql, 'user')
        passwd = cf.get(mysql, 'passwd')
        db = cf.get(mysql, 'db')
        port = int(cf.get(mysql, 'port'))
        charset = cf.get(mysql, 'charset')
        return pymysql.connect(host=host, user=user, passwd=passwd, db=db, port=port, charset=charset)

    # 执行insert语句
    def modify(self, sql, params=()):
        connection = self.connect()
        try:

            with connection.cursor() as cursor:
                # Create a new record
                if isinstance(params, Tuple) and len(params) > 0 and isinstance(params[0], Tuple):
                    cursor.executemany(sql, params)
                else:
                    cursor.execute(sql, params)
                # connection is not autocommit by default. So you must commit to save
                # your changes.
                # 提交事务
                sql = ''' select LAST_INSERT_ID() '''
                num = cursor.execute(sql)
                if num > 0:
                    id = cursor.fetchall()[0]
            connection.commit()
            return id
        except Exception as e:
            writeError(e)
        finally:
            connection.close()

    # 查询语句
    def query(self, sql, params=()):
        connection = self.connect()
        try:
            with connection.cursor() as cursor:
                cursor.execute(sql, params)
                return cursor.fetchall()
        except Exception as e:
            writeError(e)
        finally:
            connection.close()


def parse(content):
    res_html = BeautifulSoup(content, "html.parser")
    #     论文下载标签
    ResultCont = res_html.select('div.ResultCont')
    params_list = []
    for result in ResultCont:
        # 论文标题
        title = str(result.select_one('div.title>a:nth-child(3)').text).strip()
        # 授予学位
        resultResouceType = str(result.select_one('span.resultResouceType').text).strip()
        # 作者
        author = str(result.select_one('div.author>a').text).strip()
        # 学校
        source = etree.HTML(result.select_one('div.Source').prettify()).xpath('//comment()[2]')
        if len(source) > 0:
            school = source[0].tail.strip()
        else:
            school = ''
        # 年份
        year = str(result.select_one('span.blockspan').text).strip()
        # 关键词
        tag = ''
        for a in result.select('div.Keyword>a'):
            tag += f",{a.text}"
        if len(tag) > 0:
            tag = tag[1:]
        #     摘要
        if result.select_one('div.summary'):
            summary = result.select_one('div.summary').text
        else:
            summary = ''
        info = {
            "title": title,
            "resultResouceType": resultResouceType,
            "author": author,
            "school": school,
            "year": "".join(filter(str.isdigit, year)),
            "tag": tag,
            "summary": summary
        }

        writeInfo('正在获取论文《{title}》的真实下载地址'.format(title=title))
        onClick = result.select_one('a.result_opera_down')['onclick']
        prefix = 'downLoadPermissions'
        suffix = ",'0'"
        match = re.findall('{prefix}.*{suffix}'.format(prefix=prefix, suffix=suffix), onClick)
        if len(match) > 0:
            match = match[0]
            # 下载参数
            params_str = match[len(prefix) + 1:].split(",'")
            param_keys = ["page_cnt", "language", "resourceType", "source", "resourceId", "resourceTitle", "isoa"]
            params_obj = {}
            if len(params_str) == len(param_keys):
                for index, key in enumerate(param_keys):
                    params_obj[key] = params_str[index].replace("'", "")
                params_list.append({**params_obj, **info})
            else:
                writeError('匹配下载参数失败')
        else:
            writeError('匹配下载参数失败')
    return params_list


suffix = '77726476706e69737468656265737421e7e056d2303166567f068ea89941227bfcd3ca21bd0c'
base_url = f'https://libcon.bupt.edu.cn/http/{suffix}'
profession = "计算机软件与理论"
keyword = f'(专业%3A"{profession}")'
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36",
}
db = MysqlDB()
session = requests.Session()
session.mount('http://', HTTPAdapter(max_retries=3))
session.mount('https://', HTTPAdapter(max_retries=3))
cookies = RequestsCookieJar()
cookies.set('wengine_vpn_ticketlibcon_bupt_edu_cn', '7bde1fe6992c50f9', path='/', domain='.libcon.bupt.edu.cn')
session.cookies.update(cookies)

pdf_dir = 'pdf'
html_dir = 'html'
# executor = ThreadPoolExecutor(max_workers=1)
# 向量表
sys_tfidf = 'sys_tfidf'
# 论文表
sys_paper = 'sys_paper'
# 词库表
sys_word = 'sys_word'
# 相似度表
sys_similarity = 'sys_similarity'


class Word:
    def __init__(self, word, flag):
        self.word = word
        self.flag = flag

    def __eq__(self, other: object) -> bool:
        if isinstance(other, self.__class__):
            return self.word == other.word
        else:
            return False

    def __hash__(self) -> int:
        return hash(self.word)


# 更新词库
def split_word():
    jieba.enable_paddle()
    start = db.query(f'select min(id) from {sys_paper}')[0][0]
    end = db.query(f'select max(id) from {sys_paper}')[0][0]
    result = db.query(f'select word,flag from {sys_word}')
    filter_word = set(Word(_[0], _[1]) for _ in result)
    new_word = set()
    count = 0
    for i in range(start, end + 1):
        txt_content = db.query(f'select txt_content from {sys_paper} where id=%s', (i))[0][0]
        words = posseg.cut(txt_content, use_paddle=True)
        for word, flag in words:
            # writeInfo(f'word={word},flag={flag}')
            if flag == 'n':
                word = re.sub(u"([^\u4e00-\u9fa5a-zA-Z])", "", word)
                w = Word(word, flag)
                if len(word) > 0 and w not in filter_word:
                    new_word.add(w)
            count = count + 1
        writeInfo(f'从{count}个词语中过滤出{len(new_word)}个新词汇')

    if len(new_word) > 0:
        words = tuple((_.word, _.flag) for _ in new_word)
        db.modify(f'insert into {sys_word}(word,flag) values (%s,%s)', words)
        create_doc_vector()
        create_sim()
    else:
        writeInfo('没有发现新词汇，不需要更新词库')


# 计算文档向量
def create_doc_vector():
    start = time.time()
    writeInfo('开始计算文档向量')
    db.modify(f'drop table if exists {sys_tfidf}')
    create_table_sql = f'''
    create table {sys_tfidf}
    (
        id    bigint   NOT NULL AUTO_INCREMENT,
        tfidf longtext not null,
        primary key (id)
    ) as
    select id, group_concat(tf * idf order by word) as tfidf
    from (select f.word,
                 df,
                 f.idf,
                 round((LENGTH(txt_content) - LENGTH(REPLACE(txt_content, word, ''))) / LENGTH(word)) AS tf,
                 id
          from {sys_paper},
               (select word,
                       sum(if(locate(word, txt_content) > 0, 1, 0))                                             as df,
                       log((select count(*) from {sys_paper}) / sum(if(locate(word, txt_content) > 0, 1, 0))) + 1 as idf
                from {sys_paper},
                     {sys_word}
                group by word) as f) as f
    group by id
    '''
    db.modify(create_table_sql)
    writeInfo(f'计算文档向量执行sql{create_table_sql}花费{round(time.time() - start)}s')


# 计算文档相似度
def create_sim():
    ids = db.query(f'select group_concat(id) from {sys_paper}')
    if len(ids) > 0 and len(ids[0]) > 0:
        group_ids = list(combinations(ids[0][0].split(','), 2))
        for id1, id2 in group_ids:
            result = db.query(f'select tfidf from {sys_tfidf} where id in ({id1},{id2})')
            if len(result) == 2:
                tfidf1, tfidf2 = result
                writeInfo(f'开始比较文档{id1}和文档{id2}的相似度')
                w1 = tfidf1[0].split(',')
                w2 = tfidf2[0].split(',')
                if len(w1) == len(w2):
                    a = 0
                    b = 0
                    c = 0
                    for i in range(0, len(w1)):
                        # writeInfo(f'第{i+1}个词汇在文档{id1}中tfidf={w1[i]},在文档{id2}中tfidf={w2[i]}')
                        a += float(w1[i]) * float(w2[i])
                        b += numpy.square(float(w1[i]))
                        c += numpy.square(float(w2[i]))
                    b = numpy.sqrt(b)
                    c = numpy.sqrt(c)
                    count = float(a / (b * c))
                    writeInfo(f'文档{id1}和文档{id2}的相似度={count}')
                    db.modify('replace into sys_similarity (source_paper, target_paper, similarity) VALUE (%s,%s,%s)',
                              (id1, id2, count))
            else:
                writeError(f'查询tfidf失败{result}')


# 文件格式转换保存到数据库
def save(des, res, params):
    des = des[1].split('=')
    file_name = unquote(des[1], 'utf-8').replace('"', '')
    if not os.path.exists(pdf_dir):
        os.mkdir(pdf_dir)
    writeInfo(f'{params["title"]} PDF文件大小{len(res.content)}字节')
    with open(f'{pdf_dir}/{file_name}', 'wb') as file:
        file.write(res.content)
    if not os.path.exists(html_dir):
        os.mkdir(html_dir)
    html_file = f'{html_dir}/{file_name.replace("pdf", "html")}'
    writeInfo(f'{params["title"]} BEGIN PDF转HTML')
    pdf2txt.main(['-o', html_file, '-Y', 'exact', f'{pdf_dir}/{file_name}'])
    writeInfo(f'{params["title"]} END PDF转HTML')
    with open(html_file, 'rb') as file:
        html_content = file.read()
    parse_html = BeautifulSoup(html_content, "html.parser")
    txt_content = parse_html.text.replace('\n', '').replace(' ', '')
    info = {
        "title": params['title'],
        "type": params['resultResouceType'],
        "author": params['author'],
        "profession": profession,
        "school": params['school'],
        "year": params['year'],
        "summary": params['summary'],
        "tag": params['tag'],
        "pdf_content": res.content,
        "html_content": html_content,
        "txt_content": txt_content,
        "create_time": time.time()
    }
    # writeInfo('论文信息{info}'.format(info=info))
    writeInfo(f'{params["title"]} 插入数据库')
    db.modify(
        f'insert into {sys_paper} (author, create_time, pdf_content,html_content,txt_content, profession, school, summary, tag, title, type, year) value(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)',
        (
            info['author'], info['create_time'], info['pdf_content'],
            info['html_content'], info['txt_content'],
            info['profession'], info['school'], info['summary'], info['tag']
            , info['title'], info['type'], info['year']
        ))


login_url = 'https://libcon.bupt.edu.cn/login'


def check(res):
    if res.status_code == 200:
        if res.url == login_url:
            raise Exception('请更新cookie信息')
        else:
            return True
    else:
        return False


# 万方平台论文采集
def run(max=10, last_page=100, page_size=20):
    if max > last_page * page_size:
        writeInfo(f'采集数不能超过{last_page * page_size}')
        return
    db.modify(f'delete from {sys_paper}')
    db.modify(f'delete from {sys_word}')
    db.modify(f'delete from {sys_similarity}')
    count = 0
    for page in range(1, last_page):
        url = f"{base_url}/search/searchList.do?searchType=degree&showType=detail&page={page}&pageSize={page_size}&searchWord={keyword}&isTriggerTag="
        writeInfo(f'分页url={url}')
        res = session.get(url,
                          headers=headers)
        if check(res):
            params_list = parse(res.content)
            for params in params_list:
                params["base_url"] = base_url
                url = '{base_url}/search/downLoad.do?page_cnt={page_cnt}&language={language}&resourceType={resourceType}&source={source}&resourceId={resourceId}&resourceTitle={resourceTitle}&isoa={isoa}&type={resourceType}&first=null'.format(
                    **params)
                writeInfo(f'下载接口={url}')
                res = session.get(url, headers=headers)
                if check(res):
                    res_html = BeautifulSoup(res.content, "html.parser")
                    if 'downloadliterature.do' in res.url:
                        downloadIframe = res_html.select_one('#downloadIframe')
                        if downloadIframe:
                            res = session.get(downloadIframe["src"])
                            if check(res) and 'download.ashx' in res.url:
                                writeInfo("成功获取真实下载地址={path}".format(path=res.url))
                                res = session.get(res.url, headers=headers, stream=True)
                                if check(res) and 'pdf' in res.headers['Content-Type']:
                                    des = res.headers['Content-Disposition'].split(';')
                                    if len(des) == 2 and len(des[1].split('=')) == 2:
                                        count = count + 1
                                        writeInfo(f'当前采集进度{count}/{max},{round(count / max, 4) * 100}%')
                                        if count <= max:
                                            save(des, res, params)
                                            if count == max:
                                                break
                                    else:
                                        writeError("非法响应类型")
                                else:
                                    writeError("无法获取文档信息")
                            else:
                                writeError("无法获取文档真实下载地址")
                        else:
                            writeError("无法获取真实下载地址")
                    elif res_html.select_one('title').text == '交易':
                        raise Exception(res_html.select_one('div.NotWork>span').text)
                    else:
                        raise Exception('发生未知错误！！！')
                else:
                    writeError('error code={code}'.format(code=res.status_code))
            break
        else:
            writeError('error code={code}'.format(code=res.status_code))
    writeInfo('采集任务已完成')
    split_word()
init 4 years ago			`import os`
			`import re`
			`from concurrent.futures.thread import ThreadPoolExecutor`
实现相似度计算并入库 4 years ago			`from itertools import combinations`
init 4 years ago			`from typing import Tuple`
增加采集指定论文数的逻辑 4 years ago			`from urllib.parse import unquote, urlencode`
init 4 years ago
			`import jieba`
			`import pymysql as pymysql`
			`import requests`
			`from Scripts import pdf2txt`
			`from bs4 import BeautifulSoup`
			`from jieba import posseg`
			`from lxml import etree`
增加采集指定论文数的逻辑 4 years ago			`from requests.adapters import HTTPAdapter`
init 4 years ago			`from requests.cookies import RequestsCookieJar`

			`from config.config import cf`
			`from config.log import writeInfo, writeError`
			`import time`
实现相似度计算并入库 4 years ago			`import numpy`
init 4 years ago

			`# mysql数据库`
			`class MysqlDB:`
			`# 建立连接`
			`def connect(self):`
			`mysql = 'mysql'`
			`host = cf.get(mysql, 'host')`
			`user = cf.get(mysql, 'user')`
			`passwd = cf.get(mysql, 'passwd')`
			`db = cf.get(mysql, 'db')`
			`port = int(cf.get(mysql, 'port'))`
			`charset = cf.get(mysql, 'charset')`
			`return pymysql.connect(host=host, user=user, passwd=passwd, db=db, port=port, charset=charset)`

			`# 执行insert语句`
			`def modify(self, sql, params=()):`
			`connection = self.connect()`
			`try:`

			`with connection.cursor() as cursor:`
			`# Create a new record`
			`if isinstance(params, Tuple) and len(params) > 0 and isinstance(params[0], Tuple):`
			`cursor.executemany(sql, params)`
			`else:`
			`cursor.execute(sql, params)`
			`# connection is not autocommit by default. So you must commit to save`
			`# your changes.`
			`# 提交事务`
			`sql = ''' select LAST_INSERT_ID() '''`
			`num = cursor.execute(sql)`
			`if num > 0:`
			`id = cursor.fetchall()[0]`
			`connection.commit()`
			`return id`
			`except Exception as e:`
			`writeError(e)`
			`finally:`
			`connection.close()`

			`# 查询语句`
			`def query(self, sql, params=()):`
			`connection = self.connect()`
			`try:`
			`with connection.cursor() as cursor:`
			`cursor.execute(sql, params)`
			`return cursor.fetchall()`
			`except Exception as e:`
			`writeError(e)`
			`finally:`
			`connection.close()`


			`def parse(content):`
			`res_html = BeautifulSoup(content, "html.parser")`
			`# 论文下载标签`
			`ResultCont = res_html.select('div.ResultCont')`
			`params_list = []`
			`for result in ResultCont:`
			`# 论文标题`
			`title = str(result.select_one('div.title>a:nth-child(3)').text).strip()`
			`# 授予学位`
			`resultResouceType = str(result.select_one('span.resultResouceType').text).strip()`
			`# 作者`
			`author = str(result.select_one('div.author>a').text).strip()`
			`# 学校`
			`source = etree.HTML(result.select_one('div.Source').prettify()).xpath('//comment()[2]')`
			`if len(source) > 0:`
			`school = source[0].tail.strip()`
			`else:`
			`school = ''`
			`# 年份`
			`year = str(result.select_one('span.blockspan').text).strip()`
			`# 关键词`
			`tag = ''`
			`for a in result.select('div.Keyword>a'):`
			`tag += f",{a.text}"`
			`if len(tag) > 0:`
			`tag = tag[1:]`
			`# 摘要`
			`if result.select_one('div.summary'):`
			`summary = result.select_one('div.summary').text`
			`else:`
			`summary = ''`
			`info = {`
			`"title": title,`
			`"resultResouceType": resultResouceType,`
			`"author": author,`
			`"school": school,`
			`"year": "".join(filter(str.isdigit, year)),`
			`"tag": tag,`
			`"summary": summary`
			`}`

			`writeInfo('正在获取论文《{title}》的真实下载地址'.format(title=title))`
			`onClick = result.select_one('a.result_opera_down')['onclick']`
			`prefix = 'downLoadPermissions'`
			`suffix = ",'0'"`
			`match = re.findall('{prefix}.*{suffix}'.format(prefix=prefix, suffix=suffix), onClick)`
			`if len(match) > 0:`
			`match = match[0]`
			`# 下载参数`
			`params_str = match[len(prefix) + 1:].split(",'")`
			`param_keys = ["page_cnt", "language", "resourceType", "source", "resourceId", "resourceTitle", "isoa"]`
			`params_obj = {}`
			`if len(params_str) == len(param_keys):`
			`for index, key in enumerate(param_keys):`
			`params_obj[key] = params_str[index].replace("'", "")`
			`params_list.append({params_obj, info})`
			`else:`
			`writeError('匹配下载参数失败')`
			`else:`
			`writeError('匹配下载参数失败')`
			`return params_list`


账户有效性判断 4 years ago			`suffix = '77726476706e69737468656265737421e7e056d2303166567f068ea89941227bfcd3ca21bd0c'`
增加采集指定论文数的逻辑 4 years ago			`base_url = f'https://libcon.bupt.edu.cn/http/{suffix}'`
init 4 years ago			`profession = "计算机软件与理论"`
			`keyword = f'(专业%3A"{profession}")'`
			`headers = {`
			`"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36",`
			`}`
			`db = MysqlDB()`
			`session = requests.Session()`
增加采集指定论文数的逻辑 4 years ago			`session.mount('http://', HTTPAdapter(max_retries=3))`
			`session.mount('https://', HTTPAdapter(max_retries=3))`
init 4 years ago			`cookies = RequestsCookieJar()`
增加采集指定论文数的逻辑 4 years ago			`cookies.set('wengine_vpn_ticketlibcon_bupt_edu_cn', '7bde1fe6992c50f9', path='/', domain='.libcon.bupt.edu.cn')`
init 4 years ago			`session.cookies.update(cookies)`

			`pdf_dir = 'pdf'`
			`html_dir = 'html'`
删除多线程 4 years ago			`# executor = ThreadPoolExecutor(max_workers=1)`
实现相似度计算并入库 4 years ago			`# 向量表`
			`sys_tfidf = 'sys_tfidf'`
			`# 论文表`
			`sys_paper = 'sys_paper'`
增加采集指定论文数的逻辑 4 years ago			`# 词库表`
			`sys_word = 'sys_word'`
实现相似度计算并入库 4 years ago			`# 相似度表`
			`sys_similarity = 'sys_similarity'`
init 4 years ago

			`class Word:`
			`def __init__(self, word, flag):`
			`self.word = word`
			`self.flag = flag`

			`def __eq__(self, other: object) -> bool:`
			`if isinstance(other, self.__class__):`
			`return self.word == other.word`
			`else:`
			`return False`

			`def __hash__(self) -> int:`
			`return hash(self.word)`


			`# 更新词库`
			`def split_word():`
			`jieba.enable_paddle()`
实现相似度计算并入库 4 years ago			`start = db.query(f'select min(id) from {sys_paper}')[0][0]`
			`end = db.query(f'select max(id) from {sys_paper}')[0][0]`
增加采集指定论文数的逻辑 4 years ago			`result = db.query(f'select word,flag from {sys_word}')`
init 4 years ago			`filter_word = set(Word(_[0], _[1]) for _ in result)`
实现相似度计算并入库 4 years ago			`new_word = set()`
init 4 years ago			`count = 0`
			`for i in range(start, end + 1):`
实现相似度计算并入库 4 years ago			`txt_content = db.query(f'select txt_content from {sys_paper} where id=%s', (i))[0][0]`
init 4 years ago			`words = posseg.cut(txt_content, use_paddle=True)`
			`for word, flag in words:`
			`# writeInfo(f'word={word},flag={flag}')`
			`if flag == 'n':`
			`word = re.sub(u"([^\u4e00-\u9fa5a-zA-Z])", "", word)`
实现相似度计算并入库 4 years ago			`w = Word(word, flag)`
init 4 years ago			`if len(word) > 0 and w not in filter_word:`
			`new_word.add(w)`
			`count = count + 1`
			`writeInfo(f'从{count}个词语中过滤出{len(new_word)}个新词汇')`

实现相似度计算并入库 4 years ago			`if len(new_word) > 0:`
init 4 years ago			`words = tuple((_.word, _.flag) for _ in new_word)`
增加采集指定论文数的逻辑 4 years ago			`db.modify(f'insert into {sys_word}(word,flag) values (%s,%s)', words)`
init 4 years ago			`create_doc_vector()`
实现相似度计算并入库 4 years ago			`create_sim()`
init 4 years ago			`else:`
			`writeInfo('没有发现新词汇，不需要更新词库')`


实现相似度计算并入库 4 years ago			`# 计算文档向量`
init 4 years ago			`def create_doc_vector():`
实现相似度计算并入库 4 years ago			`start = time.time()`
init 4 years ago			`writeInfo('开始计算文档向量')`
实现相似度计算并入库 4 years ago			`db.modify(f'drop table if exists {sys_tfidf}')`
增加采集指定论文数的逻辑 4 years ago			`create_table_sql = f'''`
实现相似度计算并入库 4 years ago			`create table {sys_tfidf}`
init 4 years ago			`(`
			`id bigint NOT NULL AUTO_INCREMENT,`
			`tfidf longtext not null,`
			`primary key (id)`
			`) as`
			`select id, group_concat(tf * idf order by word) as tfidf`
			`from (select f.word,`
			`df,`
			`f.idf,`
			`round((LENGTH(txt_content) - LENGTH(REPLACE(txt_content, word, ''))) / LENGTH(word)) AS tf,`
			`id`
实现相似度计算并入库 4 years ago			`from {sys_paper},`
init 4 years ago			`(select word,`
			`sum(if(locate(word, txt_content) > 0, 1, 0)) as df,`
实现相似度计算并入库 4 years ago			`log((select count(*) from {sys_paper}) / sum(if(locate(word, txt_content) > 0, 1, 0))) + 1 as idf`
			`from {sys_paper},`
增加采集指定论文数的逻辑 4 years ago			`{sys_word}`
init 4 years ago			`group by word) as f) as f`
			`group by id`
增加采集指定论文数的逻辑 4 years ago			`'''`
			`db.modify(create_table_sql)`
			`writeInfo(f'计算文档向量执行sql{create_table_sql}花费{round(time.time() - start)}s')`
init 4 years ago

实现相似度计算并入库 4 years ago			`# 计算文档相似度`
			`def create_sim():`
			`ids = db.query(f'select group_concat(id) from {sys_paper}')`
增加采集指定论文数的逻辑 4 years ago			`if len(ids) > 0 and len(ids[0]) > 0:`
实现相似度计算并入库 4 years ago			`group_ids = list(combinations(ids[0][0].split(','), 2))`
			`for id1, id2 in group_ids:`
			`result = db.query(f'select tfidf from {sys_tfidf} where id in ({id1},{id2})')`
增加采集指定论文数的逻辑 4 years ago			`if len(result) == 2:`
			`tfidf1, tfidf2 = result`
			`writeInfo(f'开始比较文档{id1}和文档{id2}的相似度')`
			`w1 = tfidf1[0].split(',')`
			`w2 = tfidf2[0].split(',')`
			`if len(w1) == len(w2):`
			`a = 0`
			`b = 0`
			`c = 0`
			`for i in range(0, len(w1)):`
			`# writeInfo(f'第{i+1}个词汇在文档{id1}中tfidf={w1[i]},在文档{id2}中tfidf={w2[i]}')`
			`a += float(w1[i]) * float(w2[i])`
			`b += numpy.square(float(w1[i]))`
			`c += numpy.square(float(w2[i]))`
			`b = numpy.sqrt(b)`
			`c = numpy.sqrt(c)`
			`count = float(a / (b * c))`
			`writeInfo(f'文档{id1}和文档{id2}的相似度={count}')`
			`db.modify('replace into sys_similarity (source_paper, target_paper, similarity) VALUE (%s,%s,%s)',`
			`(id1, id2, count))`
			`else:`
			`writeError(f'查询tfidf失败{result}')`
init 4 years ago

			`# 文件格式转换保存到数据库`
			`def save(des, res, params):`
			`des = des[1].split('=')`
			`file_name = unquote(des[1], 'utf-8').replace('"', '')`
			`if not os.path.exists(pdf_dir):`
			`os.mkdir(pdf_dir)`
			`writeInfo(f'{params["title"]} PDF文件大小{len(res.content)}字节')`
			`with open(f'{pdf_dir}/{file_name}', 'wb') as file:`
			`file.write(res.content)`
			`if not os.path.exists(html_dir):`
			`os.mkdir(html_dir)`
			`html_file = f'{html_dir}/{file_name.replace("pdf", "html")}'`
			`writeInfo(f'{params["title"]} BEGIN PDF转HTML')`
			`pdf2txt.main(['-o', html_file, '-Y', 'exact', f'{pdf_dir}/{file_name}'])`
			`writeInfo(f'{params["title"]} END PDF转HTML')`
			`with open(html_file, 'rb') as file:`
			`html_content = file.read()`
			`parse_html = BeautifulSoup(html_content, "html.parser")`
			`txt_content = parse_html.text.replace('\n', '').replace(' ', '')`
			`info = {`
			`"title": params['title'],`
			`"type": params['resultResouceType'],`
			`"author": params['author'],`
			`"profession": profession,`
			`"school": params['school'],`
			`"year": params['year'],`
			`"summary": params['summary'],`
			`"tag": params['tag'],`
			`"pdf_content": res.content,`
			`"html_content": html_content,`
			`"txt_content": txt_content,`
			`"create_time": time.time()`
			`}`
			`# writeInfo('论文信息{info}'.format(info=info))`
			`writeInfo(f'{params["title"]} 插入数据库')`
			`db.modify(`
实现相似度计算并入库 4 years ago			`f'insert into {sys_paper} (author, create_time, pdf_content,html_content,txt_content, profession, school, summary, tag, title, type, year) value(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)',`
init 4 years ago			`(`
			`info['author'], info['create_time'], info['pdf_content'],`
			`info['html_content'], info['txt_content'],`
			`info['profession'], info['school'], info['summary'], info['tag']`
			`, info['title'], info['type'], info['year']`
			`))`


检查cookie 4 years ago			`login_url = 'https://libcon.bupt.edu.cn/login'`


			`def check(res):`
			`if res.status_code == 200:`
			`if res.url == login_url:`
			`raise Exception('请更新cookie信息')`
			`else:`
			`return True`
			`else:`
			`return False`


init 4 years ago			`# 万方平台论文采集`
增加采集指定论文数的逻辑 4 years ago			`def run(max=10, last_page=100, page_size=20):`
			`if max > last_page * page_size:`
增加采集指定论文数的逻辑 4 years ago			`writeInfo(f'采集数不能超过{last_page * page_size}')`
增加采集指定论文数的逻辑 4 years ago			`return`
			`db.modify(f'delete from {sys_paper}')`
			`db.modify(f'delete from {sys_word}')`
			`db.modify(f'delete from {sys_similarity}')`
			`count = 0`
			`for page in range(1, last_page):`
			`url = f"{base_url}/search/searchList.do?searchType=degree&showType=detail&page={page}&pageSize={page_size}&searchWord={keyword}&isTriggerTag="`
			`writeInfo(f'分页url={url}')`
			`res = session.get(url,`
			`headers=headers)`
检查cookie 4 years ago			`if check(res):`
init 4 years ago			`params_list = parse(res.content)`
			`for params in params_list:`
			`params["base_url"] = base_url`
			`url = '{base_url}/search/downLoad.do?page_cnt={page_cnt}&language={language}&resourceType={resourceType}&source={source}&resourceId={resourceId}&resourceTitle={resourceTitle}&isoa={isoa}&type={resourceType}&first=null'.format(`
			`**params)`
增加采集指定论文数的逻辑 4 years ago			`writeInfo(f'下载接口={url}')`
init 4 years ago			`res = session.get(url, headers=headers)`
账户有效性判断 4 years ago			`if check(res):`
init 4 years ago			`res_html = BeautifulSoup(res.content, "html.parser")`
账户有效性判断 4 years ago			`if 'downloadliterature.do' in res.url:`
			`downloadIframe = res_html.select_one('#downloadIframe')`
			`if downloadIframe:`
			`res = session.get(downloadIframe["src"])`
			`if check(res) and 'download.ashx' in res.url:`
			`writeInfo("成功获取真实下载地址={path}".format(path=res.url))`
			`res = session.get(res.url, headers=headers, stream=True)`
			`if check(res) and 'pdf' in res.headers['Content-Type']:`
			`des = res.headers['Content-Disposition'].split(';')`
			`if len(des) == 2 and len(des[1].split('=')) == 2:`
			`count = count + 1`
			`writeInfo(f'当前采集进度{count}/{max},{round(count / max, 4) * 100}%')`
			`if count <= max:`
			`save(des, res, params)`
			`if count == max:`
			`break`
			`else:`
			`writeError("非法响应类型")`
init 4 years ago			`else:`
账户有效性判断 4 years ago			`writeError("无法获取文档信息")`
init 4 years ago			`else:`
账户有效性判断 4 years ago			`writeError("无法获取文档真实下载地址")`
init 4 years ago			`else:`
账户有效性判断 4 years ago			`writeError("无法获取真实下载地址")`
			`elif res_html.select_one('title').text == '交易':`
			`raise Exception(res_html.select_one('div.NotWork>span').text)`
init 4 years ago			`else:`
账户有效性判断 4 years ago			`raise Exception('发生未知错误！！！')`
init 4 years ago			`else:`
			`writeError('error code={code}'.format(code=res.status_code))`
检查cookie 4 years ago			`break`
init 4 years ago			`else:`
			`writeError('error code={code}'.format(code=res.status_code))`
删除多线程 4 years ago			`writeInfo('采集任务已完成')`
检查cookie 4 years ago			`split_word()`