init

4 years ago · 7c4f7344a1
commit 7c4f7344a1
7 changed files with 657 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,219 @@
 # Created by .ignore support plugin (hsz.mobi)
 ### Python template
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
 *$py.class
 # C extensions
 *.so
 # Distribution / packaging
 .Python
 build/
 develop-eggs/
 dist/
 downloads/
 eggs/
 .eggs/
 lib/
 lib64/
 parts/
 sdist/
 var/
 wheels/
 share/python-wheels/
 *.egg-info/
 .installed.cfg
 *.egg
 MANIFEST
 # PyInstaller
 #  Usually these files are written by a python script from a template
 #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 *.manifest
 *.spec
 # Installer logs
 pip-log.txt
 pip-delete-this-directory.txt
 # Unit test / coverage reports
 htmlcov/
 .tox/
 .nox/
 .coverage
 .coverage.*
 .cache
 nosetests.xml
 coverage.xml
 *.cover
 *.py,cover
 .hypothesis/
 .pytest_cache/
 cover/
 # Translations
 *.mo
 *.pot
 # Django stuff:
 *.log
 local_settings.py
 db.sqlite3
 db.sqlite3-journal
 # Flask stuff:
 instance/
 .webassets-cache
 # Scrapy stuff:
 .scrapy
 # Sphinx documentation
 docs/_build/
 # PyBuilder
 .pybuilder/
 target/
 # Jupyter Notebook
 .ipynb_checkpoints
 # IPython
 profile_default/
 ipython_config.py
 # pyenv
 #   For a library or package, you might want to ignore these files since the code is
 #   intended to run in multiple environments; otherwise, check them in:
 # .python-version
 # pipenv
 #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 #   install all needed dependencies.
 #Pipfile.lock
 # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 __pypackages__/
 # Celery stuff
 celerybeat-schedule
 celerybeat.pid
 # SageMath parsed files
 *.sage.py
 # Environments
 .env
 .venv
 env/
 venv/
 ENV/
 env.bak/
 venv.bak/
 # Spyder project settings
 .spyderproject
 .spyproject
 # Rope project settings
 .ropeproject
 # mkdocs documentation
 /site
 # mypy
 .mypy_cache/
 .dmypy.json
 dmypy.json
 # Pyre type checker
 .pyre/
 # pytype static type analyzer
 .pytype/
 # Cython debug symbols
 cython_debug/
 ### JetBrains template
 # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
 # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
 # User-specific stuff
 .idea/**/workspace.xml
 .idea/**/tasks.xml
 .idea/**/usage.statistics.xml
 .idea/**/dictionaries
 .idea/**/shelf
 # Generated files
 .idea/**/contentModel.xml
 # Sensitive or high-churn files
 .idea/**/dataSources/
 .idea/**/dataSources.ids
 .idea/**/dataSources.local.xml
 .idea/**/sqlDataSources.xml
 .idea/**/dynamic.xml
 .idea/**/uiDesigner.xml
 .idea/**/dbnavigator.xml
 # Gradle
 .idea/**/gradle.xml
 .idea/**/libraries
 # Gradle and Maven with auto-import
 # When using Gradle or Maven with auto-import, you should exclude module files,
 # since they will be recreated, and may cause churn.  Uncomment if using
 # auto-import.
 # .idea/artifacts
 # .idea/compiler.xml
 # .idea/jarRepositories.xml
 # .idea/modules.xml
 # .idea/*.iml
 # .idea/modules
 # *.iml
 # *.ipr
 # CMake
 cmake-build-*/
 # Mongo Explorer plugin
 .idea/**/mongoSettings.xml
 # File-based project format
 *.iws
 # IntelliJ
 out/
 # mpeltonen/sbt-idea plugin
 .idea_modules/
 # JIRA plugin
 atlassian-ide-plugin.xml
 # Cursive Clojure plugin
 .idea/replstate.xml
 # Crashlytics plugin (for Android Studio and IntelliJ)
 com_crashlytics_export_strings.xml
 crashlytics.properties
 crashlytics-build.properties
 fabric.properties
 # Editor-based Rest Client
 .idea/httpRequests
 # Android studio 3.1+ serialized cache file
 .idea/caches/build_file_checksums.ser
 .idea
 venv
 html
 pdf
 log
--- a/config.ini
+++ b/config.ini
@ -0,0 +1,17 @@
 [file]
 ;日志文件名
 logFile = log.txt
 ;mysql数据库配置
 [mysql]
 #数据库服务端地址
 host=localhost
 #用户
 user=sukura
 #密码
 passwd=123456
 #数据库名
 db=note
 #端口
 port=3306
 #连接编码
 charset=utf8
--- a/config/config.py
+++ b/config/config.py
@ -0,0 +1,44 @@
 import configparser
 import logging
 from logging.handlers import TimedRotatingFileHandler
 import os
 BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 print(BASE_DIR)
 cf = configparser.ConfigParser()
 config_path=BASE_DIR+"/config.ini"
 if not os.path.exists(config_path):
    raise Exception("配置文件:%s不存在" % config_path)
 cf.read(config_path,encoding='utf-8')
 logFile = cf.get('file', 'logFile')
 logger = logging.getLogger()
 class NoParsingFilter(logging.Filter):
    def filter(self, record):
        return 'pdfminer' not in record.name
 def getHandle():
    for handler in logger.handlers:
        if isinstance(handler,logging.StreamHandler):
            return handler
    return logging.StreamHandler()
 def init():
    logger.setLevel(logging.INFO)
    log_format=logging.Formatter(fmt="%(asctime)s %(levelname)s : %(message)s",datefmt='%Y-%m-%d %H:%M:%S')
    # 在控制台打印日志
    streamHandler = getHandle()
    streamHandler.setFormatter(log_format)
    streamHandler.addFilter(NoParsingFilter())
    logger.addHandler(streamHandler)
    logpath=BASE_DIR+"/log/"
    print(logpath)
    if not os.path.exists(BASE_DIR+"/log/"):
        os.mkdir(logpath)
    timedRotatingFileHandler=TimedRotatingFileHandler(filename=logpath+"all.log",when='H',interval=1,encoding='utf-8')
    timedRotatingFileHandler.setFormatter(log_format)
    timedRotatingFileHandler.addFilter(NoParsingFilter())
    logger.addHandler(timedRotatingFileHandler)
--- a/config/log.py
+++ b/config/log.py
@ -0,0 +1,21 @@
 import time
 from config.config import init
 from config.config import logger
 start = int(time.time())
 init()
 def getRunTimeInt():
    return (int(time.time()) - start)
 def getRunTime():
    return '程序已经执行%d秒' % (int(time.time()) - start)
 def writeInfo(msg):
    logger.info('%s\t(%s)' % (msg, getRunTime()))
 def writeError(msg):
    logger.error('%s\t(%s)' % (msg, getRunTime()))
--- a/main.py
+++ b/main.py
@ -0,0 +1,322 @@
 import os
 import re
 from concurrent.futures.thread import ThreadPoolExecutor
 from typing import Tuple
 from urllib.parse import unquote
 import jieba
 import pymysql as pymysql
 import requests
 from Scripts import pdf2txt
 from bs4 import BeautifulSoup
 from jieba import posseg
 from lxml import etree
 from requests.cookies import RequestsCookieJar
 from config.config import cf
 from config.log import writeInfo, writeError
 import time
 # mysql数据库
 class MysqlDB:
    # 建立连接
    def connect(self):
        mysql = 'mysql'
        host = cf.get(mysql, 'host')
        user = cf.get(mysql, 'user')
        passwd = cf.get(mysql, 'passwd')
        db = cf.get(mysql, 'db')
        port = int(cf.get(mysql, 'port'))
        charset = cf.get(mysql, 'charset')
        return pymysql.connect(host=host, user=user, passwd=passwd, db=db, port=port, charset=charset)
    # 执行insert语句
    def modify(self, sql, params=()):
        connection = self.connect()
        try:
            with connection.cursor() as cursor:
                # Create a new record
                if isinstance(params, Tuple) and len(params) > 0 and isinstance(params[0], Tuple):
                    cursor.executemany(sql, params)
                else:
                    cursor.execute(sql, params)
                # connection is not autocommit by default. So you must commit to save
                # your changes.
                # 提交事务
                sql = ''' select LAST_INSERT_ID() '''
                num = cursor.execute(sql)
                if num > 0:
                    id = cursor.fetchall()[0]
            connection.commit()
            return id
        except Exception as e:
            writeError(e)
        finally:
            connection.close()
    # 查询语句
    def query(self, sql, params=()):
        connection = self.connect()
        try:
            with connection.cursor() as cursor:
                cursor.execute(sql, params)
                return cursor.fetchall()
        except Exception as e:
            writeError(e)
        finally:
            connection.close()
 def parse(content):
    res_html = BeautifulSoup(content, "html.parser")
    #     论文下载标签
    ResultCont = res_html.select('div.ResultCont')
    params_list = []
    for result in ResultCont:
        # 论文标题
        title = str(result.select_one('div.title>a:nth-child(3)').text).strip()
        # 授予学位
        resultResouceType = str(result.select_one('span.resultResouceType').text).strip()
        # 作者
        author = str(result.select_one('div.author>a').text).strip()
        # 学校
        source = etree.HTML(result.select_one('div.Source').prettify()).xpath('//comment()[2]')
        if len(source) > 0:
            school = source[0].tail.strip()
        else:
            school = ''
        # 年份
        year = str(result.select_one('span.blockspan').text).strip()
        # 关键词
        tag = ''
        for a in result.select('div.Keyword>a'):
            tag += f",{a.text}"
        if len(tag) > 0:
            tag = tag[1:]
        #     摘要
        if result.select_one('div.summary'):
            summary = result.select_one('div.summary').text
        else:
            summary = ''
        info = {
            "title": title,
            "resultResouceType": resultResouceType,
            "author": author,
            "school": school,
            "year": "".join(filter(str.isdigit, year)),
            "tag": tag,
            "summary": summary
        }
        writeInfo('正在获取论文《{title}》的真实下载地址'.format(title=title))
        onClick = result.select_one('a.result_opera_down')['onclick']
        prefix = 'downLoadPermissions'
        suffix = ",'0'"
        match = re.findall('{prefix}.*{suffix}'.format(prefix=prefix, suffix=suffix), onClick)
        if len(match) > 0:
            match = match[0]
            # 下载参数
            params_str = match[len(prefix) + 1:].split(",'")
            param_keys = ["page_cnt", "language", "resourceType", "source", "resourceId", "resourceTitle", "isoa"]
            params_obj = {}
            if len(params_str) == len(param_keys):
                for index, key in enumerate(param_keys):
                    params_obj[key] = params_str[index].replace("'", "")
                params_list.append({**params_obj, **info})
            else:
                writeError('匹配下载参数失败')
        else:
            writeError('匹配下载参数失败')
    return params_list
 base_url = 'https://libcon.bupt.edu.cn/http/77726476706e69737468656265737421e7e056d2303166567f068ea89941227bfcd3ca21bd0c'
 profession = "计算机软件与理论"
 keyword = f'(专业%3A"{profession}")'
 headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36",
 }
 db = MysqlDB()
 session = requests.Session()
 cookies = RequestsCookieJar()
 cookies.set('wengine_vpn_ticketlibcon_bupt_edu_cn', '85f585f32fdf27ca', path='/', domain='.libcon.bupt.edu.cn')
 cookies.set('remember_token', 'yeysYCGGzNIrcDDUNXOVyNXODUZzJpROJgLmsObLAXrsmibvEqcwqzRkDYlODYWA', path='/',
            domain='libcon.bupt.edu.cn')
 session.cookies.update(cookies)
 pdf_dir = 'pdf'
 html_dir = 'html'
 executor = ThreadPoolExecutor(max_workers=2)
 class Word:
    def __init__(self, word, flag):
        self.word = word
        self.flag = flag
    def __eq__(self, other: object) -> bool:
        if isinstance(other, self.__class__):
            return self.word == other.word
        else:
            return False
    def __hash__(self) -> int:
        return hash(self.word)
 # 更新词库
 def split_word():
    jieba.enable_paddle()
    start = db.query('select min(id) from sys_paper')[0][0]
    end = db.query('select max(id) from sys_paper')[0][0]
    result = db.query('select word,flag from sys_word')
    filter_word = set(Word(_[0], _[1]) for _ in result)
    new_word=set()
    count = 0
    for i in range(start, end + 1):
        txt_content = db.query('select txt_content from sys_paper where id=%s', (i))[0][0]
        words = posseg.cut(txt_content, use_paddle=True)
        for word, flag in words:
            # writeInfo(f'word={word},flag={flag}')
            if flag == 'n':
                word = re.sub(u"([^\u4e00-\u9fa5a-zA-Z])", "", word)
                w=Word(word, flag)
                if len(word) > 0 and w not in filter_word:
                    new_word.add(w)
            count = count + 1
        writeInfo(f'从{count}个词语中过滤出{len(new_word)}个新词汇')
    if len(new_word)>0:
        words = tuple((_.word, _.flag) for _ in new_word)
        db.modify('insert into sys_word(word,flag) values (%s,%s)', words)
        create_doc_vector()
    else:
        writeInfo('没有发现新词汇，不需要更新词库')
 table_name = 'sys_tfidf'
 def create_doc_vector():
    start=time.time()
    writeInfo('开始计算文档向量')
    db.modify(f'drop table if exists {table_name}')
    db.modify(f'''
    create table {table_name}
    (
        id    bigint   NOT NULL AUTO_INCREMENT,
        tfidf longtext not null,
        primary key (id)
    ) as
    select id, group_concat(tf * idf order by word) as tfidf
    from (select f.word,
                 df,
                 f.idf,
                 round((LENGTH(txt_content) - LENGTH(REPLACE(txt_content, word, ''))) / LENGTH(word)) AS tf,
                 id
          from sys_paper,
               (select word,
                       sum(if(locate(word, txt_content) > 0, 1, 0))                                             as df,
                       log((select count(*) from sys_paper) / sum(if(locate(word, txt_content) > 0, 1, 0))) + 1 as idf
                from sys_paper,
                     sys_word
                group by word) as f) as f
    group by id
    ''')
    writeInfo(f'计算文档向量花费{round(time.time()-start)}s')
 # 文档向量计算
 def compare_doc_vector(ids=None):
    if ids is not None:
        result1 = db.query(f'select * from {table_name} where id in ({",".join(ids)})')
        result2 = db.query(f'select * from {table_name} where id not in ({",".join(ids)})')
        for id1, tfidf1 in result1:
            for id2,tfidf2 in result2:
                print(f'id={id1}和id={id2}比较')
 # 文件格式转换保存到数据库
 def save(des, res, params):
    des = des[1].split('=')
    file_name = unquote(des[1], 'utf-8').replace('"', '')
    if not os.path.exists(pdf_dir):
        os.mkdir(pdf_dir)
    writeInfo(f'{params["title"]} PDF文件大小{len(res.content)}字节')
    with open(f'{pdf_dir}/{file_name}', 'wb') as file:
        file.write(res.content)
    if not os.path.exists(html_dir):
        os.mkdir(html_dir)
    html_file = f'{html_dir}/{file_name.replace("pdf", "html")}'
    writeInfo(f'{params["title"]} BEGIN PDF转HTML')
    pdf2txt.main(['-o', html_file, '-Y', 'exact', f'{pdf_dir}/{file_name}'])
    writeInfo(f'{params["title"]} END PDF转HTML')
    with open(html_file, 'rb') as file:
        html_content = file.read()
    parse_html = BeautifulSoup(html_content, "html.parser")
    txt_content = parse_html.text.replace('\n', '').replace(' ', '')
    info = {
        "title": params['title'],
        "type": params['resultResouceType'],
        "author": params['author'],
        "profession": profession,
        "school": params['school'],
        "year": params['year'],
        "summary": params['summary'],
        "tag": params['tag'],
        "pdf_content": res.content,
        "html_content": html_content,
        "txt_content": txt_content,
        "create_time": time.time()
    }
    # writeInfo('论文信息{info}'.format(info=info))
    writeInfo(f'{params["title"]} 插入数据库')
    db.modify(
        f'insert into sys_paper (author, create_time, pdf_content,html_content,txt_content, profession, school, summary, tag, title, type, year) value(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)',
        (
            info['author'], info['create_time'], info['pdf_content'],
            info['html_content'], info['txt_content'],
            info['profession'], info['school'], info['summary'], info['tag']
            , info['title'], info['type'], info['year']
        ))
 # 万方平台论文采集
 def run():
    for page in range(1, 100):
        res = session.get(
            f"{base_url}/search/searchList.do?searchType=degree&showType=detail&page={page}&pageSize=20&searchWord={keyword}&isTriggerTag=",
            headers=headers)
        if res.status_code == 200:
            params_list = parse(res.content)
            for params in params_list:
                params["base_url"] = base_url
                url = '{base_url}/search/downLoad.do?page_cnt={page_cnt}&language={language}&resourceType={resourceType}&source={source}&resourceId={resourceId}&resourceTitle={resourceTitle}&isoa={isoa}&type={resourceType}&first=null'.format(
                    **params)
                res = session.get(url, headers=headers)
                if res.status_code == 200 and 'downloadliterature.do' in res.url:
                    res_html = BeautifulSoup(res.content, "html.parser")
                    downloadIframe = res_html.select_one('#downloadIframe')
                    if downloadIframe:
                        res = session.get(downloadIframe["src"])
                        if res.status_code == 200 and 'download.ashx' in res.url:
                            writeInfo("成功获取真实下载地址{path}".format(path=res.url))
                            res = session.get(res.url, headers=headers, stream=True)
                            if res.status_code == 200 and 'pdf' in res.headers['Content-Type']:
                                des = res.headers['Content-Disposition'].split(';')
                                if len(des) == 2 and len(des[1].split('=')) == 2:
                                    executor.submit(save, des, res, params)
                                else:
                                    writeError("非法响应类型")
                            else:
                                writeError("无法获取文档信息")
                        else:
                            writeError("无法获取文档真实下载地址")
                    else:
                        writeError("无法获取真实下载地址")
                else:
                    writeError('error code={code}'.format(code=res.status_code))
        else:
            writeError('error code={code}'.format(code=res.status_code))
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,17 @@
 beautifulsoup4==4.9.1
 certifi==2020.6.20
 cffi==1.14.1
 chardet==3.0.4
 cryptography==3.0
 idna==2.10
 jieba==0.42.1
 lxml==4.5.2
 pdfminer.six==20200726
 pycparser==2.20
 pycryptodome==3.9.8
 PyMySQL==0.10.0
 requests==2.24.0
 six==1.15.0
 sortedcontainers==2.2.2
 soupsieve==2.0.1
 urllib3==1.25.10
--- a/test.py
+++ b/test.py
@ -0,0 +1,17 @@
 import sys
 from typing import Tuple
 import jieba
 from Scripts import pdf2txt
 from bs4 import BeautifulSoup
 from jieba import posseg
 from config.log import writeInfo
 from main import MysqlDB, run, split_word, Word, create_doc_vector
 db=MysqlDB()
 if __name__ == '__main__':
    # split_word()
    create_doc_vector()
    # c({'3'})