commit 7c4f7344a1508df32ac43c3fd8c3c432725cad17 Author: pan <1029559041@qq.com> Date: Sun Aug 9 07:25:16 2020 +0800 init diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..79f98aa --- /dev/null +++ b/.gitignore @@ -0,0 +1,219 @@ +# Created by .ignore support plugin (hsz.mobi) +### Python template +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +### JetBrains template +# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider +# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 + +# User-specific stuff +.idea/**/workspace.xml +.idea/**/tasks.xml +.idea/**/usage.statistics.xml +.idea/**/dictionaries +.idea/**/shelf + +# Generated files +.idea/**/contentModel.xml + +# Sensitive or high-churn files +.idea/**/dataSources/ +.idea/**/dataSources.ids +.idea/**/dataSources.local.xml +.idea/**/sqlDataSources.xml +.idea/**/dynamic.xml +.idea/**/uiDesigner.xml +.idea/**/dbnavigator.xml + +# Gradle +.idea/**/gradle.xml +.idea/**/libraries + +# Gradle and Maven with auto-import +# When using Gradle or Maven with auto-import, you should exclude module files, +# since they will be recreated, and may cause churn. Uncomment if using +# auto-import. +# .idea/artifacts +# .idea/compiler.xml +# .idea/jarRepositories.xml +# .idea/modules.xml +# .idea/*.iml +# .idea/modules +# *.iml +# *.ipr + +# CMake +cmake-build-*/ + +# Mongo Explorer plugin +.idea/**/mongoSettings.xml + +# File-based project format +*.iws + +# IntelliJ +out/ + +# mpeltonen/sbt-idea plugin +.idea_modules/ + +# JIRA plugin +atlassian-ide-plugin.xml + +# Cursive Clojure plugin +.idea/replstate.xml + +# Crashlytics plugin (for Android Studio and IntelliJ) +com_crashlytics_export_strings.xml +crashlytics.properties +crashlytics-build.properties +fabric.properties + +# Editor-based Rest Client +.idea/httpRequests + +# Android studio 3.1+ serialized cache file +.idea/caches/build_file_checksums.ser + +.idea +venv +html +pdf +log \ No newline at end of file diff --git a/config.ini b/config.ini new file mode 100644 index 0000000..802d376 --- /dev/null +++ b/config.ini @@ -0,0 +1,17 @@ +[file] +;日志文件名 +logFile = log.txt +;mysql数据库配置 +[mysql] +#数据库服务端地址 +host=localhost +#用户 +user=sukura +#密码 +passwd=123456 +#数据库名 +db=note +#端口 +port=3306 +#连接编码 +charset=utf8 \ No newline at end of file diff --git a/config/config.py b/config/config.py new file mode 100644 index 0000000..c8194ef --- /dev/null +++ b/config/config.py @@ -0,0 +1,44 @@ +import configparser +import logging +from logging.handlers import TimedRotatingFileHandler +import os +BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +print(BASE_DIR) +cf = configparser.ConfigParser() +config_path=BASE_DIR+"/config.ini" +if not os.path.exists(config_path): + raise Exception("配置文件:%s不存在" % config_path) +cf.read(config_path,encoding='utf-8') +logFile = cf.get('file', 'logFile') +logger = logging.getLogger() + +class NoParsingFilter(logging.Filter): + def filter(self, record): + return 'pdfminer' not in record.name + +def getHandle(): + for handler in logger.handlers: + if isinstance(handler,logging.StreamHandler): + return handler + return logging.StreamHandler() + +def init(): + logger.setLevel(logging.INFO) + log_format=logging.Formatter(fmt="%(asctime)s %(levelname)s : %(message)s",datefmt='%Y-%m-%d %H:%M:%S') + # 在控制台打印日志 + streamHandler = getHandle() + streamHandler.setFormatter(log_format) + streamHandler.addFilter(NoParsingFilter()) + logger.addHandler(streamHandler) + + logpath=BASE_DIR+"/log/" + print(logpath) + if not os.path.exists(BASE_DIR+"/log/"): + os.mkdir(logpath) + + timedRotatingFileHandler=TimedRotatingFileHandler(filename=logpath+"all.log",when='H',interval=1,encoding='utf-8') + timedRotatingFileHandler.setFormatter(log_format) + timedRotatingFileHandler.addFilter(NoParsingFilter()) + logger.addHandler(timedRotatingFileHandler) + + diff --git a/config/log.py b/config/log.py new file mode 100644 index 0000000..d9929a9 --- /dev/null +++ b/config/log.py @@ -0,0 +1,21 @@ +import time + +from config.config import init +from config.config import logger + +start = int(time.time()) +init() + +def getRunTimeInt(): + return (int(time.time()) - start) + +def getRunTime(): + return '程序已经执行%d秒' % (int(time.time()) - start) + + +def writeInfo(msg): + logger.info('%s\t(%s)' % (msg, getRunTime())) + + +def writeError(msg): + logger.error('%s\t(%s)' % (msg, getRunTime())) diff --git a/main.py b/main.py new file mode 100644 index 0000000..c7a9b67 --- /dev/null +++ b/main.py @@ -0,0 +1,322 @@ +import os +import re +from concurrent.futures.thread import ThreadPoolExecutor +from typing import Tuple +from urllib.parse import unquote + +import jieba +import pymysql as pymysql +import requests +from Scripts import pdf2txt +from bs4 import BeautifulSoup +from jieba import posseg +from lxml import etree +from requests.cookies import RequestsCookieJar + +from config.config import cf +from config.log import writeInfo, writeError +import time + + +# mysql数据库 +class MysqlDB: + # 建立连接 + def connect(self): + mysql = 'mysql' + host = cf.get(mysql, 'host') + user = cf.get(mysql, 'user') + passwd = cf.get(mysql, 'passwd') + db = cf.get(mysql, 'db') + port = int(cf.get(mysql, 'port')) + charset = cf.get(mysql, 'charset') + return pymysql.connect(host=host, user=user, passwd=passwd, db=db, port=port, charset=charset) + + # 执行insert语句 + def modify(self, sql, params=()): + connection = self.connect() + try: + + with connection.cursor() as cursor: + # Create a new record + if isinstance(params, Tuple) and len(params) > 0 and isinstance(params[0], Tuple): + cursor.executemany(sql, params) + else: + cursor.execute(sql, params) + # connection is not autocommit by default. So you must commit to save + # your changes. + # 提交事务 + sql = ''' select LAST_INSERT_ID() ''' + num = cursor.execute(sql) + if num > 0: + id = cursor.fetchall()[0] + connection.commit() + return id + except Exception as e: + writeError(e) + finally: + connection.close() + + # 查询语句 + def query(self, sql, params=()): + connection = self.connect() + try: + with connection.cursor() as cursor: + cursor.execute(sql, params) + return cursor.fetchall() + except Exception as e: + writeError(e) + finally: + connection.close() + + +def parse(content): + res_html = BeautifulSoup(content, "html.parser") + # 论文下载标签 + ResultCont = res_html.select('div.ResultCont') + params_list = [] + for result in ResultCont: + # 论文标题 + title = str(result.select_one('div.title>a:nth-child(3)').text).strip() + # 授予学位 + resultResouceType = str(result.select_one('span.resultResouceType').text).strip() + # 作者 + author = str(result.select_one('div.author>a').text).strip() + # 学校 + source = etree.HTML(result.select_one('div.Source').prettify()).xpath('//comment()[2]') + if len(source) > 0: + school = source[0].tail.strip() + else: + school = '' + # 年份 + year = str(result.select_one('span.blockspan').text).strip() + # 关键词 + tag = '' + for a in result.select('div.Keyword>a'): + tag += f",{a.text}" + if len(tag) > 0: + tag = tag[1:] + # 摘要 + if result.select_one('div.summary'): + summary = result.select_one('div.summary').text + else: + summary = '' + info = { + "title": title, + "resultResouceType": resultResouceType, + "author": author, + "school": school, + "year": "".join(filter(str.isdigit, year)), + "tag": tag, + "summary": summary + } + + writeInfo('正在获取论文《{title}》的真实下载地址'.format(title=title)) + onClick = result.select_one('a.result_opera_down')['onclick'] + prefix = 'downLoadPermissions' + suffix = ",'0'" + match = re.findall('{prefix}.*{suffix}'.format(prefix=prefix, suffix=suffix), onClick) + if len(match) > 0: + match = match[0] + # 下载参数 + params_str = match[len(prefix) + 1:].split(",'") + param_keys = ["page_cnt", "language", "resourceType", "source", "resourceId", "resourceTitle", "isoa"] + params_obj = {} + if len(params_str) == len(param_keys): + for index, key in enumerate(param_keys): + params_obj[key] = params_str[index].replace("'", "") + params_list.append({**params_obj, **info}) + else: + writeError('匹配下载参数失败') + else: + writeError('匹配下载参数失败') + return params_list + + +base_url = 'https://libcon.bupt.edu.cn/http/77726476706e69737468656265737421e7e056d2303166567f068ea89941227bfcd3ca21bd0c' +profession = "计算机软件与理论" +keyword = f'(专业%3A"{profession}")' +headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36", +} +db = MysqlDB() +session = requests.Session() +cookies = RequestsCookieJar() +cookies.set('wengine_vpn_ticketlibcon_bupt_edu_cn', '85f585f32fdf27ca', path='/', domain='.libcon.bupt.edu.cn') +cookies.set('remember_token', 'yeysYCGGzNIrcDDUNXOVyNXODUZzJpROJgLmsObLAXrsmibvEqcwqzRkDYlODYWA', path='/', + domain='libcon.bupt.edu.cn') +session.cookies.update(cookies) + +pdf_dir = 'pdf' +html_dir = 'html' +executor = ThreadPoolExecutor(max_workers=2) + + +class Word: + def __init__(self, word, flag): + self.word = word + self.flag = flag + + def __eq__(self, other: object) -> bool: + if isinstance(other, self.__class__): + return self.word == other.word + else: + return False + + def __hash__(self) -> int: + return hash(self.word) + + +# 更新词库 +def split_word(): + jieba.enable_paddle() + start = db.query('select min(id) from sys_paper')[0][0] + end = db.query('select max(id) from sys_paper')[0][0] + result = db.query('select word,flag from sys_word') + filter_word = set(Word(_[0], _[1]) for _ in result) + new_word=set() + count = 0 + for i in range(start, end + 1): + txt_content = db.query('select txt_content from sys_paper where id=%s', (i))[0][0] + words = posseg.cut(txt_content, use_paddle=True) + for word, flag in words: + # writeInfo(f'word={word},flag={flag}') + if flag == 'n': + word = re.sub(u"([^\u4e00-\u9fa5a-zA-Z])", "", word) + w=Word(word, flag) + if len(word) > 0 and w not in filter_word: + new_word.add(w) + count = count + 1 + writeInfo(f'从{count}个词语中过滤出{len(new_word)}个新词汇') + + if len(new_word)>0: + words = tuple((_.word, _.flag) for _ in new_word) + db.modify('insert into sys_word(word,flag) values (%s,%s)', words) + create_doc_vector() + else: + writeInfo('没有发现新词汇,不需要更新词库') + +table_name = 'sys_tfidf' + +def create_doc_vector(): + start=time.time() + writeInfo('开始计算文档向量') + db.modify(f'drop table if exists {table_name}') + db.modify(f''' + create table {table_name} + ( + id bigint NOT NULL AUTO_INCREMENT, + tfidf longtext not null, + primary key (id) + ) as + select id, group_concat(tf * idf order by word) as tfidf + from (select f.word, + df, + f.idf, + round((LENGTH(txt_content) - LENGTH(REPLACE(txt_content, word, ''))) / LENGTH(word)) AS tf, + id + from sys_paper, + (select word, + sum(if(locate(word, txt_content) > 0, 1, 0)) as df, + log((select count(*) from sys_paper) / sum(if(locate(word, txt_content) > 0, 1, 0))) + 1 as idf + from sys_paper, + sys_word + group by word) as f) as f + group by id + ''') + writeInfo(f'计算文档向量花费{round(time.time()-start)}s') + +# 文档向量计算 +def compare_doc_vector(ids=None): + + if ids is not None: + + result1 = db.query(f'select * from {table_name} where id in ({",".join(ids)})') + result2 = db.query(f'select * from {table_name} where id not in ({",".join(ids)})') + + for id1, tfidf1 in result1: + for id2,tfidf2 in result2: + print(f'id={id1}和id={id2}比较') + + +# 文件格式转换保存到数据库 +def save(des, res, params): + des = des[1].split('=') + file_name = unquote(des[1], 'utf-8').replace('"', '') + if not os.path.exists(pdf_dir): + os.mkdir(pdf_dir) + writeInfo(f'{params["title"]} PDF文件大小{len(res.content)}字节') + with open(f'{pdf_dir}/{file_name}', 'wb') as file: + file.write(res.content) + if not os.path.exists(html_dir): + os.mkdir(html_dir) + html_file = f'{html_dir}/{file_name.replace("pdf", "html")}' + writeInfo(f'{params["title"]} BEGIN PDF转HTML') + pdf2txt.main(['-o', html_file, '-Y', 'exact', f'{pdf_dir}/{file_name}']) + writeInfo(f'{params["title"]} END PDF转HTML') + with open(html_file, 'rb') as file: + html_content = file.read() + parse_html = BeautifulSoup(html_content, "html.parser") + txt_content = parse_html.text.replace('\n', '').replace(' ', '') + info = { + "title": params['title'], + "type": params['resultResouceType'], + "author": params['author'], + "profession": profession, + "school": params['school'], + "year": params['year'], + "summary": params['summary'], + "tag": params['tag'], + "pdf_content": res.content, + "html_content": html_content, + "txt_content": txt_content, + "create_time": time.time() + } + # writeInfo('论文信息{info}'.format(info=info)) + writeInfo(f'{params["title"]} 插入数据库') + db.modify( + f'insert into sys_paper (author, create_time, pdf_content,html_content,txt_content, profession, school, summary, tag, title, type, year) value(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)', + ( + info['author'], info['create_time'], info['pdf_content'], + info['html_content'], info['txt_content'], + info['profession'], info['school'], info['summary'], info['tag'] + , info['title'], info['type'], info['year'] + )) + + +# 万方平台论文采集 +def run(): + for page in range(1, 100): + res = session.get( + f"{base_url}/search/searchList.do?searchType=degree&showType=detail&page={page}&pageSize=20&searchWord={keyword}&isTriggerTag=", + headers=headers) + if res.status_code == 200: + params_list = parse(res.content) + for params in params_list: + params["base_url"] = base_url + url = '{base_url}/search/downLoad.do?page_cnt={page_cnt}&language={language}&resourceType={resourceType}&source={source}&resourceId={resourceId}&resourceTitle={resourceTitle}&isoa={isoa}&type={resourceType}&first=null'.format( + **params) + res = session.get(url, headers=headers) + if res.status_code == 200 and 'downloadliterature.do' in res.url: + res_html = BeautifulSoup(res.content, "html.parser") + downloadIframe = res_html.select_one('#downloadIframe') + if downloadIframe: + res = session.get(downloadIframe["src"]) + if res.status_code == 200 and 'download.ashx' in res.url: + writeInfo("成功获取真实下载地址{path}".format(path=res.url)) + res = session.get(res.url, headers=headers, stream=True) + if res.status_code == 200 and 'pdf' in res.headers['Content-Type']: + des = res.headers['Content-Disposition'].split(';') + if len(des) == 2 and len(des[1].split('=')) == 2: + executor.submit(save, des, res, params) + else: + writeError("非法响应类型") + else: + writeError("无法获取文档信息") + else: + writeError("无法获取文档真实下载地址") + else: + writeError("无法获取真实下载地址") + else: + writeError('error code={code}'.format(code=res.status_code)) + else: + writeError('error code={code}'.format(code=res.status_code)) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..5c5f190 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,17 @@ +beautifulsoup4==4.9.1 +certifi==2020.6.20 +cffi==1.14.1 +chardet==3.0.4 +cryptography==3.0 +idna==2.10 +jieba==0.42.1 +lxml==4.5.2 +pdfminer.six==20200726 +pycparser==2.20 +pycryptodome==3.9.8 +PyMySQL==0.10.0 +requests==2.24.0 +six==1.15.0 +sortedcontainers==2.2.2 +soupsieve==2.0.1 +urllib3==1.25.10 diff --git a/test.py b/test.py new file mode 100644 index 0000000..4cdeaf2 --- /dev/null +++ b/test.py @@ -0,0 +1,17 @@ +import sys +from typing import Tuple + +import jieba +from Scripts import pdf2txt +from bs4 import BeautifulSoup +from jieba import posseg +from config.log import writeInfo + +from main import MysqlDB, run, split_word, Word, create_doc_vector + +db=MysqlDB() + +if __name__ == '__main__': + # split_word() + create_doc_vector() + # c({'3'}) \ No newline at end of file