commit
7c4f7344a1
@ -0,0 +1,219 @@ |
|||||||
|
# Created by .ignore support plugin (hsz.mobi) |
||||||
|
### Python template |
||||||
|
# Byte-compiled / optimized / DLL files |
||||||
|
__pycache__/ |
||||||
|
*.py[cod] |
||||||
|
*$py.class |
||||||
|
|
||||||
|
# C extensions |
||||||
|
*.so |
||||||
|
|
||||||
|
# Distribution / packaging |
||||||
|
.Python |
||||||
|
build/ |
||||||
|
develop-eggs/ |
||||||
|
dist/ |
||||||
|
downloads/ |
||||||
|
eggs/ |
||||||
|
.eggs/ |
||||||
|
lib/ |
||||||
|
lib64/ |
||||||
|
parts/ |
||||||
|
sdist/ |
||||||
|
var/ |
||||||
|
wheels/ |
||||||
|
share/python-wheels/ |
||||||
|
*.egg-info/ |
||||||
|
.installed.cfg |
||||||
|
*.egg |
||||||
|
MANIFEST |
||||||
|
|
||||||
|
# PyInstaller |
||||||
|
# Usually these files are written by a python script from a template |
||||||
|
# before PyInstaller builds the exe, so as to inject date/other infos into it. |
||||||
|
*.manifest |
||||||
|
*.spec |
||||||
|
|
||||||
|
# Installer logs |
||||||
|
pip-log.txt |
||||||
|
pip-delete-this-directory.txt |
||||||
|
|
||||||
|
# Unit test / coverage reports |
||||||
|
htmlcov/ |
||||||
|
.tox/ |
||||||
|
.nox/ |
||||||
|
.coverage |
||||||
|
.coverage.* |
||||||
|
.cache |
||||||
|
nosetests.xml |
||||||
|
coverage.xml |
||||||
|
*.cover |
||||||
|
*.py,cover |
||||||
|
.hypothesis/ |
||||||
|
.pytest_cache/ |
||||||
|
cover/ |
||||||
|
|
||||||
|
# Translations |
||||||
|
*.mo |
||||||
|
*.pot |
||||||
|
|
||||||
|
# Django stuff: |
||||||
|
*.log |
||||||
|
local_settings.py |
||||||
|
db.sqlite3 |
||||||
|
db.sqlite3-journal |
||||||
|
|
||||||
|
# Flask stuff: |
||||||
|
instance/ |
||||||
|
.webassets-cache |
||||||
|
|
||||||
|
# Scrapy stuff: |
||||||
|
.scrapy |
||||||
|
|
||||||
|
# Sphinx documentation |
||||||
|
docs/_build/ |
||||||
|
|
||||||
|
# PyBuilder |
||||||
|
.pybuilder/ |
||||||
|
target/ |
||||||
|
|
||||||
|
# Jupyter Notebook |
||||||
|
.ipynb_checkpoints |
||||||
|
|
||||||
|
# IPython |
||||||
|
profile_default/ |
||||||
|
ipython_config.py |
||||||
|
|
||||||
|
# pyenv |
||||||
|
# For a library or package, you might want to ignore these files since the code is |
||||||
|
# intended to run in multiple environments; otherwise, check them in: |
||||||
|
# .python-version |
||||||
|
|
||||||
|
# pipenv |
||||||
|
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. |
||||||
|
# However, in case of collaboration, if having platform-specific dependencies or dependencies |
||||||
|
# having no cross-platform support, pipenv may install dependencies that don't work, or not |
||||||
|
# install all needed dependencies. |
||||||
|
#Pipfile.lock |
||||||
|
|
||||||
|
# PEP 582; used by e.g. github.com/David-OConnor/pyflow |
||||||
|
__pypackages__/ |
||||||
|
|
||||||
|
# Celery stuff |
||||||
|
celerybeat-schedule |
||||||
|
celerybeat.pid |
||||||
|
|
||||||
|
# SageMath parsed files |
||||||
|
*.sage.py |
||||||
|
|
||||||
|
# Environments |
||||||
|
.env |
||||||
|
.venv |
||||||
|
env/ |
||||||
|
venv/ |
||||||
|
ENV/ |
||||||
|
env.bak/ |
||||||
|
venv.bak/ |
||||||
|
|
||||||
|
# Spyder project settings |
||||||
|
.spyderproject |
||||||
|
.spyproject |
||||||
|
|
||||||
|
# Rope project settings |
||||||
|
.ropeproject |
||||||
|
|
||||||
|
# mkdocs documentation |
||||||
|
/site |
||||||
|
|
||||||
|
# mypy |
||||||
|
.mypy_cache/ |
||||||
|
.dmypy.json |
||||||
|
dmypy.json |
||||||
|
|
||||||
|
# Pyre type checker |
||||||
|
.pyre/ |
||||||
|
|
||||||
|
# pytype static type analyzer |
||||||
|
.pytype/ |
||||||
|
|
||||||
|
# Cython debug symbols |
||||||
|
cython_debug/ |
||||||
|
|
||||||
|
### JetBrains template |
||||||
|
# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider |
||||||
|
# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 |
||||||
|
|
||||||
|
# User-specific stuff |
||||||
|
.idea/**/workspace.xml |
||||||
|
.idea/**/tasks.xml |
||||||
|
.idea/**/usage.statistics.xml |
||||||
|
.idea/**/dictionaries |
||||||
|
.idea/**/shelf |
||||||
|
|
||||||
|
# Generated files |
||||||
|
.idea/**/contentModel.xml |
||||||
|
|
||||||
|
# Sensitive or high-churn files |
||||||
|
.idea/**/dataSources/ |
||||||
|
.idea/**/dataSources.ids |
||||||
|
.idea/**/dataSources.local.xml |
||||||
|
.idea/**/sqlDataSources.xml |
||||||
|
.idea/**/dynamic.xml |
||||||
|
.idea/**/uiDesigner.xml |
||||||
|
.idea/**/dbnavigator.xml |
||||||
|
|
||||||
|
# Gradle |
||||||
|
.idea/**/gradle.xml |
||||||
|
.idea/**/libraries |
||||||
|
|
||||||
|
# Gradle and Maven with auto-import |
||||||
|
# When using Gradle or Maven with auto-import, you should exclude module files, |
||||||
|
# since they will be recreated, and may cause churn. Uncomment if using |
||||||
|
# auto-import. |
||||||
|
# .idea/artifacts |
||||||
|
# .idea/compiler.xml |
||||||
|
# .idea/jarRepositories.xml |
||||||
|
# .idea/modules.xml |
||||||
|
# .idea/*.iml |
||||||
|
# .idea/modules |
||||||
|
# *.iml |
||||||
|
# *.ipr |
||||||
|
|
||||||
|
# CMake |
||||||
|
cmake-build-*/ |
||||||
|
|
||||||
|
# Mongo Explorer plugin |
||||||
|
.idea/**/mongoSettings.xml |
||||||
|
|
||||||
|
# File-based project format |
||||||
|
*.iws |
||||||
|
|
||||||
|
# IntelliJ |
||||||
|
out/ |
||||||
|
|
||||||
|
# mpeltonen/sbt-idea plugin |
||||||
|
.idea_modules/ |
||||||
|
|
||||||
|
# JIRA plugin |
||||||
|
atlassian-ide-plugin.xml |
||||||
|
|
||||||
|
# Cursive Clojure plugin |
||||||
|
.idea/replstate.xml |
||||||
|
|
||||||
|
# Crashlytics plugin (for Android Studio and IntelliJ) |
||||||
|
com_crashlytics_export_strings.xml |
||||||
|
crashlytics.properties |
||||||
|
crashlytics-build.properties |
||||||
|
fabric.properties |
||||||
|
|
||||||
|
# Editor-based Rest Client |
||||||
|
.idea/httpRequests |
||||||
|
|
||||||
|
# Android studio 3.1+ serialized cache file |
||||||
|
.idea/caches/build_file_checksums.ser |
||||||
|
|
||||||
|
.idea |
||||||
|
venv |
||||||
|
html |
||||||
|
pdf |
||||||
|
log |
@ -0,0 +1,17 @@ |
|||||||
|
[file] |
||||||
|
;日志文件名 |
||||||
|
logFile = log.txt |
||||||
|
;mysql数据库配置 |
||||||
|
[mysql] |
||||||
|
#数据库服务端地址 |
||||||
|
host=localhost |
||||||
|
#用户 |
||||||
|
user=sukura |
||||||
|
#密码 |
||||||
|
passwd=123456 |
||||||
|
#数据库名 |
||||||
|
db=note |
||||||
|
#端口 |
||||||
|
port=3306 |
||||||
|
#连接编码 |
||||||
|
charset=utf8 |
@ -0,0 +1,44 @@ |
|||||||
|
import configparser |
||||||
|
import logging |
||||||
|
from logging.handlers import TimedRotatingFileHandler |
||||||
|
import os |
||||||
|
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) |
||||||
|
print(BASE_DIR) |
||||||
|
cf = configparser.ConfigParser() |
||||||
|
config_path=BASE_DIR+"/config.ini" |
||||||
|
if not os.path.exists(config_path): |
||||||
|
raise Exception("配置文件:%s不存在" % config_path) |
||||||
|
cf.read(config_path,encoding='utf-8') |
||||||
|
logFile = cf.get('file', 'logFile') |
||||||
|
logger = logging.getLogger() |
||||||
|
|
||||||
|
class NoParsingFilter(logging.Filter): |
||||||
|
def filter(self, record): |
||||||
|
return 'pdfminer' not in record.name |
||||||
|
|
||||||
|
def getHandle(): |
||||||
|
for handler in logger.handlers: |
||||||
|
if isinstance(handler,logging.StreamHandler): |
||||||
|
return handler |
||||||
|
return logging.StreamHandler() |
||||||
|
|
||||||
|
def init(): |
||||||
|
logger.setLevel(logging.INFO) |
||||||
|
log_format=logging.Formatter(fmt="%(asctime)s %(levelname)s : %(message)s",datefmt='%Y-%m-%d %H:%M:%S') |
||||||
|
# 在控制台打印日志 |
||||||
|
streamHandler = getHandle() |
||||||
|
streamHandler.setFormatter(log_format) |
||||||
|
streamHandler.addFilter(NoParsingFilter()) |
||||||
|
logger.addHandler(streamHandler) |
||||||
|
|
||||||
|
logpath=BASE_DIR+"/log/" |
||||||
|
print(logpath) |
||||||
|
if not os.path.exists(BASE_DIR+"/log/"): |
||||||
|
os.mkdir(logpath) |
||||||
|
|
||||||
|
timedRotatingFileHandler=TimedRotatingFileHandler(filename=logpath+"all.log",when='H',interval=1,encoding='utf-8') |
||||||
|
timedRotatingFileHandler.setFormatter(log_format) |
||||||
|
timedRotatingFileHandler.addFilter(NoParsingFilter()) |
||||||
|
logger.addHandler(timedRotatingFileHandler) |
||||||
|
|
||||||
|
|
@ -0,0 +1,21 @@ |
|||||||
|
import time |
||||||
|
|
||||||
|
from config.config import init |
||||||
|
from config.config import logger |
||||||
|
|
||||||
|
start = int(time.time()) |
||||||
|
init() |
||||||
|
|
||||||
|
def getRunTimeInt(): |
||||||
|
return (int(time.time()) - start) |
||||||
|
|
||||||
|
def getRunTime(): |
||||||
|
return '程序已经执行%d秒' % (int(time.time()) - start) |
||||||
|
|
||||||
|
|
||||||
|
def writeInfo(msg): |
||||||
|
logger.info('%s\t(%s)' % (msg, getRunTime())) |
||||||
|
|
||||||
|
|
||||||
|
def writeError(msg): |
||||||
|
logger.error('%s\t(%s)' % (msg, getRunTime())) |
@ -0,0 +1,322 @@ |
|||||||
|
import os |
||||||
|
import re |
||||||
|
from concurrent.futures.thread import ThreadPoolExecutor |
||||||
|
from typing import Tuple |
||||||
|
from urllib.parse import unquote |
||||||
|
|
||||||
|
import jieba |
||||||
|
import pymysql as pymysql |
||||||
|
import requests |
||||||
|
from Scripts import pdf2txt |
||||||
|
from bs4 import BeautifulSoup |
||||||
|
from jieba import posseg |
||||||
|
from lxml import etree |
||||||
|
from requests.cookies import RequestsCookieJar |
||||||
|
|
||||||
|
from config.config import cf |
||||||
|
from config.log import writeInfo, writeError |
||||||
|
import time |
||||||
|
|
||||||
|
|
||||||
|
# mysql数据库 |
||||||
|
class MysqlDB: |
||||||
|
# 建立连接 |
||||||
|
def connect(self): |
||||||
|
mysql = 'mysql' |
||||||
|
host = cf.get(mysql, 'host') |
||||||
|
user = cf.get(mysql, 'user') |
||||||
|
passwd = cf.get(mysql, 'passwd') |
||||||
|
db = cf.get(mysql, 'db') |
||||||
|
port = int(cf.get(mysql, 'port')) |
||||||
|
charset = cf.get(mysql, 'charset') |
||||||
|
return pymysql.connect(host=host, user=user, passwd=passwd, db=db, port=port, charset=charset) |
||||||
|
|
||||||
|
# 执行insert语句 |
||||||
|
def modify(self, sql, params=()): |
||||||
|
connection = self.connect() |
||||||
|
try: |
||||||
|
|
||||||
|
with connection.cursor() as cursor: |
||||||
|
# Create a new record |
||||||
|
if isinstance(params, Tuple) and len(params) > 0 and isinstance(params[0], Tuple): |
||||||
|
cursor.executemany(sql, params) |
||||||
|
else: |
||||||
|
cursor.execute(sql, params) |
||||||
|
# connection is not autocommit by default. So you must commit to save |
||||||
|
# your changes. |
||||||
|
# 提交事务 |
||||||
|
sql = ''' select LAST_INSERT_ID() ''' |
||||||
|
num = cursor.execute(sql) |
||||||
|
if num > 0: |
||||||
|
id = cursor.fetchall()[0] |
||||||
|
connection.commit() |
||||||
|
return id |
||||||
|
except Exception as e: |
||||||
|
writeError(e) |
||||||
|
finally: |
||||||
|
connection.close() |
||||||
|
|
||||||
|
# 查询语句 |
||||||
|
def query(self, sql, params=()): |
||||||
|
connection = self.connect() |
||||||
|
try: |
||||||
|
with connection.cursor() as cursor: |
||||||
|
cursor.execute(sql, params) |
||||||
|
return cursor.fetchall() |
||||||
|
except Exception as e: |
||||||
|
writeError(e) |
||||||
|
finally: |
||||||
|
connection.close() |
||||||
|
|
||||||
|
|
||||||
|
def parse(content): |
||||||
|
res_html = BeautifulSoup(content, "html.parser") |
||||||
|
# 论文下载标签 |
||||||
|
ResultCont = res_html.select('div.ResultCont') |
||||||
|
params_list = [] |
||||||
|
for result in ResultCont: |
||||||
|
# 论文标题 |
||||||
|
title = str(result.select_one('div.title>a:nth-child(3)').text).strip() |
||||||
|
# 授予学位 |
||||||
|
resultResouceType = str(result.select_one('span.resultResouceType').text).strip() |
||||||
|
# 作者 |
||||||
|
author = str(result.select_one('div.author>a').text).strip() |
||||||
|
# 学校 |
||||||
|
source = etree.HTML(result.select_one('div.Source').prettify()).xpath('//comment()[2]') |
||||||
|
if len(source) > 0: |
||||||
|
school = source[0].tail.strip() |
||||||
|
else: |
||||||
|
school = '' |
||||||
|
# 年份 |
||||||
|
year = str(result.select_one('span.blockspan').text).strip() |
||||||
|
# 关键词 |
||||||
|
tag = '' |
||||||
|
for a in result.select('div.Keyword>a'): |
||||||
|
tag += f",{a.text}" |
||||||
|
if len(tag) > 0: |
||||||
|
tag = tag[1:] |
||||||
|
# 摘要 |
||||||
|
if result.select_one('div.summary'): |
||||||
|
summary = result.select_one('div.summary').text |
||||||
|
else: |
||||||
|
summary = '' |
||||||
|
info = { |
||||||
|
"title": title, |
||||||
|
"resultResouceType": resultResouceType, |
||||||
|
"author": author, |
||||||
|
"school": school, |
||||||
|
"year": "".join(filter(str.isdigit, year)), |
||||||
|
"tag": tag, |
||||||
|
"summary": summary |
||||||
|
} |
||||||
|
|
||||||
|
writeInfo('正在获取论文《{title}》的真实下载地址'.format(title=title)) |
||||||
|
onClick = result.select_one('a.result_opera_down')['onclick'] |
||||||
|
prefix = 'downLoadPermissions' |
||||||
|
suffix = ",'0'" |
||||||
|
match = re.findall('{prefix}.*{suffix}'.format(prefix=prefix, suffix=suffix), onClick) |
||||||
|
if len(match) > 0: |
||||||
|
match = match[0] |
||||||
|
# 下载参数 |
||||||
|
params_str = match[len(prefix) + 1:].split(",'") |
||||||
|
param_keys = ["page_cnt", "language", "resourceType", "source", "resourceId", "resourceTitle", "isoa"] |
||||||
|
params_obj = {} |
||||||
|
if len(params_str) == len(param_keys): |
||||||
|
for index, key in enumerate(param_keys): |
||||||
|
params_obj[key] = params_str[index].replace("'", "") |
||||||
|
params_list.append({**params_obj, **info}) |
||||||
|
else: |
||||||
|
writeError('匹配下载参数失败') |
||||||
|
else: |
||||||
|
writeError('匹配下载参数失败') |
||||||
|
return params_list |
||||||
|
|
||||||
|
|
||||||
|
base_url = 'https://libcon.bupt.edu.cn/http/77726476706e69737468656265737421e7e056d2303166567f068ea89941227bfcd3ca21bd0c' |
||||||
|
profession = "计算机软件与理论" |
||||||
|
keyword = f'(专业%3A"{profession}")' |
||||||
|
headers = { |
||||||
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36", |
||||||
|
} |
||||||
|
db = MysqlDB() |
||||||
|
session = requests.Session() |
||||||
|
cookies = RequestsCookieJar() |
||||||
|
cookies.set('wengine_vpn_ticketlibcon_bupt_edu_cn', '85f585f32fdf27ca', path='/', domain='.libcon.bupt.edu.cn') |
||||||
|
cookies.set('remember_token', 'yeysYCGGzNIrcDDUNXOVyNXODUZzJpROJgLmsObLAXrsmibvEqcwqzRkDYlODYWA', path='/', |
||||||
|
domain='libcon.bupt.edu.cn') |
||||||
|
session.cookies.update(cookies) |
||||||
|
|
||||||
|
pdf_dir = 'pdf' |
||||||
|
html_dir = 'html' |
||||||
|
executor = ThreadPoolExecutor(max_workers=2) |
||||||
|
|
||||||
|
|
||||||
|
class Word: |
||||||
|
def __init__(self, word, flag): |
||||||
|
self.word = word |
||||||
|
self.flag = flag |
||||||
|
|
||||||
|
def __eq__(self, other: object) -> bool: |
||||||
|
if isinstance(other, self.__class__): |
||||||
|
return self.word == other.word |
||||||
|
else: |
||||||
|
return False |
||||||
|
|
||||||
|
def __hash__(self) -> int: |
||||||
|
return hash(self.word) |
||||||
|
|
||||||
|
|
||||||
|
# 更新词库 |
||||||
|
def split_word(): |
||||||
|
jieba.enable_paddle() |
||||||
|
start = db.query('select min(id) from sys_paper')[0][0] |
||||||
|
end = db.query('select max(id) from sys_paper')[0][0] |
||||||
|
result = db.query('select word,flag from sys_word') |
||||||
|
filter_word = set(Word(_[0], _[1]) for _ in result) |
||||||
|
new_word=set() |
||||||
|
count = 0 |
||||||
|
for i in range(start, end + 1): |
||||||
|
txt_content = db.query('select txt_content from sys_paper where id=%s', (i))[0][0] |
||||||
|
words = posseg.cut(txt_content, use_paddle=True) |
||||||
|
for word, flag in words: |
||||||
|
# writeInfo(f'word={word},flag={flag}') |
||||||
|
if flag == 'n': |
||||||
|
word = re.sub(u"([^\u4e00-\u9fa5a-zA-Z])", "", word) |
||||||
|
w=Word(word, flag) |
||||||
|
if len(word) > 0 and w not in filter_word: |
||||||
|
new_word.add(w) |
||||||
|
count = count + 1 |
||||||
|
writeInfo(f'从{count}个词语中过滤出{len(new_word)}个新词汇') |
||||||
|
|
||||||
|
if len(new_word)>0: |
||||||
|
words = tuple((_.word, _.flag) for _ in new_word) |
||||||
|
db.modify('insert into sys_word(word,flag) values (%s,%s)', words) |
||||||
|
create_doc_vector() |
||||||
|
else: |
||||||
|
writeInfo('没有发现新词汇,不需要更新词库') |
||||||
|
|
||||||
|
table_name = 'sys_tfidf' |
||||||
|
|
||||||
|
def create_doc_vector(): |
||||||
|
start=time.time() |
||||||
|
writeInfo('开始计算文档向量') |
||||||
|
db.modify(f'drop table if exists {table_name}') |
||||||
|
db.modify(f''' |
||||||
|
create table {table_name} |
||||||
|
( |
||||||
|
id bigint NOT NULL AUTO_INCREMENT, |
||||||
|
tfidf longtext not null, |
||||||
|
primary key (id) |
||||||
|
) as |
||||||
|
select id, group_concat(tf * idf order by word) as tfidf |
||||||
|
from (select f.word, |
||||||
|
df, |
||||||
|
f.idf, |
||||||
|
round((LENGTH(txt_content) - LENGTH(REPLACE(txt_content, word, ''))) / LENGTH(word)) AS tf, |
||||||
|
id |
||||||
|
from sys_paper, |
||||||
|
(select word, |
||||||
|
sum(if(locate(word, txt_content) > 0, 1, 0)) as df, |
||||||
|
log((select count(*) from sys_paper) / sum(if(locate(word, txt_content) > 0, 1, 0))) + 1 as idf |
||||||
|
from sys_paper, |
||||||
|
sys_word |
||||||
|
group by word) as f) as f |
||||||
|
group by id |
||||||
|
''') |
||||||
|
writeInfo(f'计算文档向量花费{round(time.time()-start)}s') |
||||||
|
|
||||||
|
# 文档向量计算 |
||||||
|
def compare_doc_vector(ids=None): |
||||||
|
|
||||||
|
if ids is not None: |
||||||
|
|
||||||
|
result1 = db.query(f'select * from {table_name} where id in ({",".join(ids)})') |
||||||
|
result2 = db.query(f'select * from {table_name} where id not in ({",".join(ids)})') |
||||||
|
|
||||||
|
for id1, tfidf1 in result1: |
||||||
|
for id2,tfidf2 in result2: |
||||||
|
print(f'id={id1}和id={id2}比较') |
||||||
|
|
||||||
|
|
||||||
|
# 文件格式转换保存到数据库 |
||||||
|
def save(des, res, params): |
||||||
|
des = des[1].split('=') |
||||||
|
file_name = unquote(des[1], 'utf-8').replace('"', '') |
||||||
|
if not os.path.exists(pdf_dir): |
||||||
|
os.mkdir(pdf_dir) |
||||||
|
writeInfo(f'{params["title"]} PDF文件大小{len(res.content)}字节') |
||||||
|
with open(f'{pdf_dir}/{file_name}', 'wb') as file: |
||||||
|
file.write(res.content) |
||||||
|
if not os.path.exists(html_dir): |
||||||
|
os.mkdir(html_dir) |
||||||
|
html_file = f'{html_dir}/{file_name.replace("pdf", "html")}' |
||||||
|
writeInfo(f'{params["title"]} BEGIN PDF转HTML') |
||||||
|
pdf2txt.main(['-o', html_file, '-Y', 'exact', f'{pdf_dir}/{file_name}']) |
||||||
|
writeInfo(f'{params["title"]} END PDF转HTML') |
||||||
|
with open(html_file, 'rb') as file: |
||||||
|
html_content = file.read() |
||||||
|
parse_html = BeautifulSoup(html_content, "html.parser") |
||||||
|
txt_content = parse_html.text.replace('\n', '').replace(' ', '') |
||||||
|
info = { |
||||||
|
"title": params['title'], |
||||||
|
"type": params['resultResouceType'], |
||||||
|
"author": params['author'], |
||||||
|
"profession": profession, |
||||||
|
"school": params['school'], |
||||||
|
"year": params['year'], |
||||||
|
"summary": params['summary'], |
||||||
|
"tag": params['tag'], |
||||||
|
"pdf_content": res.content, |
||||||
|
"html_content": html_content, |
||||||
|
"txt_content": txt_content, |
||||||
|
"create_time": time.time() |
||||||
|
} |
||||||
|
# writeInfo('论文信息{info}'.format(info=info)) |
||||||
|
writeInfo(f'{params["title"]} 插入数据库') |
||||||
|
db.modify( |
||||||
|
f'insert into sys_paper (author, create_time, pdf_content,html_content,txt_content, profession, school, summary, tag, title, type, year) value(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)', |
||||||
|
( |
||||||
|
info['author'], info['create_time'], info['pdf_content'], |
||||||
|
info['html_content'], info['txt_content'], |
||||||
|
info['profession'], info['school'], info['summary'], info['tag'] |
||||||
|
, info['title'], info['type'], info['year'] |
||||||
|
)) |
||||||
|
|
||||||
|
|
||||||
|
# 万方平台论文采集 |
||||||
|
def run(): |
||||||
|
for page in range(1, 100): |
||||||
|
res = session.get( |
||||||
|
f"{base_url}/search/searchList.do?searchType=degree&showType=detail&page={page}&pageSize=20&searchWord={keyword}&isTriggerTag=", |
||||||
|
headers=headers) |
||||||
|
if res.status_code == 200: |
||||||
|
params_list = parse(res.content) |
||||||
|
for params in params_list: |
||||||
|
params["base_url"] = base_url |
||||||
|
url = '{base_url}/search/downLoad.do?page_cnt={page_cnt}&language={language}&resourceType={resourceType}&source={source}&resourceId={resourceId}&resourceTitle={resourceTitle}&isoa={isoa}&type={resourceType}&first=null'.format( |
||||||
|
**params) |
||||||
|
res = session.get(url, headers=headers) |
||||||
|
if res.status_code == 200 and 'downloadliterature.do' in res.url: |
||||||
|
res_html = BeautifulSoup(res.content, "html.parser") |
||||||
|
downloadIframe = res_html.select_one('#downloadIframe') |
||||||
|
if downloadIframe: |
||||||
|
res = session.get(downloadIframe["src"]) |
||||||
|
if res.status_code == 200 and 'download.ashx' in res.url: |
||||||
|
writeInfo("成功获取真实下载地址{path}".format(path=res.url)) |
||||||
|
res = session.get(res.url, headers=headers, stream=True) |
||||||
|
if res.status_code == 200 and 'pdf' in res.headers['Content-Type']: |
||||||
|
des = res.headers['Content-Disposition'].split(';') |
||||||
|
if len(des) == 2 and len(des[1].split('=')) == 2: |
||||||
|
executor.submit(save, des, res, params) |
||||||
|
else: |
||||||
|
writeError("非法响应类型") |
||||||
|
else: |
||||||
|
writeError("无法获取文档信息") |
||||||
|
else: |
||||||
|
writeError("无法获取文档真实下载地址") |
||||||
|
else: |
||||||
|
writeError("无法获取真实下载地址") |
||||||
|
else: |
||||||
|
writeError('error code={code}'.format(code=res.status_code)) |
||||||
|
else: |
||||||
|
writeError('error code={code}'.format(code=res.status_code)) |
@ -0,0 +1,17 @@ |
|||||||
|
beautifulsoup4==4.9.1 |
||||||
|
certifi==2020.6.20 |
||||||
|
cffi==1.14.1 |
||||||
|
chardet==3.0.4 |
||||||
|
cryptography==3.0 |
||||||
|
idna==2.10 |
||||||
|
jieba==0.42.1 |
||||||
|
lxml==4.5.2 |
||||||
|
pdfminer.six==20200726 |
||||||
|
pycparser==2.20 |
||||||
|
pycryptodome==3.9.8 |
||||||
|
PyMySQL==0.10.0 |
||||||
|
requests==2.24.0 |
||||||
|
six==1.15.0 |
||||||
|
sortedcontainers==2.2.2 |
||||||
|
soupsieve==2.0.1 |
||||||
|
urllib3==1.25.10 |
@ -0,0 +1,17 @@ |
|||||||
|
import sys |
||||||
|
from typing import Tuple |
||||||
|
|
||||||
|
import jieba |
||||||
|
from Scripts import pdf2txt |
||||||
|
from bs4 import BeautifulSoup |
||||||
|
from jieba import posseg |
||||||
|
from config.log import writeInfo |
||||||
|
|
||||||
|
from main import MysqlDB, run, split_word, Word, create_doc_vector |
||||||
|
|
||||||
|
db=MysqlDB() |
||||||
|
|
||||||
|
if __name__ == '__main__': |
||||||
|
# split_word() |
||||||
|
create_doc_vector() |
||||||
|
# c({'3'}) |
Loading…
Reference in new issue