自动清理旧数据

解决频繁爬取导致的“Max retries exceeded with URL in requests
”
master
pan 5 years ago
parent 5dbd2c792b
commit e05b64c18d
  1. 16
      Test.py
  2. 38
      job_spider.py

@ -0,0 +1,16 @@
import requests
from bs4 import BeautifulSoup
HEADERS = {
"X-Requested-With": "XMLHttpRequest",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36"
"(KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36",
}
html=requests.get('https://jobs.51job.com/guangzhou/119982727.html?s=01&t=0',headers=HEADERS).content
bs_tmp = BeautifulSoup(html, 'lxml').select(
'body > div.tCompanyPage > div.tCompany_center.clearfix > div.tHeader.tHjob > div > div.cn')[0]
bs_tmp1 = bs_tmp.select('h1')[0]
bs_tmp2 = bs_tmp.select('strong')[0]
bs_tmp3 = bs_tmp.select('p.cname > a.catn')[0]
bs_tmp4 = bs_tmp.select(' p.msg.ltype')[0].text.replace(u'\xa0', '').split('|')
tmp = {"岗位": bs_tmp1.text, "公司": bs_tmp3['title'], "薪资": bs_tmp2.text, '位置': bs_tmp4[0],
'工作经验': bs_tmp4[1], '学历': bs_tmp4[2], '招聘人数': bs_tmp4[3], '发布时间': bs_tmp4[4]}

@ -4,6 +4,9 @@ from gevent import monkey
from gevent.pool import Pool
monkey.patch_all(select=False)
from requests.adapters import HTTPAdapter
from urllib3 import Retry
import time
import os
import logging
@ -12,7 +15,6 @@ from queue import Queue
from bs4 import BeautifulSoup
def get_logger():
"""
创建日志实例
@ -43,6 +45,11 @@ LOG_LEVEL = logging.INFO # 日志等级
POOL_MAXSIZE = 8 # 线程池最大容量
logger = get_logger()
session = requests.Session()
retry = Retry(connect=3, backoff_factor=0.5)
adapter = HTTPAdapter(max_retries=retry)
session.mount('http://', adapter)
session.mount('https://', adapter)
class JobSpider:
@ -65,7 +72,7 @@ class JobSpider:
urls = [START_URL.format(p) for p in range(1, 200)] # #resultList > div:nth-child(53)
for url in urls:
logger.info("爬取链接:{}\n{}".format(url, urls.index(url) + 1))
html = requests.get(url, headers=HEADERS).content
html = session.get(url, headers=HEADERS).content
bs = BeautifulSoup(html, "lxml").find("div", class_="dw_table").find_all("div", class_="el")
for b in bs:
try:
@ -83,9 +90,9 @@ class JobSpider:
while True:
# 从队列中取 url
url = self.desc_url_queue.get()
resp = requests.get(url, headers=HEADERS)
resp = session.get(url, headers=HEADERS)
if resp.status_code == 200:
logger.info("爬取链接:{}\n{} 条岗位详情".format(url,self.count))
logger.info("爬取链接:{}\n{} 条岗位详情".format(url, self.count))
html = resp.content
self.desc_url_queue.task_done()
self.count += 1
@ -99,15 +106,15 @@ class JobSpider:
bs_tmp2 = bs_tmp.select('strong')[0]
bs_tmp3 = bs_tmp.select('p.cname > a.catn')[0]
bs_tmp4 = bs_tmp.select(' p.msg.ltype')[0].text.replace(u'\xa0', '').split('|')
with open(os.path.join(self.job_dir, "岗位信息.txt"), 'ab+') as f:
tmp = {"岗位": bs_tmp1.text, "公司": bs_tmp3['title'], "薪资": bs_tmp2.text, '位置': bs_tmp4[0],
'工作经验': bs_tmp4[1], '学历': bs_tmp4[2], '招聘人数': bs_tmp4[3], '发布时间': bs_tmp4[4]}
f.write((str(tmp) + '\n').encode('utf-8'))
bs = BeautifulSoup(html, "lxml").find("div", class_="bmsg job_msg inbox").text
s = bs.replace("微信", "").replace("分享", "").replace("邮件", "").replace("\t", "").strip()
with open(os.path.join(self.job_dir, "岗位描述.txt"), "a", encoding="utf-8") as f:
f.write(s)
if len(bs_tmp4) == 5:
with open(os.path.join(self.job_dir, "岗位信息.txt"), 'ab+') as f:
tmp = {"岗位": bs_tmp1.text, "公司": bs_tmp3['title'], "薪资": bs_tmp2.text, '位置': bs_tmp4[0],
'工作经验': bs_tmp4[1], '学历': bs_tmp4[2], '招聘人数': bs_tmp4[3], '发布时间': bs_tmp4[4]}
f.write((str(tmp) + '\n').encode('utf-8'))
bs = BeautifulSoup(html, "lxml").find("div", class_="bmsg job_msg inbox").text
s = bs.replace("微信", "").replace("分享", "").replace("邮件", "").replace("\t", "").strip()
with open(os.path.join(self.job_dir, "岗位描述.txt"), "a", encoding="utf-8") as f:
f.write(s)
except Exception as e:
logger.error(e)
logger.warning(url)
@ -135,7 +142,7 @@ class JobSpider:
self.execute_more_tasks(self.post_require)
self.desc_url_queue.join() # 主线程阻塞,等待队列清空
def clearDir(self,rootdir):
def clearDir(self, rootdir):
filelist = os.listdir(rootdir)
for f in filelist:
filepath = os.path.join(rootdir, f)
@ -145,8 +152,9 @@ class JobSpider:
elif os.path.isdir(filepath):
shutil.rmtree(filepath, True)
if __name__ == "__main__":
spider = JobSpider()
start = time.time()
spider.run()
logger.info("总耗时 {}".format(time.time() - start))
logger.info("总耗时 {}".format(time.time() - start))

Loading…
Cancel
Save