|
|
|
@ -4,6 +4,9 @@ from gevent import monkey |
|
|
|
|
from gevent.pool import Pool |
|
|
|
|
|
|
|
|
|
monkey.patch_all(select=False) |
|
|
|
|
|
|
|
|
|
from requests.adapters import HTTPAdapter |
|
|
|
|
from urllib3 import Retry |
|
|
|
|
import time |
|
|
|
|
import os |
|
|
|
|
import logging |
|
|
|
@ -12,7 +15,6 @@ from queue import Queue |
|
|
|
|
from bs4 import BeautifulSoup |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_logger(): |
|
|
|
|
""" |
|
|
|
|
创建日志实例 |
|
|
|
@ -43,6 +45,11 @@ LOG_LEVEL = logging.INFO # 日志等级 |
|
|
|
|
POOL_MAXSIZE = 8 # 线程池最大容量 |
|
|
|
|
|
|
|
|
|
logger = get_logger() |
|
|
|
|
session = requests.Session() |
|
|
|
|
retry = Retry(connect=3, backoff_factor=0.5) |
|
|
|
|
adapter = HTTPAdapter(max_retries=retry) |
|
|
|
|
session.mount('http://', adapter) |
|
|
|
|
session.mount('https://', adapter) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class JobSpider: |
|
|
|
@ -65,7 +72,7 @@ class JobSpider: |
|
|
|
|
urls = [START_URL.format(p) for p in range(1, 200)] # #resultList > div:nth-child(53) |
|
|
|
|
for url in urls: |
|
|
|
|
logger.info("爬取链接:{}\n第 {} 页".format(url, urls.index(url) + 1)) |
|
|
|
|
html = requests.get(url, headers=HEADERS).content |
|
|
|
|
html = session.get(url, headers=HEADERS).content |
|
|
|
|
bs = BeautifulSoup(html, "lxml").find("div", class_="dw_table").find_all("div", class_="el") |
|
|
|
|
for b in bs: |
|
|
|
|
try: |
|
|
|
@ -83,9 +90,9 @@ class JobSpider: |
|
|
|
|
while True: |
|
|
|
|
# 从队列中取 url |
|
|
|
|
url = self.desc_url_queue.get() |
|
|
|
|
resp = requests.get(url, headers=HEADERS) |
|
|
|
|
resp = session.get(url, headers=HEADERS) |
|
|
|
|
if resp.status_code == 200: |
|
|
|
|
logger.info("爬取链接:{}\n第 {} 条岗位详情".format(url,self.count)) |
|
|
|
|
logger.info("爬取链接:{}\n第 {} 条岗位详情".format(url, self.count)) |
|
|
|
|
html = resp.content |
|
|
|
|
self.desc_url_queue.task_done() |
|
|
|
|
self.count += 1 |
|
|
|
@ -99,15 +106,15 @@ class JobSpider: |
|
|
|
|
bs_tmp2 = bs_tmp.select('strong')[0] |
|
|
|
|
bs_tmp3 = bs_tmp.select('p.cname > a.catn')[0] |
|
|
|
|
bs_tmp4 = bs_tmp.select(' p.msg.ltype')[0].text.replace(u'\xa0', '').split('|') |
|
|
|
|
|
|
|
|
|
with open(os.path.join(self.job_dir, "岗位信息.txt"), 'ab+') as f: |
|
|
|
|
tmp = {"岗位": bs_tmp1.text, "公司": bs_tmp3['title'], "薪资": bs_tmp2.text, '位置': bs_tmp4[0], |
|
|
|
|
'工作经验': bs_tmp4[1], '学历': bs_tmp4[2], '招聘人数': bs_tmp4[3], '发布时间': bs_tmp4[4]} |
|
|
|
|
f.write((str(tmp) + '\n').encode('utf-8')) |
|
|
|
|
bs = BeautifulSoup(html, "lxml").find("div", class_="bmsg job_msg inbox").text |
|
|
|
|
s = bs.replace("微信", "").replace("分享", "").replace("邮件", "").replace("\t", "").strip() |
|
|
|
|
with open(os.path.join(self.job_dir, "岗位描述.txt"), "a", encoding="utf-8") as f: |
|
|
|
|
f.write(s) |
|
|
|
|
if len(bs_tmp4) == 5: |
|
|
|
|
with open(os.path.join(self.job_dir, "岗位信息.txt"), 'ab+') as f: |
|
|
|
|
tmp = {"岗位": bs_tmp1.text, "公司": bs_tmp3['title'], "薪资": bs_tmp2.text, '位置': bs_tmp4[0], |
|
|
|
|
'工作经验': bs_tmp4[1], '学历': bs_tmp4[2], '招聘人数': bs_tmp4[3], '发布时间': bs_tmp4[4]} |
|
|
|
|
f.write((str(tmp) + '\n').encode('utf-8')) |
|
|
|
|
bs = BeautifulSoup(html, "lxml").find("div", class_="bmsg job_msg inbox").text |
|
|
|
|
s = bs.replace("微信", "").replace("分享", "").replace("邮件", "").replace("\t", "").strip() |
|
|
|
|
with open(os.path.join(self.job_dir, "岗位描述.txt"), "a", encoding="utf-8") as f: |
|
|
|
|
f.write(s) |
|
|
|
|
except Exception as e: |
|
|
|
|
logger.error(e) |
|
|
|
|
logger.warning(url) |
|
|
|
@ -135,7 +142,7 @@ class JobSpider: |
|
|
|
|
self.execute_more_tasks(self.post_require) |
|
|
|
|
self.desc_url_queue.join() # 主线程阻塞,等待队列清空 |
|
|
|
|
|
|
|
|
|
def clearDir(self,rootdir): |
|
|
|
|
def clearDir(self, rootdir): |
|
|
|
|
filelist = os.listdir(rootdir) |
|
|
|
|
for f in filelist: |
|
|
|
|
filepath = os.path.join(rootdir, f) |
|
|
|
@ -145,8 +152,9 @@ class JobSpider: |
|
|
|
|
elif os.path.isdir(filepath): |
|
|
|
|
shutil.rmtree(filepath, True) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
|
spider = JobSpider() |
|
|
|
|
start = time.time() |
|
|
|
|
spider.run() |
|
|
|
|
logger.info("总耗时 {} 秒".format(time.time() - start)) |
|
|
|
|
logger.info("总耗时 {} 秒".format(time.time() - start)) |
|
|
|
|