master
pan 5 years ago
parent dc9a49f501
commit 5dbd2c792b
  1. 30
      job_spider.py

@ -1,3 +1,5 @@
import shutil
from gevent import monkey from gevent import monkey
from gevent.pool import Pool from gevent.pool import Pool
@ -9,8 +11,6 @@ import requests
from queue import Queue from queue import Queue
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
# 开启多线程
monkey.patch_all()
def get_logger(): def get_logger():
@ -49,6 +49,7 @@ class JobSpider:
""" """
Job 网站爬虫类 Job 网站爬虫类
""" """
job_dir = 'data'
def __init__(self): def __init__(self):
self.count = 1 # 记录当前爬第几条数据 self.count = 1 # 记录当前爬第几条数据
@ -64,7 +65,7 @@ class JobSpider:
urls = [START_URL.format(p) for p in range(1, 200)] # #resultList > div:nth-child(53) urls = [START_URL.format(p) for p in range(1, 200)] # #resultList > div:nth-child(53)
for url in urls: for url in urls:
logger.info("爬取链接:{}\n{}".format(url, urls.index(url) + 1)) logger.info("爬取链接:{}\n{}".format(url, urls.index(url) + 1))
html = requests.get(url, headers=HEADERS).content.decode("gbk") html = requests.get(url, headers=HEADERS).content
bs = BeautifulSoup(html, "lxml").find("div", class_="dw_table").find_all("div", class_="el") bs = BeautifulSoup(html, "lxml").find("div", class_="dw_table").find_all("div", class_="el")
for b in bs: for b in bs:
try: try:
@ -84,8 +85,8 @@ class JobSpider:
url = self.desc_url_queue.get() url = self.desc_url_queue.get()
resp = requests.get(url, headers=HEADERS) resp = requests.get(url, headers=HEADERS)
if resp.status_code == 200: if resp.status_code == 200:
logger.info("爬取第 {} 条岗位详情".format(self.count)) logger.info("爬取链接:{}\n{} 条岗位详情".format(url,self.count))
html = resp.content.decode("gbk") html = resp.content
self.desc_url_queue.task_done() self.desc_url_queue.task_done()
self.count += 1 self.count += 1
else: else:
@ -98,13 +99,14 @@ class JobSpider:
bs_tmp2 = bs_tmp.select('strong')[0] bs_tmp2 = bs_tmp.select('strong')[0]
bs_tmp3 = bs_tmp.select('p.cname > a.catn')[0] bs_tmp3 = bs_tmp.select('p.cname > a.catn')[0]
bs_tmp4 = bs_tmp.select(' p.msg.ltype')[0].text.replace(u'\xa0', '').split('|') bs_tmp4 = bs_tmp.select(' p.msg.ltype')[0].text.replace(u'\xa0', '').split('|')
with open('data/岗位信息.txt', 'ab+') as f:
with open(os.path.join(self.job_dir, "岗位信息.txt"), 'ab+') as f:
tmp = {"岗位": bs_tmp1.text, "公司": bs_tmp3['title'], "薪资": bs_tmp2.text, '位置': bs_tmp4[0], tmp = {"岗位": bs_tmp1.text, "公司": bs_tmp3['title'], "薪资": bs_tmp2.text, '位置': bs_tmp4[0],
'工作经验': bs_tmp4[1], '学历': bs_tmp4[2], '招聘人数': bs_tmp4[3], '发布时间': bs_tmp4[4]} '工作经验': bs_tmp4[1], '学历': bs_tmp4[2], '招聘人数': bs_tmp4[3], '发布时间': bs_tmp4[4]}
f.write((str(tmp) + '\n').encode('utf-8')) f.write((str(tmp) + '\n').encode('utf-8'))
bs = BeautifulSoup(html, "lxml").find("div", class_="bmsg job_msg inbox").text bs = BeautifulSoup(html, "lxml").find("div", class_="bmsg job_msg inbox").text
s = bs.replace("微信", "").replace("分享", "").replace("邮件", "").replace("\t", "").strip() s = bs.replace("微信", "").replace("分享", "").replace("邮件", "").replace("\t", "").strip()
with open(os.path.join("data", "岗位描述.txt"), "a", encoding="utf-8") as f: with open(os.path.join(self.job_dir, "岗位描述.txt"), "a", encoding="utf-8") as f:
f.write(s) f.write(s)
except Exception as e: except Exception as e:
logger.error(e) logger.error(e)
@ -121,6 +123,11 @@ class JobSpider:
self.pool.apply_async(target) self.pool.apply_async(target)
def run(self): def run(self):
if os.path.exists(self.job_dir):
self.clearDir(self.job_dir)
else:
os.mkdir(self.job_dir)
""" """
多线程爬取数据 多线程爬取数据
""" """
@ -128,6 +135,15 @@ class JobSpider:
self.execute_more_tasks(self.post_require) self.execute_more_tasks(self.post_require)
self.desc_url_queue.join() # 主线程阻塞,等待队列清空 self.desc_url_queue.join() # 主线程阻塞,等待队列清空
def clearDir(self,rootdir):
filelist = os.listdir(rootdir)
for f in filelist:
filepath = os.path.join(rootdir, f)
if os.path.isfile(filepath):
os.remove(filepath)
print(filepath + " removed!")
elif os.path.isdir(filepath):
shutil.rmtree(filepath, True)
if __name__ == "__main__": if __name__ == "__main__":
spider = JobSpider() spider = JobSpider()

Loading…
Cancel
Save