|
|
@ -1,3 +1,5 @@ |
|
|
|
|
|
|
|
import shutil |
|
|
|
|
|
|
|
|
|
|
|
from gevent import monkey |
|
|
|
from gevent import monkey |
|
|
|
from gevent.pool import Pool |
|
|
|
from gevent.pool import Pool |
|
|
|
|
|
|
|
|
|
|
@ -9,8 +11,6 @@ import requests |
|
|
|
from queue import Queue |
|
|
|
from queue import Queue |
|
|
|
from bs4 import BeautifulSoup |
|
|
|
from bs4 import BeautifulSoup |
|
|
|
|
|
|
|
|
|
|
|
# 开启多线程 |
|
|
|
|
|
|
|
monkey.patch_all() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_logger(): |
|
|
|
def get_logger(): |
|
|
@ -49,6 +49,7 @@ class JobSpider: |
|
|
|
""" |
|
|
|
""" |
|
|
|
Job 网站爬虫类 |
|
|
|
Job 网站爬虫类 |
|
|
|
""" |
|
|
|
""" |
|
|
|
|
|
|
|
job_dir = 'data' |
|
|
|
|
|
|
|
|
|
|
|
def __init__(self): |
|
|
|
def __init__(self): |
|
|
|
self.count = 1 # 记录当前爬第几条数据 |
|
|
|
self.count = 1 # 记录当前爬第几条数据 |
|
|
@ -64,7 +65,7 @@ class JobSpider: |
|
|
|
urls = [START_URL.format(p) for p in range(1, 200)] # #resultList > div:nth-child(53) |
|
|
|
urls = [START_URL.format(p) for p in range(1, 200)] # #resultList > div:nth-child(53) |
|
|
|
for url in urls: |
|
|
|
for url in urls: |
|
|
|
logger.info("爬取链接:{}\n第 {} 页".format(url, urls.index(url) + 1)) |
|
|
|
logger.info("爬取链接:{}\n第 {} 页".format(url, urls.index(url) + 1)) |
|
|
|
html = requests.get(url, headers=HEADERS).content.decode("gbk") |
|
|
|
html = requests.get(url, headers=HEADERS).content |
|
|
|
bs = BeautifulSoup(html, "lxml").find("div", class_="dw_table").find_all("div", class_="el") |
|
|
|
bs = BeautifulSoup(html, "lxml").find("div", class_="dw_table").find_all("div", class_="el") |
|
|
|
for b in bs: |
|
|
|
for b in bs: |
|
|
|
try: |
|
|
|
try: |
|
|
@ -84,8 +85,8 @@ class JobSpider: |
|
|
|
url = self.desc_url_queue.get() |
|
|
|
url = self.desc_url_queue.get() |
|
|
|
resp = requests.get(url, headers=HEADERS) |
|
|
|
resp = requests.get(url, headers=HEADERS) |
|
|
|
if resp.status_code == 200: |
|
|
|
if resp.status_code == 200: |
|
|
|
logger.info("爬取第 {} 条岗位详情".format(self.count)) |
|
|
|
logger.info("爬取链接:{}\n第 {} 条岗位详情".format(url,self.count)) |
|
|
|
html = resp.content.decode("gbk") |
|
|
|
html = resp.content |
|
|
|
self.desc_url_queue.task_done() |
|
|
|
self.desc_url_queue.task_done() |
|
|
|
self.count += 1 |
|
|
|
self.count += 1 |
|
|
|
else: |
|
|
|
else: |
|
|
@ -98,13 +99,14 @@ class JobSpider: |
|
|
|
bs_tmp2 = bs_tmp.select('strong')[0] |
|
|
|
bs_tmp2 = bs_tmp.select('strong')[0] |
|
|
|
bs_tmp3 = bs_tmp.select('p.cname > a.catn')[0] |
|
|
|
bs_tmp3 = bs_tmp.select('p.cname > a.catn')[0] |
|
|
|
bs_tmp4 = bs_tmp.select(' p.msg.ltype')[0].text.replace(u'\xa0', '').split('|') |
|
|
|
bs_tmp4 = bs_tmp.select(' p.msg.ltype')[0].text.replace(u'\xa0', '').split('|') |
|
|
|
with open('data/岗位信息.txt', 'ab+') as f: |
|
|
|
|
|
|
|
|
|
|
|
with open(os.path.join(self.job_dir, "岗位信息.txt"), 'ab+') as f: |
|
|
|
tmp = {"岗位": bs_tmp1.text, "公司": bs_tmp3['title'], "薪资": bs_tmp2.text, '位置': bs_tmp4[0], |
|
|
|
tmp = {"岗位": bs_tmp1.text, "公司": bs_tmp3['title'], "薪资": bs_tmp2.text, '位置': bs_tmp4[0], |
|
|
|
'工作经验': bs_tmp4[1], '学历': bs_tmp4[2], '招聘人数': bs_tmp4[3], '发布时间': bs_tmp4[4]} |
|
|
|
'工作经验': bs_tmp4[1], '学历': bs_tmp4[2], '招聘人数': bs_tmp4[3], '发布时间': bs_tmp4[4]} |
|
|
|
f.write((str(tmp) + '\n').encode('utf-8')) |
|
|
|
f.write((str(tmp) + '\n').encode('utf-8')) |
|
|
|
bs = BeautifulSoup(html, "lxml").find("div", class_="bmsg job_msg inbox").text |
|
|
|
bs = BeautifulSoup(html, "lxml").find("div", class_="bmsg job_msg inbox").text |
|
|
|
s = bs.replace("微信", "").replace("分享", "").replace("邮件", "").replace("\t", "").strip() |
|
|
|
s = bs.replace("微信", "").replace("分享", "").replace("邮件", "").replace("\t", "").strip() |
|
|
|
with open(os.path.join("data", "岗位描述.txt"), "a", encoding="utf-8") as f: |
|
|
|
with open(os.path.join(self.job_dir, "岗位描述.txt"), "a", encoding="utf-8") as f: |
|
|
|
f.write(s) |
|
|
|
f.write(s) |
|
|
|
except Exception as e: |
|
|
|
except Exception as e: |
|
|
|
logger.error(e) |
|
|
|
logger.error(e) |
|
|
@ -121,6 +123,11 @@ class JobSpider: |
|
|
|
self.pool.apply_async(target) |
|
|
|
self.pool.apply_async(target) |
|
|
|
|
|
|
|
|
|
|
|
def run(self): |
|
|
|
def run(self): |
|
|
|
|
|
|
|
if os.path.exists(self.job_dir): |
|
|
|
|
|
|
|
self.clearDir(self.job_dir) |
|
|
|
|
|
|
|
else: |
|
|
|
|
|
|
|
os.mkdir(self.job_dir) |
|
|
|
|
|
|
|
|
|
|
|
""" |
|
|
|
""" |
|
|
|
多线程爬取数据 |
|
|
|
多线程爬取数据 |
|
|
|
""" |
|
|
|
""" |
|
|
@ -128,9 +135,18 @@ class JobSpider: |
|
|
|
self.execute_more_tasks(self.post_require) |
|
|
|
self.execute_more_tasks(self.post_require) |
|
|
|
self.desc_url_queue.join() # 主线程阻塞,等待队列清空 |
|
|
|
self.desc_url_queue.join() # 主线程阻塞,等待队列清空 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def clearDir(self,rootdir): |
|
|
|
|
|
|
|
filelist = os.listdir(rootdir) |
|
|
|
|
|
|
|
for f in filelist: |
|
|
|
|
|
|
|
filepath = os.path.join(rootdir, f) |
|
|
|
|
|
|
|
if os.path.isfile(filepath): |
|
|
|
|
|
|
|
os.remove(filepath) |
|
|
|
|
|
|
|
print(filepath + " removed!") |
|
|
|
|
|
|
|
elif os.path.isdir(filepath): |
|
|
|
|
|
|
|
shutil.rmtree(filepath, True) |
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
if __name__ == "__main__": |
|
|
|
spider = JobSpider() |
|
|
|
spider = JobSpider() |
|
|
|
start = time.time() |
|
|
|
start = time.time() |
|
|
|
spider.run() |
|
|
|
spider.run() |
|
|
|
logger.info("总耗时 {} 秒".format(time.time() - start)) |
|
|
|
logger.info("总耗时 {} 秒".format(time.time() - start)) |