diff --git a/Test.py b/Test.py new file mode 100644 index 0000000..74ddfcc --- /dev/null +++ b/Test.py @@ -0,0 +1,16 @@ +import requests +from bs4 import BeautifulSoup +HEADERS = { + "X-Requested-With": "XMLHttpRequest", + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36" + "(KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36", +} +html=requests.get('https://jobs.51job.com/guangzhou/119982727.html?s=01&t=0',headers=HEADERS).content +bs_tmp = BeautifulSoup(html, 'lxml').select( + 'body > div.tCompanyPage > div.tCompany_center.clearfix > div.tHeader.tHjob > div > div.cn')[0] +bs_tmp1 = bs_tmp.select('h1')[0] +bs_tmp2 = bs_tmp.select('strong')[0] +bs_tmp3 = bs_tmp.select('p.cname > a.catn')[0] +bs_tmp4 = bs_tmp.select(' p.msg.ltype')[0].text.replace(u'\xa0', '').split('|') +tmp = {"岗位": bs_tmp1.text, "公司": bs_tmp3['title'], "薪资": bs_tmp2.text, '位置': bs_tmp4[0], + '工作经验': bs_tmp4[1], '学历': bs_tmp4[2], '招聘人数': bs_tmp4[3], '发布时间': bs_tmp4[4]} \ No newline at end of file diff --git a/job_spider.py b/job_spider.py index a52b57d..684b772 100644 --- a/job_spider.py +++ b/job_spider.py @@ -4,6 +4,9 @@ from gevent import monkey from gevent.pool import Pool monkey.patch_all(select=False) + +from requests.adapters import HTTPAdapter +from urllib3 import Retry import time import os import logging @@ -12,7 +15,6 @@ from queue import Queue from bs4 import BeautifulSoup - def get_logger(): """ 创建日志实例 @@ -43,6 +45,11 @@ LOG_LEVEL = logging.INFO # 日志等级 POOL_MAXSIZE = 8 # 线程池最大容量 logger = get_logger() +session = requests.Session() +retry = Retry(connect=3, backoff_factor=0.5) +adapter = HTTPAdapter(max_retries=retry) +session.mount('http://', adapter) +session.mount('https://', adapter) class JobSpider: @@ -65,7 +72,7 @@ class JobSpider: urls = [START_URL.format(p) for p in range(1, 200)] # #resultList > div:nth-child(53) for url in urls: logger.info("爬取链接:{}\n第 {} 页".format(url, urls.index(url) + 1)) - html = requests.get(url, headers=HEADERS).content + html = session.get(url, headers=HEADERS).content bs = BeautifulSoup(html, "lxml").find("div", class_="dw_table").find_all("div", class_="el") for b in bs: try: @@ -83,9 +90,9 @@ class JobSpider: while True: # 从队列中取 url url = self.desc_url_queue.get() - resp = requests.get(url, headers=HEADERS) + resp = session.get(url, headers=HEADERS) if resp.status_code == 200: - logger.info("爬取链接:{}\n第 {} 条岗位详情".format(url,self.count)) + logger.info("爬取链接:{}\n第 {} 条岗位详情".format(url, self.count)) html = resp.content self.desc_url_queue.task_done() self.count += 1 @@ -99,15 +106,15 @@ class JobSpider: bs_tmp2 = bs_tmp.select('strong')[0] bs_tmp3 = bs_tmp.select('p.cname > a.catn')[0] bs_tmp4 = bs_tmp.select(' p.msg.ltype')[0].text.replace(u'\xa0', '').split('|') - - with open(os.path.join(self.job_dir, "岗位信息.txt"), 'ab+') as f: - tmp = {"岗位": bs_tmp1.text, "公司": bs_tmp3['title'], "薪资": bs_tmp2.text, '位置': bs_tmp4[0], - '工作经验': bs_tmp4[1], '学历': bs_tmp4[2], '招聘人数': bs_tmp4[3], '发布时间': bs_tmp4[4]} - f.write((str(tmp) + '\n').encode('utf-8')) - bs = BeautifulSoup(html, "lxml").find("div", class_="bmsg job_msg inbox").text - s = bs.replace("微信", "").replace("分享", "").replace("邮件", "").replace("\t", "").strip() - with open(os.path.join(self.job_dir, "岗位描述.txt"), "a", encoding="utf-8") as f: - f.write(s) + if len(bs_tmp4) == 5: + with open(os.path.join(self.job_dir, "岗位信息.txt"), 'ab+') as f: + tmp = {"岗位": bs_tmp1.text, "公司": bs_tmp3['title'], "薪资": bs_tmp2.text, '位置': bs_tmp4[0], + '工作经验': bs_tmp4[1], '学历': bs_tmp4[2], '招聘人数': bs_tmp4[3], '发布时间': bs_tmp4[4]} + f.write((str(tmp) + '\n').encode('utf-8')) + bs = BeautifulSoup(html, "lxml").find("div", class_="bmsg job_msg inbox").text + s = bs.replace("微信", "").replace("分享", "").replace("邮件", "").replace("\t", "").strip() + with open(os.path.join(self.job_dir, "岗位描述.txt"), "a", encoding="utf-8") as f: + f.write(s) except Exception as e: logger.error(e) logger.warning(url) @@ -135,7 +142,7 @@ class JobSpider: self.execute_more_tasks(self.post_require) self.desc_url_queue.join() # 主线程阻塞,等待队列清空 - def clearDir(self,rootdir): + def clearDir(self, rootdir): filelist = os.listdir(rootdir) for f in filelist: filepath = os.path.join(rootdir, f) @@ -145,8 +152,9 @@ class JobSpider: elif os.path.isdir(filepath): shutil.rmtree(filepath, True) + if __name__ == "__main__": spider = JobSpider() start = time.time() spider.run() - logger.info("总耗时 {} 秒".format(time.time() - start)) \ No newline at end of file + logger.info("总耗时 {} 秒".format(time.time() - start))