forked from xingag/spider_python
-
Notifications
You must be signed in to change notification settings - Fork 0
/
spider_boss.py
186 lines (137 loc) · 5.07 KB
/
spider_boss.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
#!/usr/bin/env python
# encoding: utf-8
"""
@version: v1.0
@author: xag
@license: Apache Licence
@contact: [email protected]
@site: http://www.xingag.top
@software: PyCharm
@file: spider_boss.py
@time: 2018/10/12 10:17
@description:使用selenium爬取boss直聘网并写入到csv文件中
"""
from selenium import webdriver
import re
from lxml import etree
import requests
import time
import string_utils
import csv
current_page = 1
class BossSpider(object):
driver_path = "/usr/local/bin/chromedriver"
def __init__(self):
self.driver = webdriver.Chrome(executable_path=BossSpider.driver_path)
# 网页前缀
self.domain = 'https://www.zhipin.com'
# 爬取在首页
self.url = 'https://www.zhipin.com/job_detail/?query=python&scity=100010000&industry=&position='
self.positions = []
# 保存数据到 csv 文件中【追加】
fp = open('positions.csv', 'a', newline='', encoding='utf-8')
self.writer = csv.DictWriter(fp, ['company_name', 'name', 'salary', 'city', 'work_years', 'education', 'desc'])
self.writer.writeheader()
def run(self):
self.driver.get(self.url)
global current_page
while True:
print('爬取第%d页数据' % current_page)
current_page = current_page + 1
# 获取首页在源码内容
source = self.driver.page_source
# 爬去当前页面在数据
self.parse_current_page(source)
next_bt = self.driver.find_element_by_xpath("//a[@ka='page-next']")
if 'disabled' in next_bt.get_attribute("class"):
# 最后一页,爬取完成之后,退出应用
self.driver.quit()
break
else:
next_bt.click()
time.sleep(1)
# 由于boss直聘做了反爬【验证码】,这里只爬取一页数据
break
def parse_current_page(self, source):
"""
解析当前页面在数据获取到详情页面在url:detail_url
:param source:
:return:
"""
html = etree.HTML(source)
# 获取到每一个职位在详情地址
detail_urls_pre = html.xpath('//div[@class="info-primary"]//a/@href')
# links = html.xpath("//div[@class='info-primary']//a[position()=1]/@href")
# 利用lambda + map 对职位详情地址列表加入前缀
detail_urls = list(map(lambda x: self.domain + x, detail_urls_pre))
# 爬取详情页面的数据
for detail_url in detail_urls:
self.request_detail_page(detail_url)
time.sleep(1)
def request_detail_page(self, detail_url):
"""
打开职位详情页面
:param detail_url:
:return:
"""
# 1.切换到详情页面窗口
self.driver.execute_script("window.open('%s')" % (detail_url))
self.driver.switch_to.window(self.driver.window_handles[1])
# 2.获取详情页面的源码数据
page_source_detail = self.driver.page_source
# 3.解析详情页面
self.parse_detail_page(page_source_detail)
# 4.关闭当前窗口并切换回列表
self.driver.close()
self.driver.switch_to.window(self.driver.window_handles[0])
def parse_detail_page(self, page_source_detail):
"""
解析职位详情页面
:param page_source_detail:
:return:
"""
html = etree.HTML(page_source_detail)
# 数据 - 名称
name = html.xpath('//h1/text()')[0]
# 数据 - 公司名称
company_name = html.xpath('//h3[@class="name"]/a[@ka="job-detail-company"]/text()')[0].strip()
# 数据 - 薪水
salary = html.xpath("//div[@class='name']/span[@class='badge']/text()")[0].strip()
# 数据 - info
infos = html.xpath("//div[@class='job-primary detail-box']/div[@class='info-primary']/p/text()")
desc_pre = html.xpath('//div[@class="job-sec"]/div[@class="text"]/text()')
# 每一项换行,去掉前后空格,最后去掉特殊符号
desc = string_utils.remove_special_word('\n'.join(desc_pre).strip())
city = infos[0]
work_years = infos[1]
education = infos[2]
position = {
'company_name': company_name,
'name': name,
'salary': salary,
'city': city,
'work_years': work_years,
'education': education,
'desc': desc
}
print('爬取一条数据成功')
print("==" * 40)
# 写入到csv文件中
self.write_to_csv(position)
self.positions.append(position)
def write_to_csv(self, position):
"""
把职位信息写入到 csv 文件中
:param position:
:return:
"""
self.writer.writerow(position)
if __name__ == '__main__':
# 定义爬虫类
spider = BossSpider()
# 开始执行爬虫
spider.run()
# 写入到csv文件中
# 查看数据
print('恭喜!爬取数据完成~')
print(spider.positions)