-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathjobs.py
105 lines (81 loc) · 2.85 KB
/
jobs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
from browsers import load_chrome, parse_html
HEADER_CLASS = 'gc-card__header gc-job-detail__header'
def load_tabs(browser, jobs):
tabs = []
for idx, job in enumerate(jobs):
arg = f"window.open('{job['loc']}', 'tab{idx+1}');"
browser.execute_script(arg)
tabs.append(f'tab{idx+1}')
return tabs
def get_job_header(page):
header = page.find('div', {'class': HEADER_CLASS})
return header
def get_job_title(header):
title = header.find('h1')
return title.text
def get_job_locations_by_header(header):
locations = []
for div in header.find_all('div', itemprop='address'):
city = div.find('div', itemprop="addressLocality")
if city:
city = city.text.strip()
state = div.find('div', itemprop="addressRegion")
if state:
state = state.text.strip()
country = div.find('div', itemprop="addressCountry")
if country:
country = country.text.strip()
location = [country, state, city]
locations.append(location)
return locations
def get_job_locations_by_page(page):
locations_message = 'Note: By applying to this position your application'
locations = []
if locations_message in page:
locs = page.split(locations_message)[1]
locs = locs.split('<b>')[1]
locs = locs.split('</b>')[0]
locs = locs.split(';')
for location in locs:
location = location.split(',')
city = location[0].strip()
state = location[1].strip()
country = location[-1].strip()
if len(location) <= 2:
state = None
locations.append([country, state, city])
return locations
def get_job_locations(page, header):
locations = get_job_locations_by_header(header)
locations += get_job_locations_by_page(str(page))
return locations
def is_job_open(page):
closed_message = 'Applications are currently closed for this role.'
if closed_message in str(page):
return 'Closed'
return 'Open'
def process_job(browser):
job = {}
while 'title' not in job or not len(job['title']):
page = parse_html(browser)
header = get_job_header(page)
if header is None:
continue
job['title'] = get_job_title(header)
job['locations'] = get_job_locations(page, header)
job['valid'] = is_job_open(page)
job['url'] = browser.current_url
print(f"Loaded: {job['title']} - {job['locations']}")
return job
def process_jobs(browser, tabs):
jobs_parsed = []
for tab in tabs:
browser.switch_to.window(tab)
jobs_parsed.append(process_job(browser))
return jobs_parsed
def parse_jobs(jobs):
browser = load_chrome()
tabs = load_tabs(browser, jobs)
jobs_parsed = process_jobs(browser, tabs)
browser.quit()
return jobs_parsed