-
Notifications
You must be signed in to change notification settings - Fork 2
/
puf.py
65 lines (60 loc) · 3 KB
/
puf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import html5lib
# NOTES
# You have to solve a reCaptcha to enter the site at the first
# In this code the scraper collects 250 resumes per field. You can increase it by changing the range in line-40
# Fill this list with the fields
myfields = ["python","java","dot net"]
baseurl = "https://resumes.indeed.com/"
driver = webdriver.Chrome()
driver.get(baseurl)
for ele,field in enumerate(myfields):
count = 0
myfield = driver.find_element_by_xpath('//*[@id="input-q"]')
myfield.send_keys(Keys.CONTROL + "a")
myfield.send_keys(Keys.DELETE)
myfield.send_keys(field)
if ele == 0:
driver.find_element_by_xpath('//*[@id="content"]/div/div[2]/div/div[1]/div[2]/div/form/div[3]/button').click()
else:
driver.find_element_by_xpath('//*[@id="content"]/div/div[2]/div/div[2]/div[1]/div[1]/div/form/div[3]/button').click()
# For reCaptcha - Indeed is asking only once(for one field)
try:
# this waits for 180sec or until the presence of given element is detected
element = WebDriverWait(driver, 180).until(
EC.presence_of_element_located((By.XPATH, '//*[@id="content"]/div/div[2]/div/div[2]/div[2]/div[1]/div/div[1]/span[1]/a'))
)
finally:
html = driver.page_source
soup = BeautifulSoup(html, 'html5lib')
pages = soup.find('div',{'class':'rezemp-ResumeSearchPage-Pagination icl-Grid-col icl-u-xs-span12 icl-u-md-span7'})
print(len(pages))
for index in range(5):
divs = []
if index == 0:
pass
elif index == 1:
driver.find_element_by_xpath('//*[@id="content"]/div/div[2]/div/div[2]/div[5]/span[14]').click()
else:
driver.find_element_by_xpath('//*[@id="content"]/div/div[2]/div/div[2]/div[5]/span[15]').click()
for div in soup.find('div',{'class':'icl-Grid-col icl-u-xs-span12 icl-u-md-span7 icl-Body'}):
divs.append(div)
for i in range(1,len(divs)+1):
window_before = driver.window_handles[0]
driver.find_element_by_xpath('//*[@id="content"]/div/div[2]/div/div[2]/div[2]/div[{}]/div/div[1]/span[1]/a'.format(i)).click()
window_after = driver.window_handles[1]
driver.switch_to_window(window_after)
hm = driver.find_element_by_xpath('//*[@id="content"]/div/div[3]/div/div[2]/div/div')
mytext = hm.text
with open("Resumes/{0}-{1}.txt".format(field,i+count),"w") as textfile:
textfile.write(mytext)
driver.close()
driver.switch_to_window(window_before)
count += len(divs)
# This line closes the head-less browser that is opened
driver.close()