Skip to content

Commit

Permalink
edited scraper
Browse files Browse the repository at this point in the history
  • Loading branch information
Prinz Magtulis authored and Prinz Magtulis committed Sep 23, 2022
1 parent eb34dcf commit b5d3dfa
Showing 1 changed file with 18 additions and 22 deletions.
40 changes: 18 additions & 22 deletions autoscraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@

from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.common.exceptions import TimeoutException
from webdriver_manager.core.utils import ChromeType
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
Expand Down Expand Up @@ -51,27 +50,24 @@

dataset = []
while True:
try:
WebDriverWait(driver, 4).until(
EC.presence_of_element_located((By.XPATH, "/html/body/section/div/div/div/div[2]/div/div/div/a"))
)
except TimeoutException:
driver.quit()
all_div = driver.find_elements(By.CSS_SELECTOR, ".mb10")
if len(dataset) >= 3000:
break
for div in all_div:
data={}
data ['agency'] = div.find_element(By.TAG_NAME, 'span').text
data ['date'] = div.find_element(By.TAG_NAME, 'p').get_attribute('title')
data ['title'] = div.find_element(By.TAG_NAME, 'h4').text
data ['status'] = div.find_element(By.TAG_NAME, 'label').text
data ['purpose'] = div.find_elements(By.TAG_NAME, 'span')[2].text
data ['period_covered'] = div.find_elements(By.TAG_NAME, 'span')[3].text
data ['link'] = div.find_element(By.TAG_NAME, 'a').get_attribute('href')
dataset.append(data)

driver.find_element(By.XPATH, "/html/body/section/div/div/div/div[2]/div/div/div/a").click()
WebDriverWait(driver, 4).until(
EC.presence_of_element_located((By.XPATH, "/html/body/section/div/div/div/div[2]/div/div/div/a"))
)
all_div = driver.find_elements(By.CSS_SELECTOR, ".mb10")
if len(dataset) >= 2000:
break
for div in all_div:
data={}
data ['agency'] = div.find_element(By.TAG_NAME, 'span').text
data ['date'] = div.find_element(By.TAG_NAME, 'p').get_attribute('title')
data ['title'] = div.find_element(By.TAG_NAME, 'h4').text
data ['status'] = div.find_element(By.TAG_NAME, 'label').text
data ['purpose'] = div.find_elements(By.TAG_NAME, 'span')[2].text
data ['period_covered'] = div.find_elements(By.TAG_NAME, 'span')[3].text
data ['link'] = div.find_element(By.TAG_NAME, 'a').get_attribute('href')
dataset.append(data)

driver.find_element(By.XPATH, "/html/body/section/div/div/div/div[2]/div/div/div/a").click()


# In[7]:
Expand Down

0 comments on commit b5d3dfa

Please sign in to comment.