From 0cbd725f8847a73d8ef7b1fe54b8cfe18f1bc433 Mon Sep 17 00:00:00 2001 From: Tilak Patel <ptilak142@gmail.com> Date: Mon, 14 Oct 2024 18:05:31 -0400 Subject: [PATCH 1/2] add extraction chains --- src/aihawk_job_manager.py | 35 ++--- src/extractors/__init__.py | 0 src/extractors/extraction_chains.py | 230 ++++++++++++++++++++++++++++ 3 files changed, 242 insertions(+), 23 deletions(-) create mode 100644 src/extractors/__init__.py create mode 100644 src/extractors/extraction_chains.py diff --git a/src/aihawk_job_manager.py b/src/aihawk_job_manager.py index ccb06e31d..6ce5ac521 100644 --- a/src/aihawk_job_manager.py +++ b/src/aihawk_job_manager.py @@ -16,6 +16,7 @@ from src.job import Job from src.aihawk_easy_applier import AIHawkEasyApplier from loguru import logger +from src.extractors.extraction_chains import EXTRACTORS class EnvironmentKeys: @@ -46,6 +47,7 @@ def __init__(self, driver): self.easy_applier_component = None self.job_application_profile = None self.seen_jobs = [] + self.extractor = None logger.debug("AIHawkJobManager initialized successfully") def set_parameters(self, parameters): @@ -253,29 +255,16 @@ def get_jobs_from_page(self): return [] def apply_jobs(self): - try: - # Check if no matching jobs are found on the current page - no_jobs_element = self.driver.find_element(By.CLASS_NAME, 'jobs-search-two-pane__no-results-banner--expand') - if 'No matching jobs found' in no_jobs_element.text or 'unfortunately, things aren' in self.driver.page_source.lower(): - logger.debug("No matching jobs found on this page, skipping") - return - except NoSuchElementException: - pass - - # Find the job results container and job elements - job_results = self.driver.find_element(By.CLASS_NAME, "jobs-search-results-list") - - # utils.scroll_slow(self.driver, job_results) - # utils.scroll_slow(self.driver, job_results, step=300, reverse=True) - - job_list_elements = job_results.find_elements(By.CLASS_NAME, 'jobs-search-results__list-item') - - if not job_list_elements: - utils.printyellow("No job class elements found on page, moving to next page.") - logger.debug("No job class elements found on page, skipping") - return - - job_list = [Job(*self.extract_job_information_from_tile(job_element)) for job_element in job_list_elements] + job_list = [] + if self.extractor is not None: # we found a working extractor + job_list = self.extractor.get_job_list(self.driver) + else: + for e in EXTRACTORS: + extracted_jobs = e.get_job_list(self.driver) + if len(extracted_jobs) > 0: + job_list = extracted_jobs # break when we find a valid extractor + self.extractor = e + break for job in job_list: logger.debug(f"Starting applicant count search for job: {job.title} at {job.company}") diff --git a/src/extractors/__init__.py b/src/extractors/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/src/extractors/extraction_chains.py b/src/extractors/extraction_chains.py new file mode 100644 index 000000000..4906b98ab --- /dev/null +++ b/src/extractors/extraction_chains.py @@ -0,0 +1,230 @@ +from abc import ABC, abstractmethod +from src.job import Job +from selenium.common.exceptions import NoSuchElementException +from selenium.webdriver.common.by import By +from selenium.webdriver.support.wait import WebDriverWait +from selenium.webdriver.support import expected_conditions as EC +from loguru import logger + + +# An interface that defines different extraction strategies for the linkedin jobs page. +class Extractor(ABC): + @abstractmethod + def get_job_list(self, driver) -> list[Job]: + pass + + +# The only extractor living in code as of writing this. +class Extractor1(Extractor): + def get_job_list(self, driver) -> list[Job]: + try: + no_jobs_element = driver.find_element( + By.CLASS_NAME, "jobs-search-two-pane__no-results-banner--expand" + ) + if ( + "No matching jobs found" in no_jobs_element.text + or "unfortunately, things aren" in driver.page_source.lower() + ): + logger.debug("No matching jobs found on this page, skipping") + return [] + except NoSuchElementException: + pass + + job_list_elements = driver.find_elements( + By.CLASS_NAME, "scaffold-layout__list-container" + )[0].find_elements(By.CLASS_NAME, "jobs-search-results__list-item") + + if not job_list_elements: + logger.debug("No job class elements found on page, skipping") + return + + job_list = [ + Job(*self.extract_job_information_from_tile(job_element)) + for job_element in job_list_elements + ] + return list(filter(lambda j: len(j.link) > 0, job_list)) + + def extract_job_information_from_tile(self, job_tile): + logger.debug("Extracting job information from tile") + job_title, company, job_location, apply_method, link = "", "", "", "", "" + try: + print(job_tile.get_attribute("outerHTML")) + job_title = ( + job_tile.find_element(By.CLASS_NAME, "job-card-list__title") + .find_element(By.TAG_NAME, "strong") + .text + ) + + link = ( + job_tile.find_element(By.CLASS_NAME, "job-card-list__title") + .get_attribute("href") + .split("?")[0] + ) + company = job_tile.find_element( + By.CLASS_NAME, "job-card-container__primary-description" + ).text + logger.debug(f"Job information extracted: {job_title} at {company}") + except NoSuchElementException: + logger.warning("Some job information (title, link, or company) is missing.") + try: + job_location = job_tile.find_element( + By.CLASS_NAME, "job-card-container__metadata-item" + ).text + except NoSuchElementException: + logger.warning("Job location is missing.") + try: + apply_method = job_tile.find_element( + By.CLASS_NAME, "job-card-container__apply-method" + ).text + except NoSuchElementException: + apply_method = "Applied" + logger.warning("Apply method not found, assuming 'Applied'.") + + return job_title, company, job_location, link, apply_method + + +class Extractor2(Extractor): + def get_job_list(self, driver) -> list[Job]: + try: + # Wait for the job list container to be present + WebDriverWait(driver, 10).until( + EC.presence_of_element_located( + (By.CLASS_NAME, "scaffold-layout__list-container") + ) + ) + + # Find the job list container + job_list_container = driver.find_element( + By.CLASS_NAME, "scaffold-layout__list-container" + ) + + # Find all job items within the container + job_list_elements = job_list_container.find_elements( + By.CSS_SELECTOR, + "li.ember-view.jobs-search-results__list-item.occludable-update.p0.relative.scaffold-layout__list-item", + ) + + print(f"Number of job elements found: {len(job_list_elements)}") + + if not job_list_elements: + raise Exception("No job elements found on page") + + job_list = [ + Job(*self.extract_job_information_from_tile(job_element)) + for job_element in job_list_elements + ] + return list(filter(lambda j: len(j.link) > 0, job_list)) + except Exception as e: + return [] + + def extract_job_information_from_tile(self, job_tile): + job_title, company, job_location, apply_method, link = "", "", "", "", "" + try: + job_title = job_tile.find_element( + By.CSS_SELECTOR, "a.job-card-list__title" + ).text + link = ( + job_tile.find_element(By.CSS_SELECTOR, "a.job-card-list__title") + .get_attribute("href") + .split("?")[0] + ) + company = job_tile.find_element( + By.CSS_SELECTOR, ".job-card-container__primary-description" + ).text + job_location = job_tile.find_element( + By.CSS_SELECTOR, ".job-card-container__metadata-item" + ).text + apply_method = job_tile.find_element( + By.CSS_SELECTOR, ".job-card-container__apply-method" + ).text + except NoSuchElementException as e: + print(f"Error extracting job information: {str(e)}") + + return job_title, company, job_location, link, apply_method + + + +# The only extractor living in code as of writing this. +class Extractor3(Extractor): + def get_job_list(self, driver) -> list[Job]: + try: + no_jobs_element = driver.find_element( + By.CLASS_NAME, "jobs-search-two-pane__no-results-banner--expand" + ) + if ( + "No matching jobs found" in no_jobs_element.text + or "unfortunately, things aren" in driver.page_source.lower() + ): + logger.debug("No matching jobs found on this page, skipping") + return [] + except NoSuchElementException: + pass + + job_list_elements = driver.find_elements( + By.CLASS_NAME, "scaffold-layout__list-container" + )[0].find_elements(By.CLASS_NAME, "jobs-search-results__list-item") + + if not job_list_elements: + logger.debug("No job class elements found on page, skipping") + return + + job_list = [ + Job(*self.extract_job_information_from_tile(job_element)) + for job_element in job_list_elements + ] + return list(filter(lambda j: len(j.link) > 0, job_list)) + + def extract_job_information_from_tile(self, job_tile): + job_title = link = company = job_location = apply_method = "" + logger.debug("Extracting job information from tile") + + try: + print(job_tile.get_attribute("outerHTML")) + + # Extract job title + job_title = ( + job_tile.find_element( + By.CLASS_NAME, "job-card-job-posting-card-wrapper__title" + ) + .find_element(By.TAG_NAME, "strong") + .text + ) + + # Extract job link + link = ( + job_tile.find_element(By.CLASS_NAME, "app-aware-link") + .get_attribute("href") + ) + + # Extract company name + company = job_tile.find_element( + By.CLASS_NAME, "artdeco-entity-lockup__subtitle" + ).text + + # Extract job location + job_location = job_tile.find_element( + By.CLASS_NAME, "artdeco-entity-lockup__caption" + ).text + + # Apply method (if it exists) + try: + apply_method = job_tile.find_element( + By.CLASS_NAME, "job-card-job-posting-card-wrapper__footer-item" + ).text + except NoSuchElementException: + logger.warning("Apply method not found for this job tile.") + + logger.debug( + f"Job information extracted: {job_title} at {company}, located in {job_location}" + ) + except NoSuchElementException: + logger.warning( + "Some job information (title, link, or company) could not be parsed." + ) + + + + return job_title, company, job_location, link, apply_method + + +EXTRACTORS = [Extractor1(), Extractor2(), Extractor3()] From 86f6b2c538757f3ac9cc1bf5dcd2e16014d8727b Mon Sep 17 00:00:00 2001 From: Tilak Patel <ptilak142@gmail.com> Date: Mon, 14 Oct 2024 18:15:25 -0400 Subject: [PATCH 2/2] fix test --- tests/test_aihawk_job_manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_aihawk_job_manager.py b/tests/test_aihawk_job_manager.py index 41e981e4b..bda1fc0c8 100644 --- a/tests/test_aihawk_job_manager.py +++ b/tests/test_aihawk_job_manager.py @@ -109,7 +109,7 @@ def test_apply_jobs_no_jobs(mocker, job_manager): job_manager.apply_jobs() # Ensure it attempted to find the job results list - assert job_manager.driver.find_element.call_count == 1 + assert job_manager.driver.find_element.call_count > 0 def test_apply_jobs_with_jobs(mocker, job_manager):