Merge pull request #458 from MrTpat/execution-chains

feder-cr · Oct 15, 2024 · dceae26 · dceae26
2 parents e3cb3dc + 86f6b2c
commit dceae26
Show file tree

Hide file tree

Showing 4 changed files with 243 additions and 24 deletions.
diff --git a/src/aihawk_job_manager.py b/src/aihawk_job_manager.py
@@ -16,6 +16,7 @@
 from src.job import Job
 from src.aihawk_easy_applier import AIHawkEasyApplier
 from loguru import logger
+from src.extractors.extraction_chains import EXTRACTORS
 
 
 class EnvironmentKeys:
@@ -46,6 +47,7 @@ def __init__(self, driver):
         self.easy_applier_component = None
         self.job_application_profile = None
         self.seen_jobs = []
+        self.extractor = None
         logger.debug("AIHawkJobManager initialized successfully")
 
     def set_parameters(self, parameters):
@@ -253,29 +255,16 @@ def get_jobs_from_page(self):
             return []
 
     def apply_jobs(self):
-        try:
-            # Check if no matching jobs are found on the current page
-            no_jobs_element = self.driver.find_element(By.CLASS_NAME, 'jobs-search-two-pane__no-results-banner--expand')
-            if 'No matching jobs found' in no_jobs_element.text or 'unfortunately, things aren' in self.driver.page_source.lower():
-                logger.debug("No matching jobs found on this page, skipping")
-                return
-        except NoSuchElementException:
-            pass
-
-        # Find the job results container and job elements
-        job_results = self.driver.find_element(By.CLASS_NAME, "jobs-search-results-list")
-
-        # utils.scroll_slow(self.driver, job_results)
-        # utils.scroll_slow(self.driver, job_results, step=300, reverse=True)
-
-        job_list_elements = job_results.find_elements(By.CLASS_NAME, 'jobs-search-results__list-item')
-
-        if not job_list_elements:
-            utils.printyellow("No job class elements found on page, moving to next page.")
-            logger.debug("No job class elements found on page, skipping")
-            return
-
-        job_list = [Job(*self.extract_job_information_from_tile(job_element)) for job_element in job_list_elements]
+        job_list = []
+        if self.extractor is not None: # we found a working extractor
+            job_list = self.extractor.get_job_list(self.driver)
+        else:
+            for e in EXTRACTORS:
+                extracted_jobs = e.get_job_list(self.driver)
+                if len(extracted_jobs) > 0:
+                    job_list = extracted_jobs # break when we find a valid extractor
+                    self.extractor = e
+                    break
 
         for job in job_list:
             logger.debug(f"Starting applicant count search for job: {job.title} at {job.company}")

diff --git a/src/extractors/__init__.py b/src/extractors/__init__.py
diff --git a/src/extractors/extraction_chains.py b/src/extractors/extraction_chains.py
@@ -0,0 +1,230 @@
+from abc import ABC, abstractmethod
+from src.job import Job
+from selenium.common.exceptions import NoSuchElementException
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.wait import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+from loguru import logger
+
+
+# An interface that defines different extraction strategies for the linkedin jobs page.
+class Extractor(ABC):
+    @abstractmethod
+    def get_job_list(self, driver) -> list[Job]:
+        pass
+
+
+# The only extractor living in code as of writing this.
+class Extractor1(Extractor):
+    def get_job_list(self, driver) -> list[Job]:
+        try:
+            no_jobs_element = driver.find_element(
+                By.CLASS_NAME, "jobs-search-two-pane__no-results-banner--expand"
+            )
+            if (
+                "No matching jobs found" in no_jobs_element.text
+                or "unfortunately, things aren" in driver.page_source.lower()
+            ):
+                logger.debug("No matching jobs found on this page, skipping")
+                return []
+        except NoSuchElementException:
+            pass
+
+        job_list_elements = driver.find_elements(
+            By.CLASS_NAME, "scaffold-layout__list-container"
+        )[0].find_elements(By.CLASS_NAME, "jobs-search-results__list-item")
+
+        if not job_list_elements:
+            logger.debug("No job class elements found on page, skipping")
+            return
+
+        job_list = [
+            Job(*self.extract_job_information_from_tile(job_element))
+            for job_element in job_list_elements
+        ]
+        return list(filter(lambda j: len(j.link) > 0, job_list))
+
+    def extract_job_information_from_tile(self, job_tile):
+        logger.debug("Extracting job information from tile")
+        job_title, company, job_location, apply_method, link = "", "", "", "", ""
+        try:
+            print(job_tile.get_attribute("outerHTML"))
+            job_title = (
+                job_tile.find_element(By.CLASS_NAME, "job-card-list__title")
+                .find_element(By.TAG_NAME, "strong")
+                .text
+            )
+
+            link = (
+                job_tile.find_element(By.CLASS_NAME, "job-card-list__title")
+                .get_attribute("href")
+                .split("?")[0]
+            )
+            company = job_tile.find_element(
+                By.CLASS_NAME, "job-card-container__primary-description"
+            ).text
+            logger.debug(f"Job information extracted: {job_title} at {company}")
+        except NoSuchElementException:
+            logger.warning("Some job information (title, link, or company) is missing.")
+        try:
+            job_location = job_tile.find_element(
+                By.CLASS_NAME, "job-card-container__metadata-item"
+            ).text
+        except NoSuchElementException:
+            logger.warning("Job location is missing.")
+        try:
+            apply_method = job_tile.find_element(
+                By.CLASS_NAME, "job-card-container__apply-method"
+            ).text
+        except NoSuchElementException:
+            apply_method = "Applied"
+            logger.warning("Apply method not found, assuming 'Applied'.")
+
+        return job_title, company, job_location, link, apply_method
+
+
+class Extractor2(Extractor):
+    def get_job_list(self, driver) -> list[Job]:
+        try:
+            # Wait for the job list container to be present
+            WebDriverWait(driver, 10).until(
+                EC.presence_of_element_located(
+                    (By.CLASS_NAME, "scaffold-layout__list-container")
+                )
+            )
+
+            # Find the job list container
+            job_list_container = driver.find_element(
+                By.CLASS_NAME, "scaffold-layout__list-container"
+            )
+
+            # Find all job items within the container
+            job_list_elements = job_list_container.find_elements(
+                By.CSS_SELECTOR,
+                "li.ember-view.jobs-search-results__list-item.occludable-update.p0.relative.scaffold-layout__list-item",
+            )
+
+            print(f"Number of job elements found: {len(job_list_elements)}")
+
+            if not job_list_elements:
+                raise Exception("No job elements found on page")
+
+            job_list = [
+                Job(*self.extract_job_information_from_tile(job_element))
+                for job_element in job_list_elements
+            ]
+            return list(filter(lambda j: len(j.link) > 0, job_list))
+        except Exception as e:
+            return []
+
+    def extract_job_information_from_tile(self, job_tile):
+        job_title, company, job_location, apply_method, link = "", "", "", "", ""
+        try:
+            job_title = job_tile.find_element(
+                By.CSS_SELECTOR, "a.job-card-list__title"
+            ).text
+            link = (
+                job_tile.find_element(By.CSS_SELECTOR, "a.job-card-list__title")
+                .get_attribute("href")
+                .split("?")[0]
+            )
+            company = job_tile.find_element(
+                By.CSS_SELECTOR, ".job-card-container__primary-description"
+            ).text
+            job_location = job_tile.find_element(
+                By.CSS_SELECTOR, ".job-card-container__metadata-item"
+            ).text
+            apply_method = job_tile.find_element(
+                By.CSS_SELECTOR, ".job-card-container__apply-method"
+            ).text
+        except NoSuchElementException as e:
+            print(f"Error extracting job information: {str(e)}")
+
+        return job_title, company, job_location, link, apply_method
+
+
+
+# The only extractor living in code as of writing this.
+class Extractor3(Extractor):
+    def get_job_list(self, driver) -> list[Job]:
+        try:
+            no_jobs_element = driver.find_element(
+                By.CLASS_NAME, "jobs-search-two-pane__no-results-banner--expand"
+            )
+            if (
+                "No matching jobs found" in no_jobs_element.text
+                or "unfortunately, things aren" in driver.page_source.lower()
+            ):
+                logger.debug("No matching jobs found on this page, skipping")
+                return []
+        except NoSuchElementException:
+            pass
+
+        job_list_elements = driver.find_elements(
+            By.CLASS_NAME, "scaffold-layout__list-container"
+        )[0].find_elements(By.CLASS_NAME, "jobs-search-results__list-item")
+
+        if not job_list_elements:
+            logger.debug("No job class elements found on page, skipping")
+            return
+
+        job_list = [
+            Job(*self.extract_job_information_from_tile(job_element))
+            for job_element in job_list_elements
+        ]
+        return list(filter(lambda j: len(j.link) > 0, job_list))
+
+    def extract_job_information_from_tile(self, job_tile):
+        job_title = link = company = job_location = apply_method = ""
+        logger.debug("Extracting job information from tile")
+
+        try:
+            print(job_tile.get_attribute("outerHTML"))
+
+            # Extract job title
+            job_title = (
+                job_tile.find_element(
+                    By.CLASS_NAME, "job-card-job-posting-card-wrapper__title"
+                )
+                .find_element(By.TAG_NAME, "strong")
+                .text
+            )
+
+            # Extract job link
+            link = (
+                job_tile.find_element(By.CLASS_NAME, "app-aware-link")
+                .get_attribute("href")
+            )
+
+            # Extract company name
+            company = job_tile.find_element(
+                By.CLASS_NAME, "artdeco-entity-lockup__subtitle"
+            ).text
+
+            # Extract job location
+            job_location = job_tile.find_element(
+                By.CLASS_NAME, "artdeco-entity-lockup__caption"
+            ).text
+
+            # Apply method (if it exists)
+            try:
+                apply_method = job_tile.find_element(
+                    By.CLASS_NAME, "job-card-job-posting-card-wrapper__footer-item"
+                ).text
+            except NoSuchElementException:
+                logger.warning("Apply method not found for this job tile.")
+
+            logger.debug(
+                f"Job information extracted: {job_title} at {company}, located in {job_location}"
+            )
+        except NoSuchElementException:
+            logger.warning(
+                "Some job information (title, link, or company) could not be parsed."
+            )
+
+
+
+        return job_title, company, job_location, link, apply_method
+
+
+EXTRACTORS = [Extractor1(), Extractor2(), Extractor3()]
diff --git a/tests/test_aihawk_job_manager.py b/tests/test_aihawk_job_manager.py
@@ -109,7 +109,7 @@ def test_apply_jobs_no_jobs(mocker, job_manager):
     job_manager.apply_jobs()
 
     # Ensure it attempted to find the job results list
-    assert job_manager.driver.find_element.call_count == 1
+    assert job_manager.driver.find_element.call_count > 0
 
 
 def test_apply_jobs_with_jobs(mocker, job_manager):