From 0cbd725f8847a73d8ef7b1fe54b8cfe18f1bc433 Mon Sep 17 00:00:00 2001
From: Tilak Patel <ptilak142@gmail.com>
Date: Mon, 14 Oct 2024 18:05:31 -0400
Subject: [PATCH 1/2] add extraction chains

---
 src/aihawk_job_manager.py           |  35 ++---
 src/extractors/__init__.py          |   0
 src/extractors/extraction_chains.py | 230 ++++++++++++++++++++++++++++
 3 files changed, 242 insertions(+), 23 deletions(-)
 create mode 100644 src/extractors/__init__.py
 create mode 100644 src/extractors/extraction_chains.py

diff --git a/src/aihawk_job_manager.py b/src/aihawk_job_manager.py
index ccb06e31d..6ce5ac521 100644
--- a/src/aihawk_job_manager.py
+++ b/src/aihawk_job_manager.py
@@ -16,6 +16,7 @@
 from src.job import Job
 from src.aihawk_easy_applier import AIHawkEasyApplier
 from loguru import logger
+from src.extractors.extraction_chains import EXTRACTORS
 
 
 class EnvironmentKeys:
@@ -46,6 +47,7 @@ def __init__(self, driver):
         self.easy_applier_component = None
         self.job_application_profile = None
         self.seen_jobs = []
+        self.extractor = None
         logger.debug("AIHawkJobManager initialized successfully")
 
     def set_parameters(self, parameters):
@@ -253,29 +255,16 @@ def get_jobs_from_page(self):
             return []
 
     def apply_jobs(self):
-        try:
-            # Check if no matching jobs are found on the current page
-            no_jobs_element = self.driver.find_element(By.CLASS_NAME, 'jobs-search-two-pane__no-results-banner--expand')
-            if 'No matching jobs found' in no_jobs_element.text or 'unfortunately, things aren' in self.driver.page_source.lower():
-                logger.debug("No matching jobs found on this page, skipping")
-                return
-        except NoSuchElementException:
-            pass
-    
-        # Find the job results container and job elements
-        job_results = self.driver.find_element(By.CLASS_NAME, "jobs-search-results-list")
-        
-        # utils.scroll_slow(self.driver, job_results)
-        # utils.scroll_slow(self.driver, job_results, step=300, reverse=True)
-
-        job_list_elements = job_results.find_elements(By.CLASS_NAME, 'jobs-search-results__list-item')
-    
-        if not job_list_elements:
-            utils.printyellow("No job class elements found on page, moving to next page.")
-            logger.debug("No job class elements found on page, skipping")
-            return
-
-        job_list = [Job(*self.extract_job_information_from_tile(job_element)) for job_element in job_list_elements]
+        job_list = []
+        if self.extractor is not None: # we found a working extractor
+            job_list = self.extractor.get_job_list(self.driver)
+        else:
+            for e in EXTRACTORS:
+                extracted_jobs = e.get_job_list(self.driver)
+                if len(extracted_jobs) > 0:
+                    job_list = extracted_jobs # break when we find a valid extractor
+                    self.extractor = e
+                    break
 
         for job in job_list:
             logger.debug(f"Starting applicant count search for job: {job.title} at {job.company}")
diff --git a/src/extractors/__init__.py b/src/extractors/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/src/extractors/extraction_chains.py b/src/extractors/extraction_chains.py
new file mode 100644
index 000000000..4906b98ab
--- /dev/null
+++ b/src/extractors/extraction_chains.py
@@ -0,0 +1,230 @@
+from abc import ABC, abstractmethod
+from src.job import Job
+from selenium.common.exceptions import NoSuchElementException
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.wait import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+from loguru import logger
+
+
+# An interface that defines different extraction strategies for the linkedin jobs page.
+class Extractor(ABC):
+    @abstractmethod
+    def get_job_list(self, driver) -> list[Job]:
+        pass
+
+
+# The only extractor living in code as of writing this.
+class Extractor1(Extractor):
+    def get_job_list(self, driver) -> list[Job]:
+        try:
+            no_jobs_element = driver.find_element(
+                By.CLASS_NAME, "jobs-search-two-pane__no-results-banner--expand"
+            )
+            if (
+                "No matching jobs found" in no_jobs_element.text
+                or "unfortunately, things aren" in driver.page_source.lower()
+            ):
+                logger.debug("No matching jobs found on this page, skipping")
+                return []
+        except NoSuchElementException:
+            pass
+
+        job_list_elements = driver.find_elements(
+            By.CLASS_NAME, "scaffold-layout__list-container"
+        )[0].find_elements(By.CLASS_NAME, "jobs-search-results__list-item")
+
+        if not job_list_elements:
+            logger.debug("No job class elements found on page, skipping")
+            return
+
+        job_list = [
+            Job(*self.extract_job_information_from_tile(job_element))
+            for job_element in job_list_elements
+        ]
+        return list(filter(lambda j: len(j.link) > 0, job_list))
+
+    def extract_job_information_from_tile(self, job_tile):
+        logger.debug("Extracting job information from tile")
+        job_title, company, job_location, apply_method, link = "", "", "", "", ""
+        try:
+            print(job_tile.get_attribute("outerHTML"))
+            job_title = (
+                job_tile.find_element(By.CLASS_NAME, "job-card-list__title")
+                .find_element(By.TAG_NAME, "strong")
+                .text
+            )
+
+            link = (
+                job_tile.find_element(By.CLASS_NAME, "job-card-list__title")
+                .get_attribute("href")
+                .split("?")[0]
+            )
+            company = job_tile.find_element(
+                By.CLASS_NAME, "job-card-container__primary-description"
+            ).text
+            logger.debug(f"Job information extracted: {job_title} at {company}")
+        except NoSuchElementException:
+            logger.warning("Some job information (title, link, or company) is missing.")
+        try:
+            job_location = job_tile.find_element(
+                By.CLASS_NAME, "job-card-container__metadata-item"
+            ).text
+        except NoSuchElementException:
+            logger.warning("Job location is missing.")
+        try:
+            apply_method = job_tile.find_element(
+                By.CLASS_NAME, "job-card-container__apply-method"
+            ).text
+        except NoSuchElementException:
+            apply_method = "Applied"
+            logger.warning("Apply method not found, assuming 'Applied'.")
+
+        return job_title, company, job_location, link, apply_method
+
+
+class Extractor2(Extractor):
+    def get_job_list(self, driver) -> list[Job]:
+        try:
+            # Wait for the job list container to be present
+            WebDriverWait(driver, 10).until(
+                EC.presence_of_element_located(
+                    (By.CLASS_NAME, "scaffold-layout__list-container")
+                )
+            )
+
+            # Find the job list container
+            job_list_container = driver.find_element(
+                By.CLASS_NAME, "scaffold-layout__list-container"
+            )
+
+            # Find all job items within the container
+            job_list_elements = job_list_container.find_elements(
+                By.CSS_SELECTOR,
+                "li.ember-view.jobs-search-results__list-item.occludable-update.p0.relative.scaffold-layout__list-item",
+            )
+
+            print(f"Number of job elements found: {len(job_list_elements)}")
+
+            if not job_list_elements:
+                raise Exception("No job elements found on page")
+
+            job_list = [
+                Job(*self.extract_job_information_from_tile(job_element))
+                for job_element in job_list_elements
+            ]
+            return list(filter(lambda j: len(j.link) > 0, job_list))
+        except Exception as e:
+            return []
+
+    def extract_job_information_from_tile(self, job_tile):
+        job_title, company, job_location, apply_method, link = "", "", "", "", ""
+        try:
+            job_title = job_tile.find_element(
+                By.CSS_SELECTOR, "a.job-card-list__title"
+            ).text
+            link = (
+                job_tile.find_element(By.CSS_SELECTOR, "a.job-card-list__title")
+                .get_attribute("href")
+                .split("?")[0]
+            )
+            company = job_tile.find_element(
+                By.CSS_SELECTOR, ".job-card-container__primary-description"
+            ).text
+            job_location = job_tile.find_element(
+                By.CSS_SELECTOR, ".job-card-container__metadata-item"
+            ).text
+            apply_method = job_tile.find_element(
+                By.CSS_SELECTOR, ".job-card-container__apply-method"
+            ).text
+        except NoSuchElementException as e:
+            print(f"Error extracting job information: {str(e)}")
+
+        return job_title, company, job_location, link, apply_method
+
+
+
+# The only extractor living in code as of writing this.
+class Extractor3(Extractor):
+    def get_job_list(self, driver) -> list[Job]:
+        try:
+            no_jobs_element = driver.find_element(
+                By.CLASS_NAME, "jobs-search-two-pane__no-results-banner--expand"
+            )
+            if (
+                "No matching jobs found" in no_jobs_element.text
+                or "unfortunately, things aren" in driver.page_source.lower()
+            ):
+                logger.debug("No matching jobs found on this page, skipping")
+                return []
+        except NoSuchElementException:
+            pass
+
+        job_list_elements = driver.find_elements(
+            By.CLASS_NAME, "scaffold-layout__list-container"
+        )[0].find_elements(By.CLASS_NAME, "jobs-search-results__list-item")
+
+        if not job_list_elements:
+            logger.debug("No job class elements found on page, skipping")
+            return
+
+        job_list = [
+            Job(*self.extract_job_information_from_tile(job_element))
+            for job_element in job_list_elements
+        ]
+        return list(filter(lambda j: len(j.link) > 0, job_list))
+
+    def extract_job_information_from_tile(self, job_tile):
+        job_title = link = company = job_location = apply_method = ""
+        logger.debug("Extracting job information from tile")
+
+        try:
+            print(job_tile.get_attribute("outerHTML"))
+
+            # Extract job title
+            job_title = (
+                job_tile.find_element(
+                    By.CLASS_NAME, "job-card-job-posting-card-wrapper__title"
+                )
+                .find_element(By.TAG_NAME, "strong")
+                .text
+            )
+
+            # Extract job link
+            link = (
+                job_tile.find_element(By.CLASS_NAME, "app-aware-link")
+                .get_attribute("href")
+            )
+
+            # Extract company name
+            company = job_tile.find_element(
+                By.CLASS_NAME, "artdeco-entity-lockup__subtitle"
+            ).text
+
+            # Extract job location
+            job_location = job_tile.find_element(
+                By.CLASS_NAME, "artdeco-entity-lockup__caption"
+            ).text
+
+            # Apply method (if it exists)
+            try:
+                apply_method = job_tile.find_element(
+                    By.CLASS_NAME, "job-card-job-posting-card-wrapper__footer-item"
+                ).text
+            except NoSuchElementException:
+                logger.warning("Apply method not found for this job tile.")
+
+            logger.debug(
+                f"Job information extracted: {job_title} at {company}, located in {job_location}"
+            )
+        except NoSuchElementException:
+            logger.warning(
+                "Some job information (title, link, or company) could not be parsed."
+            )
+
+
+
+        return job_title, company, job_location, link, apply_method
+
+
+EXTRACTORS = [Extractor1(), Extractor2(), Extractor3()]

From 86f6b2c538757f3ac9cc1bf5dcd2e16014d8727b Mon Sep 17 00:00:00 2001
From: Tilak Patel <ptilak142@gmail.com>
Date: Mon, 14 Oct 2024 18:15:25 -0400
Subject: [PATCH 2/2] fix test

---
 tests/test_aihawk_job_manager.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_aihawk_job_manager.py b/tests/test_aihawk_job_manager.py
index 41e981e4b..bda1fc0c8 100644
--- a/tests/test_aihawk_job_manager.py
+++ b/tests/test_aihawk_job_manager.py
@@ -109,7 +109,7 @@ def test_apply_jobs_no_jobs(mocker, job_manager):
     job_manager.apply_jobs()
 
     # Ensure it attempted to find the job results list
-    assert job_manager.driver.find_element.call_count == 1
+    assert job_manager.driver.find_element.call_count > 0
 
 
 def test_apply_jobs_with_jobs(mocker, job_manager):