From 2d8522deaff6f706e3baf602fd7d324c72c2ac79 Mon Sep 17 00:00:00 2001 From: sabry Date: Mon, 13 May 2024 05:14:21 +0300 Subject: [PATCH 1/7] FEAT: Add wuzzuf Site Scraper --- src/scrape_up/wuzzuf/stringdecorator.py | 39 ++++++ src/scrape_up/wuzzuf/wuzzuf.py | 161 ++++++++++++++++++++++++ 2 files changed, 200 insertions(+) create mode 100644 src/scrape_up/wuzzuf/stringdecorator.py create mode 100644 src/scrape_up/wuzzuf/wuzzuf.py diff --git a/src/scrape_up/wuzzuf/stringdecorator.py b/src/scrape_up/wuzzuf/stringdecorator.py new file mode 100644 index 00000000..5a6cb8eb --- /dev/null +++ b/src/scrape_up/wuzzuf/stringdecorator.py @@ -0,0 +1,39 @@ +def stringDecorator(func): + """ + The stringDecorator function wraps the decorated function and performs the following modifications: + - If the result of the decorated function is None, it returns the string "NA". + - it removes any trailing hyphen ('-') from the text. + - It strips any leading/trailing whitespace from the modified text. + + Args: + func (function): The function to be decorated. + + Returns: + function: The decorated function with the modified behavior. + + Example: + @stringDecorator + def getJobTitle(job): + return job.find("h2", {"class": "jobTitle"}) + + title = getJobTitle(job) + # If the job title is "Software Engineer -", it will be returned as "Software Engineer". + # If the job title is None, it will be returned as "NA". + # Any leading/trailing whitespace will be removed from the job title. + """ + + def wrapper(*args, **kwargs): + result = func(*args, **kwargs) + + if result is None: + return "NA" + + text = result.text + + if text.endswith("-"): + text = text[:-1] + + # Strip any leading/trailing whitespace and return + return text.strip() + + return wrapper diff --git a/src/scrape_up/wuzzuf/wuzzuf.py b/src/scrape_up/wuzzuf/wuzzuf.py new file mode 100644 index 00000000..5fd8d788 --- /dev/null +++ b/src/scrape_up/wuzzuf/wuzzuf.py @@ -0,0 +1,161 @@ +import requests +from bs4 import BeautifulSoup +from time import sleep +from stringdecorator import stringDecorator + + +class JobScraper: + """ + Usage: + 1. Create an instance of the JobScraper class. + ```python + scraper = JobScraper() + ``` + + 2. Apply filters using the filterJob() method. + ```python + scraper.filterJob(title="software engineer", country="Egypt", city="Cairo", minYearsOfExperience=2, maxYearsOfExperience=5) + ``` + Customize the filters based on your requirements. + + 3. Fetch job listings using the fetchJobs() method. + ```python + jobs = scraper.fetchJobs() + ``` + The fetched jobs will be stored in the 'jobs' variable. + + 4. Save the fetched jobs to a CSV file using the FileSaver class. + ```python + saver = FileSaver() + saver.saveToFile(jobs, 'jobListings.csv') + ``` + Specify the desired file path for the CSV file. + """ + + def __init__(self): + """ + Initializes the JobScraper instance with the base URL. + """ + self.url = "https://wuzzuf.net/search/jobs/?" + + def filterJob( + self, + title=None, + country=None, + city=None, + minYearsOfExperience=None, + maxYearsOfExperience=None, + ): + """ + Filters job listings based on specified criteria. + + Args: + title (str): The job title to filter by. + country (str): The country to filter by. + city (str): The city to filter by. + minYearsOfExperience (int): The minimum years of experience to filter by. + maxYearsOfExperience (int): The maximum years of experience to filter by. + """ + + if title is not None: + title.replace(" ", "+") + self.url += f"q={title}" + if country is not None: + self.url += f"&filters[country][0]={country.strip().capitalize()}" + if city is not None: + self.url += f"&filters[city][0]={city.strip().capitalize()}" + if minYearsOfExperience is not None: + self.url += f"&filters[years_of_experience_min][0]={minYearsOfExperience}" + if maxYearsOfExperience is not None: + self.url += f"&filters[years_of_experience_max][0]={maxYearsOfExperience}" + + def __fetchPageJobs(self, pageNum): + """ + Fetches job listings from a specific page. + + Args: + pageNum (int): The page number to fetch job listings from. + + Returns: + list: A list of job listings from the specified page. + + Raises: + ConnectionError: If there is an error fetching the job listings. + """ + + response = requests.get(self.url + f"&start={pageNum}") + jobSubList = [] + if response.status_code == 200: + parsedHtml = BeautifulSoup(response.content, "lxml") + jobsData = parsedHtml.find_all("div", {"class": "css-1gatmva e1v1l3u10"}) + + for jobData in jobsData: + job = { + "name": self.__getJobName(jobData), + "url": self.__getJobUrl(jobData), + "company": self.__getJobCompany(jobData), + "location": self.__getJobLocation(jobData), + "publishedTime": self.__getPublishedTime(jobData), + "properties": self.__getJobProperties(jobData), + } + jobSubList.append(job) + else: + raise ConnectionError(f"Error code: {response.status_code}") + return jobSubList + + def fetchJobs(self, maxPageNumber=1000): + """ + Fetches job listings from multiple pages. + + Returns: + list: A list of job listings from all pages. + """ + + jobList = [] + for pageNum in range(maxPageNumber): + jobSubList = self.__fetchPageJobs(pageNum) + if jobSubList: + jobList.extend(jobSubList) + else: + break + sleep(1) + return jobList + + @stringDecorator + def __getJobName(self, jobData): + return jobData.find("h2", {"class": "css-m604qf"}).find("a") + + def __getJobUrl(self, jobData): + return jobData.find("h2", {"class": "css-m604qf"}).find("a")["href"] + + @stringDecorator + def __getJobCompany(self, jobData): + return jobData.find("div", {"class": "css-d7j1kk"}).find("a") + + @stringDecorator + def __getJobLocation(self, jobData): + return jobData.find("span", {"class": "css-5wys0k"}) + + @stringDecorator + def __getPublishedTime(self, jobData): + return jobData.find("div", {"class": "css-4c4ojb"}) or jobData.find( + "div", {"class": "css-do6t5g"} + ) + + def __getJobProperties(self, jobData): + jobPropertiesString = " ,".join( + [prop.text for prop in jobData.find_all("span", {"class": "eoyjyou0"})] + ) + return jobPropertiesString if jobPropertiesString else "NA" + + +def main(): + scraper = JobScraper() + scraper.filterJob(title="software engineer") + jobs = scraper.fetchJobs(maxPageNumber=1000) + + print(jobs) + + +if __name__ == "__main__": + main() From 37e3564bff6fb19cc4aca591770b7689da953e7f Mon Sep 17 00:00:00 2001 From: sabry Date: Mon, 13 May 2024 05:17:48 +0300 Subject: [PATCH 2/7] Update documentation with WuzzufJobScraper --- documentation.md | 33 ++++++++++++++++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) diff --git a/documentation.md b/documentation.md index dd03b34a..50304a93 100644 --- a/documentation.md +++ b/documentation.md @@ -44,7 +44,7 @@ per user.followers() - [Flyrobu](https://github.com/Clueless-Community/scrape-up/blob/main/documentation.md#flyrobu) - [HealthGrades](https://github.com/Clueless-Community/scrape-up/blob/main/documentation.md#healthgrades) - [IMDB](https://github.com/Clueless-Community/scrape-up/blob/main/documentation.md#imdb) - +- [Wuzzuf](https://github.com/Clueless-Community/scrape-up/blob/main/documentation.md#Wuzzuf) ### GitHub ```python @@ -733,3 +733,34 @@ boxoffice = imdb.BoxOffice() | Methods | Details | | --------------- | ----------------------------------------------------------------------------- | | `.top_movies()` | Returns the top box office movies, weekend and total gross and weeks released | + +### Wuzzuf + +The `JobScraper` class provides methods for configuring scraping parameters and fetching job listings: + +| Methods | Details | +| --------------------- | --------------------------------------------------------------------------------------------------- | +| `.filterJob()` | Apply filters such as job title, country, city, and range of years of experience. | +| `.fetchJobs()` | Fetch job listings from the website based on the applied filters, across multiple pages. | + +```python +from scrap-up import wuzzuf +``` +### How to use : +- **Create an instance of the JobScraper class:** + ```python + scraper = JobScraper() + ``` +
+ +- **Apply filters using the filterJob() method:** + ```python + scraper.filterJob(title="software engineer", country="Egypt", city="Cairo", minYearsOfExperience=2, maxYearsOfExperience=5) + ``` + Customize the filters based on your requirements. +
+- **Fetch job listings using the fetchJobs() method:** + + ```python + jobs = scraper.fetchJobs() + ``` From 6a892de61efa727b3175c0392d587d9dc5f33cb8 Mon Sep 17 00:00:00 2001 From: sabry Date: Mon, 13 May 2024 05:51:46 +0300 Subject: [PATCH 3/7] change naming convention to follow project standards --- src/scrape_up/wuzzuf/stringdecorator.py | 2 +- src/scrape_up/wuzzuf/wuzzuf.py | 204 +++++++++--------------- 2 files changed, 77 insertions(+), 129 deletions(-) diff --git a/src/scrape_up/wuzzuf/stringdecorator.py b/src/scrape_up/wuzzuf/stringdecorator.py index 5a6cb8eb..38c476c6 100644 --- a/src/scrape_up/wuzzuf/stringdecorator.py +++ b/src/scrape_up/wuzzuf/stringdecorator.py @@ -1,4 +1,4 @@ -def stringDecorator(func): +def string_decorator(func): """ The stringDecorator function wraps the decorated function and performs the following modifications: - If the result of the decorated function is None, it returns the string "NA". diff --git a/src/scrape_up/wuzzuf/wuzzuf.py b/src/scrape_up/wuzzuf/wuzzuf.py index 5fd8d788..8bfd00e9 100644 --- a/src/scrape_up/wuzzuf/wuzzuf.py +++ b/src/scrape_up/wuzzuf/wuzzuf.py @@ -1,161 +1,109 @@ import requests from bs4 import BeautifulSoup from time import sleep -from stringdecorator import stringDecorator - +from stringdecorator import string_decorator class JobScraper: """ Usage: - 1. Create an instance of the JobScraper class. + 1. Create an instance of the JobScraper class: ```python scraper = JobScraper() ``` - 2. Apply filters using the filterJob() method. + 2. Apply filters using the filter_job() method: ```python - scraper.filterJob(title="software engineer", country="Egypt", city="Cairo", minYearsOfExperience=2, maxYearsOfExperience=5) + scraper.filter_job(title="software engineer", country="Egypt", city="Cairo", min_years_of_experience=2, max_years_of_experience=5) ``` - Customize the filters based on your requirements. - 3. Fetch job listings using the fetchJobs() method. + 3. Fetch job listings using the fetch_jobs() method: ```python - jobs = scraper.fetchJobs() + jobs = scraper.fetch_jobs() ``` - The fetched jobs will be stored in the 'jobs' variable. - 4. Save the fetched jobs to a CSV file using the FileSaver class. - ```python - saver = FileSaver() - saver.saveToFile(jobs, 'jobListings.csv') - ``` - Specify the desired file path for the CSV file. + 4. Output or process the fetched jobs as needed. """ def __init__(self): - """ - Initializes the JobScraper instance with the base URL. - """ self.url = "https://wuzzuf.net/search/jobs/?" - def filterJob( - self, - title=None, - country=None, - city=None, - minYearsOfExperience=None, - maxYearsOfExperience=None, - ): - """ - Filters job listings based on specified criteria. - - Args: - title (str): The job title to filter by. - country (str): The country to filter by. - city (str): The city to filter by. - minYearsOfExperience (int): The minimum years of experience to filter by. - maxYearsOfExperience (int): The maximum years of experience to filter by. - """ - - if title is not None: - title.replace(" ", "+") + def filter_job(self, title=None, country=None, city=None, min_years_of_experience=None, max_years_of_experience=None): + if title: + title = title.replace(" ", "+") self.url += f"q={title}" - if country is not None: + if country: self.url += f"&filters[country][0]={country.strip().capitalize()}" - if city is not None: + if city: self.url += f"&filters[city][0]={city.strip().capitalize()}" - if minYearsOfExperience is not None: - self.url += f"&filters[years_of_experience_min][0]={minYearsOfExperience}" - if maxYearsOfExperience is not None: - self.url += f"&filters[years_of_experience_max][0]={maxYearsOfExperience}" - - def __fetchPageJobs(self, pageNum): - """ - Fetches job listings from a specific page. - - Args: - pageNum (int): The page number to fetch job listings from. - - Returns: - list: A list of job listings from the specified page. - - Raises: - ConnectionError: If there is an error fetching the job listings. - """ - - response = requests.get(self.url + f"&start={pageNum}") - jobSubList = [] - if response.status_code == 200: - parsedHtml = BeautifulSoup(response.content, "lxml") - jobsData = parsedHtml.find_all("div", {"class": "css-1gatmva e1v1l3u10"}) - - for jobData in jobsData: - job = { - "name": self.__getJobName(jobData), - "url": self.__getJobUrl(jobData), - "company": self.__getJobCompany(jobData), - "location": self.__getJobLocation(jobData), - "publishedTime": self.__getPublishedTime(jobData), - "properties": self.__getJobProperties(jobData), - } - jobSubList.append(job) - else: - raise ConnectionError(f"Error code: {response.status_code}") - return jobSubList - - def fetchJobs(self, maxPageNumber=1000): - """ - Fetches job listings from multiple pages. - - Returns: - list: A list of job listings from all pages. - """ - - jobList = [] - for pageNum in range(maxPageNumber): - jobSubList = self.__fetchPageJobs(pageNum) - if jobSubList: - jobList.extend(jobSubList) + if min_years_of_experience: + self.url += f"&filters[years_of_experience_min][0]={min_years_of_experience}" + if max_years_of_experience: + self.url += f"&filters[years_of_experience_max][0]={max_years_of_experience}" + def _fetch_page_jobs(self, page_num): + response = requests.get(self.url + f"&start={page_num}") + if response.status_code == 200: + parsed_html = BeautifulSoup(response.content, "lxml") + jobs_data = parsed_html.find_all("div", {"class": "css-1gatmva e1v1l3u10"}) + job_sub_list = [] + for job_data in jobs_data: + job = { + "name": self.__get_job_name(job_data), + "url": self.__get_job_url(job_data), + "company": self.__get_job_company(job_data), + "location": self.__get_job_location(job_data), + "published_time": self.__get_published_time(job_data), + "properties": self.__get_job_properties(job_data), + } + job_sub_list.append(job) + return job_sub_list else: - break - sleep(1) - return jobList - - @stringDecorator - def __getJobName(self, jobData): - return jobData.find("h2", {"class": "css-m604qf"}).find("a") - - def __getJobUrl(self, jobData): - return jobData.find("h2", {"class": "css-m604qf"}).find("a")["href"] - - @stringDecorator - def __getJobCompany(self, jobData): - return jobData.find("div", {"class": "css-d7j1kk"}).find("a") - - @stringDecorator - def __getJobLocation(self, jobData): - return jobData.find("span", {"class": "css-5wys0k"}) - - @stringDecorator - def __getPublishedTime(self, jobData): - return jobData.find("div", {"class": "css-4c4ojb"}) or jobData.find( - "div", {"class": "css-do6t5g"} - ) - - def __getJobProperties(self, jobData): - jobPropertiesString = " ,".join( - [prop.text for prop in jobData.find_all("span", {"class": "eoyjyou0"})] + raise ConnectionError(f"Error code: {response.status_code}") + + + def fetch_jobs(self, max_page_number=1000): + job_list = [] + try: + for page_num in range(max_page_number): + job_sub_list = self._fetch_page_jobs(page_num) + if job_sub_list: + job_list.extend(job_sub_list) + else: + break + sleep(1) + except requests.RequestException as e: + return None + return job_list + + @string_decorator + def __get_job_name(self, job_data): + return job_data.find("h2", {"class": "css-m604qf"}).find("a") + + def __get_job_url(self, job_data): + return job_data.find("h2", {"class": "css-m604qf"}).find("a")["href"] + + @string_decorator + def __get_job_company(self, job_data): + return job_data.find("div", {"class": "css-d7j1kk"}).find("a") + + @string_decorator + def __get_job_location(self, job_data): + return job_data.find("span", {"class": "css-5wys0k"}) + + @string_decorator + def __get_published_time(self, job_data): + return job_data.find("div", {"class": "css-4c4ojb"}) or job_data.find("div", {"class": "css-do6t5g"}) + + def __get_job_properties(self, job_data): + job_properties_string = " ,".join( + [prop.text for prop in job_data.find_all("span", {"class": "eoyjyou0"})] ) - return jobPropertiesString if jobPropertiesString else "NA" - + return job_properties_string if job_properties_string else "NA" def main(): scraper = JobScraper() - scraper.filterJob(title="software engineer") - jobs = scraper.fetchJobs(maxPageNumber=1000) - + scraper.filter_job(title="software engineer") + jobs = scraper.fetch_jobs(max_page_number=1) print(jobs) - if __name__ == "__main__": - main() + main() \ No newline at end of file From 1c0f409df0f7007ef81d3353f21e0e5b66906013 Mon Sep 17 00:00:00 2001 From: sabry Date: Mon, 13 May 2024 05:56:05 +0300 Subject: [PATCH 4/7] update ducumentation.md WuzzufScraper to follow naming standards --- documentation.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/documentation.md b/documentation.md index 50304a93..7a8446c7 100644 --- a/documentation.md +++ b/documentation.md @@ -740,8 +740,8 @@ The `JobScraper` class provides methods for configuring scraping parameters and | Methods | Details | | --------------------- | --------------------------------------------------------------------------------------------------- | -| `.filterJob()` | Apply filters such as job title, country, city, and range of years of experience. | -| `.fetchJobs()` | Fetch job listings from the website based on the applied filters, across multiple pages. | +| `.filter_job()` | Apply filters such as job title, country, city, and range of years of experience. | +| `.fetch_jobs()` | Fetch job listings from the website based on the applied filters, across multiple pages. | ```python from scrap-up import wuzzuf @@ -753,14 +753,14 @@ from scrap-up import wuzzuf ```
-- **Apply filters using the filterJob() method:** +- **Apply filters using the filter_job() method:** ```python - scraper.filterJob(title="software engineer", country="Egypt", city="Cairo", minYearsOfExperience=2, maxYearsOfExperience=5) + scraper.filter_job(title="software engineer", country="Egypt", city="Cairo", minYearsOfExperience=2, maxYearsOfExperience=5) ``` Customize the filters based on your requirements.
-- **Fetch job listings using the fetchJobs() method:** +- **Fetch job listings using the fetch_jobs() method:** ```python - jobs = scraper.fetchJobs() + jobs = scraper.fetch_jobs() ``` From b09acfeb16b0946eabbed005823a69a8e1c5c007 Mon Sep 17 00:00:00 2001 From: sabry Date: Mon, 13 May 2024 14:20:51 +0300 Subject: [PATCH 5/7] remove decorator class && change class name in wuzzuf.py --- src/scrape_up/wuzzuf/__init__.py | 3 + src/scrape_up/wuzzuf/stringdecorator.py | 39 ------- src/scrape_up/wuzzuf/wuzzuf.py | 145 +++++++++++++++--------- 3 files changed, 93 insertions(+), 94 deletions(-) create mode 100644 src/scrape_up/wuzzuf/__init__.py delete mode 100644 src/scrape_up/wuzzuf/stringdecorator.py diff --git a/src/scrape_up/wuzzuf/__init__.py b/src/scrape_up/wuzzuf/__init__.py new file mode 100644 index 00000000..69f2dc74 --- /dev/null +++ b/src/scrape_up/wuzzuf/__init__.py @@ -0,0 +1,3 @@ +from .wuzzuf import Jobs + +__all__ = ["Jobs"] \ No newline at end of file diff --git a/src/scrape_up/wuzzuf/stringdecorator.py b/src/scrape_up/wuzzuf/stringdecorator.py deleted file mode 100644 index 38c476c6..00000000 --- a/src/scrape_up/wuzzuf/stringdecorator.py +++ /dev/null @@ -1,39 +0,0 @@ -def string_decorator(func): - """ - The stringDecorator function wraps the decorated function and performs the following modifications: - - If the result of the decorated function is None, it returns the string "NA". - - it removes any trailing hyphen ('-') from the text. - - It strips any leading/trailing whitespace from the modified text. - - Args: - func (function): The function to be decorated. - - Returns: - function: The decorated function with the modified behavior. - - Example: - @stringDecorator - def getJobTitle(job): - return job.find("h2", {"class": "jobTitle"}) - - title = getJobTitle(job) - # If the job title is "Software Engineer -", it will be returned as "Software Engineer". - # If the job title is None, it will be returned as "NA". - # Any leading/trailing whitespace will be removed from the job title. - """ - - def wrapper(*args, **kwargs): - result = func(*args, **kwargs) - - if result is None: - return "NA" - - text = result.text - - if text.endswith("-"): - text = text[:-1] - - # Strip any leading/trailing whitespace and return - return text.strip() - - return wrapper diff --git a/src/scrape_up/wuzzuf/wuzzuf.py b/src/scrape_up/wuzzuf/wuzzuf.py index 8bfd00e9..34f7b100 100644 --- a/src/scrape_up/wuzzuf/wuzzuf.py +++ b/src/scrape_up/wuzzuf/wuzzuf.py @@ -1,33 +1,47 @@ import requests from bs4 import BeautifulSoup from time import sleep -from stringdecorator import string_decorator +import json -class JobScraper: - """ - Usage: - 1. Create an instance of the JobScraper class: - ```python - scraper = JobScraper() - ``` - - 2. Apply filters using the filter_job() method: - ```python - scraper.filter_job(title="software engineer", country="Egypt", city="Cairo", min_years_of_experience=2, max_years_of_experience=5) - ``` - - 3. Fetch job listings using the fetch_jobs() method: - ```python - jobs = scraper.fetch_jobs() - ``` - 4. Output or process the fetched jobs as needed. +class Jobs: + """ + Create an instance of the class `Jobs` + ```python + scraper = Jobs() + ``` + | Methods | Details | + | ----------------------------- | -------------------------------------------------------------------------------------------------- | + | `.filter_job()` | Apply filters to the job search using parameters like title, country, city, minimum and maximum years of experience. | + | `.fetch_jobs()` | Fetch job listings based on the applied filters, with an optional maximum number of pages to scrape. | """ def __init__(self): self.url = "https://wuzzuf.net/search/jobs/?" - def filter_job(self, title=None, country=None, city=None, min_years_of_experience=None, max_years_of_experience=None): + def filter_job( + self, + title=None, + country=None, + city=None, + min_years_of_experience=None, + max_years_of_experience=None, + ): + """ + Apply filters to the job search. + + Parameters: + - `title` (str): Job title to search for. + - `country` (str): Country to search for jobs in. + - `city` (str): City to search for jobs in. + - `min_years_of_experience` (int): Minimum years of experience required. + - `max_years_of_experience` (int): Maximum years of experience allowed. + + Example: + ```python + scraper.filter_job(title="software engineer", country="Egypt", city="Cairo", min_years_of_experience=2, max_years_of_experience=5) + ``` + """ if title: title = title.replace(" ", "+") self.url += f"q={title}" @@ -36,35 +50,53 @@ def filter_job(self, title=None, country=None, city=None, min_years_of_experienc if city: self.url += f"&filters[city][0]={city.strip().capitalize()}" if min_years_of_experience: - self.url += f"&filters[years_of_experience_min][0]={min_years_of_experience}" + self.url += ( + f"&filters[years_of_experience_min][0]={min_years_of_experience}" + ) if max_years_of_experience: - self.url += f"&filters[years_of_experience_max][0]={max_years_of_experience}" - def _fetch_page_jobs(self, page_num): - response = requests.get(self.url + f"&start={page_num}") - if response.status_code == 200: - parsed_html = BeautifulSoup(response.content, "lxml") - jobs_data = parsed_html.find_all("div", {"class": "css-1gatmva e1v1l3u10"}) - job_sub_list = [] - for job_data in jobs_data: - job = { - "name": self.__get_job_name(job_data), - "url": self.__get_job_url(job_data), - "company": self.__get_job_company(job_data), - "location": self.__get_job_location(job_data), - "published_time": self.__get_published_time(job_data), - "properties": self.__get_job_properties(job_data), - } - job_sub_list.append(job) - return job_sub_list - else: - raise ConnectionError(f"Error code: {response.status_code}") - - - def fetch_jobs(self, max_page_number=1000): + self.url += ( + f"&filters[years_of_experience_max][0]={max_years_of_experience}" + ) + + def __fetch_page_jobs(self, page_num): + response = requests.get(self.url + f"&start={page_num}") + if response.status_code == 200: + parsed_html = BeautifulSoup(response.content, "lxml") + jobs_data = parsed_html.find_all("div", {"class": "css-1gatmva e1v1l3u10"}) + job_sub_list = [] + for job_data in jobs_data: + job = { + "name": self.__get_job_name(job_data), + "url": self.__get_job_url(job_data), + "company": self.__get_job_company(job_data), + "location": self.__get_job_location(job_data), + "published_time": self.__get_published_time(job_data), + "properties": self.__get_job_properties(job_data), + } + job_sub_list.append(job) + return job_sub_list + else: + raise ConnectionError(f"Error code: {response.status_code}") + + def fetch_jobs(self, max_page_number=50): + """ + Fetch job listings based on the applied filters. + + Parameters: + - `max_page_number` (int): Maximum number of pages to scrape (default is 50). + + Returns: + - `list`: A list of dictionaries representing the fetched job listings. + + Example: + ```python + jobs = scraper.fetch_jobs(max_page_number=5) + ``` + """ job_list = [] try: for page_num in range(max_page_number): - job_sub_list = self._fetch_page_jobs(page_num) + job_sub_list = self.__fetch_page_jobs(page_num) if job_sub_list: job_list.extend(job_sub_list) else: @@ -74,24 +106,24 @@ def fetch_jobs(self, max_page_number=1000): return None return job_list - @string_decorator def __get_job_name(self, job_data): - return job_data.find("h2", {"class": "css-m604qf"}).find("a") + return job_data.find("h2", {"class": "css-m604qf"}).find("a").text.strip() def __get_job_url(self, job_data): return job_data.find("h2", {"class": "css-m604qf"}).find("a")["href"] - @string_decorator def __get_job_company(self, job_data): - return job_data.find("div", {"class": "css-d7j1kk"}).find("a") + return job_data.find("div", {"class": "css-d7j1kk"}).find("a").text[:-1].strip() - @string_decorator def __get_job_location(self, job_data): - return job_data.find("span", {"class": "css-5wys0k"}) + data = job_data.find("span", {"class": "css-5wys0k"}) + return data.text.strip() if data else "NA" - @string_decorator def __get_published_time(self, job_data): - return job_data.find("div", {"class": "css-4c4ojb"}) or job_data.find("div", {"class": "css-do6t5g"}) + return ( + job_data.find("div", {"class": "css-4c4ojb"}) + or job_data.find("div", {"class": "css-do6t5g"}) + ).text.strip() def __get_job_properties(self, job_data): job_properties_string = " ,".join( @@ -99,11 +131,14 @@ def __get_job_properties(self, job_data): ) return job_properties_string if job_properties_string else "NA" + + def main(): - scraper = JobScraper() + scraper = Jobs() scraper.filter_job(title="software engineer") - jobs = scraper.fetch_jobs(max_page_number=1) + jobs = scraper.fetch_jobs(max_page_number=5) print(jobs) + if __name__ == "__main__": main() \ No newline at end of file From fb92a268e18566c197b1a4b283f491060b162ad8 Mon Sep 17 00:00:00 2001 From: sabry Date: Mon, 13 May 2024 14:35:37 +0300 Subject: [PATCH 6/7] update dev-documentation.md with wuzzuf module --- dev-documentation.md | 13 +++++++++++++ documentation.md | 33 +-------------------------------- 2 files changed, 14 insertions(+), 32 deletions(-) diff --git a/dev-documentation.md b/dev-documentation.md index 624696a8..c76b7eb6 100644 --- a/dev-documentation.md +++ b/dev-documentation.md @@ -1616,3 +1616,16 @@ First create an object of class `Dictionary`. | `.get_word_of_the_day()` | Returns the word of the day. | | `.word_of_the_day_definition()` | Returns the definition of the word of the day. -------- +## Wuzzuf + +```python +from scrap-up import wuzzuf +jobs = wuzzuf.Jobs() +``` + +The `Jobs` class provides methods for configuring scraping parameters and fetching job listings: + +| Methods | Details | +| --------------------- | --------------------------------------------------------------------------------------------------- | +| `.filter_job()` | Apply filters such as job title, country, city, and range of years of experience. | +| `.fetch_jobs()` | Fetch job listings from the website based on the applied filters, across multiple pages. | diff --git a/documentation.md b/documentation.md index 7a8446c7..dd03b34a 100644 --- a/documentation.md +++ b/documentation.md @@ -44,7 +44,7 @@ per user.followers() - [Flyrobu](https://github.com/Clueless-Community/scrape-up/blob/main/documentation.md#flyrobu) - [HealthGrades](https://github.com/Clueless-Community/scrape-up/blob/main/documentation.md#healthgrades) - [IMDB](https://github.com/Clueless-Community/scrape-up/blob/main/documentation.md#imdb) -- [Wuzzuf](https://github.com/Clueless-Community/scrape-up/blob/main/documentation.md#Wuzzuf) + ### GitHub ```python @@ -733,34 +733,3 @@ boxoffice = imdb.BoxOffice() | Methods | Details | | --------------- | ----------------------------------------------------------------------------- | | `.top_movies()` | Returns the top box office movies, weekend and total gross and weeks released | - -### Wuzzuf - -The `JobScraper` class provides methods for configuring scraping parameters and fetching job listings: - -| Methods | Details | -| --------------------- | --------------------------------------------------------------------------------------------------- | -| `.filter_job()` | Apply filters such as job title, country, city, and range of years of experience. | -| `.fetch_jobs()` | Fetch job listings from the website based on the applied filters, across multiple pages. | - -```python -from scrap-up import wuzzuf -``` -### How to use : -- **Create an instance of the JobScraper class:** - ```python - scraper = JobScraper() - ``` -
- -- **Apply filters using the filter_job() method:** - ```python - scraper.filter_job(title="software engineer", country="Egypt", city="Cairo", minYearsOfExperience=2, maxYearsOfExperience=5) - ``` - Customize the filters based on your requirements. -
-- **Fetch job listings using the fetch_jobs() method:** - - ```python - jobs = scraper.fetch_jobs() - ``` From 9caa54ce690da877edbc18840f4b6af34476f1ac Mon Sep 17 00:00:00 2001 From: Nikhil Raj Date: Tue, 14 May 2024 14:29:46 +0530 Subject: [PATCH 7/7] Formatting and check. --- dev-documentation.md | 15 ++++++--------- src/scrape_up/wuzzuf/__init__.py | 2 +- src/scrape_up/wuzzuf/wuzzuf.py | 19 +++---------------- 3 files changed, 10 insertions(+), 26 deletions(-) diff --git a/dev-documentation.md b/dev-documentation.md index 65db2ad3..c5c7c08a 100644 --- a/dev-documentation.md +++ b/dev-documentation.md @@ -1615,11 +1615,8 @@ First create an object of class `Dictionary`. | `.get_word_of_the_day()` | Returns the word of the day. | | `.word_of_the_day_definition()` | Returns the definition of the word of the day. | -| Methods | Details | -| ---------------- | -------------------------------------------------------------------------------------------- | -| `.get_word_of_the_day()` | Returns the word of the day. | -| `.word_of_the_day_definition()` | Returns the definition of the word of the day. --------- +--- + ## Wuzzuf ```python @@ -1629,7 +1626,7 @@ jobs = wuzzuf.Jobs() The `Jobs` class provides methods for configuring scraping parameters and fetching job listings: -| Methods | Details | -| --------------------- | --------------------------------------------------------------------------------------------------- | -| `.filter_job()` | Apply filters such as job title, country, city, and range of years of experience. | -| `.fetch_jobs()` | Fetch job listings from the website based on the applied filters, across multiple pages. | +| Methods | Details | +| --------------- | ---------------------------------------------------------------------------------------- | +| `.filter_job()` | Apply filters such as job title, country, city, and range of years of experience. | +| `.fetch_jobs()` | Fetch job listings from the website based on the applied filters, across multiple pages. | diff --git a/src/scrape_up/wuzzuf/__init__.py b/src/scrape_up/wuzzuf/__init__.py index 69f2dc74..1e78fe07 100644 --- a/src/scrape_up/wuzzuf/__init__.py +++ b/src/scrape_up/wuzzuf/__init__.py @@ -1,3 +1,3 @@ from .wuzzuf import Jobs -__all__ = ["Jobs"] \ No newline at end of file +__all__ = ["Jobs"] diff --git a/src/scrape_up/wuzzuf/wuzzuf.py b/src/scrape_up/wuzzuf/wuzzuf.py index 34f7b100..3d42efc3 100644 --- a/src/scrape_up/wuzzuf/wuzzuf.py +++ b/src/scrape_up/wuzzuf/wuzzuf.py @@ -1,7 +1,6 @@ import requests from bs4 import BeautifulSoup from time import sleep -import json class Jobs: @@ -76,7 +75,7 @@ def __fetch_page_jobs(self, page_num): job_sub_list.append(job) return job_sub_list else: - raise ConnectionError(f"Error code: {response.status_code}") + raise None def fetch_jobs(self, max_page_number=50): """ @@ -117,7 +116,7 @@ def __get_job_company(self, job_data): def __get_job_location(self, job_data): data = job_data.find("span", {"class": "css-5wys0k"}) - return data.text.strip() if data else "NA" + return data.text.strip() if data else None def __get_published_time(self, job_data): return ( @@ -129,16 +128,4 @@ def __get_job_properties(self, job_data): job_properties_string = " ,".join( [prop.text for prop in job_data.find_all("span", {"class": "eoyjyou0"})] ) - return job_properties_string if job_properties_string else "NA" - - - -def main(): - scraper = Jobs() - scraper.filter_job(title="software engineer") - jobs = scraper.fetch_jobs(max_page_number=5) - print(jobs) - - -if __name__ == "__main__": - main() \ No newline at end of file + return job_properties_string if job_properties_string else None