Skip to content

Commit

Permalink
added find location alternative method
Browse files Browse the repository at this point in the history
  • Loading branch information
PhillipMaire committed Dec 4, 2023
1 parent e177d41 commit 55c3114
Show file tree
Hide file tree
Showing 11 changed files with 1,445 additions and 184 deletions.
7 changes: 6 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -157,4 +157,9 @@ cython_debug/
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/
.DS_Store
**/.DS_Store


notebooks/.DS_Store
notebooks/.DS_Store
scrapifurs/data/.DS_Store
8 changes: 8 additions & 0 deletions _______requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
beautifulsoup4==4.12.2
numpy==1.25.2
openai==0.27.8
pandas==2.0.3
python-dotenv==1.0.0
scrapifurs==0.1.2
selenium==4.11.2
setuptools==68.0.0
Binary file modified notebooks/.DS_Store
Binary file not shown.
329 changes: 329 additions & 0 deletions notebooks/scratch/Untitled1.ipynb

Large diffs are not rendered by default.

Binary file modified notebooks/working/.DS_Store
Binary file not shown.
1,048 changes: 926 additions & 122 deletions notebooks/working/fully_imported_inside_package_v1.ipynb

Large diffs are not rendered by default.

16 changes: 10 additions & 6 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,8 +1,12 @@
beautifulsoup4==4.12.2
numpy==1.25.2
openai==0.27.8
pandas==2.0.3
geopy==2.4.0
matplotlib==3.8.2
numpy==1.26.2
openai==1.3.3
pandas==2.1.3
plotly==5.18.0
python-dotenv==1.0.0
scrapifurs==0.1.2
selenium==4.11.2
setuptools==68.0.0
scipy==1.11.4
seaborn==0.13.0
selenium==4.15.2
tqdm==4.66.1
Binary file modified scrapifurs/data/.DS_Store
Binary file not shown.
15 changes: 10 additions & 5 deletions scrapifurs/job_main_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,6 @@





def open_browser(info_dict):
#init chrome
chrome_options = Options()
Expand Down Expand Up @@ -204,11 +202,18 @@ def get_apply_link(driver):


def get_location(element):
location_div = element.find_element(By.CSS_SELECTOR, ".job-details-jobs-unified-top-card__primary-description div")
try:
location_div = element.find_element(By.CSS_SELECTOR, ".job-details-jobs-unified-top-card__primary-description div")
except:
print('used new method for get-location')
location_div = element.find_element(By.CSS_SELECTOR, "div.mb2")



location_parts = location_div.text.split('·')
potential_location = location_parts[1].strip().split()
if "," in potential_location[1]:
return ' '.join(potential_location[:2])
if "," in ''.join(potential_location[:1]):
return ' '.join(potential_locatixon[:2])
else:
return ' '.join(potential_location[:1])

Expand Down
13 changes: 8 additions & 5 deletions scrapifurs/search_jobs_window.py
Original file line number Diff line number Diff line change
Expand Up @@ -320,9 +320,9 @@ def get_job_details(driver, existing_job_ids=[]):
job_dict["time_added"].append(formatted_time_as_int)


if job_listings.click_job_by_index(job_ind): #$%^ validate clicks only new ones
if job_listings.click_job_by_index(job_ind):
job_dict["about_the_job"].append(job_listings.get_about_job_info())
time.sleep(np.random.uniform(1.5, 3)) # if its too fast i think linked in will flag
time.sleep(np.random.uniform(1.5, 3)) # if its too fast i think linked in will flag it
else:
job_dict["about_the_job"].append('NA')

Expand Down Expand Up @@ -374,7 +374,7 @@ def scrape_job_data(info_dict, scraper_settings_list, data_class, auto_update_li
Args:
- info_dict: A dictionary containing general information for the browser setup.
- scraper_settings: A dictionary containing specific settings for scraping, including start_url.
- data_class: An instance of a DataClass containing df_main and a save_it method.
- data_class: An instance of a DataFile containing df_main and a save_it method.
"""
for scraper_settings in scraper_settings_list:
start_url = scraper_settings.get('start_url')
Expand Down Expand Up @@ -410,9 +410,12 @@ def scrape_job_data(info_dict, scraper_settings_list, data_class, auto_update_li
job_data = get_job_details(driver, existing_job_ids)
job_data = pd.DataFrame(job_data)
job_data['job_ids'] = job_data['job_ids'].astype('int')
data_class.df_main = update_dataframe(data_class.df_main, job_data, ['job_ids'])
if len(job_data)>0:
data_class.df_main = update_dataframe(data_class.df_main, job_data, ['job_ids'])

data_class.save_it()
data_class.save_it()
#update the file to have pay info as ints
utils.update_job_data(data_class.filename, overwrite=True)

print('_____________________________________________\n\n_____________________________________________')
time.sleep(update_every_n_secs)
Expand Down
193 changes: 148 additions & 45 deletions scrapifurs/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,109 @@
from scrapifurs.search_jobs_window import DataFile


import numpy as np
import re


def update_job_data(path, overwrite=False):
"""
Processes Excel files containing job data, updating them with additional salary information.
This function can handle both single Excel files and directories containing multiple Excel files.
Parameters:
- path (str): The path to an Excel file or a directory containing Excel files.
If a directory is specified, the function processes all '.xlsx' files within it.
- overwrite (bool, optional): If set to True, the function overwrites existing salary information in the files.
If False, it skips files that already contain specified salary columns.
Defaults to False.
Behavior:
- The function adds new columns ('lower_salaries', 'upper_salaries', 'units', 'yearly_lower_salaries',
'yearly_upper_salaries') to the DataFrame based on parsed salary information.
- For hourly wages, it converts them to yearly salaries using a predefined conversion factor.
- If 'overwrite' is False and any of the specified columns already exist in a file,
that file is skipped without modification.
- Prints messages for skipped files or if the provided path is neither a file nor a directory.
"""

# Check if the path is a directory
if os.path.isdir(path):
# If it's a directory, process all .xlsx files in it
xlsx_files = glob.glob(os.path.join(path, '*.xlsx'))
for file in xlsx_files:
update_job_data(file, overwrite)
elif os.path.isfile(path):
# If it's a file, process the file
job_data = pd.read_excel(path)

# Define the columns to check for existing data
check_columns = ['lower_salaries', 'upper_salaries', 'units', 'yearly_lower_salaries', 'yearly_upper_salaries']

# Check if any of the specified columns already exist
if not job_data.columns.isin(check_columns).any() or overwrite:
# If none of the columns exist, process the file

# Process salary information
lower_salaries, upper_salaries, units = parse_salary_info(job_data['pay'])
job_data[check_columns] = pd.DataFrame([lower_salaries, upper_salaries, units, lower_salaries, upper_salaries]).T

# Function to convert hourly to yearly salary
def convert_hr_to_yr(x):
return int(x * 1700)

# Apply conversion for hourly wages
c = job_data['units'] == 'hr'
job_data.loc[c, 'yearly_lower_salaries'] = job_data.loc[c, 'lower_salaries'].apply(convert_hr_to_yr)
job_data.loc[c, 'yearly_upper_salaries'] = job_data.loc[c, 'upper_salaries'].apply(convert_hr_to_yr)

# Save the updated DataFrame back to the original Excel file
job_data.to_excel(path, index=False)
else:
print(f"File {path} already contains one or more of the specified columns and overwrite is False. Skipping.")
else:
print(f"Path {path} is neither a file nor a directory.")


def covert_k_to_num(str):
str = str.lower()
if 'k' in str:
num_out = int(float(str.split('k')[0]))*1000
else:
num_out = float(str)
return num_out

def parse_salary_info(in_list):

in_list = ["" if pd.isna(item) else item for item in in_list]

lower_salaries = []
upper_salaries = []
units = []

for item in in_list:
# Regular expression to find salary ranges and units
# matches = re.findall(r'\$([0-9]+(?:K)?)\s*\/\s*(yr|hr)', item)
matches = re.findall(r'\$([0-9]+(?:\.[0-9]+)?(?:K)?)\s*\/\s*(yr|hr)', item)

if matches:
# Convert salary string to numerical value
lower_salary = covert_k_to_num(matches[0][0])
if len(matches) > 1:
upper_salary = covert_k_to_num(matches[1][0])
else:
upper_salary = lower_salary

lower_salaries.append(lower_salary)
upper_salaries.append(upper_salary)
units.append(matches[0][1])
else:
lower_salaries.append(np.nan)
upper_salaries.append(np.nan)
units.append('NA')

return lower_salaries, upper_salaries, units


def update_master_files(data_folder, fn_applied, fn_skipped):
files = glob.glob(f'{data_folder}/*.xlsx')

Expand Down Expand Up @@ -550,7 +653,7 @@ def save_it(self):

# Args:
# - info_dict: A dictionary containing start_url and other necessary information.
# - data_class: An instance of a DataClass containing df_main and a save_it method.
# - data_class: An instance of a DataFile containing df_main and a save_it method.
# - n_pages_to_scrape: Number of pages to scrape.
# - wait_sec_each_page: Time to wait on each page before scraping.
# - update_every_n_secs: How often to update the data in seconds.
Expand Down Expand Up @@ -583,54 +686,54 @@ def save_it(self):



def scrape_job_data(info_dict, scraper_settings_list, data_class, auto_update_link=True):
"""
Scrapes job data based on specified settings.
# def scrape_job_data(info_dict, scraper_settings_list, data_class, auto_update_link=True):
# """
# Scrapes job data based on specified settings.

Args:
- info_dict: A dictionary containing general information for the browser setup.
- scraper_settings: A dictionary containing specific settings for scraping, including start_url.
- data_class: An instance of a DataClass containing df_main and a save_it method.
"""
for scraper_settings in scraper_settings_list:
start_url = scraper_settings.get('start_url')
n_pages_to_scrape = scraper_settings.get('n_pages_to_scrape', 5)
wait_sec_each_page = scraper_settings.get('wait_sec_each_page', 5)
update_every_n_secs = scraper_settings.get('update_every_n_secs', 60*5)
existing_job_ids = scraper_settings.get('existing_job_ids', [])
# Args:
# - info_dict: A dictionary containing general information for the browser setup.
# - scraper_settings: A dictionary containing specific settings for scraping, including start_url.
# - data_class: An instance of a DataFile containing df_main and a save_it method.
# """
# for scraper_settings in scraper_settings_list:
# start_url = scraper_settings.get('start_url')
# n_pages_to_scrape = scraper_settings.get('n_pages_to_scrape', 5)
# wait_sec_each_page = scraper_settings.get('wait_sec_each_page', 5)
# update_every_n_secs = scraper_settings.get('update_every_n_secs', 60*5)
# existing_job_ids = scraper_settings.get('existing_job_ids', [])

driver = open_browser(info_dict)
driver.get(start_url)
# driver = open_browser(info_dict)
# driver.get(start_url)

if auto_update_link:
# we click the search button to update the link then print it
job_listings = JobListings(driver)
time.sleep(2)
job_listings.click_search_button()
current_url = driver.current_url
print("Current URL:", current_url)
start_url = current_url
driver.get(start_url)
# if auto_update_link:
# # we click the search button to update the link then print it
# job_listings = JobListings(driver)
# time.sleep(2)
# job_listings.click_search_button()
# current_url = driver.current_url
# print("Current URL:", current_url)
# start_url = current_url
# driver.get(start_url)


try:
while True:
for n_page in range(n_pages_to_scrape):
if n_page == 0:
# Go back to the first page
driver.get(start_url)
else:
go_to_next_page(driver)
time.sleep(wait_sec_each_page)
# Update data
job_data = get_job_details(driver, existing_job_ids)
job_data = pd.DataFrame(job_data)
job_data['job_ids'] = job_data['job_ids'].astype('int')
data_class.df_main = update_dataframe(data_class.df_main, job_data, ['job_ids'])
# try:
# while True:
# for n_page in range(n_pages_to_scrape):
# if n_page == 0:
# # Go back to the first page
# driver.get(start_url)
# else:
# go_to_next_page(driver)
# time.sleep(wait_sec_each_page)
# # Update data
# job_data = get_job_details(driver, existing_job_ids)
# job_data = pd.DataFrame(job_data)
# job_data['job_ids'] = job_data['job_ids'].astype('int')
# data_class.df_main = update_dataframe(data_class.df_main, job_data, ['job_ids'])

data_class.save_it()
# data_class.save_it()

print('_____________________________________________\n\n_____________________________________________')
time.sleep(update_every_n_secs)
finally:
driver.quit()
# print('_____________________________________________\n\n_____________________________________________')
# time.sleep(update_every_n_secs)
# finally:
# driver.quit()

0 comments on commit 55c3114

Please sign in to comment.