Skip to content

Commit

Permalink
Add in a json file for the Journal IF - to be updated yearly.
Browse files Browse the repository at this point in the history
  • Loading branch information
Luen committed May 14, 2024
1 parent 462f579 commit 413a469
Show file tree
Hide file tree
Showing 3 changed files with 95 additions and 57 deletions.
10 changes: 10 additions & 0 deletions journal_impact_factor.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
{
"global change biology": "11.6",
"nature climate change": "30.7",
"journal of experimental biology": "2.8",
"transactions of the american fisheries society": "1.4",
"science": "56.9",
"functional ecology": "5.2",
"scientific reports": "4.6",
"comparative biochemistry and physiology part a: molecular & integrative physiology": "2.3"
}
108 changes: 55 additions & 53 deletions journal_impact_factor.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def fetch_if_from_wikipedia(journal_name):
# Try to get the Wikipedia page for the journal
page = wikipedia.page(journal_name.replace(' ', '_').title()+"_(journal)") # E.g., https://en.wikipedia.org/wiki/Significance_(journal)
if page:
impact_factor = parse_if_from_wikipedia(page.html) # Note the lack of () which is intentional as pymediawiki handles the .page differently
impact_factor = parse_if_from_wikipedia(page.html)
if impact_factor is False:
return None
if impact_factor is not None:
Expand All @@ -50,7 +50,7 @@ def fetch_if_from_wikipedia(journal_name):
time.sleep(10)
page = wikipedia.page(option)
if page:
impact_factor = parse_if_from_wikipedia(page.html())
impact_factor = parse_if_from_wikipedia(page.html)
if impact_factor is False:
return None
if impact_factor is not None:
Expand All @@ -63,7 +63,7 @@ def fetch_if_from_wikipedia(journal_name):
time.sleep(10)
page = wikipedia.page(search[0])
if page:
impact_factor = parse_if_from_wikipedia(page.html())
impact_factor = parse_if_from_wikipedia(page.html)
if impact_factor is False:
return None
if impact_factor is not None:
Expand Down Expand Up @@ -106,63 +106,65 @@ async def fetch_if_from_bioxbio(journal_name):
async with async_playwright() as p:
# Launch the browser
browser = await p.chromium.launch(headless=True)
page = await browser.new_page()
async with browser:

# Construct the URL and navigate to the page
url = f"https://www.bioxbio.com/search/?q={journal_name.replace(' ', '+')}"
await page.goto(url)
page = await browser.new_page()

# Wait for the necessary elements to load
await page.wait_for_selector('div.gsc-expansionArea', timeout=10000)
# Construct the URL and navigate to the page
url = f"https://www.bioxbio.com/search/?q={journal_name.replace(' ', '+')}"
await page.goto(url)

# Get the search results
results = await page.query_selector_all('div.gsc-webResult.gsc-result')
if not results:
print("No results found")
return None
# Wait for the necessary elements to load
await page.wait_for_selector('div.gsc-expansionArea', timeout=10000)

first_result_link = await results[0].query_selector('a.gs-title')
if first_result_link:
# Click the first result link and wait for the navigation to complete
link = await first_result_link.get_attribute('href')
await page.goto(link, wait_until='load')

# Wait for the table to load on the new page
#await page.wait_for_selector('table.table-bordered', timeout=10000)

# Directly extract the Impact Factor from the table using Playwright
impact_factor_array = await page.evaluate('''() => {
const table = document.querySelector('table.table-bordered');
if (!table) {
return [];
}
const secondRow = table.querySelectorAll('tr')[1];
return secondRow ? [
secondRow.children[0].textContent.trim(),
secondRow.children[1].textContent.trim()
] : [];
}''')

if not impact_factor_array or len(impact_factor_array) != 2:
print_warn("Impact Factor table not found.")
# Get the search results
results = await page.query_selector_all('div.gsc-webResult.gsc-result')
if not results:
print("No results found")
return None

# Extracting year and impact factor
impact_factor_year = impact_factor_array[0].split(' ')[0] if impact_factor_array else None
impact_factor = impact_factor_array[1] if impact_factor_array else None
if not impact_factor:
print_warn("Impact Factor not found.")
return None
first_result_link = await results[0].query_selector('a.gs-title')
if first_result_link:
# Click the first result link and wait for the navigation to complete
link = await first_result_link.get_attribute('href')
await page.goto(link, wait_until='load')

# Wait for the table to load on the new page
#await page.wait_for_selector('table.table-bordered', timeout=10000)

# Directly extract the Impact Factor from the table using Playwright
impact_factor_array = await page.evaluate('''() => {
const table = document.querySelector('table.table-bordered');
if (!table) {
return [];
}
const secondRow = table.querySelectorAll('tr')[1];
return secondRow ? [
secondRow.children[0].textContent.trim(),
secondRow.children[1].textContent.trim()
] : [];
}''')

if not impact_factor_array or len(impact_factor_array) != 2:
print_warn("Impact Factor table not found.")
return None

# Extracting year and impact factor
impact_factor_year = impact_factor_array[0].split(' ')[0] if impact_factor_array else None
impact_factor = impact_factor_array[1] if impact_factor_array else None
if not impact_factor:
print_warn("Impact Factor not found.")
return None

current_year = int(time.strftime("%Y"))
if impact_factor_year and int(impact_factor_year) < (current_year - 2):
print_warn(f"Year is {impact_factor_year}. Impact Factor outdated.")
return None

#print(f"Impact Factor: {impact_factor}")
return impact_factor

current_year = int(time.strftime("%Y"))
if impact_factor_year and int(impact_factor_year) < (current_year - 2):
print_warn(f"Year is {impact_factor_year}. Impact Factor outdated.")
return None

#print(f"Impact Factor: {impact_factor}")
return impact_factor

print_warn("First result link not found.")
print_warn("First result link not found.")
return None
except Exception as e:
print_error(f"Error fetching from BioxBio: {e}")
Expand Down
34 changes: 30 additions & 4 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,23 @@

scholar_id = sys.argv[1]


def load_impact_factor_json_file(file_path):
try:
with open(file_path, "r") as f:
return json.load(f)
except json.JSONDecodeError:
print_warn(f"Error loading JSON from {file_path}. File may be empty or malformed.")
return {}
except FileNotFoundError:
print_warn(f"File {file_path} not found. Creating a new one.")
return {}

def save_impact_factor_json_file(file_path, data):
with open(file_path, "w") as f:
json.dump(data, f, indent=4)


try:
print(f"Getting author with ID: {scholar_id}")
print("This script will take a while to complete due to the rate limits of the scraping website and APIs used")
Expand Down Expand Up @@ -42,7 +59,7 @@
standardised_authors = standardise_authors(authors)
filled_pub['bib']['authors_standardised'] = standardised_authors

if not "symposium" in journal_name or not "conference" in journal_name or not "workshop" in journal_name or not "annual meeting" in journal_name:
if not ("symposium" in journal_name or "conference" in journal_name or "workshop" in journal_name or "annual meeting" in journal_name):
# Get DOI
print(f"Getting DOI for {pub_url}")

Expand Down Expand Up @@ -75,14 +92,23 @@
filled_pub['doi_resolved_link'] = resolved_link if resolved_link else ""

# Get Impact Factor
impact_factor_json = load_impact_factor_json_file("journal_impact_factor.json")
impact_factor = None
if journal_name:
print(f"Getting impact factor for {journal_name}")
impact_factor = get_impact_factor(journal_name.lower())
print_info(f"Impact factor: {impact_factor}")
if journal_name.lower() not in impact_factor_json:
print_warn("TODO: Implement a search function if the journal name isn't exactly the same - e.g., levenshtein.") # https://github.com/Luen/google-scholar-references-py/blob/main/references.py
print(f"Getting impact factor for {journal_name}")
impact_factor = get_impact_factor(journal_name.lower())
print_info(f"Impact factor: {impact_factor}")
# Add impact factor to journal_impact_factor.json
impact_factor_json[journal_name.lower()] = impact_factor
save_impact_factor_json_file("journal_impact_factor.json", impact_factor_json)
else:
print_info(f"Impact factor found in journal_impact_factor.json")
else:
print_warn(f"Journal name not found.")
filled_pub['bib']['impact_factor'] = impact_factor

else:
print_warn(f"Skipping DOI and Impact Factor for symposium, conference, workshop, or annual meeting: {journal_name}")
filled_pub['doi'] = ""
Expand Down

0 comments on commit 413a469

Please sign in to comment.