Add in a json file for the Journal IF - to be updated yearly.

Luen · May 14, 2024 · 413a469 · 413a469
1 parent 462f579
commit 413a469
Show file tree

Hide file tree

Showing 3 changed files with 95 additions and 57 deletions.
diff --git a/journal_impact_factor.json b/journal_impact_factor.json
@@ -0,0 +1,10 @@
+{
+    "global change biology": "11.6",
+    "nature climate change": "30.7",
+    "journal of experimental biology": "2.8",
+    "transactions of the american fisheries society": "1.4",
+    "science": "56.9",
+    "functional ecology": "5.2",
+    "scientific reports": "4.6",
+    "comparative biochemistry and physiology part a: molecular & integrative physiology": "2.3"
+}
diff --git a/journal_impact_factor.py b/journal_impact_factor.py
@@ -34,7 +34,7 @@ def fetch_if_from_wikipedia(journal_name):
         # Try to get the Wikipedia page for the journal
         page = wikipedia.page(journal_name.replace(' ', '_').title()+"_(journal)") # E.g., https://en.wikipedia.org/wiki/Significance_(journal)
         if page:
-            impact_factor = parse_if_from_wikipedia(page.html) # Note the lack of () which is intentional as pymediawiki handles the .page differently
+            impact_factor = parse_if_from_wikipedia(page.html)
             if impact_factor is False:
                 return None
             if impact_factor is not None:
@@ -50,7 +50,7 @@ def fetch_if_from_wikipedia(journal_name):
                 time.sleep(10)
                 page = wikipedia.page(option)
             if page:
-                impact_factor = parse_if_from_wikipedia(page.html())
+                impact_factor = parse_if_from_wikipedia(page.html)
                 if impact_factor is False:
                     return None
                 if impact_factor is not None:
@@ -63,7 +63,7 @@ def fetch_if_from_wikipedia(journal_name):
             time.sleep(10)
             page = wikipedia.page(search[0])
             if page:
-                impact_factor = parse_if_from_wikipedia(page.html())
+                impact_factor = parse_if_from_wikipedia(page.html)
                 if impact_factor is False:
                     return None
                 if impact_factor is not None:
@@ -106,63 +106,65 @@ async def fetch_if_from_bioxbio(journal_name):
         async with async_playwright() as p:
             # Launch the browser
             browser = await p.chromium.launch(headless=True)
-            page = await browser.new_page()
+            async with browser:
 
-            # Construct the URL and navigate to the page
-            url = f"https://www.bioxbio.com/search/?q={journal_name.replace(' ', '+')}"
-            await page.goto(url)
+                page = await browser.new_page()
 
-            # Wait for the necessary elements to load
-            await page.wait_for_selector('div.gsc-expansionArea', timeout=10000)
+                # Construct the URL and navigate to the page
+                url = f"https://www.bioxbio.com/search/?q={journal_name.replace(' ', '+')}"
+                await page.goto(url)
 
-            # Get the search results
-            results = await page.query_selector_all('div.gsc-webResult.gsc-result')
-            if not results:
-                print("No results found")
-                return None
+                # Wait for the necessary elements to load
+                await page.wait_for_selector('div.gsc-expansionArea', timeout=10000)
 
-            first_result_link = await results[0].query_selector('a.gs-title')
-            if first_result_link:
-                # Click the first result link and wait for the navigation to complete
-                link = await first_result_link.get_attribute('href')
-                await page.goto(link, wait_until='load')
-
-                # Wait for the table to load on the new page
-                #await page.wait_for_selector('table.table-bordered', timeout=10000)
-
-                # Directly extract the Impact Factor from the table using Playwright
-                impact_factor_array = await page.evaluate('''() => {
-                    const table = document.querySelector('table.table-bordered');
-                    if (!table) {
-                        return [];
-                    }
-                    const secondRow = table.querySelectorAll('tr')[1];
-                    return secondRow ? [
-                        secondRow.children[0].textContent.trim(),
-                        secondRow.children[1].textContent.trim()
-                    ] : [];
-                }''')
-
-                if not impact_factor_array or len(impact_factor_array) != 2:
-                    print_warn("Impact Factor table not found.")
+                # Get the search results
+                results = await page.query_selector_all('div.gsc-webResult.gsc-result')
+                if not results:
+                    print("No results found")
                     return None
 
-                # Extracting year and impact factor
-                impact_factor_year = impact_factor_array[0].split(' ')[0] if impact_factor_array else None
-                impact_factor = impact_factor_array[1] if impact_factor_array else None
-                if not impact_factor:
-                    print_warn("Impact Factor not found.")
-                    return None
+                first_result_link = await results[0].query_selector('a.gs-title')
+                if first_result_link:
+                    # Click the first result link and wait for the navigation to complete
+                    link = await first_result_link.get_attribute('href')
+                    await page.goto(link, wait_until='load')
+
+                    # Wait for the table to load on the new page
+                    #await page.wait_for_selector('table.table-bordered', timeout=10000)
+
+                    # Directly extract the Impact Factor from the table using Playwright
+                    impact_factor_array = await page.evaluate('''() => {
+                        const table = document.querySelector('table.table-bordered');
+                        if (!table) {
+                            return [];
+                        }
+                        const secondRow = table.querySelectorAll('tr')[1];
+                        return secondRow ? [
+                            secondRow.children[0].textContent.trim(),
+                            secondRow.children[1].textContent.trim()
+                        ] : [];
+                    }''')
+
+                    if not impact_factor_array or len(impact_factor_array) != 2:
+                        print_warn("Impact Factor table not found.")
+                        return None
+
+                    # Extracting year and impact factor
+                    impact_factor_year = impact_factor_array[0].split(' ')[0] if impact_factor_array else None
+                    impact_factor = impact_factor_array[1] if impact_factor_array else None
+                    if not impact_factor:
+                        print_warn("Impact Factor not found.")
+                        return None
+
+                    current_year = int(time.strftime("%Y"))
+                    if impact_factor_year and int(impact_factor_year) < (current_year - 2):
+                        print_warn(f"Year is {impact_factor_year}. Impact Factor outdated.")
+                        return None
+
+                    #print(f"Impact Factor: {impact_factor}")
+                    return impact_factor
 
-                current_year = int(time.strftime("%Y"))
-                if impact_factor_year and int(impact_factor_year) < (current_year - 2):
-                    print_warn(f"Year is {impact_factor_year}. Impact Factor outdated.")
-                    return None
-
-                #print(f"Impact Factor: {impact_factor}")
-                return impact_factor
-
-            print_warn("First result link not found.")
+                print_warn("First result link not found.")
         return None
     except Exception as e:
         print_error(f"Error fetching from BioxBio: {e}")

diff --git a/main.py b/main.py
@@ -15,6 +15,23 @@
 
 scholar_id = sys.argv[1]
 
+
+def load_impact_factor_json_file(file_path):
+    try:
+        with open(file_path, "r") as f:
+            return json.load(f)
+    except json.JSONDecodeError:
+        print_warn(f"Error loading JSON from {file_path}. File may be empty or malformed.")
+        return {}
+    except FileNotFoundError:
+        print_warn(f"File {file_path} not found. Creating a new one.")
+        return {}
+
+def save_impact_factor_json_file(file_path, data):
+    with open(file_path, "w") as f:
+        json.dump(data, f, indent=4)
+
+
 try:
     print(f"Getting author with ID: {scholar_id}")
     print("This script will take a while to complete due to the rate limits of the scraping website and APIs used")
@@ -42,7 +59,7 @@
         standardised_authors = standardise_authors(authors)
         filled_pub['bib']['authors_standardised'] = standardised_authors
 
-        if not "symposium" in journal_name or not "conference" in journal_name or not "workshop" in journal_name or not "annual meeting" in journal_name:
+        if not ("symposium" in journal_name or "conference" in journal_name or "workshop" in journal_name or "annual meeting" in journal_name):
             # Get DOI
             print(f"Getting DOI for {pub_url}")
 
@@ -75,14 +92,23 @@
             filled_pub['doi_resolved_link'] = resolved_link if resolved_link else ""
 
             # Get Impact Factor
+            impact_factor_json = load_impact_factor_json_file("journal_impact_factor.json")
             impact_factor = None
             if journal_name:
-                print(f"Getting impact factor for {journal_name}")
-                impact_factor = get_impact_factor(journal_name.lower())
-                print_info(f"Impact factor: {impact_factor}")
+                if journal_name.lower() not in impact_factor_json:
+                    print_warn("TODO: Implement a search function if the journal name isn't exactly the same - e.g., levenshtein.") # https://github.com/Luen/google-scholar-references-py/blob/main/references.py
+                    print(f"Getting impact factor for {journal_name}")
+                    impact_factor = get_impact_factor(journal_name.lower())
+                    print_info(f"Impact factor: {impact_factor}")
+                    # Add impact factor to journal_impact_factor.json
+                    impact_factor_json[journal_name.lower()] = impact_factor
+                    save_impact_factor_json_file("journal_impact_factor.json", impact_factor_json)
+                else:
+                    print_info(f"Impact factor found in journal_impact_factor.json")
             else:
                 print_warn(f"Journal name not found.")
             filled_pub['bib']['impact_factor'] = impact_factor
+
         else: 
             print_warn(f"Skipping DOI and Impact Factor for symposium, conference, workshop, or annual meeting: {journal_name}")
             filled_pub['doi'] = ""