From e71485e1857d418c4c7a18b33c44ea5e2a11920f Mon Sep 17 00:00:00 2001 From: Robert Date: Thu, 24 Oct 2024 20:49:36 -0700 Subject: [PATCH] Semantic Scholar integration --- App_Function_Libraries/Gradio_Related.py | 2 + .../Gradio_UI/Semantic_Scholar_tab.py | 184 ++++++++++++++++++ .../Third_Party/Semantic_Scholar.py | 162 +++++++++++++++ 3 files changed, 348 insertions(+) create mode 100644 App_Function_Libraries/Gradio_UI/Semantic_Scholar_tab.py create mode 100644 App_Function_Libraries/Third_Party/Semantic_Scholar.py diff --git a/App_Function_Libraries/Gradio_Related.py b/App_Function_Libraries/Gradio_Related.py index 2d4f39e65..c2911040a 100644 --- a/App_Function_Libraries/Gradio_Related.py +++ b/App_Function_Libraries/Gradio_Related.py @@ -54,6 +54,7 @@ from App_Function_Libraries.Gradio_UI.RAG_Chat_tab import create_rag_tab from App_Function_Libraries.Gradio_UI.Embeddings_tab import create_embeddings_tab, create_view_embeddings_tab, \ create_purge_embeddings_tab +from App_Function_Libraries.Gradio_UI.Semantic_Scholar_tab import create_semantic_scholar_tab from App_Function_Libraries.Gradio_UI.Trash import create_view_trash_tab, create_empty_trash_tab, \ create_delete_trash_tab, create_search_and_mark_trash_tab from App_Function_Libraries.Gradio_UI.Utilities import create_utilities_yt_timestamp_tab, create_utilities_yt_audio_tab, \ @@ -287,6 +288,7 @@ def launch_ui(share_public=None, server_mode=False): create_summarize_explain_tab() create_live_recording_tab() create_arxiv_tab() + create_semantic_scholar_tab() with gr.TabItem("Text Search", id="text search", visible=True): create_search_tab() diff --git a/App_Function_Libraries/Gradio_UI/Semantic_Scholar_tab.py b/App_Function_Libraries/Gradio_UI/Semantic_Scholar_tab.py new file mode 100644 index 000000000..46a89e514 --- /dev/null +++ b/App_Function_Libraries/Gradio_UI/Semantic_Scholar_tab.py @@ -0,0 +1,184 @@ +# Sematnic_Scholar_tab.py +# Description: contains the code to create the Semantic Scholar tab in the Gradio UI. +# +# Imports +# +# External Libraries +import gradio as gr +# +# Internal Libraries +from App_Function_Libraries.Third_Party.Semantic_Scholar import search_and_display, FIELDS_OF_STUDY, PUBLICATION_TYPES + + +# +###################################################################################################################### +# Functions +def create_semantic_scholar_tab(): + """Create the Semantic Scholar tab for the Gradio UI""" + with gr.Tab("Semantic Scholar Search"): + with gr.Row(): + with gr.Column(scale=2): + gr.Markdown(""" + ## Semantic Scholar Paper Search + + This interface allows you to search for academic papers using the Semantic Scholar API with advanced filtering options: + + ### Search Options + - **Keywords**: Search across titles, abstracts, and other paper content + - **Year Range**: Filter papers by publication year (e.g., "2020-2023" or "2020") + - **Venue**: Filter by publication venue (journal or conference) + - **Minimum Citations**: Filter papers by minimum citation count + - **Fields of Study**: Filter papers by academic field + - **Publication Types**: Filter by type of publication + - **Open Access**: Option to show only papers with free PDF access + + ### Results Include + - Paper title + - Author list + - Publication year and venue + - Citation count + - Publication types + - Abstract + - Links to PDF (when available) and Semantic Scholar page + """) + with gr.Column(scale=2): + gr.Markdown(""" + ### Pagination + - 10 results per page + - Navigate through results using Previous/Next buttons + - Current page number and total results displayed + + ### Usage Tips + - Combine multiple filters for more specific results + - Use specific terms for more focused results + - Try different combinations of filters if you don't find what you're looking for + """) + with gr.Row(): + with gr.Column(scale=2): + search_input = gr.Textbox( + label="Search Query", + placeholder="Enter keywords to search for papers...", + lines=1 + ) + + # Advanced search options + with gr.Row(): + year_range = gr.Textbox( + label="Year Range", + placeholder="e.g., 2020-2023 or 2020", + lines=1 + ) + venue = gr.Textbox( + label="Venue", + placeholder="e.g., Nature, Science", + lines=1 + ) + min_citations = gr.Number( + label="Minimum Citations", + value=0, + minimum=0, + step=1 + ) + + with gr.Row(): + fields_of_study = gr.Dropdown( + choices=FIELDS_OF_STUDY, + label="Fields of Study", + multiselect=True, + value=[] + ) + publication_types = gr.Dropdown( + choices=PUBLICATION_TYPES, + label="Publication Types", + multiselect=True, + value=[] + ) + + open_access_only = gr.Checkbox( + label="Open Access Only", + value=False + ) + + with gr.Column(scale=1): + search_button = gr.Button("Search", variant="primary") + + # Pagination controls + with gr.Row(): + prev_button = gr.Button("← Previous") + current_page = gr.Number(value=0, label="Page", minimum=0, step=1) + max_page = gr.Number(value=0, label="Max Page", visible=False) + next_button = gr.Button("Next →") + + total_results = gr.Textbox( + label="Total Results", + value="0", + interactive=False + ) + + output_text = gr.Markdown( + label="Results", + value="Use the search options above to find papers." + ) + + def update_page(direction, current, maximum): + new_page = current + direction + if new_page < 0: + return 0 + if new_page > maximum: + return maximum + return new_page + + # Handle search and pagination + def search_from_button(query, fields_of_study, publication_types, year_range, venue, min_citations, + open_access_only): + """Wrapper to always search from page 0 when search button is clicked""" + return search_and_display( + query=query, + page=0, # Force page 0 for new searches + fields_of_study=fields_of_study, + publication_types=publication_types, + year_range=year_range, + venue=venue, + min_citations=min_citations, + open_access_only=open_access_only + ) + normal_search = search_and_display + + search_button.click( + fn=search_from_button, + inputs=[ + search_input, fields_of_study, publication_types, + year_range, venue, min_citations, open_access_only + ], + outputs=[output_text, current_page, max_page, total_results] + ) + + prev_button.click( + fn=lambda curr, max_p: update_page(-1, curr, max_p), + inputs=[current_page, max_page], + outputs=current_page + ).then( + fn=normal_search, + inputs=[ + search_input, current_page, fields_of_study, publication_types, + year_range, venue, min_citations, open_access_only + ], + outputs=[output_text, current_page, max_page, total_results] + ) + + next_button.click( + fn=lambda curr, max_p: update_page(1, curr, max_p), + inputs=[current_page, max_page], + outputs=current_page + ).then( + fn=normal_search, + inputs=[ + search_input, current_page, fields_of_study, publication_types, + year_range, venue, min_citations, open_access_only + ], + outputs=[output_text, current_page, max_page, total_results] + ) + +# +# End of Semantic_Scholar_tab.py +###################################################################################################################### diff --git a/App_Function_Libraries/Third_Party/Semantic_Scholar.py b/App_Function_Libraries/Third_Party/Semantic_Scholar.py new file mode 100644 index 000000000..46fccc5a4 --- /dev/null +++ b/App_Function_Libraries/Third_Party/Semantic_Scholar.py @@ -0,0 +1,162 @@ +# Semantic_Scholar.py +# Description: This file contains the functions to interact with the Semantic Scholar API +# +# Imports +from typing import List, Dict, Any + +import requests +# +#################################################################################################### +# +# Functions + +# Constants +FIELDS_OF_STUDY = [ + "Computer Science", "Medicine", "Chemistry", "Biology", "Materials Science", + "Physics", "Geology", "Psychology", "Art", "History", "Geography", + "Sociology", "Business", "Political Science", "Economics", "Philosophy", + "Mathematics", "Engineering", "Environmental Science", + "Agricultural and Food Sciences", "Education", "Law", "Linguistics" +] + +PUBLICATION_TYPES = [ + "Review", "JournalArticle", "CaseReport", "ClinicalTrial", "Conference", + "Dataset", "Editorial", "LettersAndComments", "MetaAnalysis", "News", + "Study", "Book", "BookSection" +] + + +def search_papers( + query: str, + page: int, + fields_of_study: List[str], + publication_types: List[str], + year_range: str, + venue: str, + min_citations: int, + open_access_only: bool, + limit: int = 10 +) -> Dict[str, Any]: + """Search for papers using the Semantic Scholar API with all available filters""" + if not query.strip(): + return {"total": 0, "offset": 0, "next": 0, "data": []} + + try: + url = "https://api.semanticscholar.org/graph/v1/paper/search" + params = { + "query": query, + "offset": page * limit, + "limit": limit, + "fields": "title,abstract,year,citationCount,authors,venue,openAccessPdf,url,publicationTypes,publicationDate" + } + + # Add optional filters + if fields_of_study: + params["fieldsOfStudy"] = ",".join(fields_of_study) + if publication_types: + params["publicationTypes"] = ",".join(publication_types) + if venue: + params["venue"] = venue + if min_citations: + params["minCitationCount"] = str(min_citations) + if open_access_only: + params["openAccessPdf"] = "" + if year_range: + try: + if "-" in year_range: + start_year, end_year = year_range.split("-") + params["year"] = f"{start_year.strip()}-{end_year.strip()}" + else: + params["year"] = year_range.strip() + except ValueError: + pass + + response = requests.get(url, params=params) + response.raise_for_status() + return response.json() + except requests.exceptions.RequestException as e: + return {"error": f"API Error: {str(e)}", "total": 0, "offset": 0, "data": []} + + +def get_paper_details(paper_id): + """Get detailed information about a specific paper""" + try: + url = f"https://api.semanticscholar.org/graph/v1/paper/{paper_id}" + params = { + "fields": "title,abstract,year,citationCount,authors,venue,openAccessPdf,url,references,citations" + } + response = requests.get(url, params=params) + response.raise_for_status() + return response.json() + except requests.exceptions.RequestException as e: + return {"error": f"API Error: {str(e)}"} + + +def format_paper_info(paper: Dict[str, Any]) -> str: + """Format paper information for display""" + authors = ", ".join([author["name"] for author in paper.get("authors", [])]) + year = f"Year: {paper.get('year', 'N/A')}" + venue = f"Venue: {paper.get('venue', 'N/A')}" + citations = f"Citations: {paper.get('citationCount', 0)}" + pub_types = f"Types: {', '.join(paper.get('publicationTypes', ['N/A']))}" + + pdf_link = "" + if paper.get("openAccessPdf"): + pdf_link = f"\nPDF: {paper['openAccessPdf']['url']}" + + s2_link = f"\nSemantic Scholar: {paper.get('url', '')}" + + formatted = f"""# {paper.get('title', 'No Title')} + +Authors: {authors} +{year} | {venue} | {citations} +{pub_types} + +Abstract: +{paper.get('abstract', 'No abstract available')} + +Links:{pdf_link}{s2_link} +""" + return formatted + + +def search_and_display( + query: str, + page: int, + fields_of_study: List[str], + publication_types: List[str], + year_range: str, + venue: str, + min_citations: int, + open_access_only: bool +) -> tuple[str, int, int, str]: + """Search for papers and return formatted results with pagination info""" + result = search_papers( + query, page, fields_of_study, publication_types, + year_range, venue, min_citations, open_access_only + ) + + if "error" in result: + return result["error"], 0, 0, "0" + + if not result["data"]: + return "No results found.", 0, 0, "0" + + papers = result["data"] + total_results = int(result.get("total", "0")) + max_pages = (total_results + 9) // 10 # Ceiling division + + results = [] + for paper in papers: + results.append(format_paper_info(paper)) + + formatted_results = "\n\n---\n\n".join(results) + + # Add pagination information + pagination_info = f"\n\n---\n\nShowing results {result['offset'] + 1}-{result['offset'] + len(papers)} of {total_results}" + + return formatted_results + pagination_info, page, max_pages - 1, str(total_results) + +# +# End of Semantic_Scholar.py +####################################################################################################