AI Research Agent

Utilizes arxiv, exa to write a detailed blog on a research topic that can help the user understand the topic with supported articles and papers.
phidatahq · Dec 29, 2024 · 0398626 · 0398626
1 parent a4cf57f
commit 0398626
Show file tree

Hide file tree

Showing 4 changed files with 343 additions and 0 deletions.
diff --git a/cookbook/examples/agents/research_agent_app/README.md b/cookbook/examples/agents/research_agent_app/README.md
@@ -0,0 +1,31 @@
+# AI Research Workflow
+
+We've created a constrained AI Research Workflow that uses Agents to writes a detailed blog on topic by utilizing LLM models and external tools:
+- Exa
+- ArXiv
+
+### 1. Create a virtual environment
+
+```shell
+python3 -m venv ~/.venvs/aienv
+source ~/.venvs/aienv/bin/activate
+```
+
+### 2. Install requirements
+
+```shell
+pip install -r cookbook/examples/agents/research_agent_app/requirements.txt
+```
+
+### 3. Export `OPENAI_API_KEY` and `EXA_API_KEY`
+
+```shell
+export OPENAI_API_KEY=sk-***
+export EXA_API_KEY=***
+```
+
+### 4. Run Streamlit App
+
+```shell
+streamlit run cookbook/examples/agents/research_agent_app/app.py
+```
diff --git a/cookbook/examples/agents/research_agent_app/ai_research_agent.py b/cookbook/examples/agents/research_agent_app/ai_research_agent.py
@@ -0,0 +1,139 @@
+import os
+from pathlib import Path
+from typing import List
+from pydantic import BaseModel, Field
+from phi.agent import Agent
+from dotenv import load_dotenv
+from phi.model.openai import OpenAIChat
+from phi.tools.arxiv_toolkit import ArxivToolkit
+from phi.tools.exa import ExaTools
+
+load_dotenv()
+
+OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
+
+
+# Define data models
+class SearchTerms(BaseModel):
+    terms: List[str] = Field(..., description="List of search terms related to a topic.")
+
+
+class ArxivSearchResult(BaseModel):
+    title: str = Field(..., description="Title of the article.")
+    id: str = Field(..., description="The ID of the article.")
+    authors: List[str] = Field(..., description="Authors of the article.")
+    summary: str = Field(..., description="Summary of the article.")
+    pdf_url: str = Field(..., description="URL of the PDF of the article.")
+    links: List[str] = Field(..., description="Links related to the article.")
+    reasoning: str = Field(..., description="Reason for selecting this article.")
+
+
+class ArxivSearchResults(BaseModel):
+    results: List[ArxivSearchResult] = Field(..., description="List of top search results.")
+
+
+class WebSearchResult(BaseModel):
+    title: str = Field(..., description="Title of the article.")
+    summary: str = Field(..., description="Summary of the article.")
+    links: List[str] = Field(..., description="Links related to the article.")
+    reasoning: str = Field(..., description="Reason for selecting this article.")
+
+
+class WebSearchResults(BaseModel):
+    results: List[WebSearchResult] = Field(..., description="List of top search results.")
+
+
+# Initialize tools
+arxiv_toolkit = ArxivToolkit(download_dir=Path(__file__).parent.parent.parent.parent.joinpath("wip", "arxiv_pdfs"))
+exa_tools = ExaTools()
+
+# Initialize agents
+search_term_generator = Agent(
+    model=OpenAIChat(id="gpt-4o"),
+    description="""
+You are an expert research strategist. Generate 2 specific and distinct search terms that will capture different key aspects of the given topic.
+Focus on terms that are:
+    - Specific enough to yield relevant results
+    - Cover both technical and practical aspects of the topic
+    - Relevant to current developments
+    - Optimized for searching academic and web resources effectively
+
+Provide the search terms as a list of strings like ["xyz", "abc", ...]
+""",
+    response_model=SearchTerms,
+    structured_output=True,
+)
+
+arxiv_search_agent = Agent(
+    model=OpenAIChat(id="gpt-4o"),
+    description="""
+You are an expert in academic research with access to ArXiv's database.
+
+Your task is to:
+1. Search ArXiv for the top 10 papers related to the provided search term.
+2. Select the 3 most relevant research papers based on:
+    - Direct relevance to the search term.
+    - Scientific impact (e.g., citations, journal reputation).
+    - Recency of publication.
+
+For each selected paper, the output should be in json structure have these details:
+    - title
+    - id
+    - authors
+    - a concise summary
+    - the PDF link of the research paper
+    - links related to the research paper
+    - reasoning for why the paper was chosen
+
+Ensure the selected research papers directly address the topic and offer valuable insights.
+""",
+    tools=[arxiv_toolkit],
+    response_model=ArxivSearchResults,
+    structured_output=True,
+)
+
+exa_search_agent = Agent(
+    model=OpenAIChat(id="gpt-4o"),
+    description=f"""
+You are a web search expert specializing in extracting high-quality information.
+
+Your task is to:
+1. Given a topic, search Exa for the top 10 articles about that topic.
+2. Select the 3 most relevant articles based on:
+    - Source credibility.
+    - Content depth and relevance.
+
+For each selected article, the output should have:
+    - title
+    - a concise summary
+    - related links to the article
+    - reasoning for why the article was chosen and how it contributes to understanding the topic.
+
+Ensure the selected articles are credible, relevant, and provide significant insights into the topic.
+""",
+    tools=[ExaTools()],
+    response_model=WebSearchResults,
+    structured_output=True,
+)
+
+research_editor = Agent(
+    model=OpenAIChat(id="gpt-4o"),
+    description="""
+You are a senior research editor specializing in breaking complex topics and information into understandable, engaging, high-quality blogs.
+
+Your task is to:
+1. Create a detailed blog within 1000 words based on the given topic.
+2. The blog should be of max 7-8 paragraphs, understandable, intuitive, making things easy to understand for the reader.
+2. Highlight key findings and provide a clear, high-level overview of the topic.
+4. At the end add the supporting articles link, paper link or any findings you think is necessary to add.
+
+The blog should help the reader in getting a decent understanding of the topic.
+The blog should me in markdown format.
+""",
+    instructions=[
+        "Analyze all materials before writing.",
+        "Build a clear narrative structure.",
+        "Balance technical accuracy with explainability.",
+    ],
+    markdown=True,
+)
diff --git a/cookbook/examples/agents/research_agent_app/app.py b/cookbook/examples/agents/research_agent_app/app.py
@@ -0,0 +1,168 @@
+import json
+from typing import Optional
+import streamlit as st
+import pandas as pd
+from cookbook.examples.agents.research_agent_app.ai_research_agent import (
+    SearchTerms,
+    search_term_generator,
+    arxiv_search_agent,
+    exa_search_agent,
+    research_editor,
+    arxiv_toolkit,
+)
+
+# Streamlit App Configuration
+st.set_page_config(
+    page_title="Research Workflow",
+    page_icon=":orange_heart:",
+)
+st.title("AI Research Workflow")
+st.markdown("##### :orange_heart: built by [phidata](https://github.com/phidatahq/phidata)")
+
+
+def main() -> None:
+    # Get topic for report
+    input_topic = st.sidebar.text_input(
+        ":female-scientist: Enter a topic",
+        value="LLM evals in multi-agentic space",
+    )
+    # Button to generate report
+    generate_report = st.sidebar.button("Generate Report")
+    if generate_report:
+        st.session_state["topic"] = input_topic
+
+    # Checkboxes for search
+    st.sidebar.markdown("## Assistants")
+    search_exa = st.sidebar.checkbox("Exa Search", value=True)
+    search_arxiv = st.sidebar.checkbox("ArXiv Search", value=False)
+    search_pubmed = st.sidebar.checkbox("PubMed Search", disabled=True)  # noqa
+    search_google_scholar = st.sidebar.checkbox("Google Scholar Search", disabled=True)  # noqa
+    use_cache = st.sidebar.toggle("Use Cache", value=False, disabled=True)  # noqa
+    num_search_terms = st.sidebar.number_input(
+        "Number of Search Terms", value=1, min_value=1, max_value=3, help="This will increase latency."
+    )
+
+    st.sidebar.markdown("---")
+    st.sidebar.markdown("## Trending Topics")
+    topic = "Humanoid and Autonomous Agents"
+    if st.sidebar.button(topic):
+        st.session_state["topic"] = topic
+
+    topic = "Gene Editing for Disease Treatment"
+    if st.sidebar.button(topic):
+        st.session_state["topic"] = topic
+
+    topic = "Multimodal AI in healthcare"
+    if st.sidebar.button(topic):
+        st.session_state["topic"] = topic
+
+    topic = "Brain Aging and Neurodegenerative Diseases"
+    if st.sidebar.button(topic):
+        st.session_state["topic"] = topic
+
+    if "topic" in st.session_state:
+        report_topic = st.session_state["topic"]
+
+        search_terms: Optional[SearchTerms] = None
+        with st.status("Generating Search Terms", expanded=True) as status:
+            with st.container():
+                search_terms_container = st.empty()
+                search_generator_input = {"topic": report_topic, "num_terms": num_search_terms}
+                search_terms = search_term_generator.run(json.dumps(search_generator_input))
+                if search_terms:
+                    search_terms_container.json(search_terms.model_dump())
+            status.update(label="Search Terms Generated", state="complete", expanded=False)
+
+        if not search_terms:
+            st.write("Sorry report generation failed. Please try again.")
+            return
+
+        exa_content: Optional[str] = None
+        arxiv_content: Optional[str] = None
+
+        if search_exa:
+            with st.status("Searching Exa", expanded=True) as status:
+                with st.container():
+                    exa_container = st.empty()
+                    try:
+                        exa_search_results = exa_search_agent.run(search_terms.model_dump_json(indent=4))
+                        if isinstance(exa_search_results, str):
+                            raise ValueError("Unexpected string response from exa_search_agent")
+                        if exa_search_results and len(exa_search_results.content.results) > 0:
+                            exa_content = exa_search_results.model_dump_json(indent=4)
+                            exa_container.json(exa_search_results.content.results)
+                            status.update(label="Exa Search Complete", state="complete", expanded=False)
+                    except Exception as e:
+                        st.error(f"An error occurred during Exa search: {e}")
+                        status.update(label="Exa Search Failed", state="error", expanded=True)
+                        exa_content = None
+
+        if search_arxiv:
+            with st.status("Searching ArXiv (this takes a while)", expanded=True) as status:
+                with st.container():
+                    arxiv_container = st.empty()
+                    arxiv_search_results = arxiv_search_agent.run(search_terms.model_dump_json(indent=4))
+                    if arxiv_search_results and arxiv_search_results.content.results:
+                        arxiv_container.json([result.model_dump() for result in arxiv_search_results.content.results])
+                status.update(label="ArXiv Search Complete", state="complete", expanded=False)
+
+            if arxiv_search_results and arxiv_search_results.content.results:
+                paper_summaries = []
+                for result in arxiv_search_results.content.results:
+                    summary = {
+                        "ID": result.id,
+                        "Title": result.title,
+                        "Authors": ", ".join(result.authors) if result.authors else "No authors available",
+                        "Summary": result.summary[:200] + "..." if len(result.summary) > 200 else result.summary,
+                    }
+                    paper_summaries.append(summary)
+
+                if paper_summaries:
+                    with st.status("Displaying ArXiv Paper Summaries", expanded=True) as status:
+                        with st.container():
+                            st.subheader("ArXiv Paper Summaries")
+                            df = pd.DataFrame(paper_summaries)
+                            st.dataframe(df, use_container_width=True)
+                        status.update(label="ArXiv Paper Summaries Displayed", state="complete", expanded=False)
+
+                    arxiv_paper_ids = [summary["ID"] for summary in paper_summaries]
+                    if arxiv_paper_ids:
+                        with st.status("Reading ArXiv Papers", expanded=True) as status:
+                            with st.container():
+                                arxiv_content = arxiv_toolkit.read_arxiv_papers(arxiv_paper_ids, pages_to_read=2)
+                                st.write(f"Read {len(arxiv_paper_ids)} ArXiv papers")
+                            status.update(label="Reading ArXiv Papers Complete", state="complete", expanded=False)
+
+        report_input = ""
+        report_input += f"# Topic: {report_topic}\n\n"
+        report_input += "## Search Terms\n\n"
+        report_input += f"{search_terms}\n\n"
+        if arxiv_content:
+            report_input += "## ArXiv Papers\n\n"
+            report_input += "<arxiv_papers>\n\n"
+            report_input += f"{arxiv_content}\n\n"
+            report_input += "</arxiv_papers>\n\n"
+        if exa_content:
+            report_input += "## Web Search Content from Exa\n\n"
+            report_input += "<exa_content>\n\n"
+            report_input += f"{exa_content}\n\n"
+            report_input += "</exa_content>\n\n"
+
+        # Only generate the report if we have content
+        if arxiv_content or exa_content:
+            with st.spinner("Generating Report"):
+                final_report = ""
+                final_report_container = st.empty()
+                research_report = research_editor.run(report_input)
+                final_report_container.markdown(research_report.content)
+        else:
+            st.error(
+                "Report generation cancelled due to search failure. Please try again or select another search option."
+            )
+
+    st.sidebar.markdown("---")
+    if st.sidebar.button("Restart"):
+        st.rerun()
+
+
+main()
diff --git a/cookbook/examples/agents/research_agent_app/requirements.txt b/cookbook/examples/agents/research_agent_app/requirements.txt
@@ -0,0 +1,5 @@
+phidata
+openai
+streamlit
+exa_py
+arxiv