Merge pull request #142 from bioimage-io/fix-web-search

Fix and enhance the web search
bioimage-io · Jun 7, 2024 · 2bdf082 · 2bdf082
2 parents 4a0ce24 + 25f3db1
commit 2bdf082
Show file tree

Hide file tree

Showing 4 changed files with 59 additions and 81 deletions.
diff --git a/bioimageio_chatbot/chatbot_extensions/web_search_extension/__init__.py b/bioimageio_chatbot/chatbot_extensions/web_search_extension/__init__.py
@@ -1,23 +1,70 @@
 
 from schema_agents import schema_tool
 from bioimageio_chatbot.utils import ChatbotExtension
-from bioimageio_chatbot.chatbot_extensions.web_search_extension.llm_web_search import search_duckduckgo
-from bioimageio_chatbot.chatbot_extensions.web_search_extension.langchain_websearch import LangchainCompressor
+from pydantic import Field
+from typing import Optional
 
-langchain_compressor = None
+import httpx
+from bs4 import BeautifulSoup
+
+from .langchain_websearch import LangchainCompressor
+
+default_langchain_compressor = None
 
 @schema_tool
-async def search_web(query: str):
+async def search_web(query: str=Field(description="space separated keywords for the duckduckgo search engine"), max_results: int = Field(description="maximum number of results to return")):
     """Search the web for information using duckduckgo."""
-    global langchain_compressor
-    langchain_compressor = langchain_compressor or LangchainCompressor(device="cpu")
-    content = await search_duckduckgo(query, langchain_compressor, max_results=5, similarity_threshold=0.5, instant_answers=True, chunk_size=500, num_results_to_process=5)
-    return content
+    from duckduckgo_search import AsyncDDGS
+    query = query.strip("\"'")
+    results = await AsyncDDGS(proxy=None).atext(query, region='wt-wt', safesearch='moderate', timelimit=None,
+                            max_results=max_results)
+    if not results:
+        return "No relevant information found."
+    docs = []
+    for d in results:
+        docs.append({"title": d['title'], "body": d['body'], "url": d['href']})
+    return docs
+
+@schema_tool
+async def browse_web_pages(query: str=Field(description="keywords or a sentence describing the information to be retrieved"), urls: list[str]=Field(description="list of web page urls to analyse"), num_results_to_process: Optional[int]=Field(5, description="number of results to process")):
+    """Read web pages and return compressed documents with most relevant information."""
+    global default_langchain_compressor
+    default_langchain_compressor = default_langchain_compressor or LangchainCompressor(device="cpu")
+
+    documents = await default_langchain_compressor.faiss_embedding_query_urls(query, urls,
+                                                               num_results=num_results_to_process)
+
+    if not documents:    # Fall back to old simple search rather than returning nothing
+        print("LLM_Web_search | Could not find any page content "
+              "similar enough to be extracted, using basic search fallback...")
+        return "No relevant information found."
+    return documents
+
+
+@schema_tool
+async def read_webpage(url: str=Field(description="the web url to read")) -> str:
+    """Read the full content of a web page converted to plain text."""
+    headers = {
+        "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:120.0) Gecko/20100101 Firefox/120.0",
+        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
+        "Accept-Language": "en-US,en;q=0.5"
+    }
+
+    async with httpx.AsyncClient() as client:
+        response = await client.get(url, headers=headers)
+
+    soup = BeautifulSoup(response.content, features="lxml")
+    for script in soup(["script", "style"]):
+        script.extract()
+
+    strings = soup.stripped_strings
+    return '\n'.join([s.strip() for s in strings])
+
 
 def get_extension():
     return ChatbotExtension(
         id="web",
         name="Search Web",
         description="Search the web for information using duckduckgo. Search by keywords and returns a list of relevant documents.",
-        tools=dict(search=search_web),
+        tools=dict(search=search_web, browse=browse_web_pages, read=read_webpage)
     )
diff --git a/bioimageio_chatbot/chatbot_extensions/web_search_extension/llm_web_search.py b/bioimageio_chatbot/chatbot_extensions/web_search_extension/llm_web_search.py
diff --git a/pyproject.toml b/pyproject.toml
@@ -3,7 +3,7 @@ requires = ["setuptools", "wheel"]
 
 [project]
 name = "bioimageio-chatbot"
-version = "0.2.5"
+version = "0.2.6"
 readme = "README.md"
 description = "Your Personal Assistant in Computational BioImaging."
 dependencies = [
@@ -19,7 +19,7 @@ dependencies = [
   "langchain>=0.1.6",
   "beautifulsoup4",
   "pandas",
-  "duckduckgo-search>=5.1.0",
+  "duckduckgo-search>=6.1.5",
   "rank-bm25",
   "langchain-openai",
   "langchain-core>=0.1.31",

diff --git a/requirements.txt b/requirements.txt
@@ -13,7 +13,7 @@ langchain-community==0.0.27
 langchain-core==0.1.31
 beautifulsoup4
 pandas
-duckduckgo-search==5.1.0
+duckduckgo-search==6.1.5
 langchain-openai==0.0.8
 rank-bm25==0.2.2
 html2text==2020.1.16