-
Notifications
You must be signed in to change notification settings - Fork 10
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #142 from bioimage-io/fix-web-search
Fix and enhance the web search
- Loading branch information
Showing
4 changed files
with
59 additions
and
81 deletions.
There are no files selected for viewing
65 changes: 56 additions & 9 deletions
65
bioimageio_chatbot/chatbot_extensions/web_search_extension/__init__.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,23 +1,70 @@ | ||
|
||
from schema_agents import schema_tool | ||
from bioimageio_chatbot.utils import ChatbotExtension | ||
from bioimageio_chatbot.chatbot_extensions.web_search_extension.llm_web_search import search_duckduckgo | ||
from bioimageio_chatbot.chatbot_extensions.web_search_extension.langchain_websearch import LangchainCompressor | ||
from pydantic import Field | ||
from typing import Optional | ||
|
||
langchain_compressor = None | ||
import httpx | ||
from bs4 import BeautifulSoup | ||
|
||
from .langchain_websearch import LangchainCompressor | ||
|
||
default_langchain_compressor = None | ||
|
||
@schema_tool | ||
async def search_web(query: str): | ||
async def search_web(query: str=Field(description="space separated keywords for the duckduckgo search engine"), max_results: int = Field(description="maximum number of results to return")): | ||
"""Search the web for information using duckduckgo.""" | ||
global langchain_compressor | ||
langchain_compressor = langchain_compressor or LangchainCompressor(device="cpu") | ||
content = await search_duckduckgo(query, langchain_compressor, max_results=5, similarity_threshold=0.5, instant_answers=True, chunk_size=500, num_results_to_process=5) | ||
return content | ||
from duckduckgo_search import AsyncDDGS | ||
query = query.strip("\"'") | ||
results = await AsyncDDGS(proxy=None).atext(query, region='wt-wt', safesearch='moderate', timelimit=None, | ||
max_results=max_results) | ||
if not results: | ||
return "No relevant information found." | ||
docs = [] | ||
for d in results: | ||
docs.append({"title": d['title'], "body": d['body'], "url": d['href']}) | ||
return docs | ||
|
||
@schema_tool | ||
async def browse_web_pages(query: str=Field(description="keywords or a sentence describing the information to be retrieved"), urls: list[str]=Field(description="list of web page urls to analyse"), num_results_to_process: Optional[int]=Field(5, description="number of results to process")): | ||
"""Read web pages and return compressed documents with most relevant information.""" | ||
global default_langchain_compressor | ||
default_langchain_compressor = default_langchain_compressor or LangchainCompressor(device="cpu") | ||
|
||
documents = await default_langchain_compressor.faiss_embedding_query_urls(query, urls, | ||
num_results=num_results_to_process) | ||
|
||
if not documents: # Fall back to old simple search rather than returning nothing | ||
print("LLM_Web_search | Could not find any page content " | ||
"similar enough to be extracted, using basic search fallback...") | ||
return "No relevant information found." | ||
return documents | ||
|
||
|
||
@schema_tool | ||
async def read_webpage(url: str=Field(description="the web url to read")) -> str: | ||
"""Read the full content of a web page converted to plain text.""" | ||
headers = { | ||
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:120.0) Gecko/20100101 Firefox/120.0", | ||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", | ||
"Accept-Language": "en-US,en;q=0.5" | ||
} | ||
|
||
async with httpx.AsyncClient() as client: | ||
response = await client.get(url, headers=headers) | ||
|
||
soup = BeautifulSoup(response.content, features="lxml") | ||
for script in soup(["script", "style"]): | ||
script.extract() | ||
|
||
strings = soup.stripped_strings | ||
return '\n'.join([s.strip() for s in strings]) | ||
|
||
|
||
def get_extension(): | ||
return ChatbotExtension( | ||
id="web", | ||
name="Search Web", | ||
description="Search the web for information using duckduckgo. Search by keywords and returns a list of relevant documents.", | ||
tools=dict(search=search_web), | ||
tools=dict(search=search_web, browse=browse_web_pages, read=read_webpage) | ||
) |
69 changes: 0 additions & 69 deletions
69
bioimageio_chatbot/chatbot_extensions/web_search_extension/llm_web_search.py
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters