Skip to content

Commit

Permalink
Merge pull request #142 from bioimage-io/fix-web-search
Browse files Browse the repository at this point in the history
Fix and enhance the web search
  • Loading branch information
oeway authored Jun 7, 2024
2 parents 4a0ce24 + 25f3db1 commit 2bdf082
Show file tree
Hide file tree
Showing 4 changed files with 59 additions and 81 deletions.
Original file line number Diff line number Diff line change
@@ -1,23 +1,70 @@

from schema_agents import schema_tool
from bioimageio_chatbot.utils import ChatbotExtension
from bioimageio_chatbot.chatbot_extensions.web_search_extension.llm_web_search import search_duckduckgo
from bioimageio_chatbot.chatbot_extensions.web_search_extension.langchain_websearch import LangchainCompressor
from pydantic import Field
from typing import Optional

langchain_compressor = None
import httpx
from bs4 import BeautifulSoup

from .langchain_websearch import LangchainCompressor

default_langchain_compressor = None

@schema_tool
async def search_web(query: str):
async def search_web(query: str=Field(description="space separated keywords for the duckduckgo search engine"), max_results: int = Field(description="maximum number of results to return")):
"""Search the web for information using duckduckgo."""
global langchain_compressor
langchain_compressor = langchain_compressor or LangchainCompressor(device="cpu")
content = await search_duckduckgo(query, langchain_compressor, max_results=5, similarity_threshold=0.5, instant_answers=True, chunk_size=500, num_results_to_process=5)
return content
from duckduckgo_search import AsyncDDGS
query = query.strip("\"'")
results = await AsyncDDGS(proxy=None).atext(query, region='wt-wt', safesearch='moderate', timelimit=None,
max_results=max_results)
if not results:
return "No relevant information found."
docs = []
for d in results:
docs.append({"title": d['title'], "body": d['body'], "url": d['href']})
return docs

@schema_tool
async def browse_web_pages(query: str=Field(description="keywords or a sentence describing the information to be retrieved"), urls: list[str]=Field(description="list of web page urls to analyse"), num_results_to_process: Optional[int]=Field(5, description="number of results to process")):
"""Read web pages and return compressed documents with most relevant information."""
global default_langchain_compressor
default_langchain_compressor = default_langchain_compressor or LangchainCompressor(device="cpu")

documents = await default_langchain_compressor.faiss_embedding_query_urls(query, urls,
num_results=num_results_to_process)

if not documents: # Fall back to old simple search rather than returning nothing
print("LLM_Web_search | Could not find any page content "
"similar enough to be extracted, using basic search fallback...")
return "No relevant information found."
return documents


@schema_tool
async def read_webpage(url: str=Field(description="the web url to read")) -> str:
"""Read the full content of a web page converted to plain text."""
headers = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:120.0) Gecko/20100101 Firefox/120.0",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.5"
}

async with httpx.AsyncClient() as client:
response = await client.get(url, headers=headers)

soup = BeautifulSoup(response.content, features="lxml")
for script in soup(["script", "style"]):
script.extract()

strings = soup.stripped_strings
return '\n'.join([s.strip() for s in strings])


def get_extension():
return ChatbotExtension(
id="web",
name="Search Web",
description="Search the web for information using duckduckgo. Search by keywords and returns a list of relevant documents.",
tools=dict(search=search_web),
tools=dict(search=search_web, browse=browse_web_pages, read=read_webpage)
)

This file was deleted.

4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ requires = ["setuptools", "wheel"]

[project]
name = "bioimageio-chatbot"
version = "0.2.5"
version = "0.2.6"
readme = "README.md"
description = "Your Personal Assistant in Computational BioImaging."
dependencies = [
Expand All @@ -19,7 +19,7 @@ dependencies = [
"langchain>=0.1.6",
"beautifulsoup4",
"pandas",
"duckduckgo-search>=5.1.0",
"duckduckgo-search>=6.1.5",
"rank-bm25",
"langchain-openai",
"langchain-core>=0.1.31",
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ langchain-community==0.0.27
langchain-core==0.1.31
beautifulsoup4
pandas
duckduckgo-search==5.1.0
duckduckgo-search==6.1.5
langchain-openai==0.0.8
rank-bm25==0.2.2
html2text==2020.1.16
Expand Down

0 comments on commit 2bdf082

Please sign in to comment.