Skip to content

Commit

Permalink
chore(spidertools): Added passing in custom params and set limit for …
Browse files Browse the repository at this point in the history
…crawling
  • Loading branch information
gbertb committed Dec 27, 2024
1 parent 409f909 commit 24da467
Showing 1 changed file with 13 additions and 8 deletions.
21 changes: 13 additions & 8 deletions phi/tools/spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,12 @@ def __init__(
self,
max_results: Optional[int] = None,
url: Optional[str] = None,
optional_params: Optional[dict] = None,
):
super().__init__(name="spider")
self.max_results = max_results
self.url = url
self.optional_params = optional_params or {}
self.register(self.search)
self.register(self.scrape)
self.register(self.crawl)
Expand All @@ -44,20 +46,21 @@ def scrape(self, url: str) -> str:
"""
return self._scrape(url)

def crawl(self, url: str) -> str:
"""Use this function to crawl a webpage.
def crawl(self, url: str, limit: Optional[int] = None) -> str:
"""Use this function to crawl the web.
Args:
url (str): The URL of the webpage to crawl.
limit (int, optional): The maximum number of pages to crawl. Defaults to 10.
Returns:
Markdown of all the pages on the URL.
The results of the crawl.
"""
return self._crawl(url)
return self._crawl(url, limit=limit)

def _search(self, query: str, max_results: int = 1) -> str:
app = ExternalSpider()
logger.info(f"Fetching results from spider for query: {query} with max_results: {max_results}")
try:
options = {"fetch_page_content": False, "num": max_results}
options = {"fetch_page_content": False, "num": max_results, **self.optional_params}
results = app.search(query, options)
return json.dumps(results)
except Exception as e:
Expand All @@ -68,18 +71,20 @@ def _scrape(self, url: str) -> str:
app = ExternalSpider()
logger.info(f"Fetching content from spider for url: {url}")
try:
options = {"return_format": "markdown"}
options = {"return_format": "markdown", **self.optional_params}
results = app.scrape_url(url, options)
return json.dumps(results)
except Exception as e:
logger.error(f"Error fetching content from spider: {e}")
return f"Error fetching content from spider: {e}"

def _crawl(self, url: str) -> str:
def _crawl(self, url: str, limit: Optional[int] = None) -> str:
app = ExternalSpider()
logger.info(f"Fetching content from spider for url: {url}")
try:
options = {"return_format": "markdown"}
if limit is None:
limit = 10
options = {"return_format": "markdown", "limit": limit, **self.optional_params}
results = app.crawl_url(url, options)
return json.dumps(results)
except Exception as e:
Expand Down

0 comments on commit 24da467

Please sign in to comment.