chore(spidertools): Added passing in custom params and set limit for …

…crawling
phidatahq · Dec 27, 2024 · 24da467 · 24da467
1 parent 409f909
commit 24da467
Showing 1 changed file with 13 additions and 8 deletions.
diff --git a/phi/tools/spider.py b/phi/tools/spider.py
@@ -16,10 +16,12 @@ def __init__(
         self,
         max_results: Optional[int] = None,
         url: Optional[str] = None,
+        optional_params: Optional[dict] = None,
     ):
         super().__init__(name="spider")
         self.max_results = max_results
         self.url = url
+        self.optional_params = optional_params or {}
         self.register(self.search)
         self.register(self.scrape)
         self.register(self.crawl)
@@ -44,20 +46,21 @@ def scrape(self, url: str) -> str:
         """
         return self._scrape(url)
 
-    def crawl(self, url: str) -> str:
-        """Use this function to crawl a webpage.
+    def crawl(self, url: str, limit: Optional[int] = None) -> str:
+        """Use this function to crawl the web.
         Args:
             url (str): The URL of the webpage to crawl.
+            limit (int, optional): The maximum number of pages to crawl. Defaults to 10.
         Returns:
-            Markdown of all the pages on the URL.
+            The results of the crawl.
         """
-        return self._crawl(url)
+        return self._crawl(url, limit=limit)
 
     def _search(self, query: str, max_results: int = 1) -> str:
         app = ExternalSpider()
         logger.info(f"Fetching results from spider for query: {query} with max_results: {max_results}")
         try:
-            options = {"fetch_page_content": False, "num": max_results}
+            options = {"fetch_page_content": False, "num": max_results, **self.optional_params}
             results = app.search(query, options)
             return json.dumps(results)
         except Exception as e:
@@ -68,18 +71,20 @@ def _scrape(self, url: str) -> str:
         app = ExternalSpider()
         logger.info(f"Fetching content from spider for url: {url}")
         try:
-            options = {"return_format": "markdown"}
+            options = {"return_format": "markdown", **self.optional_params}
             results = app.scrape_url(url, options)
             return json.dumps(results)
         except Exception as e:
             logger.error(f"Error fetching content from spider: {e}")
             return f"Error fetching content from spider: {e}"
 
-    def _crawl(self, url: str) -> str:
+    def _crawl(self, url: str, limit: Optional[int] = None) -> str:
         app = ExternalSpider()
         logger.info(f"Fetching content from spider for url: {url}")
         try:
-            options = {"return_format": "markdown"}
+            if limit is None:
+                limit = 10
+            options = {"return_format": "markdown", "limit": limit, **self.optional_params}
             results = app.crawl_url(url, options)
             return json.dumps(results)
         except Exception as e: