diff --git a/CHANGELOG.md b/CHANGELOG.md
index 07589f0..14d999c 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,8 +1,11 @@
## 1.0.0 (2024-12-05)
+## 1.0.0-beta.1 (2024-12-05)
+
### Features
+* added markdownify and localscraper tools ([03e49dc](https://github.com/ScrapeGraphAI/langchain-scrapegraph/commit/03e49dce84ef5a1b7a59b6dfd046eb563c14d283))
* tools integration ([dc7e9a8](https://github.com/ScrapeGraphAI/langchain-scrapegraph/commit/dc7e9a8fbf4e88bb79e11a9253428b2f61fa1293))
diff --git a/README.md b/README.md
index 0e6e4ae..510ea82 100644
--- a/README.md
+++ b/README.md
@@ -1 +1,140 @@
-# langchain-scrapegraph
\ No newline at end of file
+# 🕷️🦜 langchain-scrapegraph
+
+[![License](https://img.shields.io/badge/License-MIT-blue.svg)](https://opensource.org/licenses/MIT)
+[![Python Support](https://img.shields.io/pypi/pyversions/langchain-scrapegraph.svg)](https://pypi.org/project/langchain-scrapegraph/)
+[![Documentation](https://img.shields.io/badge/Documentation-Latest-green)](https://scrapegraphai.com/docs)
+
+Supercharge your LangChain agents with AI-powered web scraping capabilities. LangChain-ScrapeGraph provides a seamless integration between [LangChain](https://github.com/langchain-ai/langchain) and [ScrapeGraph AI](https://scrapegraphai.com), enabling your agents to extract structured data from websites using natural language.
+
+## 📦 Installation
+
+```bash
+pip install langchain-scrapegraph
+```
+
+## 🛠️ Available Tools
+
+### 📝 MarkdownifyTool
+Convert any webpage into clean, formatted markdown.
+
+```python
+from langchain_scrapegraph.tools import MarkdownifyTool
+
+tool = MarkdownifyTool()
+markdown = tool.invoke({"website_url": "https://example.com"})
+
+print(markdown)
+```
+
+### 🔍 SmartscraperTool
+Extract structured data from any webpage using natural language prompts.
+
+```python
+from langchain_scrapegraph.tools import SmartscraperTool
+
+# Initialize the tool (uses SGAI_API_KEY from environment)
+tool = SmartscraperTool()
+
+# Extract information using natural language
+result = tool.invoke({
+ "website_url": "https://www.example.com",
+ "user_prompt": "Extract the main heading and first paragraph"
+})
+
+print(result)
+```
+
+### 💻 LocalscraperTool
+Extract information from HTML content using AI.
+
+```python
+from langchain_scrapegraph.tools import LocalscraperTool
+
+tool = LocalscraperTool()
+result = tool.invoke({
+ "user_prompt": "Extract all contact information",
+ "website_html": "..."
+})
+
+print(result)
+```
+
+## 🌟 Key Features
+
+- 🐦 **LangChain Integration**: Seamlessly works with LangChain agents and chains
+- 🔍 **AI-Powered Extraction**: Use natural language to describe what data to extract
+- 📊 **Structured Output**: Get clean, structured data ready for your agents
+- 🔄 **Flexible Tools**: Choose from multiple specialized scraping tools
+- ⚡ **Async Support**: Built-in support for async operations
+
+## 💡 Use Cases
+
+- 📖 **Research Agents**: Create agents that gather and analyze web data
+- 📊 **Data Collection**: Automate structured data extraction from websites
+- 📝 **Content Processing**: Convert web content into markdown for further processing
+- 🔍 **Information Extraction**: Extract specific data points using natural language
+
+## 🤖 Example Agent
+
+```python
+from langchain.agents import initialize_agent, AgentType
+from langchain_scrapegraph.tools import SmartscraperTool
+from langchain_openai import ChatOpenAI
+
+# Initialize tools
+tools = [
+ SmartscraperTool(),
+]
+
+# Create an agent
+agent = initialize_agent(
+ tools=tools,
+ llm=ChatOpenAI(temperature=0),
+ agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
+ verbose=True
+)
+
+# Use the agent
+response = agent.run("""
+ Visit example.com, make a summary of the content and extract the main heading and first paragraph
+""")
+```
+
+## ⚙️ Configuration
+
+Set your ScrapeGraph API key in your environment:
+```bash
+export SGAI_API_KEY="your-api-key-here"
+```
+
+Or set it programmatically:
+```python
+import os
+os.environ["SGAI_API_KEY"] = "your-api-key-here"
+```
+
+## 📚 Documentation
+
+- [API Documentation](https://scrapegraphai.com/docs)
+- [LangChain Documentation](https://python.langchain.com/docs/get_started/introduction.html)
+- [Examples](examples/)
+
+## 💬 Support & Feedback
+
+- 📧 Email: support@scrapegraphai.com
+- 💻 GitHub Issues: [Create an issue](https://github.com/ScrapeGraphAI/langchain-scrapegraph/issues)
+- 🌟 Feature Requests: [Request a feature](https://github.com/ScrapeGraphAI/langchain-scrapegraph/issues/new)
+
+## 📄 License
+
+This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
+
+## 🙏 Acknowledgments
+
+This project is built on top of:
+- [LangChain](https://github.com/langchain-ai/langchain)
+- [ScrapeGraph AI](https://scrapegraphai.com)
+
+---
+
+Made with ❤️ by [ScrapeGraph AI](https://scrapegraphai.com)
diff --git a/examples/agent_example.py b/examples/agent_example.py
new file mode 100644
index 0000000..9e61fba
--- /dev/null
+++ b/examples/agent_example.py
@@ -0,0 +1,57 @@
+"""
+Remember to install the additional dependencies for this example to work:
+pip install langchain-openai langchain
+"""
+
+from dotenv import load_dotenv
+from langchain.agents import AgentExecutor, create_openai_functions_agent
+from langchain_core.messages import SystemMessage
+from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
+from langchain_openai import ChatOpenAI
+
+from langchain_scrapegraph.tools import (
+ GetCreditsTool,
+ LocalScraperTool,
+ SmartScraperTool,
+)
+
+load_dotenv()
+
+# Initialize the tools
+tools = [
+ SmartScraperTool(),
+ LocalScraperTool(),
+ GetCreditsTool(),
+]
+
+# Create the prompt template
+prompt = ChatPromptTemplate.from_messages(
+ [
+ SystemMessage(
+ content=(
+ "You are a helpful AI assistant that can analyze websites and extract information. "
+ "You have access to tools that can help you scrape and process web content. "
+ "Always explain what you're doing before using a tool."
+ )
+ ),
+ MessagesPlaceholder(variable_name="chat_history", optional=True),
+ ("user", "{input}"),
+ MessagesPlaceholder(variable_name="agent_scratchpad"),
+ ]
+)
+
+# Initialize the LLM
+llm = ChatOpenAI(temperature=0)
+
+# Create the agent
+agent = create_openai_functions_agent(llm, tools, prompt)
+
+# Create the executor
+agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=True)
+
+# Example usage
+query = """Extract the main products from https://www.scrapegraphai.com/"""
+
+print("\nQuery:", query, "\n")
+response = agent_executor.invoke({"input": query})
+print("\nFinal Response:", response["output"])
diff --git a/examples/get_credits_tool.py b/examples/get_credits_tool.py
index 50d6139..0645fe9 100644
--- a/examples/get_credits_tool.py
+++ b/examples/get_credits_tool.py
@@ -1,9 +1,13 @@
+from scrapegraph_py.logger import sgai_logger
+
from langchain_scrapegraph.tools import GetCreditsTool
-# Will automatically get SGAI_API_KEY from environment, or set it manually
+sgai_logger.set_logging(level="INFO")
+
+# Will automatically get SGAI_API_KEY from environment
tool = GetCreditsTool()
-credits = tool.run()
-print("\nCredits Information:")
-print(f"Remaining Credits: {credits['remaining_credits']}")
-print(f"Total Credits Used: {credits['total_credits_used']}")
+# Use the tool
+credits = tool.invoke({})
+
+print(credits)
diff --git a/examples/localscraper_tool.py b/examples/localscraper_tool.py
new file mode 100644
index 0000000..a8df8ee
--- /dev/null
+++ b/examples/localscraper_tool.py
@@ -0,0 +1,28 @@
+from scrapegraph_py.logger import sgai_logger
+
+from langchain_scrapegraph.tools import LocalScraperTool
+
+sgai_logger.set_logging(level="INFO")
+
+# Will automatically get SGAI_API_KEY from environment
+tool = LocalScraperTool()
+
+# Example website and prompt
+html_content = """
+
+
+ Company Name
+ We are a technology company focused on AI solutions.
+
+
+
+"""
+user_prompt = "Make a summary of the webpage and extract the email and phone number"
+
+# Use the tool
+result = tool.invoke({"website_html": html_content, "user_prompt": user_prompt})
+
+print(result)
diff --git a/examples/markdownify_tool.py b/examples/markdownify_tool.py
new file mode 100644
index 0000000..32dc76f
--- /dev/null
+++ b/examples/markdownify_tool.py
@@ -0,0 +1,16 @@
+from scrapegraph_py.logger import sgai_logger
+
+from langchain_scrapegraph.tools import MarkdownifyTool
+
+sgai_logger.set_logging(level="INFO")
+
+# Will automatically get SGAI_API_KEY from environment
+tool = MarkdownifyTool()
+
+# Example website and prompt
+website_url = "https://www.example.com"
+
+# Use the tool
+result = tool.invoke({"website_url": website_url})
+
+print(result)
diff --git a/examples/smartscraper_tool.py b/examples/smartscraper_tool.py
index f222c69..9f31ba1 100644
--- a/examples/smartscraper_tool.py
+++ b/examples/smartscraper_tool.py
@@ -1,15 +1,17 @@
-from langchain_scrapegraph.tools import SmartscraperTool
+from scrapegraph_py.logger import sgai_logger
-# Will automatically get SGAI_API_KEY from environment, or set it manually
-tool = SmartscraperTool()
+from langchain_scrapegraph.tools import SmartScraperTool
+
+sgai_logger.set_logging(level="INFO")
+
+# Will automatically get SGAI_API_KEY from environment
+tool = SmartScraperTool()
# Example website and prompt
website_url = "https://www.example.com"
user_prompt = "Extract the main heading and first paragraph from this webpage"
-# Use the tool synchronously
-result = tool.run({"user_prompt": user_prompt, "website_url": website_url})
+# Use the tool
+result = tool.invoke({"website_url": website_url, "user_prompt": user_prompt})
-print("\nExtraction Results:")
-print(f"Main Heading: {result['main_heading']}")
-print(f"First Paragraph: {result['first_paragraph']}")
+print(result)
diff --git a/langchain_scrapegraph/tools/__init__.py b/langchain_scrapegraph/tools/__init__.py
index 76c0b8e..a61f301 100644
--- a/langchain_scrapegraph/tools/__init__.py
+++ b/langchain_scrapegraph/tools/__init__.py
@@ -1,4 +1,6 @@
from .credits import GetCreditsTool
-from .smartscraper import SmartscraperTool
+from .localscraper import LocalScraperTool
+from .markdownify import MarkdownifyTool
+from .smartscraper import SmartScraperTool
-__all__ = ["SmartscraperTool", "GetCreditsTool"]
+__all__ = ["SmartScraperTool", "GetCreditsTool", "MarkdownifyTool", "LocalScraperTool"]
diff --git a/langchain_scrapegraph/tools/credits.py b/langchain_scrapegraph/tools/credits.py
index f57ad85..d4ea94e 100644
--- a/langchain_scrapegraph/tools/credits.py
+++ b/langchain_scrapegraph/tools/credits.py
@@ -7,25 +7,72 @@
from langchain_core.tools import BaseTool
from langchain_core.utils import get_from_dict_or_env
from pydantic import model_validator
-from scrapegraph_py import SyncClient
+from scrapegraph_py import Client
class GetCreditsTool(BaseTool):
+ """Tool for checking remaining credits on your ScrapeGraph AI account.
+
+ Setup:
+ Install ``langchain-scrapegraph`` python package:
+
+ .. code-block:: bash
+
+ pip install langchain-scrapegraph
+
+ Get your API key from ScrapeGraph AI (https://scrapegraphai.com)
+ and set it as an environment variable:
+
+ .. code-block:: bash
+
+ export SGAI_API_KEY="your-api-key"
+
+ Key init args:
+ api_key: Your ScrapeGraph AI API key. If not provided, will look for SGAI_API_KEY env var.
+ client: Optional pre-configured ScrapeGraph client instance.
+
+ Instantiate:
+ .. code-block:: python
+
+ from langchain_scrapegraph.tools import GetCreditsTool
+
+ # Will automatically get SGAI_API_KEY from environment
+ tool = GetCreditsTool()
+
+ # Or provide API key directly
+ tool = GetCreditsTool(api_key="your-api-key")
+
+ Use the tool:
+ .. code-block:: python
+
+ result = tool.invoke({})
+
+ print(result)
+ # {
+ # "remaining_credits": 100,
+ # "total_credits_used": 50
+ # }
+
+ Async usage:
+ .. code-block:: python
+
+ result = await tool.ainvoke({})
+ """
+
name: str = "GetCredits"
description: str = (
"Get the current credits available in your ScrapeGraph AI account"
)
return_direct: bool = True
- client: Optional[SyncClient] = None
+ client: Optional[Client] = None
api_key: str
- testing: bool = False
@model_validator(mode="before")
@classmethod
def validate_environment(cls, values: Dict) -> Dict:
"""Validate that api key exists in environment."""
values["api_key"] = get_from_dict_or_env(values, "api_key", "SGAI_API_KEY")
- values["client"] = SyncClient(api_key=values["api_key"])
+ values["client"] = Client(api_key=values["api_key"])
return values
def __init__(self, **data: Any):
diff --git a/langchain_scrapegraph/tools/localscraper.py b/langchain_scrapegraph/tools/localscraper.py
new file mode 100644
index 0000000..0b7382c
--- /dev/null
+++ b/langchain_scrapegraph/tools/localscraper.py
@@ -0,0 +1,137 @@
+from typing import Any, Dict, Optional, Type
+
+from langchain_core.callbacks import (
+ AsyncCallbackManagerForToolRun,
+ CallbackManagerForToolRun,
+)
+from langchain_core.tools import BaseTool
+from langchain_core.utils import get_from_dict_or_env
+from pydantic import BaseModel, Field, model_validator
+from scrapegraph_py import Client
+
+
+class LocalscraperInput(BaseModel):
+ user_prompt: str = Field(
+ description="Prompt describing what to extract from the webpage and how to structure the output"
+ )
+ website_html: str = Field(description="HTML of the webpage to extract data from")
+
+
+class LocalScraperTool(BaseTool):
+ """Tool for extracting structured data from a local HTML file using ScrapeGraph AI.
+
+ Setup:
+ Install ``langchain-scrapegraph`` python package:
+
+ .. code-block:: bash
+
+ pip install langchain-scrapegraph
+
+ Get your API key from ScrapeGraph AI (https://scrapegraphai.com)
+ and set it as an environment variable:
+
+ .. code-block:: bash
+
+ export SGAI_API_KEY="your-api-key"
+
+ Key init args:
+ api_key: Your ScrapeGraph AI API key. If not provided, will look for SGAI_API_KEY env var.
+ client: Optional pre-configured ScrapeGraph client instance.
+
+ Instantiate:
+ .. code-block:: python
+
+ from langchain_scrapegraph.tools import LocalScraperTool
+
+ # Will automatically get SGAI_API_KEY from environment
+ tool = LocalScraperTool()
+
+ # Or provide API key directly
+ tool = LocalScraperTool(api_key="your-api-key")
+
+ Use the tool:
+ .. code-block:: python
+
+ html_content = '''
+
+
+ Company Name
+ We are a technology company focused on AI solutions.
+
+
+
+ '''
+
+ result = tool.invoke({
+ "user_prompt": "Extract company description and contact info",
+ "website_html": html_content
+ })
+
+ print(result)
+ # {
+ # "description": "We are a technology company focused on AI solutions",
+ # "contact": {
+ # "email": "contact@example.com",
+ # "phone": "(555) 123-4567"
+ # }
+ # }
+
+ Async usage:
+ .. code-block:: python
+
+ result = await tool.ainvoke({
+ "user_prompt": "Extract contact information",
+ "website_html": html_content
+ })
+ """
+
+ name: str = "LocalScraper"
+ description: str = (
+ "Useful when you need to extract structured data from a HTML webpage, applying also some reasoning using LLM, by providing an HTML string and an extraction prompt"
+ )
+ args_schema: Type[BaseModel] = LocalscraperInput
+ return_direct: bool = True
+ client: Optional[Client] = None
+ api_key: str
+
+ @model_validator(mode="before")
+ @classmethod
+ def validate_environment(cls, values: Dict) -> Dict:
+ """Validate that api key exists in environment."""
+ values["api_key"] = get_from_dict_or_env(values, "api_key", "SGAI_API_KEY")
+ values["client"] = Client(api_key=values["api_key"])
+ return values
+
+ def __init__(self, **data: Any):
+ super().__init__(**data)
+
+ def _run(
+ self,
+ user_prompt: str,
+ website_html: str,
+ run_manager: Optional[CallbackManagerForToolRun] = None,
+ ) -> dict:
+ """Use the tool to extract data from a website."""
+ if not self.client:
+ raise ValueError("Client not initialized")
+ response = self.client.localscraper(
+ website_html=website_html,
+ user_prompt=user_prompt,
+ )
+ return response["result"]
+
+ async def _arun(
+ self,
+ user_prompt: str,
+ website_html: str,
+ run_manager: Optional[AsyncCallbackManagerForToolRun] = None,
+ ) -> str:
+ """Use the tool asynchronously."""
+ return self._run(
+ user_prompt,
+ website_html,
+ run_manager=run_manager.get_sync() if run_manager else None,
+ )
diff --git a/langchain_scrapegraph/tools/markdownify.py b/langchain_scrapegraph/tools/markdownify.py
new file mode 100644
index 0000000..4750f5b
--- /dev/null
+++ b/langchain_scrapegraph/tools/markdownify.py
@@ -0,0 +1,109 @@
+from typing import Any, Dict, Optional, Type
+
+from langchain_core.callbacks import (
+ AsyncCallbackManagerForToolRun,
+ CallbackManagerForToolRun,
+)
+from langchain_core.tools import BaseTool
+from langchain_core.utils import get_from_dict_or_env
+from pydantic import BaseModel, Field, model_validator
+from scrapegraph_py import Client
+
+
+class MarkdownifyInput(BaseModel):
+ website_url: str = Field(description="Url of the website to convert to Markdown")
+
+
+class MarkdownifyTool(BaseTool):
+ """Tool for converting webpages to Markdown format using ScrapeGraph AI.
+
+ Setup:
+ Install ``langchain-scrapegraph`` python package:
+
+ .. code-block:: bash
+
+ pip install langchain-scrapegraph
+
+ Get your API key from ScrapeGraph AI (https://scrapegraphai.com)
+ and set it as an environment variable:
+
+ .. code-block:: bash
+
+ export SGAI_API_KEY="your-api-key"
+
+ Key init args:
+ api_key: Your ScrapeGraph AI API key. If not provided, will look for SGAI_API_KEY env var.
+ client: Optional pre-configured ScrapeGraph client instance.
+
+ Instantiate:
+ .. code-block:: python
+
+ from langchain_scrapegraph.tools import MarkdownifyTool
+
+ # Will automatically get SGAI_API_KEY from environment
+ tool = MarkdownifyTool()
+
+ # Or provide API key directly
+ tool = MarkdownifyTool(api_key="your-api-key")
+
+ Use the tool:
+ .. code-block:: python
+
+ result = tool.invoke({
+ "website_url": "https://example.com"
+ })
+
+ print(result)
+ # # Example Domain
+ #
+ # This domain is for use in illustrative examples...
+
+ Async usage:
+ .. code-block:: python
+
+ result = await tool.ainvoke({
+ "website_url": "https://example.com"
+ })
+ """
+
+ name: str = "Markdownify"
+ description: str = (
+ "Useful when you need to convert a webpage to Markdown, given a URL as input"
+ )
+ args_schema: Type[BaseModel] = MarkdownifyInput
+ return_direct: bool = True
+ client: Optional[Client] = None
+ api_key: str
+
+ @model_validator(mode="before")
+ @classmethod
+ def validate_environment(cls, values: Dict) -> Dict:
+ """Validate that api key exists in environment."""
+ values["api_key"] = get_from_dict_or_env(values, "api_key", "SGAI_API_KEY")
+ values["client"] = Client(api_key=values["api_key"])
+ return values
+
+ def __init__(self, **data: Any):
+ super().__init__(**data)
+
+ def _run(
+ self,
+ website_url: str,
+ run_manager: Optional[CallbackManagerForToolRun] = None,
+ ) -> dict:
+ """Use the tool to extract data from a website."""
+ if not self.client:
+ raise ValueError("Client not initialized")
+ response = self.client.markdownify(website_url=website_url)
+ return response["result"]
+
+ async def _arun(
+ self,
+ website_url: str,
+ run_manager: Optional[AsyncCallbackManagerForToolRun] = None,
+ ) -> str:
+ """Use the tool asynchronously."""
+ return self._run(
+ website_url,
+ run_manager=run_manager.get_sync() if run_manager else None,
+ )
diff --git a/langchain_scrapegraph/tools/smartscraper.py b/langchain_scrapegraph/tools/smartscraper.py
index a292a99..a48030e 100644
--- a/langchain_scrapegraph/tools/smartscraper.py
+++ b/langchain_scrapegraph/tools/smartscraper.py
@@ -7,33 +7,86 @@
from langchain_core.tools import BaseTool
from langchain_core.utils import get_from_dict_or_env
from pydantic import BaseModel, Field, model_validator
-from scrapegraph_py import SyncClient
+from scrapegraph_py import Client
-class SmartscraperInput(BaseModel):
+class SmartScraperInput(BaseModel):
user_prompt: str = Field(
- description="Prompt describing what to extract from the website and how to structure the output"
+ description="Prompt describing what to extract from the webpage and how to structure the output"
)
- website_url: str = Field(description="Url of the website to extract data from")
+ website_url: str = Field(description="Url of the webpage to extract data from")
-class SmartscraperTool(BaseTool):
- name: str = "Smartscraper"
+class SmartScraperTool(BaseTool):
+ """Tool for extracting structured data from websites using ScrapeGraph AI.
+
+ Setup:
+ Install ``langchain-scrapegraph`` python package:
+
+ .. code-block:: bash
+
+ pip install langchain-scrapegraph
+
+ Get your API key from ScrapeGraph AI (https://scrapegraphai.com)
+ and set it as an environment variable:
+
+ .. code-block:: bash
+
+ export SGAI_API_KEY="your-api-key"
+
+ Key init args:
+ api_key: Your ScrapeGraph AI API key. If not provided, will look for SGAI_API_KEY env var.
+ client: Optional pre-configured ScrapeGraph client instance.
+
+ Instantiate:
+ .. code-block:: python
+
+ from langchain_scrapegraph.tools import SmartScraperTool
+
+ # Will automatically get SGAI_API_KEY from environment
+ tool = SmartScraperTool()
+
+ # Or provide API key directly
+ tool = SmartScraperTool(api_key="your-api-key")
+
+ Use the tool:
+ .. code-block:: python
+
+ result = tool.invoke({
+ "user_prompt": "Extract the main heading and first paragraph",
+ "website_url": "https://example.com"
+ })
+
+ print(result)
+ # {
+ # "main_heading": "Example Domain",
+ # "first_paragraph": "This domain is for use in illustrative examples..."
+ # }
+
+ Async usage:
+ .. code-block:: python
+
+ result = await tool.ainvoke({
+ "user_prompt": "Extract the main heading",
+ "website_url": "https://example.com"
+ })
+ """
+
+ name: str = "SmartScraper"
description: str = (
- "Useful for when you need to extract structured data from a website, applying also some preprocessing reasoning using LLM"
+ "Useful when you need to extract structured data from a webpage, applying also some reasoning using LLM, by providing a webpage URL and an extraction prompt"
)
- args_schema: Type[BaseModel] = SmartscraperInput
+ args_schema: Type[BaseModel] = SmartScraperInput
return_direct: bool = True
- client: Optional[SyncClient] = None
+ client: Optional[Client] = None
api_key: str
- testing: bool = False
@model_validator(mode="before")
@classmethod
def validate_environment(cls, values: Dict) -> Dict:
"""Validate that api key exists in environment."""
values["api_key"] = get_from_dict_or_env(values, "api_key", "SGAI_API_KEY")
- values["client"] = SyncClient(api_key=values["api_key"])
+ values["client"] = Client(api_key=values["api_key"])
return values
def __init__(self, **data: Any):
diff --git a/pyproject.toml b/pyproject.toml
index e638a8d..46b49c3 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
[tool.poetry]
name = "langchain-scrapegraph"
-version = "1.0.0"
+version = "1.0.0b1"
description = "Library for extracting structured data from websites using ScrapeGraphAI"
authors = ["Marco Perini ", "Marco Vinciguerra ", "Lorenzo Padoan "]
license = "MIT"
@@ -36,7 +36,7 @@ packages = [{include = "langchain_scrapegraph"}]
[tool.poetry.dependencies]
python = ">=3.10,<4.0"
langchain-core = "^0.3.21"
-scrapegraph-py = "^1.5.0"
+scrapegraph-py = "^1.7.0"
[tool.poetry.group.test.dependencies]
pytest = "^8.3.4"
diff --git a/tests/integration_tests/test_tools.py b/tests/integration_tests/test_tools.py
index 97a89e6..13fbf9d 100644
--- a/tests/integration_tests/test_tools.py
+++ b/tests/integration_tests/test_tools.py
@@ -13,16 +13,21 @@
from dotenv import load_dotenv
from langchain_tests.integration_tests import ToolsIntegrationTests
-from langchain_scrapegraph.tools import GetCreditsTool, SmartscraperTool
+from langchain_scrapegraph.tools import (
+ GetCreditsTool,
+ LocalScraperTool,
+ MarkdownifyTool,
+ SmartScraperTool,
+)
# Load environment variables from .env file
load_dotenv()
-class TestSmartscraperToolIntegration(ToolsIntegrationTests):
+class TestSmartScraperToolIntegration(ToolsIntegrationTests):
@property
- def tool_constructor(self) -> Type[SmartscraperTool]:
- return SmartscraperTool
+ def tool_constructor(self) -> Type[SmartScraperTool]:
+ return SmartScraperTool
@property
def tool_constructor_params(self) -> dict:
@@ -53,4 +58,52 @@ def tool_constructor_params(self) -> dict:
@property
def tool_invoke_params_example(self) -> dict:
- return {} # GetCredits doesn't require any parameters
+ return {}
+
+
+class TestMarkdownifyToolIntegration(ToolsIntegrationTests):
+ @property
+ def tool_constructor(self) -> Type[MarkdownifyTool]:
+ return MarkdownifyTool
+
+ @property
+ def tool_constructor_params(self) -> dict:
+ api_key = os.getenv("SGAI_API_KEY")
+ if not api_key:
+ pytest.skip("SGAI_API_KEY environment variable not set")
+ return {"api_key": api_key}
+
+ @property
+ def tool_invoke_params_example(self) -> dict:
+ return {"website_url": "https://example.com"}
+
+
+class TestLocalScraperToolIntegration(ToolsIntegrationTests):
+ @property
+ def tool_constructor(self) -> Type[LocalScraperTool]:
+ return LocalScraperTool
+
+ @property
+ def tool_constructor_params(self) -> dict:
+ api_key = os.getenv("SGAI_API_KEY")
+ if not api_key:
+ pytest.skip("SGAI_API_KEY environment variable not set")
+ return {"api_key": api_key}
+
+ @property
+ def tool_invoke_params_example(self) -> dict:
+ return {
+ "user_prompt": "Make a summary and extract contact info",
+ "website_html": """
+
+
+ Company Name
+ We are a technology company focused on AI solutions.
+
+
+
+ """,
+ }
diff --git a/tests/unit_tests/mocks.py b/tests/unit_tests/mocks.py
index 2da0174..740b0d2 100644
--- a/tests/unit_tests/mocks.py
+++ b/tests/unit_tests/mocks.py
@@ -4,7 +4,7 @@
from pydantic import BaseModel, Field
-class MockSyncClient:
+class MockClient:
def __init__(self, api_key: str = None, *args, **kwargs):
"""Initialize with mock methods that return proper response structures"""
self._api_key = api_key
@@ -27,21 +27,53 @@ def get_credits(self) -> dict:
"""Mock get_credits method"""
return {"remaining_credits": 50, "total_credits_used": 543}
+ def markdownify(self, website_url: str) -> dict:
+ """Mock markdownify method"""
+ return {
+ "request_id": "test-id",
+ "status": "completed",
+ "website_url": website_url,
+ "result": "# Example Domain\n\nTest paragraph",
+ "error": "",
+ }
+
+ def localscraper(self, website_html: str, user_prompt: str) -> dict:
+ """Mock localscraper method"""
+ return {
+ "request_id": "test-id",
+ "status": "completed",
+ "user_prompt": user_prompt,
+ "result": {
+ "summary": "This is a technology company",
+ "contact": {"email": "contact@example.com", "phone": "(555) 123-4567"},
+ },
+ "error": "",
+ }
+
def close(self) -> None:
"""Mock close method"""
pass
-class MockSmartscraperInput(BaseModel):
+class MockSmartScraperInput(BaseModel):
user_prompt: str = Field(description="Test prompt")
website_url: str = Field(description="Test URL")
-class MockSmartscraperTool(BaseTool):
- name: str = "Smartscraper"
+class MockMarkdownifyInput(BaseModel):
+ website_url: str = Field(description="Test URL")
+
+
+class MockLocalScraperInput(BaseModel):
+ user_prompt: str = Field(description="Test prompt")
+ website_html: str = Field(description="Test HTML")
+
+
+class MockSmartScraperTool(BaseTool):
+ name: str = "SmartScraper"
description: str = "Test description"
- args_schema: type[BaseModel] = MockSmartscraperInput
- client: Optional[MockSyncClient] = None
+ args_schema: type[BaseModel] = MockSmartScraperInput
+ client: Optional[MockClient] = None
api_key: str
def _run(self, **kwargs: Any) -> Dict:
@@ -51,8 +83,33 @@ def _run(self, **kwargs: Any) -> Dict:
class MockGetCreditsTool(BaseTool):
name: str = "GetCredits"
description: str = "Test description"
- client: Optional[MockSyncClient] = None
+ client: Optional[MockClient] = None
api_key: str
def _run(self, **kwargs: Any) -> Dict:
return {"remaining_credits": 50, "total_credits_used": 543}
+
+
+class MockMarkdownifyTool(BaseTool):
+ name: str = "Markdownify"
+ description: str = "Test description"
+ args_schema: type[BaseModel] = MockMarkdownifyInput
+ client: Optional[MockClient] = None
+ api_key: str
+
+ def _run(self, **kwargs: Any) -> str:
+ return "# Example Domain\n\nTest paragraph"
+
+
+class MockLocalScraperTool(BaseTool):
+ name: str = "LocalScraper"
+ description: str = "Test description"
+ args_schema: type[BaseModel] = MockLocalScraperInput
+ client: Optional[MockClient] = None
+ api_key: str
+
+ def _run(self, **kwargs: Any) -> Dict:
+ return {
+ "summary": "This is a technology company",
+ "contact": {"email": "contact@example.com", "phone": "(555) 123-4567"},
+ }
diff --git a/tests/unit_tests/test_tools.py b/tests/unit_tests/test_tools.py
index fe434e3..2ac0876 100644
--- a/tests/unit_tests/test_tools.py
+++ b/tests/unit_tests/test_tools.py
@@ -3,24 +3,29 @@
from langchain_tests.unit_tests import ToolsUnitTests
-from langchain_scrapegraph.tools import GetCreditsTool, SmartscraperTool
+from langchain_scrapegraph.tools import (
+ GetCreditsTool,
+ LocalScraperTool,
+ MarkdownifyTool,
+ SmartScraperTool,
+)
from tests.unit_tests.mocks import (
+ MockClient,
MockGetCreditsTool,
- MockSmartscraperTool,
- MockSyncClient,
+ MockLocalScraperTool,
+ MockMarkdownifyTool,
+ MockSmartScraperTool,
)
-class TestSmartscraperToolUnit(ToolsUnitTests):
+class TestSmartScraperToolUnit(ToolsUnitTests):
@property
- def tool_constructor(self) -> Type[SmartscraperTool]:
- return MockSmartscraperTool
+ def tool_constructor(self) -> Type[SmartScraperTool]:
+ return MockSmartScraperTool
@property
def tool_constructor_params(self) -> dict:
- with patch(
- "langchain_scrapegraph.tools.smartscraper.SyncClient", MockSyncClient
- ):
+ with patch("langchain_scrapegraph.tools.smartscraper.Client", MockClient):
return {"api_key": "sgai-test-api-key"}
@property
@@ -38,9 +43,42 @@ def tool_constructor(self) -> Type[GetCreditsTool]:
@property
def tool_constructor_params(self) -> dict:
- with patch("langchain_scrapegraph.tools.credits.SyncClient", MockSyncClient):
+ with patch("langchain_scrapegraph.tools.credits.Client", MockClient):
return {"api_key": "sgai-test-api-key"}
@property
def tool_invoke_params_example(self) -> dict:
return {}
+
+
+class TestMarkdownifyToolUnit(ToolsUnitTests):
+ @property
+ def tool_constructor(self) -> Type[MarkdownifyTool]:
+ return MockMarkdownifyTool
+
+ @property
+ def tool_constructor_params(self) -> dict:
+ with patch("langchain_scrapegraph.tools.markdownify.Client", MockClient):
+ return {"api_key": "sgai-test-api-key"}
+
+ @property
+ def tool_invoke_params_example(self) -> dict:
+ return {"website_url": "https://example.com"}
+
+
+class TestLocalScraperToolUnit(ToolsUnitTests):
+ @property
+ def tool_constructor(self) -> Type[LocalScraperTool]:
+ return MockLocalScraperTool
+
+ @property
+ def tool_constructor_params(self) -> dict:
+ with patch("langchain_scrapegraph.tools.localscraper.Client", MockClient):
+ return {"api_key": "sgai-test-api-key"}
+
+ @property
+ def tool_invoke_params_example(self) -> dict:
+ return {
+ "user_prompt": "Make a summary and extract contact info",
+ "website_html": "Test
",
+ }