Add Zyte Web Reader (#16197)

run-llama · Sep 27, 2024 · 86bfe12 · 86bfe12
1 parent 3635f7e
commit 86bfe12
Show file tree

Hide file tree

Showing 8 changed files with 357 additions and 4 deletions.
diff --git a/docs/docs/examples/data_connectors/WebPageDemo.ipynb b/docs/docs/examples/data_connectors/WebPageDemo.ipynb
@@ -1,7 +1,6 @@
 {
  "cells": [
   {
-   "attachments": {},
    "cell_type": "markdown",
    "id": "5747e926",
    "metadata": {},
@@ -599,13 +598,136 @@
     "    scrape_format=\"markdown\",  # The scrape result format, either `markdown`(default) or `text`\n",
     ")"
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f81ccdb7",
+   "metadata": {},
+   "source": [
+    "# Using ZyteWebReader"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "aee6d871",
+   "metadata": {},
+   "source": [
+    "ZyteWebReader allows a user to access the content of webpage in different modes (\"article\", \"html-text\", \"html\"). \n",
+    "It enables user to change setting such as browser rendering and JS as the content of many sites would require setting these options to access relevant content. All supported options can be found here: https://docs.zyte.com/zyte-api/usage/reference.html\n",
+    "\n",
+    "To install dependencies:\n",
+    "```shell\n",
+    "pip install zyte-api\n",
+    "```\n",
+    "\n",
+    "To get access to your ZYTE API key please visit: https://www.zyte.com/zyte-api/"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f49f22bf",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from llama_index.readers.web import ZyteWebReader\n",
+    "\n",
+    "# Required to run it in notebook\n",
+    "# import nest_asyncio\n",
+    "# nest_asyncio.apply()\n",
+    "\n",
+    "zyte_dw_params = {\n",
+    "    \"browserHtml\": True,  # Enable browser rendering\n",
+    "    \"javascript\": True,  # Enable JavaScript\n",
+    "}\n",
+    "\n",
+    "# Initiate ZyteWebReader with your Zyte API key\n",
+    "zyte_reader = ZyteWebReader(\n",
+    "    api_key=\"Your Zyte API Key\",\n",
+    "    download_kwargs=zyte_dw_params,\n",
+    ")\n",
+    "\n",
+    "# Load documents from URLs as markdown\n",
+    "documents = zyte_reader.load_data(\n",
+    "    urls=[\"https://www.zyte.com/blog/system-integrators-extract-big-data/\"],\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "74b5d21f-7f53-4412-8f11-bbc84d85a1b5",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "7150"
+      ]
+     },
+     "execution_count": null,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(documents[0].text)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "006254a3-5af8-4a0d-8bf0-b16b9e3dce5c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "zyte_reader = ZyteWebReader(\n",
+    "    api_key=\"Your API Key\",\n",
+    "    mode=\"html-text\",\n",
+    "    download_kwargs=zyte_dw_params,\n",
+    ")\n",
+    "\n",
+    "# Load documents from URLs as markdown\n",
+    "documents = zyte_reader.load_data(\n",
+    "    urls=[\"https://www.zyte.com/blog/system-integrators-extract-big-data/\"],\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3bfb8e5d-7690-4a55-9052-365cbf2c9ce8",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "19554"
+      ]
+     },
+     "execution_count": null,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(documents[0].text)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f642faae-198e-4fad-9742-c590991c8810",
+   "metadata": {},
+   "source": [
+    "In default mode (\"article\") only the article text is extracted while in the \"html-text\" full text is extracted from the webpage, there the length of the text is significantly longer. "
+   ]
   }
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
+   "display_name": "forked-llama",
    "language": "python",
-   "name": "python3"
+   "name": "forked-llama"
   },
   "language_info": {
    "codemirror_mode": {

diff --git a/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/__init__.py b/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/__init__.py
@@ -44,6 +44,9 @@
 from llama_index.readers.web.whole_site.base import (
     WholeSiteReader,
 )
+from llama_index.readers.web.zyte_web.base import (
+    ZyteWebReader,
+)
 
 
 __all__ = [
@@ -64,4 +67,5 @@
     "TrafilaturaWebReader",
     "UnstructuredURLLoader",
     "WholeSiteReader",
+    "ZyteWebReader",
 ]
diff --git a/...index-integrations/readers/llama-index-readers-web/llama_index/readers/web/zyte_web/BUILD b/...index-integrations/readers/llama-index-readers-web/llama_index/readers/web/zyte_web/BUILD
@@ -0,0 +1,5 @@
+python_requirements(
+    name="reqs",
+)
+
+python_sources()
diff --git a/...ions/readers/llama-index-readers-web/llama_index/readers/web/zyte_web/README.md b/...ions/readers/llama-index-readers-web/llama_index/readers/web/zyte_web/README.md
@@ -0,0 +1,63 @@
+# ZyteWebReader
+
+## Instructions for ZyteWebReader
+
+### Setup and Installation
+
+`pip install llama-index-readers-web`
+
+1. **Install zyte-api Package**: Ensure the `zyte-api` package is installed to use the ZyteWebReader. Install it via pip with the following command:
+
+   ```bash
+   pip install zyte-api
+   ```
+
+   Additionally if you are planning on using "html-text" mode, you'll also need to install `html2text`
+
+   ```bash
+   pip install html2text
+   ```
+
+2. **API Key**: Secure an API key from [Zyte](https://www.zyte.com/zyte-api/) to access the Zyte services.
+
+### Using ZyteWebReader
+
+- **Initialization**: Initialize the ZyteWebReader by providing the API key, the desired mode of operation (`article`, `html-text`, or `html`), and any optional parameters for the Zyte API.
+
+  ```python
+  from llama_index.readers.web.zyte_web.base import ZyteWebReader
+
+  zyte_reader = ZyteWebReader(
+      api_key="your_api_key_here",
+      mode="article",  # or "html" or "html-text"
+      n_conn=5,  # number of concurrent connections
+      download_kwargs={"additional": "parameters"},
+  )
+  ```
+
+- **Loading Data**: To load data, use the `load_data` method with the URLs you wish to process.
+
+```python
+documents = zyte_reader.load_data(urls=["http://example.com"])
+```
+
+### Example Usage
+
+Here is an example demonstrating how to initialize the ZyteWebReader, load document from a URL.
+
+```python
+# Initialize the ZyteWebReader with your API key and desired mode
+zyte_reader = ZyteWebReader(
+    api_key="your_api_key_here",  # Replace with your actual API key
+    mode="article",  # Choose between "article", "html-text", and "html"
+    download_kwargs={
+        "additional": "parameters"
+    },  # Optional additional parameters
+)
+
+# Load documents from Paul Graham's essay URL
+documents = zyte_reader.load_data(urls=["http://www.paulgraham.com/"])
+
+# Display the document
+print(documents)
+```
diff --git a/...integrations/readers/llama-index-readers-web/llama_index/readers/web/zyte_web/__init__.py b/...integrations/readers/llama-index-readers-web/llama_index/readers/web/zyte_web/__init__.py
diff --git a/...dex-integrations/readers/llama-index-readers-web/llama_index/readers/web/zyte_web/base.py b/...dex-integrations/readers/llama-index-readers-web/llama_index/readers/web/zyte_web/base.py
@@ -0,0 +1,157 @@
+"""Zyte Web Reader."""
+import asyncio
+import logging
+from base64 import b64decode
+from typing import Any, Dict, List, Literal, Optional
+from pydantic import Field
+
+from llama_index.core.readers.base import BasePydanticReader
+from llama_index.core.schema import Document
+
+logger = logging.getLogger(__name__)
+
+
+class ZyteWebReader(BasePydanticReader):
+    """Load text from URLs using `Zyte api`.
+
+    Args:
+        api_key: Zyte API key.
+        mode: Determines how the text is extracted for the page content.
+            It can take one of the following values: 'html', 'html-text', 'article'
+        n_conn: It is the maximum number of concurrent requests to use.
+        **download_kwargs: Any additional download arguments to pass for download.
+            See: https://docs.zyte.com/zyte-api/usage/reference.html
+
+    Example:
+        .. code-block:: python
+
+            from llama_index.readers.web import ZyteWebReader
+
+            reader = ZyteWebReader(
+               api_key="ZYTE_API_KEY",
+            )
+            docs = reader.load_data(
+                urls=["<url-1>", "<url-2>"],
+            )
+
+    Zyte-API reference:
+        https://www.zyte.com/zyte-api/
+
+    """
+
+    client_async: Optional[object] = Field(None)
+    api_key: str
+    mode: str
+    n_conn: int
+    download_kwargs: Optional[dict]
+
+    def __init__(
+        self,
+        api_key: str,
+        mode: Literal["article", "html", "html-text"] = "article",
+        n_conn: int = 15,
+        download_kwargs: Optional[Dict[str, Any]] = None,
+    ) -> None:
+        """Initialize with file path."""
+        super().__init__(
+            api_key=api_key, mode=mode, n_conn=n_conn, download_kwargs=download_kwargs
+        )
+        try:
+            from zyte_api import AsyncZyteAPI
+            from zyte_api.utils import USER_AGENT as PYTHON_ZYTE_API_USER_AGENT
+
+        except ImportError:
+            raise ImportError(
+                "zyte-api package not found, please install it with "
+                "`pip install zyte-api`"
+            )
+        if mode not in ("article", "html", "html-text"):
+            raise ValueError(
+                f"Unrecognized mode '{mode}'. Expected one of "
+                f"'article', 'html', 'html-text'."
+            )
+
+        user_agent = f"llama-index-zyte-api/{PYTHON_ZYTE_API_USER_AGENT}"
+        self.client_async = AsyncZyteAPI(
+            api_key=api_key, user_agent=user_agent, n_conn=n_conn
+        )
+
+    @classmethod
+    def class_name(cls) -> str:
+        return "ZyteWebReader"
+
+    def _zyte_html_option(self) -> str:
+        if "browserHtml" in self.download_kwargs:
+            return "browserHtml"
+        return "httpResponseBody"
+
+    def _get_article(self, page: Dict) -> str:
+        return page["article"]["headline"] + "\n\n" + page["article"]["articleBody"]
+
+    def _zyte_request_params(self, url: str) -> dict:
+        request_params: Dict[str, Any] = {"url": url}
+        if self.mode == "article":
+            request_params.update({"article": True})
+
+        if self.mode in ("html", "html-text"):
+            request_params.update({self._zyte_html_option(): True})
+
+        if self.download_kwargs:
+            request_params.update(self.download_kwargs)
+        return request_params
+
+    async def fetch_items(self, urls) -> List:
+        results = []
+        queries = [self._zyte_request_params(url) for url in urls]
+        async with self.client_async.session() as session:
+            for i, future in enumerate(session.iter(queries)):
+                try:
+                    result = await future
+                    results.append(result)
+                except Exception as e:
+                    url = queries[i]["url"]
+                    if self.continue_on_failure:
+                        logger.warning(
+                            f"Error {e} while fetching url {url}, "
+                            f"skipping because continue_on_failure is True"
+                        )
+                        continue
+                    else:
+                        logger.exception(
+                            f"Error fetching {url} and aborting, use "
+                            f"continue_on_failure=True to continue loading "
+                            f"urls after encountering an error."
+                        )
+                        raise
+        return results
+
+    def _get_content(self, response: Dict) -> str:
+        if self.mode == "html-text":
+            try:
+                from html2text import html2text
+
+            except ImportError:
+                raise ImportError(
+                    "html2text package not found, please install it with "
+                    "`pip install html2text`"
+                )
+        if self.mode in ("html", "html-text"):
+            content = response[self._zyte_html_option()]
+
+            if self._zyte_html_option() == "httpResponseBody":
+                content = b64decode(content).decode()
+
+            if self.mode == "html-text":
+                content = html2text(content)
+        elif self.mode == "article":
+            content = self._get_article(response)
+        return content
+
+    def load_data(self, urls) -> List[Document]:
+        docs = []
+        responses = asyncio.run(self.fetch_items(urls))
+        for response in responses:
+            content = self._get_content(response)
+            doc = Document(text=content, metadata={"url": response["url"]})
+            docs.append(doc)
+        return docs
diff --git a/...rations/readers/llama-index-readers-web/llama_index/readers/web/zyte_web/requirements.txt b/...rations/readers/llama-index-readers-web/llama_index/readers/web/zyte_web/requirements.txt
@@ -0,0 +1,2 @@
+zyte-api
+html2text