run-llama · hatianzhang · Mar 1, 2024 · Feb 29, 2024
diff --git a/llama-index-integrations/readers/llama-index-readers-feishu-wiki/.gitignore b/llama-index-integrations/readers/llama-index-readers-feishu-wiki/.gitignore
@@ -0,0 +1,153 @@
+llama_index/_static
+.DS_Store
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+bin/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+etc/
+include/
+lib/
+lib64/
+parts/
+sdist/
+share/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+.ruff_cache
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+notebooks/
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+pyvenv.cfg
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# Jetbrains
+.idea
+modules/
+*.swp
+
+# VsCode
+.vscode
+
+# pipenv
+Pipfile
+Pipfile.lock
+
+# pyright
+pyrightconfig.json
diff --git a/llama-index-integrations/readers/llama-index-readers-feishu-wiki/BUILD b/llama-index-integrations/readers/llama-index-readers-feishu-wiki/BUILD
@@ -0,0 +1,3 @@
+poetry_requirements(
+    name="poetry",
+)
diff --git a/llama-index-integrations/readers/llama-index-readers-feishu-wiki/CHANGELOG.md b/llama-index-integrations/readers/llama-index-readers-feishu-wiki/CHANGELOG.md
@@ -0,0 +1,5 @@
+# CHANGELOG
+
+## [0.1.2] - 2024-02-13
+
+- Add maintainers and keywords from library.json (llamahub)
diff --git a/llama-index-integrations/readers/llama-index-readers-feishu-wiki/Makefile b/llama-index-integrations/readers/llama-index-readers-feishu-wiki/Makefile
@@ -0,0 +1,17 @@
+GIT_ROOT ?= $(shell git rev-parse --show-toplevel)
+
+help:	## Show all Makefile targets.
+	@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[33m%-30s\033[0m %s\n", $$1, $$2}'
+
+format:	## Run code autoformatters (black).
+	pre-commit install
+	git ls-files | xargs pre-commit run black --files
+
+lint:	## Run linters: pre-commit (black, ruff, codespell) and mypy
+	pre-commit install && git ls-files | xargs pre-commit run --show-diff-on-failure --files
+
+test:	## Run tests via pytest.
+	pytest tests
+
+watch-docs:	## Build and watch documentation.
+	sphinx-autobuild docs/ docs/_build/html --open-browser --watch $(GIT_ROOT)/llama_index/
diff --git a/llama-index-integrations/readers/llama-index-readers-feishu-wiki/README.md b/llama-index-integrations/readers/llama-index-readers-feishu-wiki/README.md
@@ -0,0 +1,24 @@
+# Feishu Wiki Loader
+
+This loader can traverse all feishu documents under the feishi space.
+
+## Usage
+
+To use this loader, you need to:
+
+1. apply the permission(`wiki:wiki:readonly`) of the feishu app
+2. add the feishu app as the admin of your feishu space, see [here](https://open.feishu.cn/document/server-docs/docs/wiki-v2/wiki-qa#b5da330b) for more help
+3. finally, pass your feishu space id to this loader
+
+```python
+from llama_index import download_loader
+
+app_id = "xxx"
+app_secret = "xxx"
+space_id = "xxx"
+FeishuWikiReader = download_loader("FeishuWikiReader")
+loader = FeishuWikiReader(app_id, app_secret)
+documents = loader.load_data(space_id=space_id)
+```
+
+This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/run-llama/llama_index/tree/main/llama_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples.
diff --git a/...ntegrations/readers/llama-index-readers-feishu-wiki/llama_index/readers/feishu_wiki/BUILD b/...ntegrations/readers/llama-index-readers-feishu-wiki/llama_index/readers/feishu_wiki/BUILD
@@ -0,0 +1 @@
+python_sources()
diff --git a/...tions/readers/llama-index-readers-feishu-wiki/llama_index/readers/feishu_wiki/__init__.py b/...tions/readers/llama-index-readers-feishu-wiki/llama_index/readers/feishu_wiki/__init__.py
@@ -0,0 +1,3 @@
+from llama_index.readers.feishu_wiki.base import FeishuWikiReader
+
+__all__ = ["FeishuWikiReader"]
diff --git a/...egrations/readers/llama-index-readers-feishu-wiki/llama_index/readers/feishu_wiki/base.py b/...egrations/readers/llama-index-readers-feishu-wiki/llama_index/readers/feishu_wiki/base.py
@@ -0,0 +1,150 @@
+"""Feishu wiki reader."""
+import json
+import os
+import time
+from typing import List
+
+import requests
+from llama_index.core.readers.base import BaseReader
+from llama_index.core.schema import Document
+
+# Copyright (2023) Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+class FeishuWikiReader(BaseReader):
+    """Feishu Wiki reader.
+
+    Reads pages from Feishu wiki under the space
+
+    """
+
+    host = "https://open.feishu.cn"
+    wiki_nodes_url_path = "/open-apis/wiki/v2/spaces/{}/nodes"
+    documents_raw_content_url_path = "/open-apis/docx/v1/documents/{}/raw_content"
+    tenant_access_token_internal_url_path = (
+        "/open-apis/auth/v3/tenant_access_token/internal"
+    )
+
+    def __init__(self, app_id: str, app_secret: str) -> None:
+        """
+
+        Args:
+            app_id: The unique identifier of the application is obtained after the application is created.
+            app_secret: Application key, obtained after creating the application.
+        """
+        super().__init__()
+        self.app_id = app_id
+        self.app_secret = app_secret
+
+        self.tenant_access_token = ""
+        self.expire = 0
+
+    def load_data(self, space_id: str, parent_node_token: str = None) -> List[Document]:
+        """Load data from the input directory.
+
+        Args:
+            space_id (str): a space id.
+            parent_node_token (str[optional]): a parent node token of the space
+        """
+        if space_id is None:
+            raise ValueError('Must specify a "space_id" in `load_kwargs`.')
+
+        document_ids = self._load_space(space_id, parent_node_token=parent_node_token)
+        document_ids = list(set(document_ids))
+
+        results = []
+        for document_id in document_ids:
+            doc = self._load_doc(document_id)
+            results.append(Document(text=doc, extra_info={"document_id": document_id}))
+        return results
+
+    def _load_space(self, space_id: str, parent_node_token: str = None) -> str:
+        if self.tenant_access_token == "" or self.expire < time.time():
+            self._update_tenant_access_token()
+        headers = {
+            "Authorization": f"Bearer {self.tenant_access_token}",
+            "Content-Type": "application/json; charset=utf-8",
+        }
+
+        url = self.host + self.wiki_spaces_url_path.format(space_id)
+        if parent_node_token:
+            url += f"?parent_node_token={parent_node_token}"
+        try:
+            response = requests.get(url, headers=headers)
+            result = response.json()
+        except Exception:
+            return []
+        if not result.get("data"):
+            return []
+        obj_token_list = []
+        for item in result["data"]["items"]:
+            obj_token_list.append(item["obj_token"])
+            if item["has_child"]:
+                child_obj_token_list = self._load_space(
+                    space_id=space_id, parent_node_token=item["node_token"]
+                )
+                if child_obj_token_list:
+                    obj_token_list.extend(child_obj_token_list)
+        return obj_token_list
+
+    def _load_doc(self, document_id: str) -> str:
+        """Load a document from Feishu Docs.
+
+        Args:
+            document_id: the document id.
+
+        Returns:
+            The document text.
+        """
+        url = self.host + self.documents_raw_content_url_path.format(document_id)
+        if self.tenant_access_token == "" or self.expire < time.time():
+            self._update_tenant_access_token()
+        headers = {
+            "Authorization": f"Bearer {self.tenant_access_token}",
+            "Content-Type": "application/json; charset=utf-8",
+        }
+        try:
+            response = requests.get(url, headers=headers)
+            result = response.json()
+        except Exception:
+            return None
+        if not result.get("data"):
+            return None
+        return result["data"]["content"]
+
+    def _update_tenant_access_token(self) -> None:
+        """For update tenant_access_token."""
+        url = self.host + self.tenant_access_token_internal_url_path
+        headers = {"Content-Type": "application/json; charset=utf-8"}
+        data = {"app_id": self.app_id, "app_secret": self.app_secret}
+        response = requests.post(url, data=json.dumps(data), headers=headers)
+        self.tenant_access_token = response.json()["tenant_access_token"]
+        self.expire = time.time() + response.json()["expire"]
+
+    def set_lark_domain(self, host: str) -> None:
+        """Set lark domain."""
+        self.host = host
+
+
+if __name__ == "__main__":
+    app_id = os.environ.get("FEISHU_APP_ID")
+    app_secret = os.environ.get("FEISHU_APP_SECRET")
+    reader = FeishuWikiReader(app_id, app_secret)
+    print(
+        reader.load_data(
+            space_id=os.environ.get("FEISHU_SPACE_ID"),
+            parent_node_token=os.environ.get("FEISHU_PARENT_NODE_TOKEN"),
+        )
+    )
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		from llama_index.readers.feishu_wiki.base import FeishuWikiReader

		__all__ = ["FeishuWikiReader"]