From 1fd52b43cc7cffb57080e1ff2abbab269ffa0c72 Mon Sep 17 00:00:00 2001 From: zhourunlai Date: Thu, 29 Feb 2024 11:42:17 +0800 Subject: [PATCH] llama-index-readers-feishu-wiki --- .../.gitignore | 153 ++++++++++++++++++ .../llama-index-readers-feishu-wiki/BUILD | 3 + .../CHANGELOG.md | 5 + .../llama-index-readers-feishu-wiki/Makefile | 17 ++ .../llama-index-readers-feishu-wiki/README.md | 23 +++ .../llama_index/readers/feishu_wiki/BUILD | 1 + .../readers/feishu_wiki/__init__.py | 3 + .../llama_index/readers/feishu_wiki/base.py | 151 +++++++++++++++++ .../pyproject.toml | 64 ++++++++ .../tests/BUILD | 1 + .../tests/__init__.py | 0 .../tests/test_readers_feishu_wiki.py | 7 + 12 files changed, 428 insertions(+) create mode 100644 llama-index-integrations/readers/llama-index-readers-feishu-wiki/.gitignore create mode 100644 llama-index-integrations/readers/llama-index-readers-feishu-wiki/BUILD create mode 100644 llama-index-integrations/readers/llama-index-readers-feishu-wiki/CHANGELOG.md create mode 100644 llama-index-integrations/readers/llama-index-readers-feishu-wiki/Makefile create mode 100644 llama-index-integrations/readers/llama-index-readers-feishu-wiki/README.md create mode 100644 llama-index-integrations/readers/llama-index-readers-feishu-wiki/llama_index/readers/feishu_wiki/BUILD create mode 100644 llama-index-integrations/readers/llama-index-readers-feishu-wiki/llama_index/readers/feishu_wiki/__init__.py create mode 100644 llama-index-integrations/readers/llama-index-readers-feishu-wiki/llama_index/readers/feishu_wiki/base.py create mode 100644 llama-index-integrations/readers/llama-index-readers-feishu-wiki/pyproject.toml create mode 100644 llama-index-integrations/readers/llama-index-readers-feishu-wiki/tests/BUILD create mode 100644 llama-index-integrations/readers/llama-index-readers-feishu-wiki/tests/__init__.py create mode 100644 llama-index-integrations/readers/llama-index-readers-feishu-wiki/tests/test_readers_feishu_wiki.py diff --git a/llama-index-integrations/readers/llama-index-readers-feishu-wiki/.gitignore b/llama-index-integrations/readers/llama-index-readers-feishu-wiki/.gitignore new file mode 100644 index 00000000000000..990c18de229088 --- /dev/null +++ b/llama-index-integrations/readers/llama-index-readers-feishu-wiki/.gitignore @@ -0,0 +1,153 @@ +llama_index/_static +.DS_Store +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +bin/ +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +etc/ +include/ +lib/ +lib64/ +parts/ +sdist/ +share/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +.ruff_cache + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints +notebooks/ + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ +pyvenv.cfg + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# Jetbrains +.idea +modules/ +*.swp + +# VsCode +.vscode + +# pipenv +Pipfile +Pipfile.lock + +# pyright +pyrightconfig.json diff --git a/llama-index-integrations/readers/llama-index-readers-feishu-wiki/BUILD b/llama-index-integrations/readers/llama-index-readers-feishu-wiki/BUILD new file mode 100644 index 00000000000000..0896ca890d8bff --- /dev/null +++ b/llama-index-integrations/readers/llama-index-readers-feishu-wiki/BUILD @@ -0,0 +1,3 @@ +poetry_requirements( + name="poetry", +) diff --git a/llama-index-integrations/readers/llama-index-readers-feishu-wiki/CHANGELOG.md b/llama-index-integrations/readers/llama-index-readers-feishu-wiki/CHANGELOG.md new file mode 100644 index 00000000000000..36bff877abcbe4 --- /dev/null +++ b/llama-index-integrations/readers/llama-index-readers-feishu-wiki/CHANGELOG.md @@ -0,0 +1,5 @@ +# CHANGELOG + +## [0.1.2] - 2024-02-13 + +- Add maintainers and keywords from library.json (llamahub) diff --git a/llama-index-integrations/readers/llama-index-readers-feishu-wiki/Makefile b/llama-index-integrations/readers/llama-index-readers-feishu-wiki/Makefile new file mode 100644 index 00000000000000..b9eab05aa37062 --- /dev/null +++ b/llama-index-integrations/readers/llama-index-readers-feishu-wiki/Makefile @@ -0,0 +1,17 @@ +GIT_ROOT ?= $(shell git rev-parse --show-toplevel) + +help: ## Show all Makefile targets. + @grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[33m%-30s\033[0m %s\n", $$1, $$2}' + +format: ## Run code autoformatters (black). + pre-commit install + git ls-files | xargs pre-commit run black --files + +lint: ## Run linters: pre-commit (black, ruff, codespell) and mypy + pre-commit install && git ls-files | xargs pre-commit run --show-diff-on-failure --files + +test: ## Run tests via pytest. + pytest tests + +watch-docs: ## Build and watch documentation. + sphinx-autobuild docs/ docs/_build/html --open-browser --watch $(GIT_ROOT)/llama_index/ diff --git a/llama-index-integrations/readers/llama-index-readers-feishu-wiki/README.md b/llama-index-integrations/readers/llama-index-readers-feishu-wiki/README.md new file mode 100644 index 00000000000000..9d574f5c3f57f8 --- /dev/null +++ b/llama-index-integrations/readers/llama-index-readers-feishu-wiki/README.md @@ -0,0 +1,23 @@ +# Feishu Wiki Loader + +This loader can traverse all feishu documents under the feishi space. + +## Usage + +To use this loader, you need to: +1. apply the permission(`wiki:wiki:readonly`) of the feishu app +2. add the feishu app as the admin of your feishu space, see [here](https://open.feishu.cn/document/server-docs/docs/wiki-v2/wiki-qa#b5da330b) for more help +3. finally, pass your feishu space id to this loader + +```python +from llama_index import download_loader + +app_id = "xxx" +app_secret = "xxx" +space_id = "xxx" +FeishuWikiReader = download_loader("FeishuWikiReader") +loader = FeishuWikiReader(app_id, app_secret) +documents = loader.load_data(space_id=space_id) +``` + +This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/run-llama/llama_index/tree/main/llama_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples. diff --git a/llama-index-integrations/readers/llama-index-readers-feishu-wiki/llama_index/readers/feishu_wiki/BUILD b/llama-index-integrations/readers/llama-index-readers-feishu-wiki/llama_index/readers/feishu_wiki/BUILD new file mode 100644 index 00000000000000..db46e8d6c978c6 --- /dev/null +++ b/llama-index-integrations/readers/llama-index-readers-feishu-wiki/llama_index/readers/feishu_wiki/BUILD @@ -0,0 +1 @@ +python_sources() diff --git a/llama-index-integrations/readers/llama-index-readers-feishu-wiki/llama_index/readers/feishu_wiki/__init__.py b/llama-index-integrations/readers/llama-index-readers-feishu-wiki/llama_index/readers/feishu_wiki/__init__.py new file mode 100644 index 00000000000000..3a4f56d259dcc8 --- /dev/null +++ b/llama-index-integrations/readers/llama-index-readers-feishu-wiki/llama_index/readers/feishu_wiki/__init__.py @@ -0,0 +1,3 @@ +from llama_index.readers.feishu_wiki.base import FeishuWikiReader + +__all__ = ["FeishuWikiReader"] diff --git a/llama-index-integrations/readers/llama-index-readers-feishu-wiki/llama_index/readers/feishu_wiki/base.py b/llama-index-integrations/readers/llama-index-readers-feishu-wiki/llama_index/readers/feishu_wiki/base.py new file mode 100644 index 00000000000000..718708dbc795a2 --- /dev/null +++ b/llama-index-integrations/readers/llama-index-readers-feishu-wiki/llama_index/readers/feishu_wiki/base.py @@ -0,0 +1,151 @@ +"""Feishu wiki reader.""" +import json +import os +import time +from typing import List + +import requests +from llama_index.core.readers.base import BaseReader +from llama_index.core.schema import Document + +# Copyright (2023) Bytedance Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +class FeishuWikiReader(BaseReader): + """Feishu Wiki reader. + + Reads pages from Feishu wiki under the space + + """ + + host = "https://open.feishu.cn" + wiki_nodes_url_path = "/open-apis/wiki/v2/spaces/{}/nodes" + documents_raw_content_url_path = "/open-apis/docx/v1/documents/{}/raw_content" + tenant_access_token_internal_url_path = ( + "/open-apis/auth/v3/tenant_access_token/internal") + + def __init__(self, app_id, app_secret) -> None: + """ + + Args: + app_id: The unique identifier of the application is obtained after the application is created. + app_secret: Application key, obtained after creating the application. + """ + super().__init__() + self.app_id = app_id + self.app_secret = app_secret + + self.tenant_access_token = "" + self.expire = 0 + + def load_data(self, + space_id: str, + parent_node_token: str = None) -> List[Document]: + """Load data from the input directory. + + Args: + space_id (str): a space id. + parent_node_token (str[optional]): a parent node token of the space + """ + if space_id is None: + raise ValueError('Must specify a "space_id" in `load_kwargs`.') + + document_ids = self._load_space(space_id, + parent_node_token=parent_node_token) + document_ids = list(set(document_ids)) + + results = [] + for document_id in document_ids: + doc = self._load_doc(document_id) + results.append( + Document(text=doc, extra_info={"document_id": document_id})) + return results + + def _load_space(self, space_id, parent_node_token=None) -> str: + if self.tenant_access_token == "" or self.expire < time.time(): + self._update_tenant_access_token() + headers = { + "Authorization": "Bearer {}".format(self.tenant_access_token), + "Content-Type": "application/json; charset=utf-8", + } + + url = self.host + self.wiki_spaces_url_path.format(space_id) + if parent_node_token: + url += "?parent_node_token={}".format(parent_node_token) + try: + response = requests.get(url, headers=headers) + result = response.json() + except Exception: + return [] + if not result.get("data"): + return [] + obj_token_list = [] + for item in result["data"]["items"]: + obj_token_list.append(item["obj_token"]) + if item["has_child"]: + child_obj_token_list = self._load_space( + space_id=space_id, parent_node_token=item["node_token"]) + if child_obj_token_list: + obj_token_list.extend(child_obj_token_list) + return obj_token_list + + def _load_doc(self, document_id) -> str: + """Load a document from Feishu Docs. + + Args: + document_id: the document id. + + Returns: + The document text. + """ + url = self.host + self.documents_raw_content_url_path.format( + document_id) + if self.tenant_access_token == "" or self.expire < time.time(): + self._update_tenant_access_token() + headers = { + "Authorization": f"Bearer {self.tenant_access_token}", + "Content-Type": "application/json; charset=utf-8", + } + try: + response = requests.get(url, headers=headers) + result = response.json() + except Exception: + return + if not result.get("data"): + return + return result["data"]["content"] + + def _update_tenant_access_token(self): + """For update tenant_access_token.""" + url = self.host + self.tenant_access_token_internal_url_path + headers = {"Content-Type": "application/json; charset=utf-8"} + data = {"app_id": self.app_id, "app_secret": self.app_secret} + response = requests.post(url, data=json.dumps(data), headers=headers) + self.tenant_access_token = response.json()["tenant_access_token"] + self.expire = time.time() + response.json()["expire"] + + def set_lark_domain(self): + """The default API endpoints are for Feishu, in order to switch to Lark, we should use set_lark_domain.""" + self.host = "https://open.larksuite.com" + + +if __name__ == "__main__": + app_id = os.environ.get("FEISHU_APP_ID") + app_secret = os.environ.get("FEISHU_APP_SECRET") + reader = FeishuWikiReader(app_id, app_secret) + print( + reader.load_data( + space_id=os.environ.get("FEISHU_SPACE_ID"), + parent_node_token=os.environ.get("FEISHU_PARENT_NODE_TOKEN"))) diff --git a/llama-index-integrations/readers/llama-index-readers-feishu-wiki/pyproject.toml b/llama-index-integrations/readers/llama-index-readers-feishu-wiki/pyproject.toml new file mode 100644 index 00000000000000..f8b9c264233fab --- /dev/null +++ b/llama-index-integrations/readers/llama-index-readers-feishu-wiki/pyproject.toml @@ -0,0 +1,64 @@ +[build-system] +build-backend = "poetry.core.masonry.api" +requires = ["poetry-core"] + +[tool.codespell] +check-filenames = true +check-hidden = true +skip = "*.csv,*.html,*.json,*.jsonl,*.pdf,*.txt,*.ipynb" + +[tool.llamahub] +contains_example = false +import_path = "llama_index.readers.feishu_wiki" + +[tool.llamahub.class_authors] +FeishuWikiReader = "zhourunlai" + +[tool.mypy] +disallow_untyped_defs = true +exclude = ["_static", "build", "examples", "notebooks", "venv"] +ignore_missing_imports = true +python_version = "3.8" + +[tool.poetry] +authors = ["Your Name "] +description = "llama-index readers feishu_wiki integration" +exclude = ["**/BUILD"] +license = "MIT" +maintainers = ["zhourunlai"] +name = "llama-index-readers-feishu-wiki" +readme = "README.md" +version = "0.1.0" + +[tool.poetry.dependencies] +python = ">=3.8.1,<4.0" +llama-index-core = "^0.10.1" +requests = "^2.31.0" + +[tool.poetry.group.dev.dependencies] +ipython = "8.10.0" +jupyter = "^1.0.0" +mypy = "0.991" +pre-commit = "3.2.0" +pylint = "2.15.10" +pytest = "7.2.1" +pytest-mock = "3.11.1" +ruff = "0.0.292" +tree-sitter-languages = "^1.8.0" +types-Deprecated = ">=0.1.0" +types-PyYAML = "^6.0.12.12" +types-protobuf = "^4.24.0.4" +types-redis = "4.5.5.0" +types-requests = "2.28.11.8" +types-setuptools = "67.1.0.0" + +[tool.poetry.group.dev.dependencies.black] +extras = ["jupyter"] +version = "<=23.9.1,>=23.7.0" + +[tool.poetry.group.dev.dependencies.codespell] +extras = ["toml"] +version = ">=v2.2.6" + +[[tool.poetry.packages]] +include = "llama_index/" diff --git a/llama-index-integrations/readers/llama-index-readers-feishu-wiki/tests/BUILD b/llama-index-integrations/readers/llama-index-readers-feishu-wiki/tests/BUILD new file mode 100644 index 00000000000000..dabf212d7e7162 --- /dev/null +++ b/llama-index-integrations/readers/llama-index-readers-feishu-wiki/tests/BUILD @@ -0,0 +1 @@ +python_tests() diff --git a/llama-index-integrations/readers/llama-index-readers-feishu-wiki/tests/__init__.py b/llama-index-integrations/readers/llama-index-readers-feishu-wiki/tests/__init__.py new file mode 100644 index 00000000000000..e69de29bb2d1d6 diff --git a/llama-index-integrations/readers/llama-index-readers-feishu-wiki/tests/test_readers_feishu_wiki.py b/llama-index-integrations/readers/llama-index-readers-feishu-wiki/tests/test_readers_feishu_wiki.py new file mode 100644 index 00000000000000..2fef634282e382 --- /dev/null +++ b/llama-index-integrations/readers/llama-index-readers-feishu-wiki/tests/test_readers_feishu_wiki.py @@ -0,0 +1,7 @@ +from llama_index.core.readers.base import BaseReader +from llama_index.readers.feishu_wiki import FeishuWikiReader + + +def test_class(): + names_of_base_classes = [b.__name__ for b in FeishuWikiReader.__mro__] + assert BaseReader.__name__ in names_of_base_classes