forked from run-llama/llama_index
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add: FeishuWikiReader (run-llama#11491)
- Loading branch information
1 parent
6cb3e49
commit d78690c
Showing
12 changed files
with
428 additions
and
0 deletions.
There are no files selected for viewing
153 changes: 153 additions & 0 deletions
153
llama-index-integrations/readers/llama-index-readers-feishu-wiki/.gitignore
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,153 @@ | ||
llama_index/_static | ||
.DS_Store | ||
# Byte-compiled / optimized / DLL files | ||
__pycache__/ | ||
*.py[cod] | ||
*$py.class | ||
|
||
# C extensions | ||
*.so | ||
|
||
# Distribution / packaging | ||
.Python | ||
bin/ | ||
build/ | ||
develop-eggs/ | ||
dist/ | ||
downloads/ | ||
eggs/ | ||
.eggs/ | ||
etc/ | ||
include/ | ||
lib/ | ||
lib64/ | ||
parts/ | ||
sdist/ | ||
share/ | ||
var/ | ||
wheels/ | ||
pip-wheel-metadata/ | ||
share/python-wheels/ | ||
*.egg-info/ | ||
.installed.cfg | ||
*.egg | ||
MANIFEST | ||
|
||
# PyInstaller | ||
# Usually these files are written by a python script from a template | ||
# before PyInstaller builds the exe, so as to inject date/other infos into it. | ||
*.manifest | ||
*.spec | ||
|
||
# Installer logs | ||
pip-log.txt | ||
pip-delete-this-directory.txt | ||
|
||
# Unit test / coverage reports | ||
htmlcov/ | ||
.tox/ | ||
.nox/ | ||
.coverage | ||
.coverage.* | ||
.cache | ||
nosetests.xml | ||
coverage.xml | ||
*.cover | ||
*.py,cover | ||
.hypothesis/ | ||
.pytest_cache/ | ||
.ruff_cache | ||
|
||
# Translations | ||
*.mo | ||
*.pot | ||
|
||
# Django stuff: | ||
*.log | ||
local_settings.py | ||
db.sqlite3 | ||
db.sqlite3-journal | ||
|
||
# Flask stuff: | ||
instance/ | ||
.webassets-cache | ||
|
||
# Scrapy stuff: | ||
.scrapy | ||
|
||
# Sphinx documentation | ||
docs/_build/ | ||
|
||
# PyBuilder | ||
target/ | ||
|
||
# Jupyter Notebook | ||
.ipynb_checkpoints | ||
notebooks/ | ||
|
||
# IPython | ||
profile_default/ | ||
ipython_config.py | ||
|
||
# pyenv | ||
.python-version | ||
|
||
# pipenv | ||
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. | ||
# However, in case of collaboration, if having platform-specific dependencies or dependencies | ||
# having no cross-platform support, pipenv may install dependencies that don't work, or not | ||
# install all needed dependencies. | ||
#Pipfile.lock | ||
|
||
# PEP 582; used by e.g. github.com/David-OConnor/pyflow | ||
__pypackages__/ | ||
|
||
# Celery stuff | ||
celerybeat-schedule | ||
celerybeat.pid | ||
|
||
# SageMath parsed files | ||
*.sage.py | ||
|
||
# Environments | ||
.env | ||
.venv | ||
env/ | ||
venv/ | ||
ENV/ | ||
env.bak/ | ||
venv.bak/ | ||
pyvenv.cfg | ||
|
||
# Spyder project settings | ||
.spyderproject | ||
.spyproject | ||
|
||
# Rope project settings | ||
.ropeproject | ||
|
||
# mkdocs documentation | ||
/site | ||
|
||
# mypy | ||
.mypy_cache/ | ||
.dmypy.json | ||
dmypy.json | ||
|
||
# Pyre type checker | ||
.pyre/ | ||
|
||
# Jetbrains | ||
.idea | ||
modules/ | ||
*.swp | ||
|
||
# VsCode | ||
.vscode | ||
|
||
# pipenv | ||
Pipfile | ||
Pipfile.lock | ||
|
||
# pyright | ||
pyrightconfig.json |
3 changes: 3 additions & 0 deletions
3
llama-index-integrations/readers/llama-index-readers-feishu-wiki/BUILD
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
poetry_requirements( | ||
name="poetry", | ||
) |
5 changes: 5 additions & 0 deletions
5
llama-index-integrations/readers/llama-index-readers-feishu-wiki/CHANGELOG.md
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
# CHANGELOG | ||
|
||
## [0.1.2] - 2024-02-13 | ||
|
||
- Add maintainers and keywords from library.json (llamahub) |
17 changes: 17 additions & 0 deletions
17
llama-index-integrations/readers/llama-index-readers-feishu-wiki/Makefile
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
GIT_ROOT ?= $(shell git rev-parse --show-toplevel) | ||
|
||
help: ## Show all Makefile targets. | ||
@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[33m%-30s\033[0m %s\n", $$1, $$2}' | ||
|
||
format: ## Run code autoformatters (black). | ||
pre-commit install | ||
git ls-files | xargs pre-commit run black --files | ||
|
||
lint: ## Run linters: pre-commit (black, ruff, codespell) and mypy | ||
pre-commit install && git ls-files | xargs pre-commit run --show-diff-on-failure --files | ||
|
||
test: ## Run tests via pytest. | ||
pytest tests | ||
|
||
watch-docs: ## Build and watch documentation. | ||
sphinx-autobuild docs/ docs/_build/html --open-browser --watch $(GIT_ROOT)/llama_index/ |
24 changes: 24 additions & 0 deletions
24
llama-index-integrations/readers/llama-index-readers-feishu-wiki/README.md
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
# Feishu Wiki Loader | ||
|
||
This loader can traverse all feishu documents under the feishi space. | ||
|
||
## Usage | ||
|
||
To use this loader, you need to: | ||
|
||
1. apply the permission(`wiki:wiki:readonly`) of the feishu app | ||
2. add the feishu app as the admin of your feishu space, see [here](https://open.feishu.cn/document/server-docs/docs/wiki-v2/wiki-qa#b5da330b) for more help | ||
3. finally, pass your feishu space id to this loader | ||
|
||
```python | ||
from llama_index import download_loader | ||
|
||
app_id = "xxx" | ||
app_secret = "xxx" | ||
space_id = "xxx" | ||
FeishuWikiReader = download_loader("FeishuWikiReader") | ||
loader = FeishuWikiReader(app_id, app_secret) | ||
documents = loader.load_data(space_id=space_id) | ||
``` | ||
|
||
This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/run-llama/llama_index/tree/main/llama_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples. |
1 change: 1 addition & 0 deletions
1
...ntegrations/readers/llama-index-readers-feishu-wiki/llama_index/readers/feishu_wiki/BUILD
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
python_sources() |
3 changes: 3 additions & 0 deletions
3
...tions/readers/llama-index-readers-feishu-wiki/llama_index/readers/feishu_wiki/__init__.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
from llama_index.readers.feishu_wiki.base import FeishuWikiReader | ||
|
||
__all__ = ["FeishuWikiReader"] |
150 changes: 150 additions & 0 deletions
150
...egrations/readers/llama-index-readers-feishu-wiki/llama_index/readers/feishu_wiki/base.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,150 @@ | ||
"""Feishu wiki reader.""" | ||
import json | ||
import os | ||
import time | ||
from typing import List | ||
|
||
import requests | ||
from llama_index.core.readers.base import BaseReader | ||
from llama_index.core.schema import Document | ||
|
||
# Copyright (2023) Bytedance Ltd. and/or its affiliates | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
|
||
class FeishuWikiReader(BaseReader): | ||
"""Feishu Wiki reader. | ||
Reads pages from Feishu wiki under the space | ||
""" | ||
|
||
host = "https://open.feishu.cn" | ||
wiki_nodes_url_path = "/open-apis/wiki/v2/spaces/{}/nodes" | ||
documents_raw_content_url_path = "/open-apis/docx/v1/documents/{}/raw_content" | ||
tenant_access_token_internal_url_path = ( | ||
"/open-apis/auth/v3/tenant_access_token/internal" | ||
) | ||
|
||
def __init__(self, app_id: str, app_secret: str) -> None: | ||
""" | ||
Args: | ||
app_id: The unique identifier of the application is obtained after the application is created. | ||
app_secret: Application key, obtained after creating the application. | ||
""" | ||
super().__init__() | ||
self.app_id = app_id | ||
self.app_secret = app_secret | ||
|
||
self.tenant_access_token = "" | ||
self.expire = 0 | ||
|
||
def load_data(self, space_id: str, parent_node_token: str = None) -> List[Document]: | ||
"""Load data from the input directory. | ||
Args: | ||
space_id (str): a space id. | ||
parent_node_token (str[optional]): a parent node token of the space | ||
""" | ||
if space_id is None: | ||
raise ValueError('Must specify a "space_id" in `load_kwargs`.') | ||
|
||
document_ids = self._load_space(space_id, parent_node_token=parent_node_token) | ||
document_ids = list(set(document_ids)) | ||
|
||
results = [] | ||
for document_id in document_ids: | ||
doc = self._load_doc(document_id) | ||
results.append(Document(text=doc, extra_info={"document_id": document_id})) | ||
return results | ||
|
||
def _load_space(self, space_id: str, parent_node_token: str = None) -> str: | ||
if self.tenant_access_token == "" or self.expire < time.time(): | ||
self._update_tenant_access_token() | ||
headers = { | ||
"Authorization": f"Bearer {self.tenant_access_token}", | ||
"Content-Type": "application/json; charset=utf-8", | ||
} | ||
|
||
url = self.host + self.wiki_spaces_url_path.format(space_id) | ||
if parent_node_token: | ||
url += f"?parent_node_token={parent_node_token}" | ||
try: | ||
response = requests.get(url, headers=headers) | ||
result = response.json() | ||
except Exception: | ||
return [] | ||
if not result.get("data"): | ||
return [] | ||
obj_token_list = [] | ||
for item in result["data"]["items"]: | ||
obj_token_list.append(item["obj_token"]) | ||
if item["has_child"]: | ||
child_obj_token_list = self._load_space( | ||
space_id=space_id, parent_node_token=item["node_token"] | ||
) | ||
if child_obj_token_list: | ||
obj_token_list.extend(child_obj_token_list) | ||
return obj_token_list | ||
|
||
def _load_doc(self, document_id: str) -> str: | ||
"""Load a document from Feishu Docs. | ||
Args: | ||
document_id: the document id. | ||
Returns: | ||
The document text. | ||
""" | ||
url = self.host + self.documents_raw_content_url_path.format(document_id) | ||
if self.tenant_access_token == "" or self.expire < time.time(): | ||
self._update_tenant_access_token() | ||
headers = { | ||
"Authorization": f"Bearer {self.tenant_access_token}", | ||
"Content-Type": "application/json; charset=utf-8", | ||
} | ||
try: | ||
response = requests.get(url, headers=headers) | ||
result = response.json() | ||
except Exception: | ||
return None | ||
if not result.get("data"): | ||
return None | ||
return result["data"]["content"] | ||
|
||
def _update_tenant_access_token(self) -> None: | ||
"""For update tenant_access_token.""" | ||
url = self.host + self.tenant_access_token_internal_url_path | ||
headers = {"Content-Type": "application/json; charset=utf-8"} | ||
data = {"app_id": self.app_id, "app_secret": self.app_secret} | ||
response = requests.post(url, data=json.dumps(data), headers=headers) | ||
self.tenant_access_token = response.json()["tenant_access_token"] | ||
self.expire = time.time() + response.json()["expire"] | ||
|
||
def set_lark_domain(self, host: str) -> None: | ||
"""Set lark domain.""" | ||
self.host = host | ||
|
||
|
||
if __name__ == "__main__": | ||
app_id = os.environ.get("FEISHU_APP_ID") | ||
app_secret = os.environ.get("FEISHU_APP_SECRET") | ||
reader = FeishuWikiReader(app_id, app_secret) | ||
print( | ||
reader.load_data( | ||
space_id=os.environ.get("FEISHU_SPACE_ID"), | ||
parent_node_token=os.environ.get("FEISHU_PARENT_NODE_TOKEN"), | ||
) | ||
) |
Oops, something went wrong.