Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add: FeishuWikiReader #11491

Merged
merged 1 commit into from
Mar 1, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
llama_index/_static
.DS_Store
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
bin/
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
etc/
include/
lib/
lib64/
parts/
sdist/
share/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
.ruff_cache

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
target/

# Jupyter Notebook
.ipynb_checkpoints
notebooks/

# IPython
profile_default/
ipython_config.py

# pyenv
.python-version

# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock

# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/

# Celery stuff
celerybeat-schedule
celerybeat.pid

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
pyvenv.cfg

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
.dmypy.json
dmypy.json

# Pyre type checker
.pyre/

# Jetbrains
.idea
modules/
*.swp

# VsCode
.vscode

# pipenv
Pipfile
Pipfile.lock

# pyright
pyrightconfig.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
poetry_requirements(
name="poetry",
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# CHANGELOG

## [0.1.2] - 2024-02-13

- Add maintainers and keywords from library.json (llamahub)
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
GIT_ROOT ?= $(shell git rev-parse --show-toplevel)

help: ## Show all Makefile targets.
@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[33m%-30s\033[0m %s\n", $$1, $$2}'

format: ## Run code autoformatters (black).
pre-commit install
git ls-files | xargs pre-commit run black --files

lint: ## Run linters: pre-commit (black, ruff, codespell) and mypy
pre-commit install && git ls-files | xargs pre-commit run --show-diff-on-failure --files

test: ## Run tests via pytest.
pytest tests

watch-docs: ## Build and watch documentation.
sphinx-autobuild docs/ docs/_build/html --open-browser --watch $(GIT_ROOT)/llama_index/
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# Feishu Wiki Loader

This loader can traverse all feishu documents under the feishi space.

## Usage

To use this loader, you need to:

1. apply the permission(`wiki:wiki:readonly`) of the feishu app
2. add the feishu app as the admin of your feishu space, see [here](https://open.feishu.cn/document/server-docs/docs/wiki-v2/wiki-qa#b5da330b) for more help
3. finally, pass your feishu space id to this loader

```python
from llama_index import download_loader

app_id = "xxx"
app_secret = "xxx"
space_id = "xxx"
FeishuWikiReader = download_loader("FeishuWikiReader")
loader = FeishuWikiReader(app_id, app_secret)
documents = loader.load_data(space_id=space_id)
```

This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/run-llama/llama_index/tree/main/llama_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples.
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
python_sources()
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from llama_index.readers.feishu_wiki.base import FeishuWikiReader

__all__ = ["FeishuWikiReader"]
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
"""Feishu wiki reader."""
import json
import os
import time
from typing import List

import requests
from llama_index.core.readers.base import BaseReader
from llama_index.core.schema import Document

# Copyright (2023) Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


class FeishuWikiReader(BaseReader):
"""Feishu Wiki reader.

Reads pages from Feishu wiki under the space

"""

host = "https://open.feishu.cn"
wiki_nodes_url_path = "/open-apis/wiki/v2/spaces/{}/nodes"
documents_raw_content_url_path = "/open-apis/docx/v1/documents/{}/raw_content"
tenant_access_token_internal_url_path = (
"/open-apis/auth/v3/tenant_access_token/internal"
)

def __init__(self, app_id: str, app_secret: str) -> None:
"""

Args:
app_id: The unique identifier of the application is obtained after the application is created.
app_secret: Application key, obtained after creating the application.
"""
super().__init__()
self.app_id = app_id
self.app_secret = app_secret

self.tenant_access_token = ""
self.expire = 0

def load_data(self, space_id: str, parent_node_token: str = None) -> List[Document]:
"""Load data from the input directory.

Args:
space_id (str): a space id.
parent_node_token (str[optional]): a parent node token of the space
"""
if space_id is None:
raise ValueError('Must specify a "space_id" in `load_kwargs`.')

document_ids = self._load_space(space_id, parent_node_token=parent_node_token)
document_ids = list(set(document_ids))

results = []
for document_id in document_ids:
doc = self._load_doc(document_id)
results.append(Document(text=doc, extra_info={"document_id": document_id}))
return results

def _load_space(self, space_id: str, parent_node_token: str = None) -> str:
if self.tenant_access_token == "" or self.expire < time.time():
self._update_tenant_access_token()
headers = {
"Authorization": f"Bearer {self.tenant_access_token}",
"Content-Type": "application/json; charset=utf-8",
}

url = self.host + self.wiki_spaces_url_path.format(space_id)
if parent_node_token:
url += f"?parent_node_token={parent_node_token}"
try:
response = requests.get(url, headers=headers)
result = response.json()
except Exception:
return []
if not result.get("data"):
return []
obj_token_list = []
for item in result["data"]["items"]:
obj_token_list.append(item["obj_token"])
if item["has_child"]:
child_obj_token_list = self._load_space(
space_id=space_id, parent_node_token=item["node_token"]
)
if child_obj_token_list:
obj_token_list.extend(child_obj_token_list)
return obj_token_list

def _load_doc(self, document_id: str) -> str:
"""Load a document from Feishu Docs.

Args:
document_id: the document id.

Returns:
The document text.
"""
url = self.host + self.documents_raw_content_url_path.format(document_id)
if self.tenant_access_token == "" or self.expire < time.time():
self._update_tenant_access_token()
headers = {
"Authorization": f"Bearer {self.tenant_access_token}",
"Content-Type": "application/json; charset=utf-8",
}
try:
response = requests.get(url, headers=headers)
result = response.json()
except Exception:
return None
if not result.get("data"):
return None
return result["data"]["content"]

def _update_tenant_access_token(self) -> None:
"""For update tenant_access_token."""
url = self.host + self.tenant_access_token_internal_url_path
headers = {"Content-Type": "application/json; charset=utf-8"}
data = {"app_id": self.app_id, "app_secret": self.app_secret}
response = requests.post(url, data=json.dumps(data), headers=headers)
self.tenant_access_token = response.json()["tenant_access_token"]
self.expire = time.time() + response.json()["expire"]

def set_lark_domain(self, host: str) -> None:
"""Set lark domain."""
self.host = host


if __name__ == "__main__":
app_id = os.environ.get("FEISHU_APP_ID")
app_secret = os.environ.get("FEISHU_APP_SECRET")
reader = FeishuWikiReader(app_id, app_secret)
print(
zhourunlai marked this conversation as resolved.
Show resolved Hide resolved
reader.load_data(
space_id=os.environ.get("FEISHU_SPACE_ID"),
parent_node_token=os.environ.get("FEISHU_PARENT_NODE_TOKEN"),
)
)
Loading
Loading