Skip to content

Commit

Permalink
Add: FeishuWikiReader (run-llama#11491)
Browse files Browse the repository at this point in the history
  • Loading branch information
zhourunlai authored and Izuki Matsuba committed Mar 29, 2024
1 parent 6cb3e49 commit d78690c
Show file tree
Hide file tree
Showing 12 changed files with 428 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
llama_index/_static
.DS_Store
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
bin/
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
etc/
include/
lib/
lib64/
parts/
sdist/
share/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
.ruff_cache

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
target/

# Jupyter Notebook
.ipynb_checkpoints
notebooks/

# IPython
profile_default/
ipython_config.py

# pyenv
.python-version

# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock

# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/

# Celery stuff
celerybeat-schedule
celerybeat.pid

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
pyvenv.cfg

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
.dmypy.json
dmypy.json

# Pyre type checker
.pyre/

# Jetbrains
.idea
modules/
*.swp

# VsCode
.vscode

# pipenv
Pipfile
Pipfile.lock

# pyright
pyrightconfig.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
poetry_requirements(
name="poetry",
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# CHANGELOG

## [0.1.2] - 2024-02-13

- Add maintainers and keywords from library.json (llamahub)
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
GIT_ROOT ?= $(shell git rev-parse --show-toplevel)

help: ## Show all Makefile targets.
@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[33m%-30s\033[0m %s\n", $$1, $$2}'

format: ## Run code autoformatters (black).
pre-commit install
git ls-files | xargs pre-commit run black --files

lint: ## Run linters: pre-commit (black, ruff, codespell) and mypy
pre-commit install && git ls-files | xargs pre-commit run --show-diff-on-failure --files

test: ## Run tests via pytest.
pytest tests

watch-docs: ## Build and watch documentation.
sphinx-autobuild docs/ docs/_build/html --open-browser --watch $(GIT_ROOT)/llama_index/
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# Feishu Wiki Loader

This loader can traverse all feishu documents under the feishi space.

## Usage

To use this loader, you need to:

1. apply the permission(`wiki:wiki:readonly`) of the feishu app
2. add the feishu app as the admin of your feishu space, see [here](https://open.feishu.cn/document/server-docs/docs/wiki-v2/wiki-qa#b5da330b) for more help
3. finally, pass your feishu space id to this loader

```python
from llama_index import download_loader

app_id = "xxx"
app_secret = "xxx"
space_id = "xxx"
FeishuWikiReader = download_loader("FeishuWikiReader")
loader = FeishuWikiReader(app_id, app_secret)
documents = loader.load_data(space_id=space_id)
```

This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/run-llama/llama_index/tree/main/llama_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples.
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
python_sources()
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from llama_index.readers.feishu_wiki.base import FeishuWikiReader

__all__ = ["FeishuWikiReader"]
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
"""Feishu wiki reader."""
import json
import os
import time
from typing import List

import requests
from llama_index.core.readers.base import BaseReader
from llama_index.core.schema import Document

# Copyright (2023) Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


class FeishuWikiReader(BaseReader):
"""Feishu Wiki reader.
Reads pages from Feishu wiki under the space
"""

host = "https://open.feishu.cn"
wiki_nodes_url_path = "/open-apis/wiki/v2/spaces/{}/nodes"
documents_raw_content_url_path = "/open-apis/docx/v1/documents/{}/raw_content"
tenant_access_token_internal_url_path = (
"/open-apis/auth/v3/tenant_access_token/internal"
)

def __init__(self, app_id: str, app_secret: str) -> None:
"""
Args:
app_id: The unique identifier of the application is obtained after the application is created.
app_secret: Application key, obtained after creating the application.
"""
super().__init__()
self.app_id = app_id
self.app_secret = app_secret

self.tenant_access_token = ""
self.expire = 0

def load_data(self, space_id: str, parent_node_token: str = None) -> List[Document]:
"""Load data from the input directory.
Args:
space_id (str): a space id.
parent_node_token (str[optional]): a parent node token of the space
"""
if space_id is None:
raise ValueError('Must specify a "space_id" in `load_kwargs`.')

document_ids = self._load_space(space_id, parent_node_token=parent_node_token)
document_ids = list(set(document_ids))

results = []
for document_id in document_ids:
doc = self._load_doc(document_id)
results.append(Document(text=doc, extra_info={"document_id": document_id}))
return results

def _load_space(self, space_id: str, parent_node_token: str = None) -> str:
if self.tenant_access_token == "" or self.expire < time.time():
self._update_tenant_access_token()
headers = {
"Authorization": f"Bearer {self.tenant_access_token}",
"Content-Type": "application/json; charset=utf-8",
}

url = self.host + self.wiki_spaces_url_path.format(space_id)
if parent_node_token:
url += f"?parent_node_token={parent_node_token}"
try:
response = requests.get(url, headers=headers)
result = response.json()
except Exception:
return []
if not result.get("data"):
return []
obj_token_list = []
for item in result["data"]["items"]:
obj_token_list.append(item["obj_token"])
if item["has_child"]:
child_obj_token_list = self._load_space(
space_id=space_id, parent_node_token=item["node_token"]
)
if child_obj_token_list:
obj_token_list.extend(child_obj_token_list)
return obj_token_list

def _load_doc(self, document_id: str) -> str:
"""Load a document from Feishu Docs.
Args:
document_id: the document id.
Returns:
The document text.
"""
url = self.host + self.documents_raw_content_url_path.format(document_id)
if self.tenant_access_token == "" or self.expire < time.time():
self._update_tenant_access_token()
headers = {
"Authorization": f"Bearer {self.tenant_access_token}",
"Content-Type": "application/json; charset=utf-8",
}
try:
response = requests.get(url, headers=headers)
result = response.json()
except Exception:
return None
if not result.get("data"):
return None
return result["data"]["content"]

def _update_tenant_access_token(self) -> None:
"""For update tenant_access_token."""
url = self.host + self.tenant_access_token_internal_url_path
headers = {"Content-Type": "application/json; charset=utf-8"}
data = {"app_id": self.app_id, "app_secret": self.app_secret}
response = requests.post(url, data=json.dumps(data), headers=headers)
self.tenant_access_token = response.json()["tenant_access_token"]
self.expire = time.time() + response.json()["expire"]

def set_lark_domain(self, host: str) -> None:
"""Set lark domain."""
self.host = host


if __name__ == "__main__":
app_id = os.environ.get("FEISHU_APP_ID")
app_secret = os.environ.get("FEISHU_APP_SECRET")
reader = FeishuWikiReader(app_id, app_secret)
print(
reader.load_data(
space_id=os.environ.get("FEISHU_SPACE_ID"),
parent_node_token=os.environ.get("FEISHU_PARENT_NODE_TOKEN"),
)
)
Loading

0 comments on commit d78690c

Please sign in to comment.