Skip to content

Commit

Permalink
[feature] Add StructuredDataReader support for xlsx, csv, json and js… (
Browse files Browse the repository at this point in the history
  • Loading branch information
ZhaoyangLiu-Leo authored Jun 26, 2024
1 parent 6a08a96 commit 06ad82c
Show file tree
Hide file tree
Showing 11 changed files with 426 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
llama_index/_static
.DS_Store
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
bin/
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
etc/
include/
lib/
lib64/
parts/
sdist/
share/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
.ruff_cache

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
target/

# Jupyter Notebook
.ipynb_checkpoints
notebooks/

# IPython
profile_default/
ipython_config.py

# pyenv
.python-version

# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock

# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/

# Celery stuff
celerybeat-schedule
celerybeat.pid

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
pyvenv.cfg

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
.dmypy.json
dmypy.json

# Pyre type checker
.pyre/

# Jetbrains
.idea
modules/
*.swp

# VsCode
.vscode

# pipenv
Pipfile
Pipfile.lock

# pyright
pyrightconfig.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
poetry_requirements(
name="poetry",
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
GIT_ROOT ?= $(shell git rev-parse --show-toplevel)

help: ## Show all Makefile targets.
@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[33m%-30s\033[0m %s\n", $$1, $$2}'

format: ## Run code autoformatters (black).
pre-commit install
git ls-files | xargs pre-commit run black --files

lint: ## Run linters: pre-commit (black, ruff, codespell) and mypy
pre-commit install && git ls-files | xargs pre-commit run --show-diff-on-failure --files

test: ## Run tests via pytest.
pytest tests

watch-docs: ## Build and watch documentation.
sphinx-autobuild docs/ docs/_build/html --open-browser --watch $(GIT_ROOT)/llama_index/
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
# LlamaIndex Readers Integration: Structured-Data

The function 'StructuredDataReader' supports reading files in JSON, JSONL, CSV, and XLSX formats. It provides parameters 'col_index' and 'col_metadata' to differentiate between columns that should be written into the document's main text and additional metadata.

## Install package

```bash
pip install llama-index-readers-structured-data
```

Or install locally:

```bash
pip install -e llama-index-integrations/readers/llama-index-readers-structured-data
```

## Usage

1. for single document:

```python
from pathlib import Path
from llama_index.readers.structured_data.base import StructuredDataReader

parser = StructuredDataReader(col_index=["col1", "col2"], col_metadata=0)
documents = parser.load_data(Path("your/file/path.json"))
```

2. for dictory of documents:

```python
from pathlib import Path
from llama_index.core import SimpleDirectoryReader
from llama_index.readers.structured_data.base import StructuredDataReader

parser = StructuredDataReader(col_index=[1, -1], col_metadata="col3")
file_extractor = {
".xlsx": parser,
".csv": parser,
".json": parser,
".jsonl": parser,
}
documents = SimpleDirectoryReader(
"your/dic/path", file_extractor=file_extractor
).load_data()
```
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
python_sources()
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
from llama_index.readers.structured_data.base import StructuredDataReader


__all__ = ["StructuredDataReader"]
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
from pathlib import Path
from typing import Any, Dict, List, Optional, Union

import pandas as pd
from fsspec import AbstractFileSystem
from llama_index.core.readers.base import BaseReader
from llama_index.core.schema import Document


class StructuredDataReader(BaseReader):
"""
Updated BaseReader parser to support JSON, JSONL, CSV and Excel (.xlsx) files.
...
Args:
col_joiner (str): The string to join the columns with. Defaults to ', '.
col_index (str, int, or list): The list of columns to be used as index.
col_metadata (None, str, int, or list): The list of columns to be used as metadata.
...
"""

def __init__(
self,
*args: Any,
col_joiner: str = ", ",
pandas_config: dict = {},
col_index: Union[str, int, List],
col_metadata: Optional[Union[str, int, List]] = None,
**kwargs: Any,
) -> None:
"""Init params."""
super().__init__(*args, **kwargs)
self._col_joiner = col_joiner
self._pandas_config = pandas_config
self._col_index = col_index
self._col_metadata = col_metadata

def _load_dataframe(
self, file: Path, fs: Optional[AbstractFileSystem] = None
) -> pd.DataFrame:
file_extension = file.suffix.lower()

read_funcs = {
".csv": lambda f: pd.read_csv(f),
".xlsx": lambda f: pd.read_excel(f),
".json": lambda f: pd.read_json(f, encoding="utf-8"),
".jsonl": lambda f: pd.read_json(f, encoding="utf-8", lines=True),
}

if file_extension not in read_funcs:
raise ValueError(
f"Unsupported file extension '{file_extension}'. Supported extensions are 'json', 'csv', 'xlsx', and 'jsonl'."
)

if fs:
with fs.open(file) as f:
df = read_funcs[file_extension](f, **self._pandas_config)
else:
df = read_funcs[file_extension](file, **self._pandas_config)
return df

def _validate_column(self, index_name, column_index, df):
if isinstance(column_index, int):
assert (
-len(df.columns) < column_index < len(df.columns)
), f"The {index_name} {column_index} exceeds the range of columns in the dataframe: ({len(df.columns)})"
elif isinstance(column_index, str):
assert (
column_index in df.columns
), f"The {index_name} must be in the dataframe"
else:
if all(isinstance(item, int) for item in column_index):
assert all(
-len(df.columns) < item < len(df.columns) for item in column_index
), f"Some items in {index_name} exceed the range of columns in the dataframe: ({len(df.columns)})"
elif all(isinstance(item, str) for item in column_index):
assert set(column_index).issubset(
df.columns
), f"All columns in {index_name} must be in the dataframe"
else:
raise ValueError(
"Not support int and str columns both in column configs."
)

def load_data(
self,
file: Path,
extra_info: Optional[Dict] = None,
fs: Optional[AbstractFileSystem] = None,
) -> List[Document]:
"""Parse file."""
df = self._load_dataframe(file, fs)

assert self._col_index, f"The col_index must be specified"
self._validate_column("col_index", self._col_index, df)

if isinstance(self._col_index, int) or (
isinstance(self._col_index, list)
and all(isinstance(item, int) for item in self._col_index)
):
df_text = df.iloc[:, self._col_index]
else:
df_text = df[self._col_index]

if isinstance(df_text, pd.DataFrame):
text_list = df_text.apply(
lambda row: self._col_joiner.join(row.astype(str).tolist()), axis=1
).tolist()
elif isinstance(df_text, pd.Series):
text_list = df_text.tolist()

if not self._col_metadata:
return [
Document(text=text_tuple, metadata=(extra_info or {}))
for text_tuple in text_list
]
else:
self._validate_column("col_metadata", self._col_metadata, df)
if isinstance(self._col_metadata, int) or (
isinstance(self._col_metadata, list)
and all(isinstance(item, int) for item in self._col_metadata)
):
df_metadata = df.iloc[:, self._col_metadata]
else:
df_metadata = df[self._col_metadata]

if isinstance(df_metadata, pd.Series):
df_metadata = pd.DataFrame(df_metadata)

metadata_list = df_metadata.to_dict(orient="records")

return [
Document(
text=text_tuple, metadata={**(metadata_tuple), **(extra_info or {})}
)
for text_tuple, metadata_tuple in zip(text_list, metadata_list)
]
Loading

0 comments on commit 06ad82c

Please sign in to comment.