From 9fb50191368d1dc30c92f6fb52ce918700b2d855 Mon Sep 17 00:00:00 2001 From: Thor Whalen Date: Wed, 3 Jan 2024 12:21:16 +0000 Subject: [PATCH] feat: Actual code --- .gitignore | 115 +++++++++++++++++++++++++++ LICENSE | 21 +++++ README.md | 96 +++++++++++++++++++++++ chromadol/__init__.py | 96 +++++++++++++++++++++++ chromadol/base.py | 146 +++++++++++++++++++++++++++++++++++ chromadol/tests/__init__.py | 1 + chromadol/tests/base_test.py | 66 ++++++++++++++++ docsrc/.gitignore | 1 + docsrc/Makefile | 33 ++++++++ docsrc/conf.py | 82 ++++++++++++++++++++ docsrc/index.rst | 18 +++++ docsrc/make.bat | 35 +++++++++ setup.cfg | 21 +++++ setup.py | 3 + 14 files changed, 734 insertions(+) create mode 100644 .gitignore create mode 100644 LICENSE create mode 100644 README.md create mode 100644 chromadol/__init__.py create mode 100644 chromadol/base.py create mode 100644 chromadol/tests/__init__.py create mode 100644 chromadol/tests/base_test.py create mode 100644 docsrc/.gitignore create mode 100644 docsrc/Makefile create mode 100644 docsrc/conf.py create mode 100644 docsrc/index.rst create mode 100644 docsrc/make.bat create mode 100644 setup.cfg create mode 100644 setup.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..76eb2d8 --- /dev/null +++ b/.gitignore @@ -0,0 +1,115 @@ +wads_configs.json +data/wads_configs.json +wads/data/wads_configs.json + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + + +.DS_Store +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST +_build + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ +docs/* + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ + +# PyCharm +.idea diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..8aa2645 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) [year] [fullname] + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..677fe1c --- /dev/null +++ b/README.md @@ -0,0 +1,96 @@ +# chromadol +Data Object Layer for ChromaDB + + +To install: ```pip install chromadol``` + + +# Example usage + +To make a `ChromaClient` DOL, you can specify a `chromadb` `Client`, `PersistentClient` (etc.) +instance, or specify a string (which will be interpreted as a path to a directory to +save the data to in a `PersistentClient` instance). + + >>> from chromadol import ChromaClient + >>> import tempfile, os + >>> with tempfile.TemporaryDirectory() as temp_dir: + ... tempdir = os.path.join(temp_dir, "chromadol_test") + ... os.makedirs(tempdir) + >>> client = ChromaClient(tempdir) + +Removing all contents of client to be able to run a test on a clean slate + + >>> for k in client: + ... del client[k] + + +There's nothing yet: + + >>> list(client) + [] + +Now let's "get" a collection. + + >>> collection = client['chromadol_test'] + +Note that just accessing the collection creates it (by default) + + + >>> list(client) + ['chromadol_test'] + +Here's nothing in the collection yet: + + >>> list(collection) + [] + +So let's write something. +Note that `chromadb` is designed to operate on multiple documents at once, +so the "chromadb-natural" way of specifying it's keys and contents (and any extras) +would be like this: + + >>> collection[['piece', 'of']] = { + ... 'documents': ['contents for piece', 'contents for of'], + ... 'metadatas': [{'author': 'me'}, {'author': 'you'}], + ... } + >>> list(collection) + ['piece', 'of'] + >>> + >>> assert collection[['piece', 'of']] == { + ... 'ids': ['piece', 'of'], + ... 'embeddings': None, + ... 'metadatas': [{'author': 'me'}, {'author': 'you'}], + ... 'documents': ['contents for piece', 'contents for of'], + ... 'uris': None, + ... 'data': None, + ... } + + +But you can read or write one document at a time too. + + >>> collection['cake'] = { + ... "documents": "contents for cake", + ... } + >>> list(collection) + ['piece', 'of', 'cake'] + >>> assert collection['cake'] == { + ... 'ids': ['cake'], + ... 'embeddings': None, + ... 'metadatas': [None], + ... 'documents': ['contents for cake'], + ... 'uris': None, + ... 'data': None, + ... } + +In fact, see that if you only want to specify the "documents" part of the information, +you can just write a string instead of a dictionary: + + >>> collection['cake'] = 'a different cake' + >>> assert collection['cake'] == { + ... 'ids': ['cake'], + ... 'embeddings': None, + ... 'metadatas': [None], + ... 'documents': ['a different cake'], + ... 'uris': None, + ... 'data': None, + ... } diff --git a/chromadol/__init__.py b/chromadol/__init__.py new file mode 100644 index 0000000..e5d5074 --- /dev/null +++ b/chromadol/__init__.py @@ -0,0 +1,96 @@ +"""Data Object Layer (DOL) for ChromaDB + +Example usage: + +To make a `ChromaClient` DOL, you can specify a `chromadb` `Client`, `PersistentClient` (etc.) +instance, or specify a string (which will be interpreted as a path to a directory to +save the data to in a `PersistentClient` instance). + +>>> from chromadol import ChromaClient +>>> import tempfile, os +>>> with tempfile.TemporaryDirectory() as temp_dir: +... tempdir = os.path.join(temp_dir, "chromadol_test") +... os.makedirs(tempdir) +>>> client = ChromaClient(tempdir) + +Removing all contents of client to be able to run a test on a clean slate + +>>> for k in client: +... del client[k] +... + +There's nothing yet: + +>>> list(client) +[] + +Now let's "get" a collection. + +>>> collection = client['chromadol_test'] + +Note that just accessing the collection creates it (by default) + + +>>> list(client) +['chromadol_test'] + +Here's nothing in the collection yet: + +>>> list(collection) +[] + +So let's write something. +Note that `chromadb` is designed to operate on multiple documents at once, +so the "chromadb-natural" way of specifying it's keys and contents (and any extras) +would be like this: + +>>> collection[['piece', 'of']] = { +... 'documents': ['contents for piece', 'contents for of'], +... 'metadatas': [{'author': 'me'}, {'author': 'you'}], +... } +>>> list(collection) +['piece', 'of'] +>>> +>>> assert collection[['piece', 'of']] == { +... 'ids': ['piece', 'of'], +... 'embeddings': None, +... 'metadatas': [{'author': 'me'}, {'author': 'you'}], +... 'documents': ['contents for piece', 'contents for of'], +... 'uris': None, +... 'data': None, +... } + + +But you can read or write one document at a time too. + +>>> collection['cake'] = { +... "documents": "contents for cake", +... } +>>> list(collection) +['piece', 'of', 'cake'] +>>> assert collection['cake'] == { +... 'ids': ['cake'], +... 'embeddings': None, +... 'metadatas': [None], +... 'documents': ['contents for cake'], +... 'uris': None, +... 'data': None, +... } + +In fact, see that if you only want to specify the "documents" part of the information, +you can just write a string instead of a dictionary: + +>>> collection['cake'] = 'a different cake' +>>> assert collection['cake'] == { +... 'ids': ['cake'], +... 'embeddings': None, +... 'metadatas': [None], +... 'documents': ['a different cake'], +... 'uris': None, +... 'data': None, +... } + + +""" + +from chromadol.base import ChromaCollection, ChromaClient diff --git a/chromadol/base.py b/chromadol/base.py new file mode 100644 index 0000000..7684689 --- /dev/null +++ b/chromadol/base.py @@ -0,0 +1,146 @@ +"""Base objects for chromadol.""" + +from typing import MutableMapping, Union +from functools import cached_property +from dol.appendable import appendable, mk_item2kv_for + +from chromadb import Client, PersistentClient, GetResult + +dflt_create_collection_kwargs = dict() + + +class ChromaClient(MutableMapping): + def __init__(self, client=None, *, get_or_create=True, **create_collection_kwargs): + """ + Initializes the reader with a chromadb Client instance. + + :param client: An instance of chromadb.Client. + """ + if client is None: + client = Client() + elif isinstance(client, str): + client = PersistentClient(client) + self.client = client + self._create_collection_kwargs_for_getitem = dict( + create_collection_kwargs, get_or_create=get_or_create + ) + self._create_collection_kwargs_for_setitem = create_collection_kwargs + + def __iter__(self): + """ + Iterates over the names of collections in the chromadb Client. + """ + return (obj.name for obj in self.client.list_collections()) + + def __getitem__(self, k: str): + """ + Retrieves a collection by its name. Creates the collection if it doesn't exist. + + :param k: The name of the collection to retrieve. + :return: The collection object. + """ + return ChromaCollection( + self.client.create_collection( + k, **self._create_collection_kwargs_for_getitem + ) + ) + + def __setitem__(self, k: str, v: dict): + """ + Creates or updates a collection. + + :param k: The name of the collection. + :param v: a dict that will be used to populate the collection via .add_documents(**v) + """ + # Implementation depends on how collections are created or updated in chromadb + # Example: + collection = self.client.create_collection( + name=k, **self._create_collection_kwargs_for_setitem + ) + if v: + collection.add_documents(**v) + + def __delitem__(self, k: str): + """ + Deletes a collection. + + :param k: The name of the collection to delete. + """ + self.client.delete_collection(k) + + def __len__(self): + """ + Returns the number of collections in the client. + """ + return len(self.client.list_collections()) + + def __contains__(self, k): + """ + Returns True if the client has a collection with the given name. + """ + existing_names = set(self) + return k in existing_names + + def clear(self): + """ + This method is here, in fact, to disable the clear method, that would + otherwise be inherited from MutableMapping. + It's existence is too dangerous, as it would delete all collections in the + client. + If you want to actually delete all collections in the client, do so explicitly + by iterating over the client and deleting each collection, as such: + + >>> for k in chroma_client_instance: # doctest: +SKIP + ... del chroma_client_instance[k] + + """ + raise NotImplementedError("Disabled for safety reasons.") + + + + +def int_string(x): + return str(int(x)) + + +item2kv = mk_item2kv_for.utc_key(factor=1e9, time_postproc=int_string) + + +@appendable(item2kv=item2kv) +class ChromaCollection(MutableMapping): + def __init__(self, collection): + """ + Initializes the store with a chromadb Collection instance. + + :param collection: An instance of chromadb.Collection. + """ + self.collection = collection + + @property + def _ids(self): + collection_elements = self.collection.get() + return collection_elements['ids'] + + def __iter__(self): + return iter(self._ids) + + def __getitem__(self, k: str) -> GetResult: + return self.collection.get(k) + + def __len__(self): + return self.collection.count() + + def __contains__(self, k): + try: + self.collection.get(k) + return True + except KeyError: + return False + + def __setitem__(self, k: str, v: Union[dict, str]): + if isinstance(v, str): + v = {'documents': [v]} + self.collection.upsert(k, **v) + + def __delitem__(self, k: str): + self.collection.delete(k) diff --git a/chromadol/tests/__init__.py b/chromadol/tests/__init__.py new file mode 100644 index 0000000..ad5e12d --- /dev/null +++ b/chromadol/tests/__init__.py @@ -0,0 +1 @@ +"""chromadol tests.""" \ No newline at end of file diff --git a/chromadol/tests/base_test.py b/chromadol/tests/base_test.py new file mode 100644 index 0000000..824acf1 --- /dev/null +++ b/chromadol/tests/base_test.py @@ -0,0 +1,66 @@ +"""Test base.py""" + + +from chromadol.base import ChromaClient + + +def test_simple(): + """A simple test of the ChromaClient and ChromaCollection classes.""" + + + import tempfile, os + with tempfile.TemporaryDirectory() as temp_dir: + tempdir = os.path.join(temp_dir, "chromadol_test") + os.makedirs(tempdir) + client = ChromaClient(tempdir) + + # removing all contents of client to be able to run a test on a clean slate + for k in client: + del client[k] + assert list(client) == [] + collection = client['chromadol_test'] + # note that just accessing the collection creates it (by default) + assert list(client) == ['chromadol_test'] + assert list(collection) == [] + + # chromadb is designed to operate on multiple documents at once, so + # specifying it's keys and contents (and any extras) list this: + collection[['piece', 'of']] = { + 'documents': ['contents for piece', 'contents for of'], + 'metadatas': [{'author': 'me'}, {'author': 'you'}], + } + assert list(collection) == ['piece', 'of'] + + assert collection[['piece', 'of']] == { + 'ids': ['piece', 'of'], + 'embeddings': None, + 'metadatas': [{'author': 'me'}, {'author': 'you'}], + 'documents': ['contents for piece', 'contents for of'], + 'uris': None, + 'data': None, + } + + # But you can read or write one document at a time too. + collection['cake'] = { + "documents": "contents for cake", + } + assert list(collection) == ['piece', 'of', 'cake'] + assert collection['cake'] == { + 'ids': ['cake'], + 'embeddings': None, + 'metadatas': [None], + 'documents': ['contents for cake'], + 'uris': None, + 'data': None, + } + + # In fact, see that if you only want to specify the "documents" part of the information, + collection['cake'] = 'a different cake' + assert collection['cake'] == { + 'ids': ['cake'], + 'embeddings': None, + 'metadatas': [None], + 'documents': ['a different cake'], + 'uris': None, + 'data': None, + } diff --git a/docsrc/.gitignore b/docsrc/.gitignore new file mode 100644 index 0000000..69fa449 --- /dev/null +++ b/docsrc/.gitignore @@ -0,0 +1 @@ +_build/ diff --git a/docsrc/Makefile b/docsrc/Makefile new file mode 100644 index 0000000..5f01b86 --- /dev/null +++ b/docsrc/Makefile @@ -0,0 +1,33 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = . +BUILDDIR = _build +GITHUBPAGESDIR = ../docs +GITLABPAGESDIR = ../public + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +github: + @make html + @cp -a "$(BUILDDIR)"/html/. $(GITHUBPAGESDIR) + +gitlab: + @make html + @cp -a "$(BUILDDIR)"/html/. $(GITLABPAGESDIR) + +clean: + rm -rfv $(BUILDDIR) $(GITHUBPAGESDIR) $(GITLABPAGESDIR) + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docsrc/conf.py b/docsrc/conf.py new file mode 100644 index 0000000..cc88c81 --- /dev/null +++ b/docsrc/conf.py @@ -0,0 +1,82 @@ +# Configuration file for the Sphinx documentation builder. +# +# This file only contains a selection of the most common options. +# For a full list see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +# -- Path setup -------------------------------------------------------------- + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. + +import os +import sys + +sys.path.insert(0, os.path.abspath('..')) + +# -- Project information ----------------------------------------------------- +from epythet.config_parser import parse_config +from pathlib import Path + +project, copyright, author, release, display_name = parse_config( + Path(__file__).absolute().parent.parent / 'setup.cfg' +) + +# -- General configuration --------------------------------------------------- + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + 'sphinx_toggleprompt', + 'sphinx_copybutton', + 'sphinx.ext.autodoc', # Include documentation from docstrings + 'sphinx.ext.doctest', # Test snippets in the documentation + 'sphinx.ext.githubpages', # This extension creates .nojekyll file + 'sphinx.ext.graphviz', # Add Graphviz graphs + 'sphinx.ext.napoleon', # Support for NumPy and Google style docstrings + 'sphinx.ext.todo', # Support for todo items + 'sphinx.ext.viewcode', # Add links to highlighted source code + 'myst_parser', # Parse .md files +] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This pattern also affects html_static_path and html_extra_path. +exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] + +# -- Options for HTML output ------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +html_theme = 'sphinx_rtd_theme' + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] + + +# -- Options for Markdown support ------------------------------------------- +# TODO: fix md support so that it doesn't interfere with rst docs +# import commonmark +# +# +# def docstring(app, what, name, obj, options, lines): +# md = '\n'.join(lines) +# ast = commonmark.Parser().parse(md) +# rst = commonmark.ReStructuredTextRenderer().render(ast) +# lines.clear() +# lines += rst.splitlines() +# +# +# def setup(app): +# app.connect('autodoc-process-docstring', docstring) + + +toggleprompt_offset_right = 30 diff --git a/docsrc/index.rst b/docsrc/index.rst new file mode 100644 index 0000000..3dcb7cc --- /dev/null +++ b/docsrc/index.rst @@ -0,0 +1,18 @@ +Welcome to chromadol's documentation! +===================================== + + +.. include:: ./table_of_contents.rst + + +Indices and tables +================== + +* :ref:`genindex` +* :ref:`modindex` +* :ref:`search` + + +Release: |release| + +Last change: |today| diff --git a/docsrc/make.bat b/docsrc/make.bat new file mode 100644 index 0000000..2119f51 --- /dev/null +++ b/docsrc/make.bat @@ -0,0 +1,35 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=. +set BUILDDIR=_build + +if "%1" == "" goto help + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.http://sphinx-doc.org/ + exit /b 1 +) + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% + +:end +popd diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..4c1b284 --- /dev/null +++ b/setup.cfg @@ -0,0 +1,21 @@ +[metadata] +root_url = https://github.com/i2mint/ +license = mit +author = OtoSense +version = 0.0.1 +description = Data Object Layer for ChromaDB +description_file = README.md +long_description = file:README.md +long_description_content_type = text/markdown +keywords = + +name = chromadol +display_name = chromadol + +[options] +packages = find: +include_package_data = True +zip_safe = False +install_requires = + + diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..201cd4c --- /dev/null +++ b/setup.py @@ -0,0 +1,3 @@ +from setuptools import setup + +setup() # Note: Everything should be in the local setup.cfg