Skip to content

Commit

Permalink
Vector DB Matrix: Vendor Data (#44)
Browse files Browse the repository at this point in the history
* VDB table data - json schema, import utility from the original sheet & the vendor json files.

* Automatic json schema validation for all vendor json files in PRs.

* Automatic bundling of vendor json files and push to GCS when PRs merge to main.
  • Loading branch information
svonava authored Jan 10, 2024
1 parent ee2c4f3 commit 693b2a3
Show file tree
Hide file tree
Showing 51 changed files with 6,080 additions and 277 deletions.
52 changes: 52 additions & 0 deletions .github/workflows/json-bundle.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
name: Bundle JSON files
on:
push:
branches:
- main

jobs:
bundle:
runs-on: ubuntu-latest
permissions:
contents: 'read'
id-token: 'write'

steps:
- name: Checkout
uses: actions/checkout@v4

- name: Set up Python 3.12
uses: actions/setup-python@v3
with:
python-version: '3.12'

- uses: getsentry/[email protected]
id: venv
with:
python-version: '3.12'
cache-dependency-path: |
docs/tool/vdb_table/requirements.txt
install-cmd: pip install -r docs/tools/vdb_table/requirements.txt

- name: Generate bundle
id: bundle
run: |
python docs/tools/vdb_table/data_utils.py json_to_bundle -dd "docs/tools/vdb_table/data/*"
- id: 'auth'
uses: 'google-github-actions/auth@v2'
with:
workload_identity_provider: 'projects/903342166386/locations/global/workloadIdentityPools/github-pool/providers/github-provider'
service_account: '[email protected]'

- id: 'upload-bundle'
uses: 'google-github-actions/upload-cloud-storage@v2'
with:
path: './bundle.json'
destination: 'vectorhub-bundle/vectorhub-bundle'

- id: 'upload-schema'
uses: 'google-github-actions/upload-cloud-storage@v2'
with:
path: 'docs/tools/vdb_table/vendor.schema.json'
destination: 'vectorhub-bundle/vendor.schema.json'
2 changes: 1 addition & 1 deletion .github/workflows/json-validate.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,5 +18,5 @@ jobs:
uses: GrantBirki/[email protected]
with:
base_dir: docs/tools/vdb_table/data
json_schema: docs/tools/vdb_table/data/vendor.schema.json
json_schema: docs/tools/vdb_table/vendor.schema.json
json_schema_version: "draft-2020-12"
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
venv/
.env
153 changes: 153 additions & 0 deletions docs/tools/vdb_table/data/activeloop.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
{
"name": "Activeloop Deep Lake",
"links": {
"docs": "https://docs.activeloop.ai/",
"github": "https://github.com/activeloopai/deeplake",
"website": "https://www.activeloop.ai/",
"vendor_discussion": "https://github.com/superlinked/VectorHub/discussions/97",
"poc_github": "https://github.com/davidbuniat",
"slug": "activeloop"
},
"oss": {
"support": "full",
"source_url": "https://github.com/activeloopai/deeplake",
"comment": "https://github.com/activeloopai/deeplake"
},
"license": {
"value": "MPL 2.0",
"source_url": "",
"comment": ""
},
"dev_languages": {
"value": [
"python",
"c++"
],
"source_url": "",
"comment": ""
},
"github_stars": 7200,
"vector_launch_year": 2023,
"metadata_filter": {
"support": "",
"source_url": "",
"comment": ""
},
"hybrid_search": {
"support": "",
"source_url": "https://docs.activeloop.ai/performance-features/querying-datasets/query-syntax",
"comment": "While you can run embedding search + contains(text, 'keywoard') or multiple those (keyword search inside text tensor) since BM25 not available I wouldn't call a full hybrid search. https://docs.activeloop.ai/performance-features/querying-datasets/query-syntax"
},
"facets": {
"support": "",
"source_url": "",
"comment": ""
},
"geo_search": {
"support": "none",
"source_url": "",
"comment": "although doable with UDFs and location storage"
},
"multi_vec": {
"support": "full",
"source_url": "https://docs.activeloop.ai/technical-details/data-layout",
"comment": "https://docs.activeloop.ai/technical-details/data-layout"
},
"sparse_vectors": {
"support": "partial",
"source_url": "https://docs.deeplake.ai/en/latest/Htypes.html",
"comment": "no native sparse vector support, although it supports all numpy arrays hence can also store sparse numpy arrays"
},
"bm25": {
"support": "none",
"source_url": "",
"comment": ""
},
"full_text": {
"support": "partial",
"source_url": "",
"comment": "you can search i.e. search keywords with TQL/SQL contains(...) function, but I assume you mean more than just text search, by full-text search engine"
},
"embeddings_text": {
"support": "",
"source_url": "https://docs.activeloop.ai/quickstart",
"comment": "https://docs.activeloop.ai/quickstart#creating-your-first-vector-store"
},
"embeddings_image": {
"support": "",
"source_url": "",
"comment": ""
},
"embeddings_structured": {
"support": "",
"source_url": "",
"comment": ""
},
"rag": {
"support": "",
"source_url": "",
"comment": ""
},
"recsys": {
"support": "",
"source_url": "",
"comment": ""
},
"langchain": {
"support": "full",
"source_url": "https://python.langchain.com/docs/integrations/vectorstores/activeloop_deeplake",
"comment": "https://python.langchain.com/docs/integrations/vectorstores/activeloop_deeplake"
},
"llamaindex": {
"support": "full",
"source_url": "https://docs.llamaindex.ai/en/stable/examples/vector_stores/DeepLakeIndexDemo.html",
"comment": "https://docs.llamaindex.ai/en/stable/examples/vector_stores/DeepLakeIndexDemo.html"
},
"managed_cloud": {
"support": "full",
"source_url": "",
"comment": ""
},
"pricing": {
"value": "https://www.activeloop.ai/pricing/ Free up to 200GB, then $100 per 30M embeddings (200GB) https://www.activeloop.ai/resources/deep-lake-hnsw-index-rapidly-query-35-m-vectors-save-80/",
"source_url": "https://www.activeloop.ai/pricing/",
"comment": ""
},
"in_process": {
"support": "full",
"source_url": "",
"comment": ""
},
"multi_tenancy": {
"support": "full",
"source_url": "https://docs.deeplake.ai/en/latest/deeplake.html",
"comment": "create a dataset per tenant, similar to collections/namespaces"
},
"disk_index": {
"support": "full",
"source_url": "https://docs.activeloop.ai/performance-features/index-for-ann-search",
"comment": "Custom Memory optimized HNSW that sits on top of an object storage (including FS)"
},
"ephemeral": {
"support": "",
"source_url": "",
"comment": ""
},
"sharding": {
"support": "partial",
"source_url": "https://docs.activeloop.ai/technical-details/data-layout",
"comment": ""
},
"doc_size": {
"bytes": 0,
"unlimited": true,
"source_url": "",
"comment": ""
},
"vector_dims": {
"value": 0,
"unlimited": true,
"source_url": "",
"comment": ""
}
}
152 changes: 152 additions & 0 deletions docs/tools/vdb_table/data/anariai.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
{
"name": "Anari AI",
"links": {
"docs": "",
"github": "https://github.com/Anari-AI",
"website": "https://anari.ai/vector-acceleration-engine/",
"vendor_discussion": "https://github.com/superlinked/VectorHub/discussions/99",
"poc_github": "https://github.com/jovan-stojanovic",
"slug": "anariai"
},
"oss": {
"support": "none",
"source_url": "",
"comment": ""
},
"license": {
"value": "Proprietary",
"source_url": "",
"comment": ""
},
"dev_languages": {
"value": [
""
],
"source_url": "",
"comment": ""
},
"github_stars": 0,
"vector_launch_year": 2023,
"metadata_filter": {
"support": "",
"source_url": "",
"comment": ""
},
"hybrid_search": {
"support": "none",
"source_url": "",
"comment": ""
},
"facets": {
"support": "",
"source_url": "",
"comment": ""
},
"geo_search": {
"support": "none",
"source_url": "",
"comment": ""
},
"multi_vec": {
"support": "",
"source_url": "",
"comment": ""
},
"sparse_vectors": {
"support": "",
"source_url": "",
"comment": ""
},
"bm25": {
"support": "none",
"source_url": "",
"comment": ""
},
"full_text": {
"support": "",
"source_url": "",
"comment": ""
},
"embeddings_text": {
"support": "",
"source_url": "",
"comment": ""
},
"embeddings_image": {
"support": "",
"source_url": "",
"comment": ""
},
"embeddings_structured": {
"support": "",
"source_url": "",
"comment": ""
},
"rag": {
"support": "",
"source_url": "",
"comment": ""
},
"recsys": {
"support": "",
"source_url": "",
"comment": ""
},
"langchain": {
"support": "none",
"source_url": "",
"comment": ""
},
"llamaindex": {
"support": "none",
"source_url": "",
"comment": ""
},
"managed_cloud": {
"support": "full",
"source_url": "",
"comment": ""
},
"pricing": {
"value": "",
"source_url": "",
"comment": ""
},
"in_process": {
"support": "",
"source_url": "",
"comment": ""
},
"multi_tenancy": {
"support": "",
"source_url": "",
"comment": ""
},
"disk_index": {
"support": "",
"source_url": "",
"comment": ""
},
"ephemeral": {
"support": "none",
"source_url": "",
"comment": ""
},
"sharding": {
"support": "",
"source_url": "",
"comment": ""
},
"doc_size": {
"bytes": 0,
"unlimited": true,
"source_url": "",
"comment": ""
},
"vector_dims": {
"value": 0,
"unlimited": false,
"source_url": "",
"comment": ""
}
}
Loading

0 comments on commit 693b2a3

Please sign in to comment.