Skip to content

Commit

Permalink
Add a CI job to generate backwards compatibility test indexes and a p…
Browse files Browse the repository at this point in the history
…ython unit test to query them (#206)
  • Loading branch information
jparismorgan committed Feb 12, 2024
1 parent 355da9e commit 0444c49
Show file tree
Hide file tree
Showing 6 changed files with 163 additions and 0 deletions.
50 changes: 50 additions & 0 deletions .github/workflows/build_wheels.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,58 @@ on:
- '*wheel*' # must quote since "*" is a YAML reserved character; we want a string

jobs:
generate_backwards_compatibility_data:
name: Generate Backwards Compatibility Data
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v3

# Based on https://github.com/TileDB-Inc/conda-forge-nightly-controller/blob/51519a0f8340b32cf737fcb59b76c6a91c42dc47/.github/workflows/activity.yml#L19C10-L19C10
- name: Setup git
run: |
git config user.name "GitHub Actions"
git config user.email "[email protected]"
- name: Set up Python
uses: actions/setup-python@v2
with:
python-version: "3.9"

- name: Print Python version
run: |
which python
which pip
python --version
- name: Build Indexes
run: |
# Get the release tag.
release_tag=$(git describe --tags --abbrev=0)
echo $release_tag
# Install dependencies.
cd apis/python && pip install . && cd ../..
# Generate data.
python backwards-compatibility-data/generate_data.py $release_tag
# Push this data to a new branch and create a PR from it.
git fetch
branch_name="update-backwards-compatibility-data-${release_tag}"
echo $branch_name
git checkout -b "$branch_name"
git add backwards-compatibility-data/data/
git commit -m "[automated] Update backwards-compatibility-data for release $release_tag"
git push origin "$branch_name"
gh pr create --base main --head "$branch_name" --title "[automated] Update backwards-compatibility-data for release $release_tag"
env:
GH_TOKEN: ${{ github.token }}

build_wheels:
name: Build wheels on ${{ matrix.os }}
# TODO(paris): Add this back once generate_backwards_compatibility_data is confirmed to work.
# needs: generate_backwards_compatibility_data
runs-on: ${{ matrix.os }}
strategy:
matrix:
Expand Down
38 changes: 38 additions & 0 deletions apis/python/test/test_backwards_compatibility.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
from common import *

from tiledb.vector_search.flat_index import FlatIndex
from tiledb.vector_search.ivf_flat_index import IVFFlatIndex
from tiledb.vector_search.utils import load_fvecs

MINIMUM_ACCURACY = 0.85

def test_query_old_indices():
'''
Tests that current code can query indices which were written to disk by old code.
'''
backwards_compatibility_path = os.path.join(os.path.dirname(__file__), '..', '..', '..', 'backwards-compatibility-data')
datasets_path = os.path.join(backwards_compatibility_path, 'data')
base = load_fvecs(os.path.join(backwards_compatibility_path, 'siftmicro_base.fvecs'))
query_indices = [0, 3, 4, 8, 10, 19, 28, 31, 39, 40, 41, 47, 49, 50, 56, 64, 68, 70, 71, 79, 82, 89, 90, 94]
queries = base[query_indices]

for directory_name in os.listdir(datasets_path):
version_path = os.path.join(datasets_path, directory_name)
if not os.path.isdir(version_path):
continue

for index_name in os.listdir(version_path):
index_uri = os.path.join(version_path, index_name)
if not os.path.isdir(index_uri):
continue

if "ivf_flat" in index_name:
index = IVFFlatIndex(uri=index_uri)
elif "flat" in index_name:
index = FlatIndex(uri=index_uri)
else:
assert False, f"Unknown index name: {index_name}"

result_d, result_i = index.query(queries, k=1)
assert query_indices == result_i.flatten().tolist()
assert result_d.flatten().tolist() == [0 for _ in range(len(query_indices))]
19 changes: 19 additions & 0 deletions backwards-compatibility-data/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
### What
This folder contains test indices built using different versions of TileDB-Vector-Search. It is used to test the ability of the latest version of TileDB-Vector-Search to load and query arrays built by previous versions.

### Usage
To generate new data, run:
```bash
cd apis/python
pip install .
cd ../..
python generate_data.py my_version
```
This will build new indexes and save them to `backwards-compatibility-data/data/my_version`.

To run the backwards compability test:
```bash
cd apis/python
pip install ".[test]"
pytest test/test_backwards_compatibility.py -s
```
2 changes: 2 additions & 0 deletions backwards-compatibility-data/data/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
### What
Holds test indices built using different versions of TileDB-Vector-Search.
54 changes: 54 additions & 0 deletions backwards-compatibility-data/generate_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
import os
import shutil

from tiledb.vector_search.ingestion import ingest
from tiledb.vector_search.utils import load_fvecs, write_fvecs

def create_sift_micro():
'''
Create a smaller version of the base SIFT 10K dataset (http://corpus-texmex.irisa.fr). You
don't need to run this again, but it's saved here just in case. To query an index built with
this data just select vectors from this file as the query vectors.
'''
script_dir = os.path.dirname(os.path.abspath(__file__))
base_uri = os.path.join(script_dir, "..", "apis", "python", "test", "data", "siftsmall", "siftsmall_base.fvecs")
write_fvecs(os.path.join(script_dir, "siftmicro_base.fvecs"), load_fvecs(base_uri)[:100])

def generate_release_data(version):
script_dir = os.path.dirname(os.path.abspath(__file__))

# Create the new release directory.
release_dir = os.path.join(script_dir, "data", version)
shutil.rmtree(release_dir, ignore_errors=True)
os.makedirs(release_dir, exist_ok=True)

# Get the data we'll use to generate the index.
base_uri = os.path.join(script_dir, "siftmicro_base.fvecs")
base = load_fvecs(base_uri)
indices = [0, 3, 4, 8, 10, 19, 28, 31, 39, 40, 41, 47, 49, 50, 56, 64, 68, 70, 71, 79, 82, 89, 90, 94]
queries = base[indices]

# Generate each index and query to make sure it works before we write it.
index_types = ["FLAT", "IVF_FLAT"]
data_types = ["float32", "uint8"]
for index_type in index_types:
for data_type in data_types:
index_uri = f"{release_dir}/{index_type.lower()}_{data_type}"
print(f"Creating index at {index_uri}")
index = ingest(
index_type=index_type,
index_uri=index_uri,
input_vectors=base.astype(data_type),
)

result_d, result_i = index.query(queries, k=1)
assert indices == result_i.flatten().tolist()
assert result_d.flatten().tolist() == [0 for _ in range(len(indices))]

if __name__ == "__main__":
import argparse
p = argparse.ArgumentParser()
p.add_argument("version", help="The name of the of the TileDB-Vector-Search version which we are creating indices for.")
args = p.parse_args()
print(f"Building indexes for version {args.version}")
generate_release_data(args.version)
Binary file added backwards-compatibility-data/siftmicro_base.fvecs
Binary file not shown.

0 comments on commit 0444c49

Please sign in to comment.