Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add a CI job to generate backwards compatibility test indexes and a python unit test to query them #206

Merged
merged 36 commits into from
Jan 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
87e9101
script to generate and test CI job
jparismorgan Jan 10, 2024
b3ca32f
disable other CI jobs
jparismorgan Jan 10, 2024
6e02d48
ci testing
jparismorgan Jan 10, 2024
5cba6d6
ci testing
jparismorgan Jan 10, 2024
aaaab7c
ci testing
jparismorgan Jan 10, 2024
223c0f0
ci testing
jparismorgan Jan 10, 2024
f723dbd
ci testing
jparismorgan Jan 10, 2024
41178d6
ci testing
jparismorgan Jan 10, 2024
0f4ac1b
ci testing
jparismorgan Jan 10, 2024
e624de4
ci testing
jparismorgan Jan 10, 2024
6940ea7
ci testing
jparismorgan Jan 10, 2024
ff3bde7
ci testing
jparismorgan Jan 10, 2024
191021a
ci testing
jparismorgan Jan 10, 2024
c2a1b32
ci testing
jparismorgan Jan 10, 2024
a3eed69
Merge branch 'main' of https://github.com/TileDB-Inc/TileDB-Vector-Se…
jparismorgan Jan 11, 2024
525cb9e
ci testing
jparismorgan Jan 11, 2024
b03d525
ci testing
jparismorgan Jan 11, 2024
48b48b8
ci testing
jparismorgan Jan 11, 2024
a615991
ci testing
jparismorgan Jan 11, 2024
0760ae4
ci testing
jparismorgan Jan 11, 2024
ac630ee
fix create_sift_micro and try on new branch
jparismorgan Jan 11, 2024
1ae4bda
Merge branch 'main' of https://github.com/TileDB-Inc/TileDB-Vector-Se…
jparismorgan Jan 17, 2024
204c410
disable two other CI jobs during testing
jparismorgan Jan 17, 2024
898e87e
test with updated release tag
jparismorgan Jan 17, 2024
d1f6bbe
fix broken log line
jparismorgan Jan 17, 2024
79b4b59
add back commented out ci files, fix typo
jparismorgan Jan 19, 2024
1e83943
update build-wheels.yml
jparismorgan Jan 19, 2024
25ef67d
cleanup code
jparismorgan Jan 19, 2024
e3fcc5a
cleanup code
jparismorgan Jan 19, 2024
ec1a1bc
cleanup generate_data.py and other files
jparismorgan Jan 19, 2024
876763c
cleanup code
jparismorgan Jan 19, 2024
fd7c7a3
add file to data/ folder so it gets created
jparismorgan Jan 19, 2024
673ef90
test creating PR instead of pushing
jparismorgan Jan 22, 2024
1c9f07d
use a dummy release tag
jparismorgan Jan 22, 2024
e23bd5d
add GH_TOKEN
jparismorgan Jan 22, 2024
247d9fb
cleanup code
jparismorgan Jan 22, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 50 additions & 0 deletions .github/workflows/build-wheels.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,58 @@ on:
- '*wheel*' # must quote since "*" is a YAML reserved character; we want a string

jobs:
generate_backwards_compatibility_data:
name: Generate Backwards Compatibility Data
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v3

# Based on https://github.com/TileDB-Inc/conda-forge-nightly-controller/blob/51519a0f8340b32cf737fcb59b76c6a91c42dc47/.github/workflows/activity.yml#L19C10-L19C10
- name: Setup git
run: |
git config user.name "GitHub Actions"
git config user.email "[email protected]"

- name: Set up Python
uses: actions/setup-python@v2
with:
python-version: "3.9"

- name: Print Python version
run: |
which python
which pip
python --version

- name: Build Indexes
run: |
# Get the release tag.
release_tag=$(git describe --tags --abbrev=0)
echo $release_tag

# Install dependencies.
cd apis/python && pip install . && cd ../..

# Generate data.
python backwards-compatibility-data/generate_data.py $release_tag

# Push this data to a new branch and create a PR from it.
git fetch
branch_name="update-backwards-compatibility-data-${release_tag}"
echo $branch_name
git checkout -b "$branch_name"
git add backwards-compatibility-data/data/
git commit -m "[automated] Update backwards-compatibility-data for release $release_tag"
git push origin "$branch_name"
gh pr create --base main --head "$branch_name" --title "[automated] Update backwards-compatibility-data for release $release_tag"
env:
GH_TOKEN: ${{ github.token }}

build_wheels:
name: Build wheels on ${{ matrix.os }}
# TODO(paris): Add this back once generate_backwards_compatibility_data is confirmed to work.
# needs: generate_backwards_compatibility_data
runs-on: ${{ matrix.os }}
strategy:
matrix:
Expand Down
38 changes: 38 additions & 0 deletions apis/python/test/test_backwards_compatibility.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
from common import *

from tiledb.vector_search.flat_index import FlatIndex
from tiledb.vector_search.ivf_flat_index import IVFFlatIndex
from tiledb.vector_search.utils import load_fvecs

MINIMUM_ACCURACY = 0.85

def test_query_old_indices():
'''
Tests that current code can query indices which were written to disk by old code.
'''
backwards_compatibility_path = os.path.join(os.path.dirname(__file__), '..', '..', '..', 'backwards-compatibility-data')
datasets_path = os.path.join(backwards_compatibility_path, 'data')
base = load_fvecs(os.path.join(backwards_compatibility_path, 'siftmicro_base.fvecs'))
query_indices = [0, 3, 4, 8, 10, 19, 28, 31, 39, 40, 41, 47, 49, 50, 56, 64, 68, 70, 71, 79, 82, 89, 90, 94]
queries = base[query_indices]

for directory_name in os.listdir(datasets_path):
version_path = os.path.join(datasets_path, directory_name)
if not os.path.isdir(version_path):
continue

for index_name in os.listdir(version_path):
index_uri = os.path.join(version_path, index_name)
if not os.path.isdir(index_uri):
continue

if "ivf_flat" in index_name:
index = IVFFlatIndex(uri=index_uri)
elif "flat" in index_name:
index = FlatIndex(uri=index_uri)
else:
assert False, f"Unknown index name: {index_name}"

result_d, result_i = index.query(queries, k=1)
assert query_indices == result_i.flatten().tolist()
assert result_d.flatten().tolist() == [0 for _ in range(len(query_indices))]
19 changes: 19 additions & 0 deletions backwards-compatibility-data/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
### What
This folder contains test indices built using different versions of TileDB-Vector-Search. It is used to test the ability of the latest version of TileDB-Vector-Search to load and query arrays built by previous versions.

### Usage
To generate new data, run:
```bash
cd apis/python
pip install .
cd ../..
python generate_data.py my_version
```
This will build new indexes and save them to `backwards-compatibility-data/data/my_version`.

To run the backwards compability test:
```bash
cd apis/python
pip install ".[test]"
pytest test/test_backwards_compatibility.py -s
```
2 changes: 2 additions & 0 deletions backwards-compatibility-data/data/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
### What
Holds test indices built using different versions of TileDB-Vector-Search.
54 changes: 54 additions & 0 deletions backwards-compatibility-data/generate_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
import os
import shutil

from tiledb.vector_search.ingestion import ingest
from tiledb.vector_search.utils import load_fvecs, write_fvecs

def create_sift_micro():
'''
Create a smaller version of the base SIFT 10K dataset (http://corpus-texmex.irisa.fr). You
don't need to run this again, but it's saved here just in case. To query an index built with
this data just select vectors from this file as the query vectors.
'''
script_dir = os.path.dirname(os.path.abspath(__file__))
base_uri = os.path.join(script_dir, "..", "apis", "python", "test", "data", "siftsmall", "siftsmall_base.fvecs")
write_fvecs(os.path.join(script_dir, "siftmicro_base.fvecs"), load_fvecs(base_uri)[:100])

def generate_release_data(version):
script_dir = os.path.dirname(os.path.abspath(__file__))

# Create the new release directory.
release_dir = os.path.join(script_dir, "data", version)
shutil.rmtree(release_dir, ignore_errors=True)
os.makedirs(release_dir, exist_ok=True)

# Get the data we'll use to generate the index.
base_uri = os.path.join(script_dir, "siftmicro_base.fvecs")
base = load_fvecs(base_uri)
indices = [0, 3, 4, 8, 10, 19, 28, 31, 39, 40, 41, 47, 49, 50, 56, 64, 68, 70, 71, 79, 82, 89, 90, 94]
queries = base[indices]

# Generate each index and query to make sure it works before we write it.
index_types = ["FLAT", "IVF_FLAT"]
data_types = ["float32", "uint8"]
for index_type in index_types:
for data_type in data_types:
index_uri = f"{release_dir}/{index_type.lower()}_{data_type}"
print(f"Creating index at {index_uri}")
index = ingest(
index_type=index_type,
index_uri=index_uri,
input_vectors=base.astype(data_type),
)

result_d, result_i = index.query(queries, k=1)
assert indices == result_i.flatten().tolist()
assert result_d.flatten().tolist() == [0 for _ in range(len(indices))]

if __name__ == "__main__":
import argparse
p = argparse.ArgumentParser()
p.add_argument("version", help="The name of the of the TileDB-Vector-Search version which we are creating indices for.")
args = p.parse_args()
print(f"Building indexes for version {args.version}")
generate_release_data(args.version)
Binary file added backwards-compatibility-data/siftmicro_base.fvecs
Binary file not shown.
Loading