Merge branch 'main' of https://github.com/TileDB-Inc/TileDB-Vector-Se…

…arch into jparismorgan/ivf-pq-temp-dir-update
TileDB-Inc · Oct 17, 2024 · b6af0fa · b6af0fa
2 parents decbe16 + eaf1a5f
commit b6af0fa
Show file tree

Hide file tree

Showing 17 changed files with 972 additions and 327 deletions.
diff --git a/.github/workflows/ci-python.yml b/.github/workflows/ci-python.yml
@@ -13,6 +13,7 @@ jobs:
         os: [ubuntu-latest]
         python-version: ["3.9"]
     runs-on: ${{ matrix.os }}
+    continue-on-error: true
     steps:
       - name: Install OpenBLAS
         run: sudo apt install libopenblas-dev
@@ -29,6 +30,62 @@ jobs:
       - name: Build and test python
         run: |
           pip install .[test]
+
+          pip list
+
+          cd apis/python
+          pytest -n logical --durations=0
+          # TODO: fix editable on linux
+          #pip uninstall -y tiledb.vector_search
+          #pip install -e .
+          #pytest
+          pip install -r test/ipynb/requirements.txt
+          export TILEDB_REST_TOKEN=$TILEDB_CLOUD_HELPER_VAR
+          pytest -n logical --durations=0 --nbmake test/ipynb
+        env:
+          TILEDB_CLOUD_HELPER_VAR: ${{ secrets.TILEDB_CLOUD_HELPER_VAR }}
+        shell: bash -el {0}
+        # TODO(paris):  This is a temporary job where we will build with numpy2, but run with numpy1.
+        # Remove once the UDFs have numpy2 and do not fail.
+        continue-on-error: true
+      - name: Check tiledb-vector-search version
+        run: |
+          python -c "from tiledb.vector_search.version import version; print(version)"
+
+  # TODO(paris): This is a temporary job where we will build with numpy2, but run with numpy1.
+  # Remove once the UDFs have numpy2 and do not fail.
+  run-tests-numpy-1:
+    strategy:
+      matrix:
+        os: [ubuntu-latest]
+        python-version: ["3.9"]
+    runs-on: ${{ matrix.os }}
+    steps:
+      - name: Install OpenBLAS
+        run: sudo apt install libopenblas-dev
+      - uses: actions/checkout@v3
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v2
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Print Python version
+        run: |
+          which python
+          which pip
+          python --version
+      - name: Build and test python
+        run: |
+          # This will build with numpy 2.
+          pip install .[test]
+
+          pip list
+
+          # Then we will uninstall numpy 2 and install numpy 1.
+          pip uninstall -y numpy
+          pip install numpy==1.25.0
+
+          pip list
+
           cd apis/python
           pytest -n logical --durations=0
           # TODO: fix editable on linux

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -15,7 +15,7 @@ repos:
       - id: prettier
 
   - repo: https://github.com/charliermarsh/ruff-pre-commit
-    rev: "v0.0.265"
+    rev: "v0.4.4"
     hooks:
       - id: ruff
         args: [--fix, --exit-non-zero-on-fix]

diff --git a/apis/python/examples/object_api/bioimg_similarity_search.ipynb b/apis/python/examples/object_api/bioimg_similarity_search.ipynb
diff --git a/apis/python/src/tiledb/vector_search/embeddings/__init__.py b/apis/python/src/tiledb/vector_search/embeddings/__init__.py
@@ -1,3 +1,4 @@
+from .huggingface_auto_image_embedding import HuggingfaceAutoImageEmbedding
 from .image_resnetv2_embedding import ImageResNetV2Embedding
 from .langchain_embedding import LangChainEmbedding
 from .object_embedding import ObjectEmbedding
@@ -11,6 +12,7 @@
     "ObjectEmbedding",
     "SomaGenePTwEmbedding",
     "ImageResNetV2Embedding",
+    "HuggingfaceAutoImageEmbedding",
     "RandomEmbedding",
     "SentenceTransformersEmbedding",
     "LangChainEmbedding",

diff --git a/apis/python/src/tiledb/vector_search/embeddings/huggingface_auto_image_embedding.py b/apis/python/src/tiledb/vector_search/embeddings/huggingface_auto_image_embedding.py
@@ -0,0 +1,76 @@
+from typing import Dict, Optional, OrderedDict
+
+import numpy as np
+
+
+class HuggingfaceAutoImageEmbedding:
+    def __init__(
+        self,
+        model_name_or_path: str,
+        dimensions: int,
+        device: Optional[str] = None,
+        cache_folder: Optional[str] = None,
+        batch_size: int = 64,
+    ):
+        self.model_name_or_path = model_name_or_path
+        self.dim_num = dimensions
+        self.device = device
+        self.cache_folder = cache_folder
+        self.batch_size = batch_size
+        self.processor = None
+        self.model = None
+
+    def init_kwargs(self) -> Dict:
+        return {
+            "model_name_or_path": self.model_name_or_path,
+            "dimensions": self.dim_num,
+            "device": self.device,
+            "cache_folder": self.cache_folder,
+            "batch_size": self.batch_size,
+        }
+
+    def dimensions(self) -> int:
+        return self.dim_num
+
+    def vector_type(self) -> np.dtype:
+        return np.float32
+
+    def load(self) -> None:
+        from transformers import AutoImageProcessor
+        from transformers import AutoModel
+
+        self.processor = AutoImageProcessor.from_pretrained(self.model_name_or_path)
+        self.model = AutoModel.from_pretrained(self.model_name_or_path)
+
+    def embed(self, objects: OrderedDict, metadata: OrderedDict) -> np.ndarray:
+        from PIL import Image
+
+        write_id = 0
+        count = 0
+        image_batch = []
+        size = len(objects["image"])
+        embeddings = np.zeros((size, self.dim_num), dtype=np.float32)
+        for image_id in range(len(objects["image"])):
+            image_batch.append(
+                Image.fromarray(
+                    np.reshape(objects["image"][image_id], objects["shape"][image_id])
+                )
+            )
+            count += 1
+            if count >= self.batch_size:
+                print(image_id)
+                inputs = self.processor(images=image_batch, return_tensors="pt")
+                batch_embeddings = (
+                    self.model(**inputs).last_hidden_state[:, 0].cpu().detach().numpy()
+                )
+                embeddings[write_id : write_id + count] = batch_embeddings
+                count = 0
+                image_batch = []
+
+        if count > 0:
+            inputs = self.processor(images=image_batch, return_tensors="pt")
+            batch_embeddings = (
+                self.model(**inputs).last_hidden_state[:, 0].cpu().detach().numpy()
+            )
+            embeddings[write_id : write_id + count] = batch_embeddings
+        return embeddings
diff --git a/apis/python/src/tiledb/vector_search/object_readers/__init__.py b/apis/python/src/tiledb/vector_search/object_readers/__init__.py
@@ -1,5 +1,4 @@
-from .bioimage_reader import BioImagePartition
-from .bioimage_reader import BioImageReader
+from .bioimage_reader import BioImageDirectoryReader
 from .directory_reader import DirectoryImageReader
 from .directory_reader import DirectoryPartition
 from .directory_reader import DirectoryReader
@@ -18,8 +17,7 @@
     "SomaAnnDataReader",
     "TileDB1DArrayPartition",
     "TileDB1DArrayReader",
-    "BioImagePartition",
-    "BioImageReader",
+    "BioImageDirectoryReader",
     "DirectoryReader",
     "DirectoryTextReader",
     "DirectoryImageReader",