Skip to content

Commit

Permalink
Add tests to test different types of media format for LanguageBind mo…
Browse files Browse the repository at this point in the history
…dels(#1073)
  • Loading branch information
wanliAlex authored Jan 7, 2025
1 parent 3d7915d commit 5b4f604
Show file tree
Hide file tree
Showing 8 changed files with 124 additions and 18 deletions.
1 change: 1 addition & 0 deletions .github/workflows/largemodel_unit_test_CI.yml
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,7 @@ jobs:
export VESPA_QUERY_URL=http://localhost:8080
export MARQO_MAX_CPU_MODEL_MEMORY=15
export MARQO_MAX_CUDA_MODEL_MEMORY=15
export HF_HUB_ENABLE_HF_TRANSFER=1
export PRIVATE_MODEL_TESTS_AWS_ACCESS_KEY_ID=${{ secrets.PRIVATE_MODEL_TESTS_AWS_ACCESS_KEY_ID }}
export PRIVATE_MODEL_TESTS_AWS_SECRET_ACCESS_KEY=${{ secrets.PRIVATE_MODEL_TESTS_AWS_SECRET_ACCESS_KEY }}
Expand Down
30 changes: 27 additions & 3 deletions tests/marqo_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,13 +42,23 @@ class TestAudioUrls(str, Enum):
AUDIO2 = "https://marqo-ecs-50-audio-test-dataset.s3.us-east-1.amazonaws.com/audios/1-115545-C-48.wav"
AUDIO3 = "https://marqo-ecs-50-audio-test-dataset.s3.us-east-1.amazonaws.com/audios/1-119125-A-45.wav"

MP3_AUDIO1 = "https://opensource-languagebind-models.s3.us-east-1.amazonaws.com/test-media-types/sample3.mp3"
ACC_AUDIO1 = "https://opensource-languagebind-models.s3.us-east-1.amazonaws.com/test-media-types/sample3.aac"
OGG_AUDIO1 = "https://opensource-languagebind-models.s3.us-east-1.amazonaws.com/test-media-types/sample3.ogg"

FLAC_AUDIO1 = "https://opensource-languagebind-models.s3.us-east-1.amazonaws.com/test-media-types/sample3.flac"


class TestVideoUrls(str, Enum):
__test__ = False
VIDEO1 = "https://marqo-k400-video-test-dataset.s3.us-east-1.amazonaws.com/videos/--_S9IDQPLg_000135_000145.mp4"
VIDEO2 = "https://marqo-k400-video-test-dataset.s3.us-east-1.amazonaws.com/videos/---QUuC4vJs_000084_000094.mp4"
VIDEO3 = "https://marqo-k400-video-test-dataset.s3.us-east-1.amazonaws.com/videos/--mI_-gaZLk_000018_000028.mp4"

MKV_VIDEO1 = "https://opensource-languagebind-models.s3.us-east-1.amazonaws.com/test-media-types/sample_640x360.mkv"
WEBM_VIDEO1 = "https://opensource-languagebind-models.s3.us-east-1.amazonaws.com/test-media-types/sample_640x360.webm"
AVI_VIDEO1 = "https://opensource-languagebind-models.s3.us-east-1.amazonaws.com/test-media-types/sample_640x360.avi"



class MarqoTestCase(unittest.TestCase):
Expand Down Expand Up @@ -109,10 +119,24 @@ def setUp(self) -> None:

def clear_indexes(self, indexes: List[MarqoIndex]):
for index in indexes:
self.clear_index_by_name(index.schema_name)
self.clear_index_by_schema_name(index.schema_name)

def clear_index_by_index_name(self, index_name: str):
"""Delete all documents in the given index.
def clear_index_by_name(self, index_name: str):
self.pyvespa_client.delete_all_docs(self.CONTENT_CLUSTER, index_name)
Args:
index_name: The name of the index to clear.
"""
schema_name = self.index_management.get_index(index_name).schema_name
return self.clear_index_by_schema_name(schema_name)

def clear_index_by_schema_name(self, schema_name: str):
"""Delete all documents in the given index.
Args:
schema_name: The schema name of the index to clear. It is not the same as the index name.
"""
self.pyvespa_client.delete_all_docs(self.CONTENT_CLUSTER, schema_name)

def random_index_name(self) -> str:
return 'a' + str(uuid.uuid4()).replace('-', '')
Expand Down
97 changes: 89 additions & 8 deletions tests/tensor_search/integ_tests/test_add_documents_combined.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
import os
import unittest.mock
import unittest.mock
import uuid
from unittest import mock
from unittest.mock import patch
Expand All @@ -21,8 +19,8 @@
from marqo.tensor_search import add_docs
from marqo.tensor_search import streaming_media_processor
from marqo.tensor_search import tensor_search
from tests.marqo_test import MarqoTestCase, TestImageUrls, TestAudioUrls, TestVideoUrls
from marqo.tensor_search.models.preprocessors_model import Preprocessors
from tests.marqo_test import MarqoTestCase, TestImageUrls, TestAudioUrls, TestVideoUrls


class TestAddDocumentsCombined(MarqoTestCase):
Expand Down Expand Up @@ -902,15 +900,15 @@ def get_docs():

self.maxDiff = None # allow output all diffs
with self.subTest(f'{index.name} with type {index.type}'):
self.clear_index_by_name(index_name=index.schema_name)
self.clear_index_by_schema_name(schema_name=index.schema_name)
add_docs(BatchVectorisationMode.PER_FIELD)
docs_added_using_per_field_strategy = get_docs()

self.clear_index_by_name(index_name=index.schema_name)
self.clear_index_by_schema_name(schema_name=index.schema_name)
add_docs(BatchVectorisationMode.PER_DOCUMENT)
docs_added_using_per_doc_strategy = get_docs()

self.clear_index_by_name(index_name=index.schema_name)
self.clear_index_by_schema_name(schema_name=index.schema_name)
add_docs(BatchVectorisationMode.PER_DOCUMENT)
docs_added_using_per_batch_strategy = get_docs()

Expand Down Expand Up @@ -1243,7 +1241,6 @@ def test_video_size_limit_in_batch(self):
tensor_fields=tensor_fields
)
).dict(exclude_none=True, by_alias=True)
print(result)

# Verify results
self.assertTrue(result["errors"]) # Should have errors due to second document
Expand All @@ -1265,4 +1262,88 @@ def test_video_size_limit_in_batch(self):
).dict(exclude_none=True, by_alias=True)

self.assertEqual(1, len(get_result["results"]))
self.assertEqual("1", get_result["results"][0]["_id"])
self.assertEqual("1", get_result["results"][0]["_id"])

def test_supported_audio_format(self):
"""Test the supported audio format for the LanguageBind model in add_documents and search."""

test_cases = [
(TestAudioUrls.MP3_AUDIO1.value, "mp3"),
(TestAudioUrls.ACC_AUDIO1.value, "aac"),
(TestAudioUrls.OGG_AUDIO1.value, "ogg"),
(TestAudioUrls.FLAC_AUDIO1.value, "flac")
]

for test_case, audio_format in test_cases:
for index in [self.structured_language_bind_index_name, self.unstructured_language_bind_index_name]:
with self.subTest(f"{index} - {audio_format}"):
self.clear_index_by_schema_name(
schema_name=self.index_management.get_index(index_name=index).schema_name)
self.assertEqual(0, self.monitoring.get_index_stats_by_name(index_name=index).number_of_documents)
document = {
"audio_field_1": test_case,
"_id": "1"
}

res = tensor_search.add_documents(
self.config,
add_docs_params=AddDocsParams(
index_name=index,
docs=[document],
tensor_fields=[
"audio_field_1"] if index == self.unstructured_language_bind_index_name else None
)
)
self.assertFalse(res.errors, msg=res.dict())
self.assertEqual(1, self.monitoring.get_index_stats_by_name(index_name=index).number_of_documents)
self.assertGreaterEqual(self.monitoring.get_index_stats_by_name(index_name=index).number_of_vectors,
1)
if test_case not in [TestAudioUrls.ACC_AUDIO1.value,]:
# .acc is not support
_ = tensor_search.search(
config=self.config,
index_name=index,
text=test_case,
search_method = "TENSOR"
)

def test_supported_video_format(self):
"""Test the supported video format for the LanguageBind model in add_documents and search."""

test_cases = [
(TestVideoUrls.AVI_VIDEO1.value, "avi"),
(TestVideoUrls.MKV_VIDEO1.value, "mkv"),
(TestVideoUrls.WEBM_VIDEO1.value, "webm")
]

for test_case, audio_format in test_cases:
for index in [self.structured_language_bind_index_name, self.unstructured_language_bind_index_name]:
with self.subTest(f"{index} - {audio_format}"):
self.clear_index_by_schema_name(
schema_name=self.index_management.get_index(index_name=index).schema_name)
self.assertEqual(0, self.monitoring.get_index_stats_by_name(index_name=index).number_of_documents)
document = {
"video_field_1": test_case,
"_id": "1"
}

res = tensor_search.add_documents(
self.config,
add_docs_params=AddDocsParams(
index_name=index,
docs=[document],
tensor_fields=[
"video_field_1"] if index == self.unstructured_language_bind_index_name else None
)
)
self.assertFalse(res.errors, msg=res.dict())
self.assertEqual(1, self.monitoring.get_index_stats_by_name(index_name=index).number_of_documents)
self.assertGreaterEqual(self.monitoring.get_index_stats_by_name(index_name=index).number_of_vectors,
1)

_ = tensor_search.search(
config=self.config,
index_name=index,
text=test_case,
search_method = "TENSOR"
)
Original file line number Diff line number Diff line change
Expand Up @@ -664,7 +664,7 @@ def _check_get_docs(doc_count, title_value):

doc_counts = 1, 2, 25
for c in doc_counts:
self.clear_index_by_name(self.image_index_with_random_model)
self.clear_index_by_index_name(self.image_index_with_random_model)

res1 = self.add_documents(
self.config,
Expand Down Expand Up @@ -744,7 +744,7 @@ def test_duplicate_ids_behaviour(self):
]

for documents, number_of_docs, msg in test_cases:
self.clear_index_by_name(self.default_text_index)
self.clear_index_by_index_name(self.default_text_index)
with self.subTest(msg):
r = self.add_documents(config=self.config,
add_docs_params=AddDocsParams(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -825,7 +825,7 @@ def _check_get_docs(doc_count, title_value):

doc_counts = 1, 2, 25
for c in doc_counts:
self.clear_index_by_name(self.index_name_img_random)
self.clear_index_by_index_name(self.index_name_img_random)

res1 = self.add_documents(
self.config,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -675,7 +675,7 @@ def _check_get_docs(doc_count, title_value):

doc_counts = 1, 2, 25
for c in doc_counts:
self.clear_index_by_name(self.image_index_with_random_model)
self.clear_index_by_index_name(self.image_index_with_random_model)

res1 = self.add_documents(
self.config,
Expand Down Expand Up @@ -755,7 +755,7 @@ def test_duplicate_ids_behaviour(self):
]

for documents, number_of_docs, msg in test_cases:
self.clear_index_by_name(self.default_text_index)
self.clear_index_by_index_name(self.default_text_index)
with self.subTest(msg):
r = self.add_documents(config=self.config,
add_docs_params=AddDocsParams(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1338,7 +1338,7 @@ def test_search_returned_documents(self):

for document, msg in [full_fields_document, partial_fields_document, no_field_documents]:
with self.subTest(msg):
self.clear_index_by_name(self.default_text_index)
self.clear_index_by_index_name(self.default_text_index)
self.add_documents(
config=self.config,
add_docs_params=AddDocsParams(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1353,7 +1353,7 @@ def test_search_returned_documents(self):

for document, msg in [full_fields_document, partial_fields_document, no_field_documents]:
with self.subTest(msg):
self.clear_index_by_name(self.default_text_index)
self.clear_index_by_index_name(self.default_text_index)
self.add_documents(
config=self.config,
add_docs_params=AddDocsParams(
Expand Down

0 comments on commit 5b4f604

Please sign in to comment.