From f14147bf80c135a5ae8dd78c012f66c5844d6464 Mon Sep 17 00:00:00 2001 From: Vijayan Balasubramanian Date: Thu, 26 Dec 2024 11:13:20 -0800 Subject: [PATCH] Update cosine score translation for nmslib Nmslib has different score range than exact search. After introducing approximate threshold setting, now, both exact search and approx search can be executed as part of same knn search api. Hence, to keep it consitent, we are overriding the score translation for nmslib for approx search Signed-off-by: Vijayan Balasubramanian --- CHANGELOG.md | 1 + .../knn/index/engine/nmslib/Nmslib.java | 11 +++- .../org/opensearch/knn/index/NmslibIT.java | 58 +++++++++++++++++++ 3 files changed, 68 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a19b53fd8..ed76dd58b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -28,6 +28,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), * Allow validation for non knn index only after 2.17.0 (#2315)[https://github.com/opensearch-project/k-NN/pull/2315] * Release query vector memory after execution (#2346)[https://github.com/opensearch-project/k-NN/pull/2346] * Fix shard level rescoring disabled setting flag (#2352)[https://github.com/opensearch-project/k-NN/pull/2352] +* Update cosine score translation for nmslib (#2357)[https://github.com/opensearch-project/k-NN/pull/2357] ### Infrastructure * Updated C++ version in JNI from c++11 to c++17 [#2259](https://github.com/opensearch-project/k-NN/pull/2259) * Upgrade bytebuddy and objenesis version to match OpenSearch core and, update github ci runner for macos [#2279](https://github.com/opensearch-project/k-NN/pull/2279) diff --git a/src/main/java/org/opensearch/knn/index/engine/nmslib/Nmslib.java b/src/main/java/org/opensearch/knn/index/engine/nmslib/Nmslib.java index 4d7f7f423..62b7fdaba 100644 --- a/src/main/java/org/opensearch/knn/index/engine/nmslib/Nmslib.java +++ b/src/main/java/org/opensearch/knn/index/engine/nmslib/Nmslib.java @@ -14,7 +14,6 @@ import org.opensearch.knn.index.engine.NativeLibrary; import org.opensearch.knn.index.engine.ResolvedMethodContext; -import java.util.Collections; import java.util.Map; import java.util.function.Function; @@ -30,8 +29,16 @@ public class Nmslib extends NativeLibrary { final static Map METHODS = ImmutableMap.of(METHOD_HNSW, new NmslibHNSWMethod()); - public final static Nmslib INSTANCE = new Nmslib(METHODS, Collections.emptyMap(), CURRENT_VERSION, EXTENSION); private final MethodResolver methodResolver; + // Map that overrides OpenSearch score translation by space type of scores returned by nmslib + private final static Map> SCORE_TRANSLATIONS = ImmutableMap.of( + SpaceType.COSINESIMIL, + // To be consistent with exact search, we will be using same formula used by lucene as mentioned + // here + // https://github.com/apache/lucene/blob/0494c824e0ac8049b757582f60d085932a890800/lucene/core/src/java/org/apache/lucene/index/VectorSimilarityFunction.java#L73 + rawScore -> Math.max((2.0F - rawScore) / 2.0F, 0.0F) + ); + public final static Nmslib INSTANCE = new Nmslib(METHODS, SCORE_TRANSLATIONS, CURRENT_VERSION, EXTENSION); /** * Constructor for Nmslib diff --git a/src/test/java/org/opensearch/knn/index/NmslibIT.java b/src/test/java/org/opensearch/knn/index/NmslibIT.java index 8ca436bf4..e2e7613a2 100644 --- a/src/test/java/org/opensearch/knn/index/NmslibIT.java +++ b/src/test/java/org/opensearch/knn/index/NmslibIT.java @@ -195,6 +195,64 @@ public void testEndToEnd() throws Exception { fail("Graphs are not getting evicted"); } + public void testEndToEnd_withApproxAndExactSearch_inSameIndex_ForCosineSpaceType() throws Exception { + String indexName = "test-index-1"; + String fieldName = "test-field-1"; + SpaceType spaceType = SpaceType.COSINESIMIL; + Integer dimension = testData.indexData.vectors[0].length; + + // Create an index + XContentBuilder builder = XContentFactory.jsonBuilder() + .startObject() + .startObject("properties") + .startObject(fieldName) + .field("type", "knn_vector") + .field("dimension", dimension) + .field(KNNConstants.METHOD_PARAMETER_SPACE_TYPE, spaceType.getValue()) + .startObject(KNNConstants.KNN_METHOD) + .field(KNNConstants.NAME, KNNConstants.METHOD_HNSW) + .field(KNNConstants.KNN_ENGINE, KNNEngine.NMSLIB.getName()) + .endObject() + .endObject() + .endObject() + .endObject(); + + Map mappingMap = xContentBuilderToMap(builder); + String mapping = builder.toString(); + + createKnnIndex(indexName, buildKNNIndexSettings(0), mapping); + + // Index one document + addKnnDoc(indexName, randomAlphaOfLength(5), fieldName, Floats.asList(testData.indexData.vectors[0]).toArray()); + + // Assert we have the right number of documents in the index + refreshAllIndices(); + assertEquals(1, getDocCount(indexName)); + // update threshold setting to skip building graph + updateIndexSettings(indexName, Settings.builder().put(KNNSettings.INDEX_KNN_ADVANCED_APPROXIMATE_THRESHOLD, -1)); + // add duplicate document with different id + addKnnDoc(indexName, randomAlphaOfLength(5), fieldName, Floats.asList(testData.indexData.vectors[0]).toArray()); + assertEquals(2, getDocCount(indexName)); + final int k = 2; + // search index + Response response = searchKNNIndex( + indexName, + KNNQueryBuilder.builder().fieldName(fieldName).vector(testData.queries[0]).k(k).build(), + k + ); + String responseBody = EntityUtils.toString(response.getEntity()); + List knnResults = parseSearchResponse(responseBody, fieldName); + assertEquals(k, knnResults.size()); + + List actualScores = parseSearchResponseScore(responseBody, fieldName); + + // both document should have identical score + assertEquals(actualScores.get(0), actualScores.get(1), 0.001); + + // Delete index + deleteKNNIndex(indexName); + } + @SneakyThrows private void validateSearch( final String indexName,