From 7d19cac76bffde00392e4fb136f74124bd414319 Mon Sep 17 00:00:00 2001 From: Vijayan Balasubramanian Date: Thu, 26 Dec 2024 11:13:20 -0800 Subject: [PATCH] Have one score definition for cosinesimilarity Currently we have different score calculation for cosine similarity, for ex: script score, approximate search, exact search has diffent formula to convert distance to cosine similarity that is aligned with OpenSearch score. To keep it consistent, we will be using one defintion which is used by Lucene as standard definition for cosine similarity for all search types. Signed-off-by: Vijayan Balasubramanian --- CHANGELOG.md | 1 + .../org/opensearch/knn/index/SpaceType.java | 4 +- .../knn/plugin/script/KNNScoringSpace.java | 7 ++- .../org/opensearch/knn/index/NmslibIT.java | 58 +++++++++++++++++++ .../plugin/script/KNNScoringSpaceTests.java | 2 +- 5 files changed, 69 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a19b53fd8..ddba48075 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -28,6 +28,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), * Allow validation for non knn index only after 2.17.0 (#2315)[https://github.com/opensearch-project/k-NN/pull/2315] * Release query vector memory after execution (#2346)[https://github.com/opensearch-project/k-NN/pull/2346] * Fix shard level rescoring disabled setting flag (#2352)[https://github.com/opensearch-project/k-NN/pull/2352] +* Use one formula to calculate cosine similarity (#2357)[https://github.com/opensearch-project/k-NN/pull/2357] ### Infrastructure * Updated C++ version in JNI from c++11 to c++17 [#2259](https://github.com/opensearch-project/k-NN/pull/2259) * Upgrade bytebuddy and objenesis version to match OpenSearch core and, update github ci runner for macos [#2279](https://github.com/opensearch-project/k-NN/pull/2279) diff --git a/src/main/java/org/opensearch/knn/index/SpaceType.java b/src/main/java/org/opensearch/knn/index/SpaceType.java index abe265a01..c8e13d4b3 100644 --- a/src/main/java/org/opensearch/knn/index/SpaceType.java +++ b/src/main/java/org/opensearch/knn/index/SpaceType.java @@ -62,7 +62,9 @@ public float scoreToDistanceTranslation(float score) { COSINESIMIL("cosinesimil") { @Override public float scoreTranslation(float rawScore) { - return 1 / (1 + rawScore); + // To be consistent, we will be using same formula used by lucene as mentioned below + // https://github.com/apache/lucene/blob/0494c824e0ac8049b757582f60d085932a890800/lucene/core/src/java/org/apache/lucene/index/VectorSimilarityFunction.java#L73 + return Math.max((2.0F - rawScore) / 2.0F, 0.0F); } @Override diff --git a/src/main/java/org/opensearch/knn/plugin/script/KNNScoringSpace.java b/src/main/java/org/opensearch/knn/plugin/script/KNNScoringSpace.java index 71616c9fd..9744796c6 100644 --- a/src/main/java/org/opensearch/knn/plugin/script/KNNScoringSpace.java +++ b/src/main/java/org/opensearch/knn/plugin/script/KNNScoringSpace.java @@ -144,7 +144,12 @@ public CosineSimilarity(Object query, MappedFieldType fieldType) { protected BiFunction getScoringMethod(final float[] processedQuery) { SpaceType.COSINESIMIL.validateVector(processedQuery); float qVectorSquaredMagnitude = getVectorMagnitudeSquared(processedQuery); - return (float[] q, float[] v) -> 1 + KNNScoringUtil.cosinesimilOptimized(q, v, qVectorSquaredMagnitude); + // To be consistent, we will be using same formula used by lucene as mentioned below + // https://github.com/apache/lucene/blob/0494c824e0ac8049b757582f60d085932a890800/lucene/core/src/java/org/apache/lucene/index/VectorSimilarityFunction.java#L73 + return (float[] q, float[] v) -> Math.max( + (1.0F + KNNScoringUtil.cosinesimilOptimized(q, v, qVectorSquaredMagnitude)) / 2.0F, + 0.0F + ); } } diff --git a/src/test/java/org/opensearch/knn/index/NmslibIT.java b/src/test/java/org/opensearch/knn/index/NmslibIT.java index 8ca436bf4..e2e7613a2 100644 --- a/src/test/java/org/opensearch/knn/index/NmslibIT.java +++ b/src/test/java/org/opensearch/knn/index/NmslibIT.java @@ -195,6 +195,64 @@ public void testEndToEnd() throws Exception { fail("Graphs are not getting evicted"); } + public void testEndToEnd_withApproxAndExactSearch_inSameIndex_ForCosineSpaceType() throws Exception { + String indexName = "test-index-1"; + String fieldName = "test-field-1"; + SpaceType spaceType = SpaceType.COSINESIMIL; + Integer dimension = testData.indexData.vectors[0].length; + + // Create an index + XContentBuilder builder = XContentFactory.jsonBuilder() + .startObject() + .startObject("properties") + .startObject(fieldName) + .field("type", "knn_vector") + .field("dimension", dimension) + .field(KNNConstants.METHOD_PARAMETER_SPACE_TYPE, spaceType.getValue()) + .startObject(KNNConstants.KNN_METHOD) + .field(KNNConstants.NAME, KNNConstants.METHOD_HNSW) + .field(KNNConstants.KNN_ENGINE, KNNEngine.NMSLIB.getName()) + .endObject() + .endObject() + .endObject() + .endObject(); + + Map mappingMap = xContentBuilderToMap(builder); + String mapping = builder.toString(); + + createKnnIndex(indexName, buildKNNIndexSettings(0), mapping); + + // Index one document + addKnnDoc(indexName, randomAlphaOfLength(5), fieldName, Floats.asList(testData.indexData.vectors[0]).toArray()); + + // Assert we have the right number of documents in the index + refreshAllIndices(); + assertEquals(1, getDocCount(indexName)); + // update threshold setting to skip building graph + updateIndexSettings(indexName, Settings.builder().put(KNNSettings.INDEX_KNN_ADVANCED_APPROXIMATE_THRESHOLD, -1)); + // add duplicate document with different id + addKnnDoc(indexName, randomAlphaOfLength(5), fieldName, Floats.asList(testData.indexData.vectors[0]).toArray()); + assertEquals(2, getDocCount(indexName)); + final int k = 2; + // search index + Response response = searchKNNIndex( + indexName, + KNNQueryBuilder.builder().fieldName(fieldName).vector(testData.queries[0]).k(k).build(), + k + ); + String responseBody = EntityUtils.toString(response.getEntity()); + List knnResults = parseSearchResponse(responseBody, fieldName); + assertEquals(k, knnResults.size()); + + List actualScores = parseSearchResponseScore(responseBody, fieldName); + + // both document should have identical score + assertEquals(actualScores.get(0), actualScores.get(1), 0.001); + + // Delete index + deleteKNNIndex(indexName); + } + @SneakyThrows private void validateSearch( final String indexName, diff --git a/src/test/java/org/opensearch/knn/plugin/script/KNNScoringSpaceTests.java b/src/test/java/org/opensearch/knn/plugin/script/KNNScoringSpaceTests.java index 4fc549d6b..56ae4a39e 100644 --- a/src/test/java/org/opensearch/knn/plugin/script/KNNScoringSpaceTests.java +++ b/src/test/java/org/opensearch/knn/plugin/script/KNNScoringSpaceTests.java @@ -86,7 +86,7 @@ public void testCosineSimilarity_whenValid_thenSucceed() { getMappingConfigForMethodMapping(knnMethodContext, 3) ); KNNScoringSpace.CosineSimilarity cosineSimilarity = new KNNScoringSpace.CosineSimilarity(arrayListQueryObject, fieldType); - assertEquals(2F, cosineSimilarity.getScoringMethod().apply(arrayFloat2, arrayFloat), 0.1F); + assertEquals(1F, cosineSimilarity.getScoringMethod().apply(arrayFloat2, arrayFloat), 0.1F); // invalid zero vector final List queryZeroVector = List.of(0.0f, 0.0f, 0.0f);