Bugfix/fix hnsw search termination check (#14215)

previously related PR: #12770 While my original change to help move us towards a saner HNSW search behavior, it is will still actually explore a candidate if its score is `==` min accepted. This will devolve in the degenerate case where all vectors are the same. This change adjusts minimum required candidate score to match `Math.nextUp`, similar to TopScoreDocCollector related to (but doesn't fully solve): #14214
apache · Feb 11, 2025 · a6a96cd · a6a96cd
1 parent 3208920
commit a6a96cd
Show file tree

Hide file tree

Showing 3 changed files with 27 additions and 3 deletions.
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
@@ -116,6 +116,8 @@ Bug Fixes
 * GITHUB#14126: Avoid overflow in index input slices invariant checks
   (Chris Hegarty)
 
+* GITHUB#14215: Fix degenerate case in HNSW where all vectors have the same score. (Ben Trent)
+
 Other
 ---------------------
 

diff --git a/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswGraphSearcher.java b/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswGraphSearcher.java
@@ -212,7 +212,7 @@ void searchLevel(
 
     // A bound that holds the minimum similarity to the query vector that a candidate vector must
     // have to be considered.
-    float minAcceptedSimilarity = results.minCompetitiveSimilarity();
+    float minAcceptedSimilarity = Math.nextUp(results.minCompetitiveSimilarity());
     while (candidates.size() > 0 && results.earlyTerminated() == false) {
       // get the best candidate (closest or best scoring)
       float topCandidateSimilarity = candidates.topScore();
@@ -238,7 +238,7 @@ void searchLevel(
           candidates.add(friendOrd, friendSimilarity);
           if (acceptOrds == null || acceptOrds.get(friendOrd)) {
             if (results.collect(friendOrd, friendSimilarity)) {
-              minAcceptedSimilarity = results.minCompetitiveSimilarity();
+              minAcceptedSimilarity = Math.nextUp(results.minCompetitiveSimilarity());
             }
           }
         }

diff --git a/lucene/core/src/test/org/apache/lucene/document/TestManyKnnDocs.java b/lucene/core/src/test/org/apache/lucene/document/TestManyKnnDocs.java
@@ -16,7 +16,11 @@
  */
 package org.apache.lucene.document;
 
+import static org.apache.lucene.index.VectorSimilarityFunction.DOT_PRODUCT;
+
 import com.carrotsearch.randomizedtesting.annotations.TimeoutSuite;
+import java.io.IOException;
+import java.util.Arrays;
 import org.apache.lucene.index.DirectoryReader;
 import org.apache.lucene.index.IndexWriter;
 import org.apache.lucene.index.IndexWriterConfig;
@@ -36,6 +40,24 @@
 public class TestManyKnnDocs extends LuceneTestCase {
   // gradlew -p lucene/core test --tests TestManyKnnDocs -Ptests.heapsize=16g -Dtests.monster=true
 
+  public void testSameVectorIndexedMultipleTimes() throws IOException {
+    try (Directory d = newDirectory()) {
+      float[] vector = new float[16];
+      Arrays.fill(vector, 0.5f);
+      try (IndexWriter w = new IndexWriter(d, new IndexWriterConfig())) {
+        for (int j = 1; j <= 100_000; j++) {
+          Document doc = new Document();
+          doc.add(new KnnFloatVectorField("field", vector, DOT_PRODUCT));
+          w.addDocument(doc);
+          if (j % 1000 == 0) {
+            w.flush();
+          }
+        }
+        w.commit();
+      }
+    }
+  }
+
   public void testLargeSegment() throws Exception {
     IndexWriterConfig iwc = new IndexWriterConfig();
     iwc.setCodec(
@@ -46,7 +68,7 @@ public void testLargeSegment() throws Exception {
     mp.setSegmentsPerTier(256); // only merge once at the end when we ask
     iwc.setMergePolicy(mp);
     String fieldName = "field";
-    VectorSimilarityFunction similarityFunction = VectorSimilarityFunction.DOT_PRODUCT;
+    VectorSimilarityFunction similarityFunction = DOT_PRODUCT;
 
     try (Directory dir = FSDirectory.open(createTempDir("ManyKnnVectorDocs"));
         IndexWriter iw = new IndexWriter(dir, iwc)) {
-Original file line number
+Diff line change
@@ Expand Up / @@ -116,6 +116,8 @@ Bug Fixes @@
     * GITHUB#14126: Avoid overflow in index input slices invariant checks
       (Chris Hegarty)
+    * GITHUB#14215: Fix degenerate case in HNSW where all vectors have the same score. (Ben Trent)
     Other
     ---------------------
@@ Expand Down @@