Skip to content

Commit

Permalink
Bugfix/fix hnsw search termination check (#14215)
Browse files Browse the repository at this point in the history
previously related PR: #12770

While my original change to help move us towards a saner HNSW search behavior, it is will still actually explore a candidate if its score is `==` min accepted. This will devolve in the degenerate case where all vectors are the same.

This change adjusts minimum required candidate score to match `Math.nextUp`, similar to TopScoreDocCollector
related to (but doesn't fully solve): #14214
  • Loading branch information
benwtrent authored Feb 11, 2025
1 parent 3208920 commit a6a96cd
Show file tree
Hide file tree
Showing 3 changed files with 27 additions and 3 deletions.
2 changes: 2 additions & 0 deletions lucene/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,8 @@ Bug Fixes
* GITHUB#14126: Avoid overflow in index input slices invariant checks
(Chris Hegarty)

* GITHUB#14215: Fix degenerate case in HNSW where all vectors have the same score. (Ben Trent)

Other
---------------------

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -212,7 +212,7 @@ void searchLevel(

// A bound that holds the minimum similarity to the query vector that a candidate vector must
// have to be considered.
float minAcceptedSimilarity = results.minCompetitiveSimilarity();
float minAcceptedSimilarity = Math.nextUp(results.minCompetitiveSimilarity());
while (candidates.size() > 0 && results.earlyTerminated() == false) {
// get the best candidate (closest or best scoring)
float topCandidateSimilarity = candidates.topScore();
Expand All @@ -238,7 +238,7 @@ void searchLevel(
candidates.add(friendOrd, friendSimilarity);
if (acceptOrds == null || acceptOrds.get(friendOrd)) {
if (results.collect(friendOrd, friendSimilarity)) {
minAcceptedSimilarity = results.minCompetitiveSimilarity();
minAcceptedSimilarity = Math.nextUp(results.minCompetitiveSimilarity());
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,11 @@
*/
package org.apache.lucene.document;

import static org.apache.lucene.index.VectorSimilarityFunction.DOT_PRODUCT;

import com.carrotsearch.randomizedtesting.annotations.TimeoutSuite;
import java.io.IOException;
import java.util.Arrays;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
Expand All @@ -36,6 +40,24 @@
public class TestManyKnnDocs extends LuceneTestCase {
// gradlew -p lucene/core test --tests TestManyKnnDocs -Ptests.heapsize=16g -Dtests.monster=true

public void testSameVectorIndexedMultipleTimes() throws IOException {
try (Directory d = newDirectory()) {
float[] vector = new float[16];
Arrays.fill(vector, 0.5f);
try (IndexWriter w = new IndexWriter(d, new IndexWriterConfig())) {
for (int j = 1; j <= 100_000; j++) {
Document doc = new Document();
doc.add(new KnnFloatVectorField("field", vector, DOT_PRODUCT));
w.addDocument(doc);
if (j % 1000 == 0) {
w.flush();
}
}
w.commit();
}
}
}

public void testLargeSegment() throws Exception {
IndexWriterConfig iwc = new IndexWriterConfig();
iwc.setCodec(
Expand All @@ -46,7 +68,7 @@ public void testLargeSegment() throws Exception {
mp.setSegmentsPerTier(256); // only merge once at the end when we ask
iwc.setMergePolicy(mp);
String fieldName = "field";
VectorSimilarityFunction similarityFunction = VectorSimilarityFunction.DOT_PRODUCT;
VectorSimilarityFunction similarityFunction = DOT_PRODUCT;

try (Directory dir = FSDirectory.open(createTempDir("ManyKnnVectorDocs"));
IndexWriter iw = new IndexWriter(dir, iwc)) {
Expand Down

0 comments on commit a6a96cd

Please sign in to comment.