From 86c681e8e66e8c7d8e007c628131f356fc92df73 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Philipp=20Gill=C3=A9?= <philipp.gille@gmail.com>
Date: Sun, 17 Mar 2024 15:28:07 +0100
Subject: [PATCH 1/2] Use max-heap for query results instead of sorting huge
 slice

---
 collection.go | 27 ++++++------------
 query.go      | 79 +++++++++++++++++++++++++++++++++++++++++++++------
 2 files changed, 79 insertions(+), 27 deletions(-)

diff --git a/collection.go b/collection.go
index b4186b0..52d8844 100644
--- a/collection.go
+++ b/collection.go
@@ -1,7 +1,6 @@
 package chromem
 
 import (
-	"cmp"
 	"context"
 	"errors"
 	"fmt"
@@ -323,30 +322,20 @@ func (c *Collection) QueryEmbedding(ctx context.Context, queryEmbedding []float3
 		return nil, nil
 	}
 
-	// For the remaining documents, calculate cosine similarity.
-	docSims, err := calcDocSimilarity(ctx, queryEmbedding, filteredDocs)
+	// For the remaining documents, get the most similar docs.
+	nMaxDocs, err := getMostSimilarDocs(ctx, queryEmbedding, filteredDocs, nResults)
 	if err != nil {
-		return nil, fmt.Errorf("couldn't calculate cosine similarity: %w", err)
+		return nil, fmt.Errorf("couldn't get most similar docs: %w", err)
 	}
 
-	// Sort by similarity
-	slices.SortFunc(docSims, func(i, j docSim) int {
-		// i, j; for descending order
-		return cmp.Compare(j.similarity, i.similarity)
-	})
-
-	// Return the top nResults or len(docSim), whichever is smaller
-	if len(docSims) < nResults {
-		nResults = len(docSims)
-	}
 	res := make([]Result, 0, nResults)
 	for i := 0; i < nResults; i++ {
 		res = append(res, Result{
-			ID:         docSims[i].docID,
-			Metadata:   c.documents[docSims[i].docID].Metadata,
-			Embedding:  c.documents[docSims[i].docID].Embedding,
-			Content:    c.documents[docSims[i].docID].Content,
-			Similarity: docSims[i].similarity,
+			ID:         nMaxDocs[i].docID,
+			Metadata:   c.documents[nMaxDocs[i].docID].Metadata,
+			Embedding:  c.documents[nMaxDocs[i].docID].Embedding,
+			Content:    c.documents[nMaxDocs[i].docID].Content,
+			Similarity: nMaxDocs[i].similarity,
 		})
 	}
 
diff --git a/query.go b/query.go
index 9da1be2..240060c 100644
--- a/query.go
+++ b/query.go
@@ -1,9 +1,12 @@
 package chromem
 
 import (
+	"cmp"
+	"container/heap"
 	"context"
 	"fmt"
 	"runtime"
+	"slices"
 	"strings"
 	"sync"
 )
@@ -15,6 +18,70 @@ type docSim struct {
 	similarity float32
 }
 
+// docMaxHeap is a max-heap of docSims, based on similarity.
+// See https://pkg.go.dev/container/heap@go1.22#example-package-IntHeap
+type docMaxHeap []docSim
+
+func (h docMaxHeap) Len() int           { return len(h) }
+func (h docMaxHeap) Less(i, j int) bool { return h[i].similarity < h[j].similarity }
+func (h docMaxHeap) Swap(i, j int)      { h[i], h[j] = h[j], h[i] }
+
+func (h *docMaxHeap) Push(x any) {
+	// Push and Pop use pointer receivers because they modify the slice's length,
+	// not just its contents.
+	*h = append(*h, x.(docSim))
+}
+
+func (h *docMaxHeap) Pop() any {
+	old := *h
+	n := len(old)
+	x := old[n-1]
+	*h = old[0 : n-1]
+	return x
+}
+
+// maxDocSims manages a max-heap of docSims with a fixed size, keeping the n highest
+// similarities. It's safe for concurrent use, but not the result of values().
+// In our benchmarks this was faster than sorting a slice of docSims at the end.
+type maxDocSims struct {
+	h    docMaxHeap
+	lock sync.RWMutex
+	size int
+}
+
+// newMaxDocSims creates a new nMaxDocs with a fixed size.
+func newMaxDocSims(size int) *maxDocSims {
+	return &maxDocSims{
+		h:    make(docMaxHeap, 0, size),
+		size: size,
+	}
+}
+
+// add inserts a new docSim into the heap, keeping only the top n similarities.
+func (mds *maxDocSims) add(doc docSim) {
+	mds.lock.Lock()
+	defer mds.lock.Unlock()
+	if mds.h.Len() < mds.size {
+		heap.Push(&mds.h, doc)
+	} else if mds.h.Len() > 0 && mds.h[0].similarity < doc.similarity {
+		// Replace the smallest similarity if the new doc's similarity is higher
+		heap.Pop(&mds.h)
+		heap.Push(&mds.h, doc)
+	}
+}
+
+// values returns the docSims in the heap, sorted by similarity (descending).
+// The call itself is safe for concurrent use with add(), but the result isn't.
+// Only work with the result after all calls to add() have finished.
+func (d *maxDocSims) values() []docSim {
+	d.lock.RLock()
+	defer d.lock.RUnlock()
+	slices.SortFunc(d.h, func(i, j docSim) int {
+		return cmp.Compare(j.similarity, i.similarity)
+	})
+	return d.h
+}
+
 // filterDocs filters a map of documents by metadata and content.
 // It does this concurrently.
 func filterDocs(docs map[string]*Document, where, whereDocument map[string]string) []*Document {
@@ -95,9 +162,8 @@ func documentMatchesFilters(document *Document, where, whereDocument map[string]
 	return true
 }
 
-func calcDocSimilarity(ctx context.Context, queryVectors []float32, docs []*Document) ([]docSim, error) {
-	similarities := make([]docSim, 0, len(docs))
-	similaritiesLock := sync.Mutex{}
+func getMostSimilarDocs(ctx context.Context, queryVectors []float32, docs []*Document, n int) ([]docSim, error) {
+	nMaxDocs := newMaxDocSims(n)
 
 	// Determine concurrency. Use number of docs or CPUs, whichever is smaller.
 	numCPUs := runtime.NumCPU()
@@ -152,10 +218,7 @@ func calcDocSimilarity(ctx context.Context, queryVectors []float32, docs []*Docu
 					return
 				}
 
-				similaritiesLock.Lock()
-				// We don't defer the unlock because we want to unlock much earlier.
-				similarities = append(similarities, docSim{docID: doc.ID, similarity: sim})
-				similaritiesLock.Unlock()
+				nMaxDocs.add(docSim{docID: doc.ID, similarity: sim})
 			}
 		}(docs[start:end])
 	}
@@ -166,5 +229,5 @@ func calcDocSimilarity(ctx context.Context, queryVectors []float32, docs []*Docu
 		return nil, sharedErr
 	}
 
-	return similarities, nil
+	return nMaxDocs.values(), nil
 }

From 787400412d73bec3532fa9c80c84e64f32341a1e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Philipp=20Gill=C3=A9?= <philipp.gille@gmail.com>
Date: Sun, 17 Mar 2024 15:38:40 +0100
Subject: [PATCH 2/2] Update benchmark numbers

---
 CHANGELOG.md |  4 ++--
 README.md    | 24 ++++++++++++------------
 2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 733a6af..f3c7cd4 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -8,7 +8,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/)
 vNext
 -----
 
-In this release the main feature is query performance improvement (5x faster, 99% fewer memory allocations). There's also a new code example for semantic search across 5,000 arXiv papers.
+In this release the main feature is query performance improvement (5x faster, 98% fewer memory allocations). There's also a new code example for semantic search across 5,000 arXiv papers.
 
 ### Added
 
@@ -18,7 +18,7 @@ In this release the main feature is query performance improvement (5x faster, 99
 ### Improved
 
 - Changed the example link target to directory instead of `main.go` file (PR [#43](https://github.com/philippgille/chromem-go/pull/43))
-- Improved query performance (5x faster, 99% fewer memory allocations) (PR [#47](https://github.com/philippgille/chromem-go/pull/47), [#53](https://github.com/philippgille/chromem-go/pull/53))
+- Improved query performance (5x faster, 98% fewer memory allocations) (PR [#47](https://github.com/philippgille/chromem-go/pull/47), [#53](https://github.com/philippgille/chromem-go/pull/53))
 
 ### Fixed
 
diff --git a/README.md b/README.md
index 5ce3c49..5b817ef 100644
--- a/README.md
+++ b/README.md
@@ -11,7 +11,7 @@ Because `chromem-go` is embeddable it enables you to add retrieval augmented gen
 
 It's *not* a library to connect to Chroma and also not a reimplementation of it in Go. It's a database on its own.
 
-The focus is not scale (millions of documents) or number of features, but simplicity and performance for the most common use cases. On a mid-range 2020 Intel laptop CPU you can query 1,000 documents in 0.3 ms and 100,000 documents in 50-60 ms, both with just 41 memory allocations. See [Benchmarks](#benchmarks) for details.
+The focus is not scale (millions of documents) or number of features, but simplicity and performance for the most common use cases. On a mid-range 2020 Intel laptop CPU you can query 1,000 documents in 0.3 ms and 100,000 documents in 40 ms, with very few and small memory allocations. See [Benchmarks](#benchmarks) for details.
 
 > ⚠️ The project is in beta, under heavy construction, and may introduce breaking changes in releases before `v1.0.0`. All changes are documented in the [`CHANGELOG`](./CHANGELOG.md).
 
@@ -197,18 +197,18 @@ goos: linux
 goarch: amd64
 pkg: github.com/philippgille/chromem-go
 cpu: 11th Gen Intel(R) Core(TM) i5-1135G7 @ 2.40GHz
-BenchmarkCollection_Query_NoContent_100-8          10000     106441 ns/op     6393 B/op       41 allocs/op
-BenchmarkCollection_Query_NoContent_1000-8          2278     494254 ns/op    35570 B/op       41 allocs/op
-BenchmarkCollection_Query_NoContent_5000-8           416    2767125 ns/op   166634 B/op       41 allocs/op
-BenchmarkCollection_Query_NoContent_25000-8           70   15165139 ns/op   813800 B/op       41 allocs/op
-BenchmarkCollection_Query_NoContent_100000-8          19   58823464 ns/op  3205865 B/op       41 allocs/op
-BenchmarkCollection_Query_100-8                    11269     105990 ns/op     6385 B/op       41 allocs/op
-BenchmarkCollection_Query_1000-8                    2364     494212 ns/op    35574 B/op       41 allocs/op
-BenchmarkCollection_Query_5000-8                     481    2750438 ns/op   166647 B/op       41 allocs/op
-BenchmarkCollection_Query_25000-8                     93   13143419 ns/op   813805 B/op       41 allocs/op
-BenchmarkCollection_Query_100000-8                    20   51727357 ns/op  3205871 B/op       41 allocs/op
+BenchmarkCollection_Query_NoContent_100-8          13164      90276 ns/op     5176 B/op       95 allocs/op
+BenchmarkCollection_Query_NoContent_1000-8          2142     520261 ns/op    13558 B/op      141 allocs/op
+BenchmarkCollection_Query_NoContent_5000-8           561    2150354 ns/op    47096 B/op      173 allocs/op
+BenchmarkCollection_Query_NoContent_25000-8          120    9890177 ns/op   211783 B/op      208 allocs/op
+BenchmarkCollection_Query_NoContent_100000-8          30   39574238 ns/op   810370 B/op      232 allocs/op
+BenchmarkCollection_Query_100-8                    13225      91058 ns/op     5177 B/op       95 allocs/op
+BenchmarkCollection_Query_1000-8                    2226     519693 ns/op    13552 B/op      140 allocs/op
+BenchmarkCollection_Query_5000-8                     550    2128121 ns/op    47108 B/op      173 allocs/op
+BenchmarkCollection_Query_25000-8                    100   10063260 ns/op   211705 B/op      205 allocs/op
+BenchmarkCollection_Query_100000-8                    30   39404005 ns/op   810295 B/op      229 allocs/op
 PASS
-ok   github.com/philippgille/chromem-go 26.187s
+ok   github.com/philippgille/chromem-go 28.402s
 ```
 
 ## Motivation