sourcegraph · keegancsmith · Jan 22, 2025 · jtibshirani · Jan 22, 2025 · jtibshirani
diff --git a/build/builder.go b/build/builder.go
@@ -468,14 +468,21 @@ func (o *Options) findShard() string {
 	// Brute force finding the shard in compound shards. We should only hit this
 	// code path for repositories that are not already existing or are in
 	// compound shards.
-	//
-	// TODO add an oracle which can speed this up in the case of repositories
-	// already in compound shards.
 	compoundShards, err := filepath.Glob(path.Join(o.IndexDir, "compound-*.zoekt"))
 	if err != nil {
 		return ""
 	}
 	for _, fn := range compoundShards {
+		// PERF: ReadMetadataPathAlive can be relatively slow on instances with
+		// thousands of tiny repos in compound shards. This is a much faster check
+		// to see if we need to do more work to check.
+		//
+		// If we are still seeing performance issues, we should consider adding
+		// some sort of global oracle here to avoid filepath.Glob and checking
+		// each compound shard.
+		if !zoekt.MaybeContainRepo(fn, o.RepositoryDescription.ID) {
+			continue
+		}
 		repos, _, err := zoekt.ReadMetadataPathAlive(fn)
 		if err != nil {
 			continue

diff --git a/indexbuilder.go b/indexbuilder.go
@@ -526,6 +526,23 @@ func (b *IndexBuilder) branchMask(br string) uint64 {
 	return 0
 }
 
+// repoIDs returns a list of sourcegraph IDs for the indexed repos. If the ID
+// is missing or there are no repos, this returns false.
+func (b *IndexBuilder) repoIDs() ([]uint32, bool) {
+	if len(b.repoList) == 0 {
+		return nil, false
+	}
+
+	ids := make([]uint32, 0, len(b.repoList))
+	for _, repo := range b.repoList {
+		if repo.ID == 0 {
+			return nil, false
+		}
+		ids = append(ids, repo.ID)
+	}
+	return ids, true
+}
+
 type DocChecker struct {
 	// A map to count the unique trigrams in a doc. Reused across docs to cut down on allocations.
 	trigrams map[ngram]struct{}

diff --git a/read.go b/read.go
@@ -24,6 +24,7 @@ import (
 	"slices"
 	"sort"
 
+	"github.com/RoaringBitmap/roaring"
 	"github.com/rs/xid"
 )
 
@@ -648,6 +649,54 @@ func IndexFilePaths(p string) ([]string, error) {
 	return exist, nil
 }
 
+// MaybeContainRepo returns true if the shard at path p could contain repoID.
+// This only returns false if we are certain it does not. You need to double
+// check if it returns true.
+//
+// This function is a performance optimization mainly intended to be used by
+// builder (see findShard) to avoid unmarshalling large metadata files for
+// compound shards. It is best-effort, so if encounters any error returns true
+// (ie indicating you need to do more checks).
+func MaybeContainRepo(p string, repoID uint32) bool {
+	f, err := os.Open(p)
+	if err != nil {
+		return true
+	}
+	defer f.Close()
+
+	inf, err := NewIndexFile(f)
+	if err != nil {
+		return true
+	}
+	defer inf.Close()
+
+	rd := &reader{r: inf}
+	var toc indexTOC
+	err = rd.readTOCSections(&toc, []string{"reposIDsBitmap"})
+	if err != nil {
+		return true
+	}
+
+	// shard does not yet contains reposIDsBitmap so we can't tell if it
+	// contains repo.
+	if toc.reposIDsBitmap.sz == 0 {
+		return true
+	}
+
+	blob, err := inf.Read(toc.reposIDsBitmap.off, toc.reposIDsBitmap.sz)
+	if err != nil {
+		return true
+	}
+
+	var rb roaring.Bitmap
+	_, err = rb.FromUnsafeBytes(blob)
+	if err != nil {
+		return true
+	}
+
+	return rb.Contains(repoID)
+}
+
 func loadIndexData(r IndexFile) (*indexData, error) {
 	rd := &reader{r: r}
 

diff --git a/testdata/shards/repo2_v16.00000.zoekt b/testdata/shards/repo2_v16.00000.zoekt
diff --git a/testdata/shards/repo_v16.00000.zoekt b/testdata/shards/repo_v16.00000.zoekt
diff --git a/toc.go b/toc.go
@@ -96,7 +96,8 @@ type indexTOC struct {
 	contentChecksums simpleSection
 	runeDocSections  simpleSection
 
-	repos simpleSection
+	repos          simpleSection
+	reposIDsBitmap simpleSection
 
 	ranks simpleSection
 }
@@ -187,6 +188,8 @@ func (t *indexTOC) sectionsTaggedList() []taggedSection {
 		{"nameBloom", &unusedSimple},
 		{"contentBloom", &unusedSimple},
 		{"ranks", &unusedSimple},
+
+		{"reposIDsBitmap", &t.reposIDsBitmap},
 	}
 }
 

diff --git a/write.go b/write.go
@@ -23,6 +23,8 @@ import (
 	"io"
 	"sort"
 	"time"
+
+	"github.com/RoaringBitmap/roaring"
 )
 
 func (w *writer) writeTOC(toc *indexTOC) {
@@ -66,6 +68,12 @@ func (s *compoundSection) writeMap(w *writer, m map[string]uint32) {
 	s.writeStrings(w, keys)
 }
 
+func writeUint32Bitmap(w *writer, dat []uint32) {
+	rb := roaring.BitmapOf(dat...)
+	rb.RunOptimize()
+	rb.WriteTo(w)
+}
+
 func writePostings(w *writer, s *postingsBuilder, ngramText *simpleSection,
 	charOffsets *simpleSection, postings *compoundSection, endRunes *simpleSection,
 ) {
@@ -169,6 +177,12 @@ func (b *IndexBuilder) Write(out io.Writer) error {
 		toc.repos.end(w)
 	}
 
+	if repoIDs, ok := b.repoIDs(); ok && next {
+		toc.reposIDsBitmap.start(w)
+		writeUint32Bitmap(w, repoIDs)
+		toc.reposIDsBitmap.end(w)
+	}
+
 	indexTime := b.IndexTime
 	if indexTime.IsZero() {
 		indexTime = time.Now().UTC()