From 51b6e52b8a99355fa83bda46e41d41039b59ff92 Mon Sep 17 00:00:00 2001 From: Keegan Carruthers-Smith Date: Thu, 13 Jul 2023 09:55:35 +0200 Subject: [PATCH 1/6] zoekt: implement mode which has same behaviour as attribution search This is to help reproduce slow attribution searches we have on sourcegraph.com. --- cmd/zoekt/main.go | 44 ++++++++++++++++++++++++++++++++++---------- 1 file changed, 34 insertions(+), 10 deletions(-) diff --git a/cmd/zoekt/main.go b/cmd/zoekt/main.go index 0744454fc..336739ade 100644 --- a/cmd/zoekt/main.go +++ b/cmd/zoekt/main.go @@ -18,6 +18,7 @@ import ( "context" "flag" "fmt" + "io" "log" "os" "path/filepath" @@ -85,6 +86,7 @@ func main() { verbose := flag.Bool("v", false, "print some background data") withRepo := flag.Bool("r", false, "print the repo before the file name") list := flag.Bool("l", false, "print matching filenames only") + exact := flag.Bool("exact_stdin", false, "look for exact matches on STDIN") flag.Usage = func() { name := os.Args[0] @@ -95,12 +97,39 @@ func main() { } flag.Parse() - if len(flag.Args()) == 0 { + var pat string + var q query.Q + var sOpts zoekt.SearchOptions + if *exact { + needle, err := io.ReadAll(os.Stdin) + if err != nil { + log.Fatal(err) + } + pat = string(needle) + q = &query.Substring{ + Pattern: pat, + CaseSensitive: true, + Content: true, + } + sOpts = zoekt.SearchOptions{ + ShardMaxMatchCount: 10_000, + ShardRepoMaxMatchCount: 1, + TotalMaxMatchCount: 100_000, + MaxWallTime: 20 * time.Second, + MaxDocDisplayCount: 5, + } + } else if len(flag.Args()) == 0 { fmt.Fprintf(os.Stderr, "Pattern is missing.\n") flag.Usage() os.Exit(2) + } else { + var err error + pat = flag.Arg(0) + q, err = query.Parse(pat) + if err != nil { + log.Fatal(err) + } } - pat := flag.Arg(0) var searcher zoekt.Searcher var err error @@ -114,16 +143,11 @@ func main() { log.Fatal(err) } - query, err := query.Parse(pat) - if err != nil { - log.Fatal(err) - } if *verbose { - log.Println("query:", query) + log.Println("query:", q) } - var sOpts zoekt.SearchOptions - sres, err := searcher.Search(context.Background(), query, &sOpts) + sres, err := searcher.Search(context.Background(), q, &sOpts) if *cpuProfile != "" { // If profiling, do it another time so we measure with // warm caches. @@ -141,7 +165,7 @@ func main() { log.Fatal(err) } for { - sres, _ = searcher.Search(context.Background(), query, &sOpts) + sres, _ = searcher.Search(context.Background(), q, &sOpts) if time.Since(t) > *profileTime { break } From 4fa6cca992fdfb579c30034bb0b682868b91730e Mon Sep 17 00:00:00 2001 From: Keegan Carruthers-Smith Date: Thu, 13 Jul 2023 15:28:29 +0200 Subject: [PATCH 2/6] zoekt: add fgprof for full profiling Useful in local testing to capture both on and off cpu time spent. Should consider shipping this in the webserver as well. Test Plan: ran zoekt with -full_profile flag and inspected output in pprof. --- cmd/zoekt/BUILD.bazel | 1 + cmd/zoekt/main.go | 27 +++++++++++++++++++++++++++ deps.bzl | 22 ++++++++++++++++++++++ go.mod | 1 + go.sum | 8 ++++++++ 5 files changed, 59 insertions(+) diff --git a/cmd/zoekt/BUILD.bazel b/cmd/zoekt/BUILD.bazel index 19fd19324..e5329e277 100644 --- a/cmd/zoekt/BUILD.bazel +++ b/cmd/zoekt/BUILD.bazel @@ -9,6 +9,7 @@ go_library( "//:zoekt", "//query", "//shards", + "@com_github_felixge_fgprof//:fgprof", ], ) diff --git a/cmd/zoekt/main.go b/cmd/zoekt/main.go index 336739ade..d14871656 100644 --- a/cmd/zoekt/main.go +++ b/cmd/zoekt/main.go @@ -25,6 +25,7 @@ import ( "runtime/pprof" "time" + "github.com/felixge/fgprof" "github.com/sourcegraph/zoekt" "github.com/sourcegraph/zoekt/query" "github.com/sourcegraph/zoekt/shards" @@ -82,6 +83,7 @@ func main() { index := flag.String("index_dir", filepath.Join(os.Getenv("HOME"), ".zoekt"), "search for index files in `directory`") cpuProfile := flag.String("cpu_profile", "", "write cpu profile to `file`") + fullProfile := flag.String("full_profile", "", "write full profile to `file`") profileTime := flag.Duration("profile_time", time.Second, "run this long to gather stats.") verbose := flag.Bool("v", false, "print some background data") withRepo := flag.Bool("r", false, "print the repo before the file name") @@ -173,6 +175,31 @@ func main() { pprof.StopCPUProfile() } + if *fullProfile != "" { + // If profiling, do it another time so we measure with + // warm caches. + f, err := os.Create(*fullProfile) + if err != nil { + log.Fatal(err) + } + defer f.Close() + if *verbose { + log.Println("Displaying matches...") + } + + t := time.Now() + stopProfile := fgprof.Start(f, fgprof.FormatPprof) + for { + sres, _ = searcher.Search(context.Background(), q, &sOpts) + if time.Since(t) > *profileTime { + break + } + } + if err := stopProfile(); err != nil { + log.Fatal(err) + } + } + if err != nil { log.Fatal(err) } diff --git a/deps.bzl b/deps.bzl index 5fb163dec..b3ea8bce7 100644 --- a/deps.bzl +++ b/deps.bzl @@ -170,6 +170,13 @@ def go_dependencies(): sum = "h1:DC2CZ1Ep5Y4k3ZQ899DldepgrayRUGE6BBZ/cd9Cj44=", version = "v2.2.0", ) + go_repository( + name = "com_github_chzyer_logex", + build_file_proto_mode = "disable_global", + importpath = "github.com/chzyer/logex", + sum = "h1:Swpa1K6QvQznwJRcfTfQJmTE72DqScAa40E+fbHEXEE=", + version = "v1.1.10", + ) go_repository( name = "com_github_chzyer_readline", @@ -178,6 +185,13 @@ def go_dependencies(): sum = "h1:upd/6fQk4src78LMRzh5vItIt361/o4uq553V8B5sGI=", version = "v1.5.1", ) + go_repository( + name = "com_github_chzyer_test", + build_file_proto_mode = "disable_global", + importpath = "github.com/chzyer/test", + sum = "h1:q763qf9huN11kDQavWsoZXJNW3xEE4JJyHa5Q25/sd8=", + version = "v0.0.0-20180213035817-a1ea475d72b1", + ) go_repository( name = "com_github_client9_misspell", @@ -346,6 +360,14 @@ def go_dependencies(): sum = "h1:Q7juDM0QtcnhCpeyLGQKyg4TOIghuNXrkL32pHAUMxo=", version = "v1.1.0", ) + go_repository( + name = "com_github_felixge_fgprof", + build_file_proto_mode = "disable_global", + importpath = "github.com/felixge/fgprof", + sum = "h1:VvyZxILNuCiUCSXtPtYmmtGvb65nqXh2QFWc0Wpf2/g=", + version = "v0.9.3", + ) + go_repository( name = "com_github_flosch_pongo2_v4", build_file_proto_mode = "disable_global", diff --git a/go.mod b/go.mod index 234ad611f..003a741ab 100644 --- a/go.mod +++ b/go.mod @@ -6,6 +6,7 @@ require ( github.com/andygrunwald/go-gerrit v0.0.0-20230628115649-c44fe2fbf2ca github.com/bmatcuk/doublestar v1.3.4 github.com/edsrzf/mmap-go v1.1.0 + github.com/felixge/fgprof v0.9.3 github.com/fsnotify/fsnotify v1.6.0 github.com/gfleury/go-bitbucket-v1 v0.0.0-20230626192437-8d7be5866751 github.com/go-enry/go-enry/v2 v2.8.4 diff --git a/go.sum b/go.sum index 6c7939b2f..2635368fc 100644 --- a/go.sum +++ b/go.sum @@ -46,6 +46,9 @@ github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA github.com/cespare/xxhash/v2 v2.1.1/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= github.com/cespare/xxhash/v2 v2.2.0 h1:DC2CZ1Ep5Y4k3ZQ899DldepgrayRUGE6BBZ/cd9Cj44= github.com/cespare/xxhash/v2 v2.2.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= +github.com/chzyer/logex v1.1.10/go.mod h1:+Ywpsq7O8HXn0nuIou7OrIPyXbp3wmkHB+jjWRnGsAI= +github.com/chzyer/readline v0.0.0-20180603132655-2972be24d48e/go.mod h1:nSuG5e5PlCu98SY8svDHJxuZscDgtXS6KTTbou5AhLI= +github.com/chzyer/test v0.0.0-20180213035817-a1ea475d72b1/go.mod h1:Q3SI9o4m/ZMnBNeIyt5eFwwo7qiLfzFZmjNmxjkiQlU= github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw= github.com/cloudflare/circl v1.3.3 h1:fE/Qz0QdIGqeWfnwq0RE0R7MI51s0M2E4Ga9kq5AEMs= github.com/cloudflare/circl v1.3.3/go.mod h1:5XYMA4rFBvNIrhs50XuiBJ15vF2pZn4nnUKZrLbUZFA= @@ -81,6 +84,8 @@ github.com/envoyproxy/protoc-gen-validate v0.10.1 h1:c0g45+xCJhdgFGw7a5QAfdS4byA github.com/fatih/color v1.7.0/go.mod h1:Zm6kSWBoL9eyXnKyktHP6abPY2pDugNf5KwzbycvMj4= github.com/fatih/color v1.15.0 h1:kOqh6YHBtK8aywxGerMG2Eq3H6Qgoqeo13Bk2Mv/nBs= github.com/fatih/color v1.15.0/go.mod h1:0h5ZqXfHYED7Bhv2ZJamyIOUej9KtShiJESRwBDUSsw= +github.com/felixge/fgprof v0.9.3 h1:VvyZxILNuCiUCSXtPtYmmtGvb65nqXh2QFWc0Wpf2/g= +github.com/felixge/fgprof v0.9.3/go.mod h1:RdbpDgzqYVh/T9fPELJyV7EYJuHB55UTEULNun8eiPw= github.com/fogleman/gg v1.2.1-0.20190220221249-0403632d5b90/go.mod h1:R/bRT+9gY/C5z7JzPU0zXsXHKM4/ayA+zqcVNZzPa1k= github.com/fsnotify/fsnotify v1.6.0 h1:n+5WquG0fcWoWp6xPWfHdbskMCQaFnG6PfBrh1Ky4HY= github.com/fsnotify/fsnotify v1.6.0/go.mod h1:sl3t1tCWJFWoRz9R8WJCbQihKKwmorjAbSClcnxKAGw= @@ -155,6 +160,7 @@ github.com/google/go-github/v27 v27.0.6/go.mod h1:/0Gr8pJ55COkmv+S/yPKCczSkUPIM/ github.com/google/go-querystring v1.0.0/go.mod h1:odCYkC5MyYFN7vkCjXpyrEuKhc/BUO6wN/zVPAxq5ck= github.com/google/go-querystring v1.1.0 h1:AnCroh3fv4ZBgVIf1Iwtovgjaw/GiKJo8M8yD/fhyJ8= github.com/google/go-querystring v1.1.0/go.mod h1:Kcdr2DB4koayq7X8pmAG4sNG59So17icRSOU623lUBU= +github.com/google/pprof v0.0.0-20211214055906-6f57359322fd/go.mod h1:KgnwoLYCZ8IQu3XUZ8Nc/bM9CCZFOyjUNOSygVozoDg= github.com/google/pprof v0.0.0-20230602150820-91b7bce49751 h1:hR7/MlvK23p6+lIw9SN1TigNLn9ZnF3W4SYRKq2gAHs= github.com/google/pprof v0.0.0-20230602150820-91b7bce49751/go.mod h1:Jh3hGz2jkYak8qXPD19ryItVnUgpgeqzdkY/D0EaeuA= github.com/google/s2a-go v0.1.4 h1:1kZ/sQM3srePvKs3tXAvQzo66XfcReoqFpIpIccE7Oc= @@ -180,6 +186,7 @@ github.com/hashicorp/go-hclog v0.16.2 h1:K4ev2ib4LdQETX5cSZBG0DVLk1jwGqSPXBjdah3 github.com/hashicorp/go-hclog v0.16.2/go.mod h1:whpDNt7SSdeAju8AWKIWsul05p54N/39EeqMAyrmvFQ= github.com/hashicorp/go-retryablehttp v0.7.4 h1:ZQgVdpTdAL7WpMIwLzCfbalOcSUdkDZnpUv3/+BxzFA= github.com/hashicorp/go-retryablehttp v0.7.4/go.mod h1:Jy/gPYAdjqffZ/yFGCFV2doI5wjtH1ewM9u8iYVjtX8= +github.com/ianlancetaylor/demangle v0.0.0-20210905161508-09a460cdf81d/go.mod h1:aYm2/VgdVmcIU8iMfdMvDMsRAQjcfZSKFby6HOFvi/w= github.com/imdario/mergo v0.3.16 h1:wwQJbIsHYGMUyLSPrEq1CT16AhnhNJQ51+4fdHUnCl4= github.com/imdario/mergo v0.3.16/go.mod h1:WBLT9ZmE3lPoWsEzCh9LPo3TiwVN+ZKEjmz+hD27ysY= github.com/jbenet/go-context v0.0.0-20150711004518-d14ea06fba99 h1:BQSFePA1RWJOlocH6Fxy8MmwDt+yVQYULKfN0RoTN8A= @@ -427,6 +434,7 @@ golang.org/x/sys v0.0.0-20201204225414-ed752295db88/go.mod h1:h1NjWce9XRLGQEsW7w golang.org/x/sys v0.0.0-20210124154548-22da62e12c0c/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20211007075335-d3039528d8ac/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= From e1ff3ded2846dd80304ad73d8e40ab2e45e70026 Mon Sep 17 00:00:00 2001 From: Keegan Carruthers-Smith Date: Thu, 13 Jul 2023 17:03:08 +0200 Subject: [PATCH 3/6] zoekt: output run count summary --- cmd/zoekt/main.go | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/cmd/zoekt/main.go b/cmd/zoekt/main.go index d14871656..a8dd219f2 100644 --- a/cmd/zoekt/main.go +++ b/cmd/zoekt/main.go @@ -166,9 +166,14 @@ func main() { if err := pprof.StartCPUProfile(f); err != nil { log.Fatal(err) } + count := 0 for { sres, _ = searcher.Search(context.Background(), q, &sOpts) - if time.Since(t) > *profileTime { + count++ + if elapsed := time.Since(t); elapsed > *profileTime { + if *verbose { + log.Printf("ran %d times in %v (%f searches/s)", count, elapsed, float64(count)/elapsed.Seconds()) + } break } } @@ -189,9 +194,14 @@ func main() { t := time.Now() stopProfile := fgprof.Start(f, fgprof.FormatPprof) + count := 0 for { sres, _ = searcher.Search(context.Background(), q, &sOpts) - if time.Since(t) > *profileTime { + count++ + if elapsed := time.Since(t); elapsed > *profileTime { + if *verbose { + log.Printf("ran %d times in %v (%f searches/s)", count, elapsed, float64(count)/elapsed.Seconds()) + } break } } From 076d554b7f78625e6f85732bbd7a070add1806c1 Mon Sep 17 00:00:00 2001 From: Keegan Carruthers-Smith Date: Thu, 13 Jul 2023 17:03:25 +0200 Subject: [PATCH 4/6] make it possible to use in memory map --- inmemoffset.go | 16 ++++++++++++++++ read.go | 28 +++++++++++++++++++++++++++- 2 files changed, 43 insertions(+), 1 deletion(-) create mode 100644 inmemoffset.go diff --git a/inmemoffset.go b/inmemoffset.go new file mode 100644 index 000000000..3d3a6a9fd --- /dev/null +++ b/inmemoffset.go @@ -0,0 +1,16 @@ +package zoekt + +type inMemoryNgrams map[ngram]simpleSection + +func (m inMemoryNgrams) Get(gram ngram) simpleSection { + ss, _ := m[gram] + return ss +} + +func (m inMemoryNgrams) DumpMap() map[ngram]simpleSection { + return map[ngram]simpleSection(m) +} + +func (m inMemoryNgrams) SizeBytes() int { + return 0 // a bit complicated to calculate for real +} diff --git a/read.go b/read.go index 9f49b3b6a..6383a02ce 100644 --- a/read.go +++ b/read.go @@ -288,7 +288,13 @@ func (r *reader) readIndexData(toc *indexTOC) (*indexData, error) { return nil, err } - if os.Getenv("ZOEKT_DISABLE_BTREE") != "" { + if os.Getenv("ZOEKT_INMEM") != "" { + offsetMap, err := d.readInMemoryNgrams(toc) + if err != nil { + return nil, err + } + d.ngrams = offsetMap + } else if os.Getenv("ZOEKT_DISABLE_BTREE") != "" { offsetMap, err := d.readNgrams(toc) if err != nil { return nil, err @@ -507,6 +513,26 @@ func (d *indexData) newBtreeIndex(ngramSec simpleSection, postings compoundSecti return bi, nil } +func (d *indexData) readInMemoryNgrams(toc *indexTOC) (inMemoryNgrams, error) { + textContent, err := d.readSectionBlob(toc.ngramText) + if err != nil { + return nil, err + } + postingsIndex := toc.postings.relativeIndex() + + ngrams := make(inMemoryNgrams, len(textContent)/ngramEncoding) + for i := 0; i < len(textContent); i += ngramEncoding { + j := i / ngramEncoding + ng := ngram(binary.BigEndian.Uint64(textContent[i : i+ngramEncoding])) + ngrams[ng] = simpleSection{ + toc.postings.data.off + postingsIndex[j], + postingsIndex[j+1] - postingsIndex[j], + } + } + + return ngrams, nil +} + func (d *indexData) readFileNameNgrams(toc *indexTOC) (map[ngram][]byte, error) { nameNgramText, err := d.readSectionBlob(toc.nameNgramText) if err != nil { From b1b0f77367bb9d5fe347b3ad452d7a5abce97170 Mon Sep 17 00:00:00 2001 From: Keegan Carruthers-Smith Date: Fri, 14 Jul 2023 15:14:17 +0200 Subject: [PATCH 5/6] sort ngrams before looking them up We believe this will improve performance of the btree lookups. Test Plan: go test ./... --- bits.go | 15 +++++---------- go.mod | 1 + go.sum | 2 ++ indexdata.go | 35 +++++++++++++++++++++++------------ 4 files changed, 31 insertions(+), 22 deletions(-) diff --git a/bits.go b/bits.go index 7539686a4..649088c8a 100644 --- a/bits.go +++ b/bits.go @@ -106,10 +106,9 @@ func (n ngram) String() string { } type runeNgramOff struct { - ngram ngram - byteSize uint32 // size of ngram - byteOff uint32 - runeOff uint32 + ngram ngram + // index is the original index inside of the returned array of splitNGrams + index uint32 } func splitNGrams(str []byte) []runeNgramOff { @@ -120,9 +119,7 @@ func splitNGrams(str []byte) []runeNgramOff { result := make([]runeNgramOff, 0, len(str)) var i uint32 - chars := -1 for len(str) > 0 { - chars++ r, sz := utf8.DecodeRune(str) str = str[sz:] runeGram[0] = runeGram[1] @@ -139,10 +136,8 @@ func splitNGrams(str []byte) []runeNgramOff { ng := runesToNGram(runeGram) result = append(result, runeNgramOff{ - ngram: ng, - byteSize: i - off[0], - byteOff: off[0], - runeOff: uint32(chars), + ngram: ng, + index: uint32(len(result)), }) } return result diff --git a/go.mod b/go.mod index 003a741ab..ff9dddd45 100644 --- a/go.mod +++ b/go.mod @@ -46,6 +46,7 @@ require ( go.opentelemetry.io/otel/trace v1.16.0 go.uber.org/atomic v1.11.0 go.uber.org/automaxprocs v1.5.2 + golang.org/x/exp v0.0.0-20230713183714-613f0c0eb8a1 golang.org/x/net v0.11.0 golang.org/x/oauth2 v0.9.0 golang.org/x/sync v0.3.0 diff --git a/go.sum b/go.sum index 2635368fc..9d621e029 100644 --- a/go.sum +++ b/go.sum @@ -368,6 +368,8 @@ golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL golang.org/x/exp v0.0.0-20190125153040-c74c464bbbf2/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/exp v0.0.0-20190306152737-a1d7652674e8/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/exp v0.0.0-20191030013958-a1ab85dbe136/go.mod h1:JXzH8nQsPlswgeRAPE3MuO9GYsAcnJvJ4vnMwN/5qkY= +golang.org/x/exp v0.0.0-20230713183714-613f0c0eb8a1 h1:MGwJjxBy0HJshjDNfLsYO8xppfqWlA5ZT9OhtUUhTNw= +golang.org/x/exp v0.0.0-20230713183714-613f0c0eb8a1/go.mod h1:FXUEEKJgO7OQYeo8N01OfiKP8RXMtf6e8aTskBGqWdc= golang.org/x/image v0.0.0-20180708004352-c73c2afc3b81/go.mod h1:ux5Hcp/YLpHSI86hEcLt0YII63i6oz57MZXIpbrjZUs= golang.org/x/image v0.0.0-20190227222117-0694c2d4d067/go.mod h1:kZ7UVZpmo3dzQBMxlp+ypCbDeSB+sBbTgSJuh5dn5js= golang.org/x/image v0.0.0-20190802002840-cff245a6509b/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0= diff --git a/indexdata.go b/indexdata.go index 02c59fb4a..8968c61f9 100644 --- a/indexdata.go +++ b/indexdata.go @@ -23,6 +23,7 @@ import ( "unicode/utf8" "github.com/sourcegraph/zoekt/query" + "golang.org/x/exp/slices" ) // indexData holds the pattern-independent data that we have to have @@ -388,6 +389,11 @@ func (d *indexData) iterateNgrams(query *query.Substring) (*ngramIterationResult // Find the 2 least common ngrams from the string. ngramOffs := splitNGrams([]byte(query.Pattern)) + // PERF: Sort to increase the chances adjacent checks are in the same btree + // bucket (which can cause disk IO). + slices.SortFunc(ngramOffs, func(a, b runeNgramOff) bool { + return a.ngram < b.ngram + }) frequencies := make([]uint32, 0, len(ngramOffs)) ngramLookups := 0 for _, o := range ngramOffs { @@ -415,18 +421,22 @@ func (d *indexData) iterateNgrams(query *query.Substring) (*ngramIterationResult frequencies = append(frequencies, freq) } - firstI := firstMinarg(frequencies) - frequencies[firstI] = maxUInt32 - lastI := lastMinarg(frequencies) - if firstI > lastI { - lastI, firstI = firstI, lastI + + var first, last runeNgramOff + { + firstI := firstMinarg(frequencies) + frequencies[firstI] = maxUInt32 + lastI := lastMinarg(frequencies) + first = ngramOffs[firstI] + last = ngramOffs[lastI] + if first.index > last.index { + last, first = first, last + } } - firstNG := ngramOffs[firstI].ngram - lastNG := ngramOffs[lastI].ngram iter := &ngramDocIterator{ - leftPad: firstI, - rightPad: uint32(utf8.RuneCountInString(str)) - firstI, + leftPad: first.index, + rightPad: uint32(utf8.RuneCountInString(str)) - first.index, ngramLookups: ngramLookups, } if query.FileName { @@ -435,15 +445,16 @@ func (d *indexData) iterateNgrams(query *query.Substring) (*ngramIterationResult iter.ends = d.fileEndRunes } - if firstI != lastI { - i, err := d.newDistanceTrigramIter(firstNG, lastNG, lastI-firstI, query.CaseSensitive, query.FileName) + if first != last { + runeDist := last.index - first.index + i, err := d.newDistanceTrigramIter(first.ngram, last.ngram, runeDist, query.CaseSensitive, query.FileName) if err != nil { return nil, err } iter.iter = i } else { - hitIter, err := d.trigramHitIterator(lastNG, query.CaseSensitive, query.FileName) + hitIter, err := d.trigramHitIterator(last.ngram, query.CaseSensitive, query.FileName) if err != nil { return nil, err } From a6e726c0b599a985dbc963cbe2106c4cbb205179 Mon Sep 17 00:00:00 2001 From: Keegan Carruthers-Smith Date: Fri, 14 Jul 2023 15:14:46 +0200 Subject: [PATCH 6/6] maybe: use sort.Search in btree inner nodes keys only has a length of 50, so might not be faster. --- btree.go | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/btree.go b/btree.go index 72f097ac3..70bf73311 100644 --- a/btree.go +++ b/btree.go @@ -208,12 +208,13 @@ func (n *innerNode) insert(ng ngram, opts btreeOpts) { // See btree.find func (n *innerNode) find(ng ngram) (int, int) { - for i, k := range n.keys { - if ng < k { - return n.children[i].find(ng) - } + i := sort.Search(len(n.keys), func(i int) bool { + return ng < n.keys[i] + }) + if i >= len(n.children) { + i = len(n.children) - 1 } - return n.children[len(n.children)-1].find(ng) + return n.children[i].find(ng) } // See btree.find