From 0e19915d63173b74c0567d403170c722f909276e Mon Sep 17 00:00:00 2001 From: psadac Date: Sat, 21 Sep 2024 21:41:05 +0200 Subject: [PATCH 1/5] Update CI to use 1.23 --- .github/workflows/ci.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index fbfb468..01c24d7 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -4,16 +4,16 @@ jobs: test: strategy: matrix: - go-version: [1.19.x, 1.20.x] + go-version: [1.22.x, 1.23.x] os: [ubuntu-latest, macos-latest, windows-latest] runs-on: ${{ matrix.os }} steps: - name: Install Go - uses: actions/setup-go@v2 + uses: actions/setup-go@v5 with: go-version: ${{ matrix.go-version }} - name: Checkout code - uses: actions/checkout@v2 + uses: actions/checkout@v4 - name: Lint run: make lint - name: Test From 52114d15083fc4cc22376526f6145814d7974cfe Mon Sep 17 00:00:00 2001 From: psadac Date: Sat, 21 Sep 2024 22:06:49 +0200 Subject: [PATCH 2/5] Update dgryski/trifles to latest version --- go.mod | 2 +- go.sum | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/go.mod b/go.mod index 4fcfe43..fe647dc 100644 --- a/go.mod +++ b/go.mod @@ -4,5 +4,5 @@ go 1.13 require ( github.com/arbovm/levenshtein v0.0.0-20160628152529-48b4e1c0c4d0 - github.com/dgryski/trifles v0.0.0-20200323201526-dd97f9abfb48 + github.com/dgryski/trifles v0.0.0-20230903005119-f50d829f2e54 ) diff --git a/go.sum b/go.sum index 74d92aa..4cd4723 100644 --- a/go.sum +++ b/go.sum @@ -1,4 +1,4 @@ github.com/arbovm/levenshtein v0.0.0-20160628152529-48b4e1c0c4d0 h1:jfIu9sQUG6Ig+0+Ap1h4unLjW6YQJpKZVmUzxsD4E/Q= github.com/arbovm/levenshtein v0.0.0-20160628152529-48b4e1c0c4d0/go.mod h1:t2tdKJDJF9BV14lnkjHmOQgcvEKgtqs5a1N3LNdJhGE= -github.com/dgryski/trifles v0.0.0-20200323201526-dd97f9abfb48 h1:fRzb/w+pyskVMQ+UbP35JkH8yB7MYb4q/qhBarqZE6g= -github.com/dgryski/trifles v0.0.0-20200323201526-dd97f9abfb48/go.mod h1:if7Fbed8SFyPtHLHbg49SI7NAdJiC5WIA09pe59rfAA= +github.com/dgryski/trifles v0.0.0-20230903005119-f50d829f2e54 h1:SG7nF6SRlWhcT7cNTs5R6Hk4V2lcmLz2NsG2VnInyNo= +github.com/dgryski/trifles v0.0.0-20230903005119-f50d829f2e54/go.mod h1:if7Fbed8SFyPtHLHbg49SI7NAdJiC5WIA09pe59rfAA= From 97d5a6d8495fead2efda1d2007f67b6944401fa6 Mon Sep 17 00:00:00 2001 From: psadac Date: Sat, 21 Sep 2024 22:38:14 +0200 Subject: [PATCH 3/5] Use min() builtin function, update Go minimum version to 1.21 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit benchstat does not indicate any performance regression. goos: linux goarch: amd64 pkg: github.com/agnivade/levenshtein cpu: AMD Ryzen 7 7840U w/ Radeon 780M Graphics │ before.txt │ after.txt │ │ sec/op │ sec/op vs base │ Simple/ASCII-16 134.3n ± 0% 134.5n ± 0% ~ (p=0.135 n=20) Simple/French-16 253.8n ± 0% 253.5n ± 0% ~ (p=0.155 n=20) Simple/Nordic-16 499.4n ± 0% 498.1n ± 0% -0.26% (p=0.005 n=20) Simple/long_string-16 1.853µ ± 0% 1.852µ ± 0% ~ (p=0.878 n=20) Simple/Tibetan-16 414.3n ± 0% 414.4n ± 0% ~ (p=0.952 n=20) All/ASCII/agniva-16 136.4n ± 0% 136.4n ± 0% ~ (p=0.941 n=20) All/ASCII/arbovm-16 193.0n ± 0% 192.6n ± 0% ~ (p=0.151 n=20) All/ASCII/dgryski-16 198.6n ± 0% 198.8n ± 1% ~ (p=0.292 n=20) All/French/agniva-16 255.9n ± 0% 255.9n ± 0% ~ (p=0.899 n=20) All/French/arbovm-16 332.1n ± 0% 331.6n ± 0% ~ (p=0.857 n=20) All/French/dgryski-16 333.2n ± 0% 333.1n ± 0% ~ (p=0.732 n=20) All/Nordic/agniva-16 500.1n ± 0% 499.9n ± 0% ~ (p=0.673 n=20) All/Nordic/arbovm-16 604.1n ± 0% 606.6n ± 0% +0.41% (p=0.041 n=20) All/Nordic/dgryski-16 612.4n ± 0% 612.4n ± 0% ~ (p=0.984 n=20) All/Tibetan/agniva-16 414.4n ± 0% 414.5n ± 0% ~ (p=0.753 n=20) All/Tibetan/arbovm-16 501.4n ± 1% 499.1n ± 1% ~ (p=0.262 n=20) All/Tibetan/dgryski-16 504.9n ± 0% 504.8n ± 0% ~ (p=0.888 n=20) geomean 365.7n 365.6n -0.03% --- go.mod | 2 +- levenshtein.go | 9 +-------- 2 files changed, 2 insertions(+), 9 deletions(-) diff --git a/go.mod b/go.mod index fe647dc..f71f642 100644 --- a/go.mod +++ b/go.mod @@ -1,6 +1,6 @@ module github.com/agnivade/levenshtein -go 1.13 +go 1.21 require ( github.com/arbovm/levenshtein v0.0.0-20160628152529-48b4e1c0c4d0 diff --git a/levenshtein.go b/levenshtein.go index f727a66..260b69c 100644 --- a/levenshtein.go +++ b/levenshtein.go @@ -71,7 +71,7 @@ func ComputeDistance(a, b string) int { for j := 1; j <= lenS1; j++ { current := x[j-1] // match if s2[i-1] != s1[j-1] { - current = min(min(x[j-1]+1, prev+1), x[j]+1) + current = min(x[j-1]+1, prev+1, x[j]+1) } x[j-1] = prev prev = current @@ -80,10 +80,3 @@ func ComputeDistance(a, b string) int { } return int(x[lenS1]) } - -func min(a, b uint16) uint16 { - if a < b { - return a - } - return b -} From b365fba2205a2d1cd852a4f10a0429c36c66800c Mon Sep 17 00:00:00 2001 From: psadac Date: Sun, 22 Sep 2024 11:13:50 +0200 Subject: [PATCH 4/5] Remove leading and trailing identical runes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Benchmarks show approximately a 30% improvement. It's probably overestimated due to the lack of randomness of test inputs. goos: linux goarch: amd64 pkg: github.com/agnivade/levenshtein cpu: AMD Ryzen 7 7840U w/ Radeon 780M Graphics │ before.txt │ after.txt │ │ sec/op │ sec/op vs base │ Simple/ASCII-16 133.80n ± 1% 78.92n ± 0% -41.02% (p=0.000 n=20) Simple/French-16 253.8n ± 0% 128.1n ± 0% -49.50% (p=0.000 n=20) Simple/Nordic-16 494.8n ± 0% 205.8n ± 0% -58.41% (p=0.000 n=20) Simple/long_string-16 1847.5n ± 0% 208.2n ± 0% -88.73% (p=0.000 n=20) Simple/Tibetan-16 410.5n ± 0% 277.8n ± 1% -32.34% (p=0.000 n=20) All/ASCII/agniva-16 135.30n ± 0% 79.38n ± 0% -41.33% (p=0.000 n=20) All/ASCII/arbovm-16 192.0n ± 0% 191.0n ± 1% -0.52% (p=0.015 n=20) All/ASCII/dgryski-16 198.4n ± 0% 196.0n ± 0% -1.21% (p=0.000 n=20) All/French/agniva-16 253.4n ± 0% 128.8n ± 0% -49.16% (p=0.000 n=20) All/French/arbovm-16 330.5n ± 0% 319.7n ± 0% -3.25% (p=0.000 n=20) All/French/dgryski-16 331.2n ± 0% 332.2n ± 0% ~ (p=0.092 n=20) All/Nordic/agniva-16 495.2n ± 0% 206.9n ± 0% -58.21% (p=0.000 n=20) All/Nordic/arbovm-16 600.2n ± 0% 588.1n ± 0% -2.01% (p=0.000 n=20) All/Nordic/dgryski-16 609.2n ± 0% 607.8n ± 0% -0.24% (p=0.020 n=20) All/Tibetan/agniva-16 409.2n ± 0% 275.8n ± 0% -32.60% (p=0.000 n=20) All/Tibetan/arbovm-16 497.5n ± 1% 483.9n ± 0% -2.74% (p=0.000 n=20) All/Tibetan/dgryski-16 503.9n ± 1% 498.1n ± 0% -1.15% (p=0.000 n=20) geomean 363.5n 237.3n -34.71% │ before.txt │ after.txt │ │ B/op │ B/op vs base │ Simple/ASCII-16 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=20) ¹ Simple/French-16 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=20) ¹ Simple/Nordic-16 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=20) ¹ Simple/long_string-16 464.0 ± 0% 368.0 ± 0% -20.69% (p=0.000 n=20) Simple/Tibetan-16 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=20) ¹ All/ASCII/agniva-16 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=20) ¹ All/ASCII/arbovm-16 96.00 ± 0% 96.00 ± 0% ~ (p=1.000 n=20) ¹ All/ASCII/dgryski-16 96.00 ± 0% 96.00 ± 0% ~ (p=1.000 n=20) ¹ All/French/agniva-16 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=20) ¹ All/French/arbovm-16 128.0 ± 0% 128.0 ± 0% ~ (p=1.000 n=20) ¹ All/French/dgryski-16 128.0 ± 0% 128.0 ± 0% ~ (p=1.000 n=20) ¹ All/Nordic/agniva-16 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=20) ¹ All/Nordic/arbovm-16 192.0 ± 0% 192.0 ± 0% ~ (p=1.000 n=20) ¹ All/Nordic/dgryski-16 192.0 ± 0% 192.0 ± 0% ~ (p=1.000 n=20) ¹ All/Tibetan/agniva-16 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=20) ¹ All/Tibetan/arbovm-16 160.0 ± 0% 160.0 ± 0% ~ (p=1.000 n=20) ¹ All/Tibetan/dgryski-16 160.0 ± 0% 160.0 ± 0% ~ (p=1.000 n=20) ¹ geomean ² -1.35% ² ¹ all samples are equal ² summaries must be >0 to compute geomean │ before.txt │ after.txt │ │ allocs/op │ allocs/op vs base │ Simple/ASCII-16 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=20) ¹ Simple/French-16 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=20) ¹ Simple/Nordic-16 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=20) ¹ Simple/long_string-16 3.000 ± 0% 2.000 ± 0% -33.33% (p=0.000 n=20) Simple/Tibetan-16 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=20) ¹ All/ASCII/agniva-16 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=20) ¹ All/ASCII/arbovm-16 1.000 ± 0% 1.000 ± 0% ~ (p=1.000 n=20) ¹ All/ASCII/dgryski-16 1.000 ± 0% 1.000 ± 0% ~ (p=1.000 n=20) ¹ All/French/agniva-16 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=20) ¹ All/French/arbovm-16 1.000 ± 0% 1.000 ± 0% ~ (p=1.000 n=20) ¹ All/French/dgryski-16 1.000 ± 0% 1.000 ± 0% ~ (p=1.000 n=20) ¹ All/Nordic/agniva-16 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=20) ¹ All/Nordic/arbovm-16 1.000 ± 0% 1.000 ± 0% ~ (p=1.000 n=20) ¹ All/Nordic/dgryski-16 1.000 ± 0% 1.000 ± 0% ~ (p=1.000 n=20) ¹ All/Tibetan/agniva-16 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=20) ¹ All/Tibetan/arbovm-16 1.000 ± 0% 1.000 ± 0% ~ (p=1.000 n=20) ¹ All/Tibetan/dgryski-16 1.000 ± 0% 1.000 ± 0% ~ (p=1.000 n=20) ¹ geomean ² -2.36% ² ¹ all samples are equal ² summaries must be >0 to compute geomean --- levenshtein.go | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/levenshtein.go b/levenshtein.go index 260b69c..861f409 100644 --- a/levenshtein.go +++ b/levenshtein.go @@ -41,6 +41,25 @@ func ComputeDistance(a, b string) int { if len(s1) > len(s2) { s1, s2 = s2, s1 } + + // remove trailing identical runes. + for i := 0; i < len(s1); i++ { + if s1[len(s1)-1-i] != s2[len(s2)-1-i] { + s1 = s1[:len(s1)-i] + s2 = s2[:len(s2)-i] + break + } + } + + // Remove leading identical runes. + for i := 0; i < len(s1); i++ { + if s1[i] != s2[i] { + s1 = s1[i:] + s2 = s2[i:] + break + } + } + lenS1 := len(s1) lenS2 := len(s2) From 1a1899bb57809ca06649253ad5a51a0d7ba9f01e Mon Sep 17 00:00:00 2001 From: psadac Date: Mon, 23 Sep 2024 21:07:30 +0200 Subject: [PATCH 5/5] Add tests on long strings with few different characters MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Benchmarks are run before and after optimization "Remove leading and trailing identical runes". Long strings with differences at the beginning (long_lead), in the middle (long_middle) or at the end (long_trail) show significant improvements in processing time and memory allocations. When the optimization is ineffective due to different leading and trailing characters (long_diff) there is no change in processing time or memory allocation. goos: linux goarch: amd64 pkg: github.com/agnivade/levenshtein cpu: AMD Ryzen 7 7840U w/ Radeon 780M Graphics │ before.txt │ after.txt │ │ sec/op │ sec/op vs base │ Simple/ASCII-16 134.20n ± 0% 79.03n ± 0% -41.11% (p=0.000 n=20) Simple/French-16 254.8n ± 0% 129.7n ± 0% -49.09% (p=0.000 n=20) Simple/Nordic-16 500.6n ± 1% 208.0n ± 0% -58.45% (p=0.000 n=20) Simple/Long_lead-16 1862.0n ± 0% 209.6n ± 1% -88.75% (p=0.000 n=20) Simple/Long_middle-16 3613.0n ± 0% 325.0n ± 0% -91.00% (p=0.000 n=20) Simple/Long_trail-16 3911.0n ± 0% 399.0n ± 1% -89.80% (p=0.000 n=20) Simple/Long_diff-16 4.030µ ± 0% 4.029µ ± 1% ~ (p=0.899 n=20) Simple/Tibetan-16 413.0n ± 0% 277.3n ± 0% -32.86% (p=0.000 n=20) geomean 964.6n 299.5n -68.95% │ before.txt │ after.txt │ │ B/op │ B/op vs base │ Simple/ASCII-16 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=20) ¹ Simple/French-16 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=20) ¹ Simple/Nordic-16 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=20) ¹ Simple/Long_lead-16 464.0 ± 0% 368.0 ± 0% -20.69% (p=0.000 n=20) Simple/Long_middle-16 672.0 ± 0% 544.0 ± 0% -19.05% (p=0.000 n=20) Simple/Long_trail-16 720.0 ± 0% 576.0 ± 0% -20.00% (p=0.000 n=20) Simple/Long_diff-16 720.0 ± 0% 720.0 ± 0% ~ (p=1.000 n=20) ¹ Simple/Tibetan-16 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=20) ¹ geomean ² -7.99% ² ¹ all samples are equal ² summaries must be >0 to compute geomean │ before.txt │ after.txt │ │ allocs/op │ allocs/op vs base │ Simple/ASCII-16 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=20) ¹ Simple/French-16 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=20) ¹ Simple/Nordic-16 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=20) ¹ Simple/Long_lead-16 3.000 ± 0% 2.000 ± 0% -33.33% (p=0.000 n=20) Simple/Long_middle-16 3.000 ± 0% 2.000 ± 0% -33.33% (p=0.000 n=20) Simple/Long_trail-16 3.000 ± 0% 2.000 ± 0% -33.33% (p=0.000 n=20) Simple/Long_diff-16 3.000 ± 0% 3.000 ± 0% ~ (p=1.000 n=20) ¹ Simple/Tibetan-16 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=20) ¹ geomean ² -14.11% ² ¹ all samples are equal ² summaries must be >0 to compute geomean --- levenshtein_test.go | 32 +++++++++++++++++++++++++++----- 1 file changed, 27 insertions(+), 5 deletions(-) diff --git a/levenshtein_test.go b/levenshtein_test.go index dd296d3..dd3607a 100644 --- a/levenshtein_test.go +++ b/levenshtein_test.go @@ -66,13 +66,35 @@ func BenchmarkSimple(b *testing.B) { name string }{ // ASCII - {"levenshtein", "frankenstein", "ASCII"}, + {a: "levenshtein", b: "frankenstein", name: "ASCII"}, // Testing acutes and umlauts - {"resumé and café", "resumés and cafés", "French"}, - {"Hafþór Júlíus Björnsson", "Hafþor Julius Bjornsson", "Nordic"}, - {"a very long string that is meant to exceed", "another very long string that is meant to exceed", "long string"}, + {a: "resumé and café", b: "resumés and cafés", name: "French"}, + {a: "Hafþór Júlíus Björnsson", b: "Hafþor Julius Bjornsson", name: "Nordic"}, + + // Long strings + { + a: "a very long string that is meant to exceed", + b: "another very long string that is meant to exceed", + name: "Long lead", + }, + { + a: "a very long string with a word in the middle that is different", + b: "a very long string with some text in the middle that is different", + name: "Long middle", + }, + { + a: "a very long string with some text at the end that is not the same", + b: "a very long string with some text at the end that is very different", + name: "Long trail", + }, + { + a: "+a very long string with different leading and trailing characters+", + b: "-a very long string with different leading and trailing characters-", + name: "Long diff", + }, + // Only 2 characters are less in the 2nd string - {"།་གམ་འས་པ་་མ།", "།་གམའས་པ་་མ", "Tibetan"}, + {a: "།་གམ་འས་པ་་མ།", b: "།་གམའས་པ་་མ", name: "Tibetan"}, } tmp := 0 for _, test := range tests {