From 278eaf1c2264390c21e3055e46df0fb0f5e47a8e Mon Sep 17 00:00:00 2001 From: Alexander Bezzubov Date: Wed, 17 Apr 2019 13:54:34 +0200 Subject: [PATCH 1/7] tokenizer: move flex-based to modules Signed-off-by: Alexander Bezzubov --- internal/tokenizer/tokenize_c.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/tokenizer/tokenize_c.go b/internal/tokenizer/tokenize_c.go index 2d640abc..e3a68f2d 100644 --- a/internal/tokenizer/tokenize_c.go +++ b/internal/tokenizer/tokenize_c.go @@ -2,7 +2,7 @@ package tokenizer -import "gopkg.in/src-d/enry.v1/internal/tokenizer/flex" +import "github.com/src-d/enry/v2/internal/tokenizer/flex" // Tokenize returns lexical tokens from content. The tokens returned match what // the Linguist library returns. At most the first ByteLimit bytes of content are tokenized. From 8bdc830833f299a7173b1855e1688de945b197e5 Mon Sep 17 00:00:00 2001 From: Alexander Bezzubov Date: Wed, 17 Apr 2019 19:28:06 +0200 Subject: [PATCH 2/7] token: new test case with Unicode replacement Signed-off-by: Alexander Bezzubov --- internal/tokenizer/tokenize_test.go | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/internal/tokenizer/tokenize_test.go b/internal/tokenizer/tokenize_test.go index 881df30b..36d45855 100644 --- a/internal/tokenizer/tokenize_test.go +++ b/internal/tokenizer/tokenize_test.go @@ -115,6 +115,13 @@ func TestTokenize(t *testing.T) { } } +func TestTokenizerLatin1AsUtf8(t *testing.T) { + content := []byte("th\xe5 filling") // `th� filling` + t.Logf("%v - %q", content, string(content)) + tokens := Tokenize(content) + require.Equal(t, 3, len(tokens)) +} + func BenchmarkTokenizer_BaselineCopy(b *testing.B) { b.ReportAllocs() for i := 0; i < b.N; i++ { From a724a2f8416e6edd6ab1e429fcf5ef452dae6c4d Mon Sep 17 00:00:00 2001 From: Alexander Bezzubov Date: Tue, 7 May 2019 13:46:36 +0200 Subject: [PATCH 3/7] token: test case for regexp + non-valid UTF8 Signed-off-by: Alexander Bezzubov --- internal/tokenizer/tokenize_test.go | 32 +++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/internal/tokenizer/tokenize_test.go b/internal/tokenizer/tokenize_test.go index 36d45855..4737c8ea 100644 --- a/internal/tokenizer/tokenize_test.go +++ b/internal/tokenizer/tokenize_test.go @@ -119,9 +119,41 @@ func TestTokenizerLatin1AsUtf8(t *testing.T) { content := []byte("th\xe5 filling") // `th� filling` t.Logf("%v - %q", content, string(content)) tokens := Tokenize(content) + for i, token := range tokens { + t.Logf("token %d, %s", i+1, token) + } require.Equal(t, 3, len(tokens)) } +func TestRegexpOnInvalidUtf8(t *testing.T) { + origContent := []struct { + bytes []byte + tokens []string + }{ + {[]byte("th\xe0 filling"), []string{"th", "filling"}}, // `th� filling` + {[]byte("th\u0100 filling"), []string{"th", "filling"}}, // `thĀ filling` + {[]byte("привет, как дела?"), []string{}}, // empty, no ASCII tokens + } + re := reRegularToken + + for _, content := range origContent { + t.Run("", func(t *testing.T) { + t.Logf("%v - %q", content, string(content.bytes)) + + tokens := re.FindAll(content.bytes, -1) + require.Equal(t, len(content.tokens), len(tokens)) + + newContent := re.ReplaceAll(content.bytes, []byte(` `)) + t.Logf("content:%q, tokens:[", newContent) + for i, token := range tokens { + t.Logf("\t%q,", string(token)) + require.Equal(t, content.tokens[i], string(token)) + } + t.Logf(" ]\n") + }) + } +} + func BenchmarkTokenizer_BaselineCopy(b *testing.B) { b.ReportAllocs() for i := 0; i < b.N; i++ { From 48fc84a5553559dc47bdf26c0768e3bbf77a670f Mon Sep 17 00:00:00 2001 From: Alexander Bezzubov Date: Wed, 8 May 2019 15:26:22 +0200 Subject: [PATCH 4/7] ci: bump oniguruma version v5.x -> v6.9.1 Signed-off-by: Alexander Bezzubov --- .travis.yml | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/.travis.yml b/.travis.yml index 70c82de0..004bc52f 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,7 +1,5 @@ dist: trusty - language: go - go: - '1.12.x' - '1.11.x' @@ -10,17 +8,13 @@ env: - GO_VERSION_FOR_JVM='1.11.x' - CGO_ENABLED=0 - GO111MODULE=on + - ONIGURUMA_VERSION='6.9.1' matrix: - ONIGURUMA=0 - ONIGURUMA=1 matrix: fast_finish: true -addons: - apt: - packages: - - libonig-dev - stages: - name: test - name: release @@ -32,8 +26,14 @@ stage: test install: - > if [[ "${ONIGURUMA}" -gt 0 ]]; then - export CGO_ENABLED=1; - export GO_TAGS='oniguruma'; + export CGO_ENABLED=1 + export GO_TAGS='oniguruma' + # install oniguruma manually as trusty has only ancient 5.x + sudo apt-get install -y dpkg # dpkg >= 1.17.5ubuntu5.8 fixes https://bugs.launchpad.net/ubuntu/+source/dpkg/+bug/1730627 + wget "http://archive.ubuntu.com/ubuntu/pool/universe/libo/libonig/libonig5_${ONIGURUMA_VERSION}-1_amd64.deb" + sudo dpkg -i "libonig5_${ONIGURUMA_VERSION}-1_amd64.deb" + wget "http://archive.ubuntu.com/ubuntu/pool/universe/libo/libonig/libonig-dev_${ONIGURUMA_VERSION}-1_amd64.deb" + sudo dpkg -i "libonig-dev_${ONIGURUMA_VERSION}-1_amd64.deb" fi; script: - make test-coverage From fb267d3aff20f91958949cb337f8cda25fdf7af0 Mon Sep 17 00:00:00 2001 From: Alexander Bezzubov Date: Wed, 8 May 2019 15:29:36 +0200 Subject: [PATCH 5/7] bump src-d/go-onigoruma to v1.1.0 A result of: ``` go get github.com/src-d/go-onigoruma go mod tidy ``` Signed-off-by: Alexander Bezzubov --- go.mod | 2 +- go.sum | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/go.mod b/go.mod index 61916f57..52578103 100644 --- a/go.mod +++ b/go.mod @@ -3,7 +3,7 @@ module github.com/src-d/enry/v2 go 1.12 require ( - github.com/src-d/go-oniguruma v1.0.0 + github.com/src-d/go-oniguruma v1.1.0 github.com/stretchr/testify v1.3.0 github.com/toqueteos/trie v1.0.0 // indirect gopkg.in/toqueteos/substring.v1 v1.0.2 diff --git a/go.sum b/go.sum index 418720b8..14c3ce6b 100644 --- a/go.sum +++ b/go.sum @@ -2,8 +2,8 @@ github.com/davecgh/go-spew v1.1.0 h1:ZDRjVQ15GmhC3fiQ8ni8+OwkZQO4DARzQgrnXU1Liz8 github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= -github.com/src-d/go-oniguruma v1.0.0 h1:JDk5PUAjreGsGAKLsoDLNmrsaryjJ5RqT3h+Si6aw/E= -github.com/src-d/go-oniguruma v1.0.0/go.mod h1:chVbff8kcVtmrhxtZ3yBVLLquXbzCS6DrxQaAK/CeqM= +github.com/src-d/go-oniguruma v1.1.0 h1:EG+Nm5n2JqWUaCjtM0NtutPxU7ZN5Tp50GWrrV8bTww= +github.com/src-d/go-oniguruma v1.1.0/go.mod h1:chVbff8kcVtmrhxtZ3yBVLLquXbzCS6DrxQaAK/CeqM= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/testify v1.3.0 h1:TivCn/peBQ7UY8ooIcPgZFpTNSz0Q2U6UrFlUfqbe0Q= github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= From 9a7b370b1792da936415c300b0a7843f01603917 Mon Sep 17 00:00:00 2001 From: Alexander Bezzubov Date: Wed, 8 May 2019 15:31:55 +0200 Subject: [PATCH 6/7] regex: in oniguruma profile, switch to ASCII matching Signed-off-by: Alexander Bezzubov --- regex/oniguruma.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/regex/oniguruma.go b/regex/oniguruma.go index 2083569e..8caf644c 100644 --- a/regex/oniguruma.go +++ b/regex/oniguruma.go @@ -9,7 +9,7 @@ import ( type EnryRegexp = *rubex.Regexp func MustCompile(str string) EnryRegexp { - return rubex.MustCompile(str) + return rubex.MustCompileASCII(str) } func QuoteMeta(s string) string { From f3ceaa6330cebaf43b97bdf31ba67f5e3132f295 Mon Sep 17 00:00:00 2001 From: Alexander Bezzubov Date: Wed, 8 May 2019 22:17:32 +0200 Subject: [PATCH 7/7] token: refactor & simplify test fixtures Signed-off-by: Alexander Bezzubov --- internal/tokenizer/tokenize_test.go | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/internal/tokenizer/tokenize_test.go b/internal/tokenizer/tokenize_test.go index 4737c8ea..36378ef6 100644 --- a/internal/tokenizer/tokenize_test.go +++ b/internal/tokenizer/tokenize_test.go @@ -127,23 +127,23 @@ func TestTokenizerLatin1AsUtf8(t *testing.T) { func TestRegexpOnInvalidUtf8(t *testing.T) { origContent := []struct { - bytes []byte + text string tokens []string }{ - {[]byte("th\xe0 filling"), []string{"th", "filling"}}, // `th� filling` - {[]byte("th\u0100 filling"), []string{"th", "filling"}}, // `thĀ filling` - {[]byte("привет, как дела?"), []string{}}, // empty, no ASCII tokens + {"th\xe0 filling", []string{"th", "filling"}}, // `th� filling` + {"th\u0100 filling", []string{"th", "filling"}}, // `thĀ filling` + {"привет, как дела?", []string{}}, // empty, no ASCII tokens } re := reRegularToken for _, content := range origContent { t.Run("", func(t *testing.T) { - t.Logf("%v - %q", content, string(content.bytes)) - - tokens := re.FindAll(content.bytes, -1) + t.Logf("%v - %q", content, content.text) + input := []byte(content.text) + tokens := re.FindAll(input, -1) require.Equal(t, len(content.tokens), len(tokens)) - newContent := re.ReplaceAll(content.bytes, []byte(` `)) + newContent := re.ReplaceAll(input, []byte(` `)) t.Logf("content:%q, tokens:[", newContent) for i, token := range tokens { t.Logf("\t%q,", string(token))