From 278eaf1c2264390c21e3055e46df0fb0f5e47a8e Mon Sep 17 00:00:00 2001
From: Alexander Bezzubov <bzz@apache.org>
Date: Wed, 17 Apr 2019 13:54:34 +0200
Subject: [PATCH 1/7] tokenizer: move flex-based to modules

Signed-off-by: Alexander Bezzubov <bzz@apache.org>
---
 internal/tokenizer/tokenize_c.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/internal/tokenizer/tokenize_c.go b/internal/tokenizer/tokenize_c.go
index 2d640abc..e3a68f2d 100644
--- a/internal/tokenizer/tokenize_c.go
+++ b/internal/tokenizer/tokenize_c.go
@@ -2,7 +2,7 @@
 
 package tokenizer
 
-import "gopkg.in/src-d/enry.v1/internal/tokenizer/flex"
+import "github.com/src-d/enry/v2/internal/tokenizer/flex"
 
 // Tokenize returns lexical tokens from content. The tokens returned match what
 // the Linguist library returns. At most the first ByteLimit bytes of content are tokenized.

From 8bdc830833f299a7173b1855e1688de945b197e5 Mon Sep 17 00:00:00 2001
From: Alexander Bezzubov <bzz@apache.org>
Date: Wed, 17 Apr 2019 19:28:06 +0200
Subject: [PATCH 2/7] token: new test case with Unicode replacement

Signed-off-by: Alexander Bezzubov <bzz@apache.org>
---
 internal/tokenizer/tokenize_test.go | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/internal/tokenizer/tokenize_test.go b/internal/tokenizer/tokenize_test.go
index 881df30b..36d45855 100644
--- a/internal/tokenizer/tokenize_test.go
+++ b/internal/tokenizer/tokenize_test.go
@@ -115,6 +115,13 @@ func TestTokenize(t *testing.T) {
 	}
 }
 
+func TestTokenizerLatin1AsUtf8(t *testing.T) {
+	content := []byte("th\xe5 filling") // `th� filling`
+	t.Logf("%v - %q", content, string(content))
+	tokens := Tokenize(content)
+	require.Equal(t, 3, len(tokens))
+}
+
 func BenchmarkTokenizer_BaselineCopy(b *testing.B) {
 	b.ReportAllocs()
 	for i := 0; i < b.N; i++ {

From a724a2f8416e6edd6ab1e429fcf5ef452dae6c4d Mon Sep 17 00:00:00 2001
From: Alexander Bezzubov <bzz@apache.org>
Date: Tue, 7 May 2019 13:46:36 +0200
Subject: [PATCH 3/7] token: test case for regexp + non-valid UTF8

Signed-off-by: Alexander Bezzubov <bzz@apache.org>
---
 internal/tokenizer/tokenize_test.go | 32 +++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

diff --git a/internal/tokenizer/tokenize_test.go b/internal/tokenizer/tokenize_test.go
index 36d45855..4737c8ea 100644
--- a/internal/tokenizer/tokenize_test.go
+++ b/internal/tokenizer/tokenize_test.go
@@ -119,9 +119,41 @@ func TestTokenizerLatin1AsUtf8(t *testing.T) {
 	content := []byte("th\xe5 filling") // `th� filling`
 	t.Logf("%v - %q", content, string(content))
 	tokens := Tokenize(content)
+	for i, token := range tokens {
+		t.Logf("token %d, %s", i+1, token)
+	}
 	require.Equal(t, 3, len(tokens))
 }
 
+func TestRegexpOnInvalidUtf8(t *testing.T) {
+	origContent := []struct {
+		bytes  []byte
+		tokens []string
+	}{
+		{[]byte("th\xe0 filling"), []string{"th", "filling"}},   // `th� filling`
+		{[]byte("th\u0100 filling"), []string{"th", "filling"}}, // `thĀ filling`
+		{[]byte("привет, как дела?"), []string{}},               // empty, no ASCII tokens
+	}
+	re := reRegularToken
+
+	for _, content := range origContent {
+		t.Run("", func(t *testing.T) {
+			t.Logf("%v - %q", content, string(content.bytes))
+
+			tokens := re.FindAll(content.bytes, -1)
+			require.Equal(t, len(content.tokens), len(tokens))
+
+			newContent := re.ReplaceAll(content.bytes, []byte(` `))
+			t.Logf("content:%q, tokens:[", newContent)
+			for i, token := range tokens {
+				t.Logf("\t%q,", string(token))
+				require.Equal(t, content.tokens[i], string(token))
+			}
+			t.Logf(" ]\n")
+		})
+	}
+}
+
 func BenchmarkTokenizer_BaselineCopy(b *testing.B) {
 	b.ReportAllocs()
 	for i := 0; i < b.N; i++ {

From 48fc84a5553559dc47bdf26c0768e3bbf77a670f Mon Sep 17 00:00:00 2001
From: Alexander Bezzubov <bzz@apache.org>
Date: Wed, 8 May 2019 15:26:22 +0200
Subject: [PATCH 4/7] ci: bump oniguruma version v5.x -> v6.9.1

Signed-off-by: Alexander Bezzubov <bzz@apache.org>
---
 .travis.yml | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 70c82de0..004bc52f 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,7 +1,5 @@
 dist: trusty
-
 language: go
-
 go:
   - '1.12.x'
   - '1.11.x'
@@ -10,17 +8,13 @@ env:
     - GO_VERSION_FOR_JVM='1.11.x'
     - CGO_ENABLED=0
     - GO111MODULE=on
+    - ONIGURUMA_VERSION='6.9.1'
   matrix:
     - ONIGURUMA=0
     - ONIGURUMA=1
 matrix:
   fast_finish: true
 
-addons:
-  apt:
-    packages:
-      - libonig-dev
-
 stages:
   - name: test
   - name: release
@@ -32,8 +26,14 @@ stage: test
 install:
   - >
     if [[ "${ONIGURUMA}" -gt 0 ]]; then
-      export CGO_ENABLED=1;
-      export GO_TAGS='oniguruma';
+      export CGO_ENABLED=1
+      export GO_TAGS='oniguruma'
+      # install oniguruma manually as trusty has only ancient 5.x
+      sudo apt-get install -y dpkg # dpkg >= 1.17.5ubuntu5.8 fixes https://bugs.launchpad.net/ubuntu/+source/dpkg/+bug/1730627
+      wget "http://archive.ubuntu.com/ubuntu/pool/universe/libo/libonig/libonig5_${ONIGURUMA_VERSION}-1_amd64.deb"
+      sudo dpkg -i "libonig5_${ONIGURUMA_VERSION}-1_amd64.deb"
+      wget "http://archive.ubuntu.com/ubuntu/pool/universe/libo/libonig/libonig-dev_${ONIGURUMA_VERSION}-1_amd64.deb"
+      sudo dpkg -i "libonig-dev_${ONIGURUMA_VERSION}-1_amd64.deb"
     fi;
 script:
   - make test-coverage

From fb267d3aff20f91958949cb337f8cda25fdf7af0 Mon Sep 17 00:00:00 2001
From: Alexander Bezzubov <bzz@apache.org>
Date: Wed, 8 May 2019 15:29:36 +0200
Subject: [PATCH 5/7] bump src-d/go-onigoruma to v1.1.0

A result of:
```
go get github.com/src-d/go-onigoruma
go mod tidy
```

Signed-off-by: Alexander Bezzubov <bzz@apache.org>
---
 go.mod | 2 +-
 go.sum | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/go.mod b/go.mod
index 61916f57..52578103 100644
--- a/go.mod
+++ b/go.mod
@@ -3,7 +3,7 @@ module github.com/src-d/enry/v2
 go 1.12
 
 require (
-	github.com/src-d/go-oniguruma v1.0.0
+	github.com/src-d/go-oniguruma v1.1.0
 	github.com/stretchr/testify v1.3.0
 	github.com/toqueteos/trie v1.0.0 // indirect
 	gopkg.in/toqueteos/substring.v1 v1.0.2
diff --git a/go.sum b/go.sum
index 418720b8..14c3ce6b 100644
--- a/go.sum
+++ b/go.sum
@@ -2,8 +2,8 @@ github.com/davecgh/go-spew v1.1.0 h1:ZDRjVQ15GmhC3fiQ8ni8+OwkZQO4DARzQgrnXU1Liz8
 github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
 github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
-github.com/src-d/go-oniguruma v1.0.0 h1:JDk5PUAjreGsGAKLsoDLNmrsaryjJ5RqT3h+Si6aw/E=
-github.com/src-d/go-oniguruma v1.0.0/go.mod h1:chVbff8kcVtmrhxtZ3yBVLLquXbzCS6DrxQaAK/CeqM=
+github.com/src-d/go-oniguruma v1.1.0 h1:EG+Nm5n2JqWUaCjtM0NtutPxU7ZN5Tp50GWrrV8bTww=
+github.com/src-d/go-oniguruma v1.1.0/go.mod h1:chVbff8kcVtmrhxtZ3yBVLLquXbzCS6DrxQaAK/CeqM=
 github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
 github.com/stretchr/testify v1.3.0 h1:TivCn/peBQ7UY8ooIcPgZFpTNSz0Q2U6UrFlUfqbe0Q=
 github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=

From 9a7b370b1792da936415c300b0a7843f01603917 Mon Sep 17 00:00:00 2001
From: Alexander Bezzubov <bzz@apache.org>
Date: Wed, 8 May 2019 15:31:55 +0200
Subject: [PATCH 6/7] regex: in oniguruma profile, switch to ASCII matching

Signed-off-by: Alexander Bezzubov <bzz@apache.org>
---
 regex/oniguruma.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/regex/oniguruma.go b/regex/oniguruma.go
index 2083569e..8caf644c 100644
--- a/regex/oniguruma.go
+++ b/regex/oniguruma.go
@@ -9,7 +9,7 @@ import (
 type EnryRegexp = *rubex.Regexp
 
 func MustCompile(str string) EnryRegexp {
-	return rubex.MustCompile(str)
+	return rubex.MustCompileASCII(str)
 }
 
 func QuoteMeta(s string) string {

From f3ceaa6330cebaf43b97bdf31ba67f5e3132f295 Mon Sep 17 00:00:00 2001
From: Alexander Bezzubov <bzz@apache.org>
Date: Wed, 8 May 2019 22:17:32 +0200
Subject: [PATCH 7/7] token: refactor & simplify test fixtures

Signed-off-by: Alexander Bezzubov <bzz@apache.org>
---
 internal/tokenizer/tokenize_test.go | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/internal/tokenizer/tokenize_test.go b/internal/tokenizer/tokenize_test.go
index 4737c8ea..36378ef6 100644
--- a/internal/tokenizer/tokenize_test.go
+++ b/internal/tokenizer/tokenize_test.go
@@ -127,23 +127,23 @@ func TestTokenizerLatin1AsUtf8(t *testing.T) {
 
 func TestRegexpOnInvalidUtf8(t *testing.T) {
 	origContent := []struct {
-		bytes  []byte
+		text   string
 		tokens []string
 	}{
-		{[]byte("th\xe0 filling"), []string{"th", "filling"}},   // `th� filling`
-		{[]byte("th\u0100 filling"), []string{"th", "filling"}}, // `thĀ filling`
-		{[]byte("привет, как дела?"), []string{}},               // empty, no ASCII tokens
+		{"th\xe0 filling", []string{"th", "filling"}},   // `th� filling`
+		{"th\u0100 filling", []string{"th", "filling"}}, // `thĀ filling`
+		{"привет, как дела?", []string{}},               // empty, no ASCII tokens
 	}
 	re := reRegularToken
 
 	for _, content := range origContent {
 		t.Run("", func(t *testing.T) {
-			t.Logf("%v - %q", content, string(content.bytes))
-
-			tokens := re.FindAll(content.bytes, -1)
+			t.Logf("%v - %q", content, content.text)
+			input := []byte(content.text)
+			tokens := re.FindAll(input, -1)
 			require.Equal(t, len(content.tokens), len(tokens))
 
-			newContent := re.ReplaceAll(content.bytes, []byte(` `))
+			newContent := re.ReplaceAll(input, []byte(` `))
 			t.Logf("content:%q, tokens:[", newContent)
 			for i, token := range tokens {
 				t.Logf("\t%q,", string(token))