From 0c49f18829ee89cbabb64e4b2788af5f15f5ea91 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20van=20Eeden?= Date: Wed, 14 Jun 2023 12:39:08 +0200 Subject: [PATCH] charset: alias utf8 and utf8mb3 (#44655) close pingcap/tidb#26226 --- parser/charset/BUILD.bazel | 2 +- parser/charset/charset.go | 30 ++++++++++++++++++++++++++---- parser/charset/charset_test.go | 28 ++++++++++++++++++++++++++++ 3 files changed, 55 insertions(+), 5 deletions(-) diff --git a/parser/charset/BUILD.bazel b/parser/charset/BUILD.bazel index f8fa92e88dc82..c437ee959cf28 100644 --- a/parser/charset/BUILD.bazel +++ b/parser/charset/BUILD.bazel @@ -42,7 +42,7 @@ go_test( ], embed = [":charset"], flaky = True, - shard_count = 7, + shard_count = 8, deps = [ "@com_github_stretchr_testify//require", "@org_golang_x_text//transform", diff --git a/parser/charset/charset.go b/parser/charset/charset.go index 6067e4f623424..668acefcac239 100644 --- a/parser/charset/charset.go +++ b/parser/charset/charset.go @@ -106,8 +106,8 @@ func GetSupportedCollations() []*Collation { // and returns a boolean. func ValidCharsetAndCollation(cs string, co string) bool { // We will use utf8 as a default charset. - if cs == "" { - cs = "utf8" + if cs == "" || cs == CharsetUTF8MB3 { + cs = CharsetUTF8 } chs, err := GetCharsetInfo(cs) if err != nil { @@ -117,7 +117,7 @@ func ValidCharsetAndCollation(cs string, co string) bool { if co == "" { return true } - co = strings.ToLower(co) + co = utf8Alias(strings.ToLower(co)) _, ok := chs.Collations[co] return ok } @@ -125,6 +125,8 @@ func ValidCharsetAndCollation(cs string, co string) bool { // GetDefaultCollationLegacy is compatible with the charset support in old version parser. func GetDefaultCollationLegacy(charset string) (string, error) { switch strings.ToLower(charset) { + case CharsetUTF8MB3: + return GetDefaultCollation(CharsetUTF8) case CharsetUTF8, CharsetUTF8MB4, CharsetASCII, CharsetLatin1, CharsetBin: return GetDefaultCollation(charset) default: @@ -148,6 +150,10 @@ func GetDefaultCharsetAndCollate() (defaultCharset string, defaultCollationName // GetCharsetInfo returns charset and collation for cs as name. func GetCharsetInfo(cs string) (*Charset, error) { + if strings.ToLower(cs) == CharsetUTF8MB3 { + cs = CharsetUTF8 + } + if c, ok := CharacterSetInfos[strings.ToLower(cs)]; ok { return c, nil } @@ -180,9 +186,23 @@ func GetCollations() []*Collation { return collations } +func utf8Alias(csname string) string { + switch csname { + case "utf8mb3_bin": + csname = "utf8_bin" + case "utf8mb3_unicode_ci": + csname = "utf8_unicode_ci" + case "utf8mb3_general_ci": + csname = "utf8_general_ci" + default: + } + return csname +} + // GetCollationByName returns the collation by name. func GetCollationByName(name string) (*Collation, error) { - collation, ok := collationsNameMap[strings.ToLower(name)] + csname := utf8Alias(strings.ToLower(name)) + collation, ok := collationsNameMap[csname] if !ok { return nil, ErrUnknownCollation.GenWithStackByArgs(name) } @@ -225,6 +245,8 @@ const ( CharsetLatin1 = "latin1" // CharsetUTF8 is the default charset for string types. CharsetUTF8 = "utf8" + // CharsetUTF8MB3 is 3 bytes utf8, a MySQL legacy encoding. "utf8" and "utf8mb3" are aliases. + CharsetUTF8MB3 = "utf8mb3" // CharsetUTF8MB4 represents 4 bytes utf8, which works the same way as utf8 in Go. CharsetUTF8MB4 = "utf8mb4" //revive:disable:exported diff --git a/parser/charset/charset_test.go b/parser/charset/charset_test.go index 6de594c68c54d..9f17b9588b37c 100644 --- a/parser/charset/charset_test.go +++ b/parser/charset/charset_test.go @@ -44,6 +44,10 @@ func TestValidCharset(t *testing.T) { {"UTF8MB4", "UTF8MB4_bin", true}, {"UTF8MB4", "UTF8MB4_general_ci", true}, {"Utf8", "uTf8_bIN", true}, + {"utf8mb3", "", true}, + {"utf8mb3", "utf8mb3_bin", true}, + {"utf8mb3", "utf8mb3_general_ci", true}, + {"utf8mb3", "utf8mb3_unicode_ci", true}, } for _, tt := range tests { testValidCharset(t, tt.cs, tt.co, tt.succ) @@ -145,6 +149,30 @@ func TestValidCustomCharset(t *testing.T) { } } +func TestUTF8MB3(t *testing.T) { + colname, err := GetDefaultCollationLegacy("utf8mb3") + require.NoError(t, err) + require.Equal(t, colname, "utf8_bin") + + csinfo, err := GetCharsetInfo("utf8mb3") + require.NoError(t, err) + require.Equal(t, csinfo.Name, "utf8") + + tests := []struct { + cs string + alias string + }{ + {"utf8mb3_bin", "utf8_bin"}, + {"utf8mb3_general_ci", "utf8_general_ci"}, + {"utf8mb3_unicode_ci", "utf8_unicode_ci"}, + } + for _, tt := range tests { + col, err := GetCollationByName(tt.cs) + require.NoError(t, err) + require.Equal(t, col.Name, tt.alias) + } +} + func BenchmarkGetCharsetDesc(b *testing.B) { b.ResetTimer() charsets := []string{CharsetUTF8, CharsetUTF8MB4, CharsetASCII, CharsetLatin1, CharsetBin}