Skip to content

Commit

Permalink
[Fix](Nereids) Fix datatype length wrong when string contains chinese (
Browse files Browse the repository at this point in the history
…apache#29885)

When varchar literal contains chinese, the length of varchar should not be the length of the varchar, it should be 
the actual length of the using byte.
Chinese is represented by unicode, a chinese char occypy 4 byte at mostly. So if meet chinese in varchar literal, we 
set the length is 4* length.

for example as following:
>        CREATE MATERIALIZED VIEW test_varchar_literal_mv
>             BUILD IMMEDIATE REFRESH AUTO ON MANUAL
>             DISTRIBUTED BY RANDOM BUCKETS 2
>             PROPERTIES ('replication_num' = '1')
>             AS
>             select case when l_orderkey > 1 then "一二三四" else "五六七八" end as field_1 from lineitem;

mysql> desc test_varchar_literal_mv;
the def of materialized view is as following:
+---------+-------------+------+-------+---------+-------+
| Field   | Type        | Null | Key   | Default | Extra |
+---------+-------------+------+-------+---------+-------+
| field_1 | VARCHAR(16) | No   | false | NULL    | NONE  |
+---------+-------------+------+-------+---------+-------+
  • Loading branch information
seawinde authored Jan 12, 2024
1 parent 7910162 commit 0c5dd1a
Show file tree
Hide file tree
Showing 6 changed files with 119 additions and 4 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
import org.apache.doris.catalog.AggregateType;
import org.apache.doris.catalog.Env;
import org.apache.doris.catalog.KeysType;
import org.apache.doris.catalog.ScalarType;
import org.apache.doris.common.Config;
import org.apache.doris.common.FeConstants;
import org.apache.doris.common.Pair;
Expand Down Expand Up @@ -316,6 +317,7 @@
import org.apache.doris.nereids.trees.expressions.literal.MapLiteral;
import org.apache.doris.nereids.trees.expressions.literal.NullLiteral;
import org.apache.doris.nereids.trees.expressions.literal.SmallIntLiteral;
import org.apache.doris.nereids.trees.expressions.literal.StringLikeLiteral;
import org.apache.doris.nereids.trees.expressions.literal.StringLiteral;
import org.apache.doris.nereids.trees.expressions.literal.StructLiteral;
import org.apache.doris.nereids.trees.expressions.literal.TinyIntLiteral;
Expand Down Expand Up @@ -406,6 +408,7 @@
import org.apache.doris.nereids.types.coercion.CharacterType;
import org.apache.doris.nereids.util.ExpressionUtils;
import org.apache.doris.nereids.util.RelationUtil;
import org.apache.doris.nereids.util.Utils;
import org.apache.doris.policy.FilterType;
import org.apache.doris.policy.PolicyTypeEnum;
import org.apache.doris.qe.ConnectContext;
Expand Down Expand Up @@ -2104,7 +2107,11 @@ public Literal visitStringLiteral(StringLiteralContext ctx) {
if (!SqlModeHelper.hasNoBackSlashEscapes()) {
s = LogicalPlanBuilderAssistant.escapeBackSlash(s);
}
return new VarcharLiteral(s);
int strLength = Utils.containChinese(s) ? s.length() * StringLikeLiteral.CHINESE_CHAR_BYTE_LENGTH : s.length();
if (strLength > ScalarType.MAX_VARCHAR_LENGTH) {
return new StringLiteral(s);
}
return new VarcharLiteral(s, strLength);
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,11 @@

import java.util.Objects;

/** StringLikeLiteral. */
/**
* StringLikeLiteral.
*/
public abstract class StringLikeLiteral extends Literal {
public static final int CHINESE_CHAR_BYTE_LENGTH = 4;
public final String value;

public StringLikeLiteral(String value, DataType dataType) {
Expand Down
12 changes: 12 additions & 0 deletions fe/fe-core/src/main/java/org/apache/doris/nereids/util/Utils.java
Original file line number Diff line number Diff line change
Expand Up @@ -277,4 +277,16 @@ public static String normalizeName(String name, String defaultName) {
}
return CaseFormat.UPPER_CAMEL.to(CaseFormat.LOWER_UNDERSCORE, name);
}

/**
* Check the content if contains chinese or not, if true when contains chinese or false
*/
public static boolean containChinese(String text) {
for (char textChar : text.toCharArray()) {
if (Character.UnicodeScript.of(textChar) == Character.UnicodeScript.HAN) {
return true;
}
}
return false;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

package org.apache.doris.nereids.util;

import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test;

/**
* The tests for utils
*/
public class UtilsTest {
@Test
public void containChinese() {
String chinese = "123数据库";
Assertions.assertTrue(Utils.containChinese(chinese));

String en = "database123";
Assertions.assertFalse(Utils.containChinese(en));
}
}
3 changes: 3 additions & 0 deletions regression-test/data/mtmv_p0/test_build_mtmv.out
Original file line number Diff line number Diff line change
Expand Up @@ -60,3 +60,6 @@ zhangsang 200
-- !select_union --
11 111

-- !desc_mv --
field_1 VARCHAR(16) No false \N NONE

59 changes: 57 additions & 2 deletions regression-test/suites/mtmv_p0/test_build_mtmv.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ suite("test_build_mtmv") {
id BIGINT,
username VARCHAR(20)
)
DISTRIBUTED BY HASH(id) BUCKETS 10
DISTRIBUTED BY HASH(id) BUCKETS 10
PROPERTIES (
"replication_num" = "1"
);
Expand All @@ -52,7 +52,7 @@ suite("test_build_mtmv") {
id BIGINT,
pv BIGINT
)
DISTRIBUTED BY HASH(id) BUCKETS 10
DISTRIBUTED BY HASH(id) BUCKETS 10
PROPERTIES (
"replication_num" = "1"
);
Expand Down Expand Up @@ -580,4 +580,59 @@ suite("test_build_mtmv") {
sql """
DROP MATERIALIZED VIEW ${mvName}
"""

// test build mv which containing literal varchar field
sql """
drop table if exists lineitem
"""
sql """
CREATE TABLE IF NOT EXISTS lineitem (
l_orderkey INTEGER NOT NULL,
l_partkey INTEGER NOT NULL,
l_suppkey INTEGER NOT NULL,
l_linenumber INTEGER NOT NULL,
l_quantity DECIMALV3(15,2) NOT NULL,
l_extendedprice DECIMALV3(15,2) NOT NULL,
l_discount DECIMALV3(15,2) NOT NULL,
l_tax DECIMALV3(15,2) NOT NULL,
l_returnflag CHAR(1) NOT NULL,
l_linestatus CHAR(1) NOT NULL,
l_shipdate DATE NOT NULL,
l_commitdate DATE NOT NULL,
l_receiptdate DATE NOT NULL,
l_shipinstruct CHAR(25) NOT NULL,
l_shipmode CHAR(10) NOT NULL,
l_comment VARCHAR(44) NOT NULL
)
DUPLICATE KEY(l_orderkey, l_partkey, l_suppkey, l_linenumber)
PARTITION BY RANGE(l_shipdate) (
PARTITION `day_2` VALUES LESS THAN ('2023-12-9'),
PARTITION `day_3` VALUES LESS THAN ("2023-12-11"),
PARTITION `day_4` VALUES LESS THAN ("2023-12-30")
)
DISTRIBUTED BY HASH(l_orderkey) BUCKETS 3
PROPERTIES (
"replication_num" = "1"
)
"""

sql """
insert into lineitem values
(1, 2, 3, 4, 5.5, 6.5, 7.5, 8.5, 'o', 'k', '2023-12-08', '2023-12-09', '2023-12-10', 'a', 'b', 'yyyyyyyyy'),
(2, 4, 3, 4, 5.5, 6.5, 7.5, 8.5, 'o', 'k', '2023-12-09', '2023-12-09', '2023-12-10', 'a', 'b', 'yyyyyyyyy'),
(3, 2, 4, 4, 5.5, 6.5, 7.5, 8.5, 'o', 'k', '2023-12-10', '2023-12-09', '2023-12-10', 'a', 'b', 'yyyyyyyyy'),
(4, 3, 3, 4, 5.5, 6.5, 7.5, 8.5, 'o', 'k', '2023-12-11', '2023-12-09', '2023-12-10', 'a', 'b', 'yyyyyyyyy'),
(5, 2, 3, 6, 7.5, 8.5, 9.5, 10.5, 'k', 'o', '2023-12-12', '2023-12-12', '2023-12-13', 'c', 'd', 'xxxxxxxxx');
"""

sql """DROP MATERIALIZED VIEW IF EXISTS test_varchar_literal_mv;"""
sql """
CREATE MATERIALIZED VIEW test_varchar_literal_mv
BUILD IMMEDIATE REFRESH AUTO ON MANUAL
DISTRIBUTED BY RANDOM BUCKETS 2
PROPERTIES ('replication_num' = '1')
AS
select case when l_orderkey > 1 then "一二三四" else "五六七八" end as field_1 from lineitem;
"""
qt_desc_mv """desc test_varchar_literal_mv;"""
}

0 comments on commit 0c5dd1a

Please sign in to comment.