diff --git a/dbms/src/Core/tests/gtest_block.cpp b/dbms/src/Core/tests/gtest_block.cpp index 811261e0bc6..0f5e06b82d1 100644 --- a/dbms/src/Core/tests/gtest_block.cpp +++ b/dbms/src/Core/tests/gtest_block.cpp @@ -53,7 +53,7 @@ try "Decimal(40,2)", "MyDate", "MyDateTime", - "String", + DataTypeString::getDefaultName(), "FixedString(10)"}; for (auto & type_name : all_types) { @@ -80,7 +80,11 @@ try ArenaPtr pool = std::make_shared(); pool->alloc(1024 * 1024); /// case 1, agg function not allocate memory in arena - std::vector types{"Int64", "String", "Nullable(Int64)", "Nullable(String)"}; + std::vector types{ + "Int64", + DataTypeString::getDefaultName(), + "Nullable(Int64)", + DataTypeString::getNullableDefaultName()}; std::vector data_size{ 16, ColumnString::APPROX_STRING_SIZE * 2, @@ -139,7 +143,7 @@ try String long_str(ColumnString::APPROX_STRING_SIZE * 5, 'a'); String short_str(std::max(1, ColumnString::APPROX_STRING_SIZE / 10), 'a'); std::vector string_values{short_str, long_str}; - std::vector types{"String", "Nullable(String)"}; + std::vector types{DataTypeString::getDefaultName(), DataTypeString::getNullableDefaultName()}; for (const auto & string_value : string_values) { for (const auto & type_string : types) diff --git a/dbms/src/Core/tests/gtest_spiller.cpp b/dbms/src/Core/tests/gtest_spiller.cpp index c6ce401abc1..f3b6ed611d4 100644 --- a/dbms/src/Core/tests/gtest_spiller.cpp +++ b/dbms/src/Core/tests/gtest_spiller.cpp @@ -940,8 +940,8 @@ TEST_F(SpillerTest, SpillAndRestoreStringEnumData) try { NamesAndTypes spiller_schema; - spiller_schema.emplace_back("col0", DataTypeFactory::instance().get("String")); - spiller_schema.emplace_back("col1", DataTypeFactory::instance().get("Nullable(String)")); + spiller_schema.emplace_back("col0", DataTypeFactory::instance().get(DataTypeString::getDefaultName())); + spiller_schema.emplace_back("col1", DataTypeFactory::instance().get(DataTypeString::getNullableDefaultName())); spiller_schema.emplace_back("col2", DataTypeFactory::instance().get("Enum8('a' = 0,'b' = 1,'c' = 2)")); spiller_schema.emplace_back("col3", DataTypeFactory::instance().get("Nullable(Enum8('a' = 0,'b' = 1,'c' = 2))")); spiller_schema.emplace_back("col4", DataTypeFactory::instance().get("Enum16('a' = 0,'b' = 1,'c' = 2)")); @@ -969,4 +969,4 @@ try CATCH } // namespace tests -} // namespace DB \ No newline at end of file +} // namespace DB diff --git a/dbms/src/DataTypes/DataTypeString.cpp b/dbms/src/DataTypes/DataTypeString.cpp index 90f8e7f80ba..939e1827a8a 100644 --- a/dbms/src/DataTypes/DataTypeString.cpp +++ b/dbms/src/DataTypes/DataTypeString.cpp @@ -22,6 +22,7 @@ #include #include #include +#include #if __SSE2__ #include @@ -292,9 +293,15 @@ bool DataTypeString::equals(const IDataType & rhs) const void registerDataTypeString(DataTypeFactory & factory) { - auto creator = static_cast([] { return DataTypePtr(std::make_shared()); }); + std::function legacy_creator = [] { + return std::make_shared(DataTypeString::SerdesFormat::SizePrefix); + }; + factory.registerSimpleDataType(DataTypeString::LegacyName, legacy_creator); - factory.registerSimpleDataType("String", creator); + std::function creator = [] { + return std::make_shared(DataTypeString::SerdesFormat::SeparateSizeAndChars); + }; + factory.registerSimpleDataType(DataTypeString::NameV2, creator); /// These synonims are added for compatibility. @@ -310,4 +317,204 @@ void registerDataTypeString(DataTypeFactory & factory) factory.registerSimpleDataType("LONGBLOB", creator, DataTypeFactory::CaseInsensitive); } +namespace +{ + +using Offset = ColumnString::Offset; + +// Returns . +template +std::pair getStream(const G & getter, IDataType::SubstreamPath & path) +{ + auto * chars_stream = getter(path); + path.emplace_back(IDataType::Substream::StringSizes); + auto * offsets_stream = getter(path); + return {offsets_stream, chars_stream}; +} + +PaddedPODArray offsetToStrSize( + const ColumnString::Offsets & chars_offsets, + const size_t begin, + const size_t end) +{ + assert(!chars_offsets.empty()); + // The class PODArrayBase ensure chars_offsets[-1] is well defined as 0. + // For details, check the `pad_left` argument in PODArrayBase. + // In the for loop code below, when `begin` and `i` are 0: + // str_sizes[0] = chars_offsets[0] - chars_offsets[-1]; + assert(chars_offsets[-1] == 0); + + PaddedPODArray str_sizes(end - begin); + auto chars_offsets_pos = chars_offsets.begin() + begin; + + // clang-format off + #pragma clang loop vectorize(enable) + // clang-format on + for (ssize_t i = 0; i < static_cast(str_sizes.size()); ++i) + { + str_sizes[i] = chars_offsets_pos[i] - chars_offsets_pos[i - 1]; + } + return str_sizes; +} + +void strSizeToOffset(const PaddedPODArray & str_sizes, ColumnString::Offsets & chars_offsets) +{ + assert(!str_sizes.empty()); + assert(chars_offsets[-1] == 0); + const auto initial_size = chars_offsets.size(); + chars_offsets.resize(initial_size + str_sizes.size()); + auto chars_offsets_pos = chars_offsets.begin() + initial_size; + // Cannot be vectorize by compiler because chars_offsets[i] depends on chars_offsets[i-1] + // #pragma clang loop vectorize(enable) + for (ssize_t i = 0; i < static_cast(str_sizes.size()); ++i) + { + chars_offsets_pos[i] = str_sizes[i] + chars_offsets_pos[i - 1]; + } +} + +std::pair serializeOffsetsBinary( + const ColumnString::Offsets & chars_offsets, + WriteBuffer & ostr, + size_t offset, + size_t limit) +{ + // [begin, end) is the range that need to be serialized of `chars_offsets`. + const auto begin = offset; + const auto end = limit != 0 && offset + limit < chars_offsets.size() ? offset + limit : chars_offsets.size(); + + PaddedPODArray sizes = offsetToStrSize(chars_offsets, begin, end); + ostr.write(reinterpret_cast(sizes.data()), sizeof(Offset) * sizes.size()); + + // [chars_begin, chars_end) is the range that need to be serialized of `chars`. + const auto chars_begin = begin == 0 ? 0 : chars_offsets[begin - 1]; + const auto chars_end = chars_offsets[end - 1]; + return {chars_begin, chars_end}; +} + +void serializeCharsBinary(const ColumnString::Chars_t & chars, WriteBuffer & ostr, size_t begin, size_t end) +{ + ostr.write(reinterpret_cast(&chars[begin]), end - begin); +} + +size_t deserializeOffsetsBinary(ColumnString::Offsets & chars_offsets, ReadBuffer & istr, size_t limit) +{ + PaddedPODArray str_sizes(limit); + const auto size = istr.readBig(reinterpret_cast(str_sizes.data()), sizeof(Offset) * limit); + str_sizes.resize(size / sizeof(Offset)); + strSizeToOffset(str_sizes, chars_offsets); + return std::accumulate(str_sizes.begin(), str_sizes.end(), 0uz); +} + +void deserializeCharsBinary(ColumnString::Chars_t & chars, ReadBuffer & istr, size_t bytes) +{ + const auto initial_size = chars.size(); + chars.resize(initial_size + bytes); + istr.readStrict(reinterpret_cast(&chars[initial_size]), bytes); +} + +void serializeBinaryBulkV2( + const IColumn & column, + WriteBuffer & offsets_stream, + WriteBuffer & chars_stream, + size_t offset, + size_t limit) +{ + if (column.empty()) + return; + const auto & column_string = typeid_cast(column); + const auto & chars = column_string.getChars(); + const auto & offsets = column_string.getOffsets(); + auto [chars_begin, chars_end] = serializeOffsetsBinary(offsets, offsets_stream, offset, limit); + serializeCharsBinary(chars, chars_stream, chars_begin, chars_end); +} + +void deserializeBinaryBulkV2(IColumn & column, ReadBuffer & offsets_stream, ReadBuffer & chars_stream, size_t limit) +{ + if (limit == 0) + return; + auto & column_string = typeid_cast(column); + auto & chars = column_string.getChars(); + auto & offsets = column_string.getOffsets(); + auto bytes = deserializeOffsetsBinary(offsets, offsets_stream, limit); + deserializeCharsBinary(chars, chars_stream, bytes); +} + +} // namespace + +void DataTypeString::enumerateStreams(const StreamCallback & callback, SubstreamPath & path) const +{ + callback(path); + if (serdes_fmt == SerdesFormat::SeparateSizeAndChars) + { + path.emplace_back(Substream::StringSizes); + callback(path); + } +} + +void DataTypeString::serializeBinaryBulkWithMultipleStreams( + const IColumn & column, + const OutputStreamGetter & getter, + size_t offset, + size_t limit, + bool /*position_independent_encoding*/, + SubstreamPath & path) const +{ + if (serdes_fmt == SerdesFormat::SeparateSizeAndChars) + { + auto [offsets_stream, chars_stream] = getStream(getter, path); + serializeBinaryBulkV2(column, *offsets_stream, *chars_stream, offset, limit); + } + else + { + serializeBinaryBulk(column, *getter(path), offset, limit); + } +} + +void DataTypeString::deserializeBinaryBulkWithMultipleStreams( + IColumn & column, + const InputStreamGetter & getter, + size_t limit, + double avg_value_size_hint, + bool /*position_independent_encoding*/, + SubstreamPath & path) const +{ + if (serdes_fmt == SerdesFormat::SeparateSizeAndChars) + { + auto [offsets_stream, chars_stream] = getStream(getter, path); + deserializeBinaryBulkV2(column, *offsets_stream, *chars_stream, limit); + } + else + { + deserializeBinaryBulk(column, *getter(path), limit, avg_value_size_hint); + } +} + +static DataTypeString::SerdesFormat getDefaultByStorageFormat(StorageFormatVersion current) +{ + if (current.identifier < 8 || (current.identifier >= 100 && current.identifier < 103)) + { + return DataTypeString::SerdesFormat::SizePrefix; + } + return DataTypeString::SerdesFormat::SeparateSizeAndChars; +} + +DataTypeString::DataTypeString(SerdesFormat serdes_fmt_) + : serdes_fmt((serdes_fmt_ != SerdesFormat::None) ? serdes_fmt_ : getDefaultByStorageFormat(STORAGE_FORMAT_CURRENT)) +{} + +String DataTypeString::getDefaultName() +{ + if (STORAGE_FORMAT_CURRENT.identifier < 8 + || (STORAGE_FORMAT_CURRENT.identifier >= 100 && STORAGE_FORMAT_CURRENT.identifier < 103)) + { + return LegacyName; + } + return NameV2; +} + +String DataTypeString::getNullableDefaultName() +{ + return fmt::format("Nullable({})", getDefaultName()); +} + } // namespace DB diff --git a/dbms/src/DataTypes/DataTypeString.h b/dbms/src/DataTypes/DataTypeString.h index 1bc4ece42dd..8687e945864 100644 --- a/dbms/src/DataTypes/DataTypeString.h +++ b/dbms/src/DataTypes/DataTypeString.h @@ -16,7 +16,6 @@ #include - namespace DB { class DataTypeString final : public IDataType @@ -27,6 +26,8 @@ class DataTypeString final : public IDataType const char * getFamilyName() const override { return "String"; } + String getName() const override { return serdes_fmt == SerdesFormat::SeparateSizeAndChars ? NameV2 : LegacyName; } + TypeIndex getTypeId() const override { return TypeIndex::String; } void serializeBinary(const Field & field, WriteBuffer & ostr) const override; @@ -64,6 +65,43 @@ class DataTypeString final : public IDataType bool isString() const override { return true; } bool isCategorial() const override { return true; } bool canBeInsideNullable() const override { return true; } + + void enumerateStreams(const StreamCallback & callback, SubstreamPath & path) const override; + + void serializeBinaryBulkWithMultipleStreams( + const IColumn & column, + const OutputStreamGetter & getter, + size_t offset, + size_t limit, + bool position_independent_encoding, + SubstreamPath & path) const override; + + void deserializeBinaryBulkWithMultipleStreams( + IColumn & column, + const InputStreamGetter & getter, + size_t limit, + double avg_value_size_hint, + bool position_independent_encoding, + SubstreamPath & path) const override; + + enum class SerdesFormat + { + None = 0, // Decide by STORAGE_FORMAT_CURRENT + SizePrefix = 1, // Legacy format, corresponding to `LegacyName` + SeparateSizeAndChars = 2, // New format, corresponding to `NameV2` + }; + + inline static const String LegacyName{"String"}; // For compatibility of size-prefix format. + inline static const String NameV2{"StringV2"}; // The separate size and chars format. + + // Both getDefaultName and getNullableDefaultName are unit-tests helpers. + static String getDefaultName(); + static String getNullableDefaultName(); + + explicit DataTypeString(SerdesFormat serdes_fmt_ = SerdesFormat::None); + +private: + const SerdesFormat serdes_fmt; }; } // namespace DB diff --git a/dbms/src/DataTypes/IDataType.cpp b/dbms/src/DataTypes/IDataType.cpp index b131adca00a..fafb9f87382 100644 --- a/dbms/src/DataTypes/IDataType.cpp +++ b/dbms/src/DataTypes/IDataType.cpp @@ -103,6 +103,13 @@ bool IDataType::isArraySizes(const SubstreamPath & path) return false; } +bool IDataType::isStringSizes(const SubstreamPath & path) +{ + return std::any_of(path.cbegin(), path.cend(), [](const auto & elem) { + return elem.type == IDataType::Substream::StringSizes; + }); +} + String IDataType::getFileNameForStream(const String & column_name, const IDataType::SubstreamPath & path) { String nested_table_name = Nested::extractTableName(column_name); @@ -127,6 +134,8 @@ String IDataType::getFileNameForStream(const String & column_name, const IDataTy /// and name is encoded as a whole. stream_name += "%2E" + escapeForFileName(elem.tuple_element_name); } + else if (elem.type == Substream::StringSizes) + stream_name += ".size"; } return stream_name; } diff --git a/dbms/src/DataTypes/IDataType.h b/dbms/src/DataTypes/IDataType.h index b9540aee8f4..a5d4dc88e73 100644 --- a/dbms/src/DataTypes/IDataType.h +++ b/dbms/src/DataTypes/IDataType.h @@ -95,6 +95,8 @@ class IDataType : private boost::noncopyable NullMap, TupleElement, + + StringSizes, }; Type type; @@ -421,6 +423,7 @@ class IDataType : private boost::noncopyable static bool isNullMap(const SubstreamPath & path); static bool isArraySizes(const SubstreamPath & path); + static bool isStringSizes(const SubstreamPath & path); }; diff --git a/dbms/src/DataTypes/tests/bench_data_type_string.cpp b/dbms/src/DataTypes/tests/bench_data_type_string.cpp new file mode 100644 index 00000000000..cfc1361fcc5 --- /dev/null +++ b/dbms/src/DataTypes/tests/bench_data_type_string.cpp @@ -0,0 +1,829 @@ +// Copyright 2024 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +namespace DB::bench +{ + +String getStreamName(const String & column_name, const IDataType::SubstreamPath & substream_path) +{ + return IDataType::getFileNameForStream(column_name, substream_path); +} + +ColumnPtr createColumnString(size_t str_size, size_t count) +{ + std::random_device rand_dev; + std::mt19937_64 rand_gen(rand_dev()); + std::uniform_int_distribution rand_size(str_size * 0.8, str_size * 1.2); + std::vector v(count); + for (auto & s : v) + s = DB::random::randomString(rand_size(rand_gen)); + + return DB::tests::createColumn(v, "", 0).column; +} + +using WriteBufferPair = std::pair, std::unique_ptr>; +WriteBufferPair createWriteBuffer(const String & stream_name, CompressionMethod method) +{ + auto write_buffer = std::make_unique(100 * 1024 * 1024); + std::unique_ptr compressed_buf; + if (method != CompressionMethod::NONE) + { + CompressionSetting setting{method}; + setting.data_type = stream_name.ends_with(".size") ? CompressionDataType::Int64 : CompressionDataType::String; + compressed_buf = CompressedWriteBuffer<>::build(*write_buffer, CompressionSettings{setting}, false); + } + return {std::move(compressed_buf), std::move(write_buffer)}; +} + +using ReadBufferPair = std::pair, std::unique_ptr>; +ReadBufferPair createReadBuffer(const WriteBufferFromOwnString & write_buffer, bool enable_compression) +{ + auto read_buffer = std::make_unique(write_buffer.stringRef().toStringView()); + std::unique_ptr compressed_buf; + if (enable_compression) + compressed_buf = std::make_unique>(*read_buffer); + return {std::move(compressed_buf), std::move(read_buffer)}; +} + +auto initWriteStream(IDataType & type, CompressionMethod method) +{ + std::unordered_map write_streams; + auto create_write_stream = [&](const IDataType::SubstreamPath & substream_path) { + const auto stream_name = getStreamName("bench", substream_path); + write_streams.emplace(stream_name, createWriteBuffer(stream_name, method)); + }; + type.enumerateStreams(create_write_stream, {}); + return write_streams; +} + +constexpr size_t str_count = 65535; + +template +void serialize(benchmark::State & state, Args &&... args) +{ + auto [fmt, str_size, method] = std::make_tuple(std::move(args)...); + auto str_col = createColumnString(str_size, str_count); + DataTypeString t(fmt); + IDataType & type = t; + auto write_streams = initWriteStream(type, method); + auto get_write_stream = [&](const IDataType::SubstreamPath & substream_path) -> WriteBuffer * { + const auto stream_name = getStreamName("bench", substream_path); + auto & [compress_buf, write_buffer] = write_streams.at(stream_name); + write_buffer->restart(); // Reset to avoid write buffer overflow. + if (compress_buf) + return compress_buf.get(); + return write_buffer.get(); + }; + auto flush_stream = [&](const IDataType::SubstreamPath & substream_path) { + const auto stream_name = getStreamName("bench", substream_path); + auto & [compress_buf, write_buffer] = write_streams.at(stream_name); + if (compress_buf) + compress_buf->next(); + }; + for (auto _ : state) + { + type.serializeBinaryBulkWithMultipleStreams(*str_col, get_write_stream, 0, str_col->size(), true, {}); + type.enumerateStreams(flush_stream, {}); + } +} + +template +void deserialize(benchmark::State & state, Args &&... args) +{ + auto [fmt, str_size, method] = std::make_tuple(std::move(args)...); + auto str_col = createColumnString(str_size, str_count); + DataTypeString t(fmt); + IDataType & type = t; + auto write_streams = initWriteStream(type, method); + auto get_write_stream = [&](const IDataType::SubstreamPath & substream_path) -> WriteBuffer * { + const auto stream_name = getStreamName("bench", substream_path); + auto & [compress_buf, write_buffer] = write_streams.at(stream_name); + if (compress_buf) + return compress_buf.get(); + return write_buffer.get(); + }; + auto flush_stream = [&](const IDataType::SubstreamPath & substream_path) { + const auto stream_name = getStreamName("bench", substream_path); + auto & [compress_buf, write_buffer] = write_streams.at(stream_name); + if (compress_buf) + compress_buf->next(); + }; + type.serializeBinaryBulkWithMultipleStreams(*str_col, get_write_stream, 0, str_col->size(), true, {}); + type.enumerateStreams(flush_stream, {}); + + std::unordered_map read_streams; + auto get_read_stream = [&](const IDataType::SubstreamPath & substream_path) { + const auto stream_name = getStreamName("bench", substream_path); + auto & [compress_buf, write_buffer] = write_streams.at(stream_name); + read_streams[stream_name] = createReadBuffer(*write_buffer, compress_buf != nullptr); + auto & [compressed_read_buffer, read_buffer] = read_streams[stream_name]; + if (compressed_read_buffer) + return compressed_read_buffer.get(); + return read_buffer.get(); + }; + for (auto _ : state) + { + auto col = type.createColumn(); + type.deserializeBinaryBulkWithMultipleStreams(*col, get_read_stream, str_count, str_size, true, {}); + benchmark::DoNotOptimize(col); + } +} + +BENCHMARK_CAPTURE( + serialize, + size_prefix_size1_none, + DataTypeString::SerdesFormat::SizePrefix, + 1, + CompressionMethod::NONE); +BENCHMARK_CAPTURE( + serialize, + size_prefix_size2_none, + DataTypeString::SerdesFormat::SizePrefix, + 2, + CompressionMethod::NONE); +BENCHMARK_CAPTURE( + serialize, + size_prefix_size4_none, + DataTypeString::SerdesFormat::SizePrefix, + 4, + CompressionMethod::NONE); +BENCHMARK_CAPTURE( + serialize, + size_prefix_size8_none, + DataTypeString::SerdesFormat::SizePrefix, + 8, + CompressionMethod::NONE); +BENCHMARK_CAPTURE( + serialize, + size_prefix_size16_none, + DataTypeString::SerdesFormat::SizePrefix, + 16, + CompressionMethod::NONE); +BENCHMARK_CAPTURE( + serialize, + size_prefix_size32_none, + DataTypeString::SerdesFormat::SizePrefix, + 32, + CompressionMethod::NONE); +BENCHMARK_CAPTURE( + serialize, + size_prefix_size64_none, + DataTypeString::SerdesFormat::SizePrefix, + 64, + CompressionMethod::NONE); +BENCHMARK_CAPTURE( + serialize, + size_prefix_size128_none, + DataTypeString::SerdesFormat::SizePrefix, + 128, + CompressionMethod::NONE); +BENCHMARK_CAPTURE( + serialize, + size_prefix_size256_none, + DataTypeString::SerdesFormat::SizePrefix, + 256, + CompressionMethod::NONE); +BENCHMARK_CAPTURE( + serialize, + size_prefix_size512_none, + DataTypeString::SerdesFormat::SizePrefix, + 512, + CompressionMethod::NONE); +BENCHMARK_CAPTURE( + serialize, + size_prefix_size1024_none, + DataTypeString::SerdesFormat::SizePrefix, + 1024, + CompressionMethod::NONE); + +BENCHMARK_CAPTURE( + serialize, + seperate_size1_none, + DataTypeString::SerdesFormat::SeparateSizeAndChars, + 1, + CompressionMethod::NONE); +BENCHMARK_CAPTURE( + serialize, + seperate_size2_none, + DataTypeString::SerdesFormat::SeparateSizeAndChars, + 2, + CompressionMethod::NONE); +BENCHMARK_CAPTURE( + serialize, + seperate_size4_none, + DataTypeString::SerdesFormat::SeparateSizeAndChars, + 4, + CompressionMethod::NONE); +BENCHMARK_CAPTURE( + serialize, + seperate_size8_none, + DataTypeString::SerdesFormat::SeparateSizeAndChars, + 8, + CompressionMethod::NONE); +BENCHMARK_CAPTURE( + serialize, + seperate_size16_none, + DataTypeString::SerdesFormat::SeparateSizeAndChars, + 16, + CompressionMethod::NONE); +BENCHMARK_CAPTURE( + serialize, + seperate_size32_none, + DataTypeString::SerdesFormat::SeparateSizeAndChars, + 32, + CompressionMethod::NONE); +BENCHMARK_CAPTURE( + serialize, + seperate_size64_none, + DataTypeString::SerdesFormat::SeparateSizeAndChars, + 64, + CompressionMethod::NONE); +BENCHMARK_CAPTURE( + serialize, + seperate_size128_none, + DataTypeString::SerdesFormat::SeparateSizeAndChars, + 128, + CompressionMethod::NONE); +BENCHMARK_CAPTURE( + serialize, + seperate_size256_none, + DataTypeString::SerdesFormat::SeparateSizeAndChars, + 256, + CompressionMethod::NONE); +BENCHMARK_CAPTURE( + serialize, + seperate_size512_none, + DataTypeString::SerdesFormat::SeparateSizeAndChars, + 512, + CompressionMethod::NONE); +BENCHMARK_CAPTURE( + serialize, + seperate_size1024_none, + DataTypeString::SerdesFormat::SeparateSizeAndChars, + 1024, + CompressionMethod::NONE); + +BENCHMARK_CAPTURE( + deserialize, + size_prefix_size1_none, + DataTypeString::SerdesFormat::SizePrefix, + 1, + CompressionMethod::NONE); +BENCHMARK_CAPTURE( + deserialize, + size_prefix_size2_none, + DataTypeString::SerdesFormat::SizePrefix, + 2, + CompressionMethod::NONE); +BENCHMARK_CAPTURE( + deserialize, + size_prefix_size4_none, + DataTypeString::SerdesFormat::SizePrefix, + 4, + CompressionMethod::NONE); +BENCHMARK_CAPTURE( + deserialize, + size_prefix_size8_none, + DataTypeString::SerdesFormat::SizePrefix, + 8, + CompressionMethod::NONE); +BENCHMARK_CAPTURE( + deserialize, + size_prefix_size16_none, + DataTypeString::SerdesFormat::SizePrefix, + 16, + CompressionMethod::NONE); +BENCHMARK_CAPTURE( + deserialize, + size_prefix_size32_none, + DataTypeString::SerdesFormat::SizePrefix, + 32, + CompressionMethod::NONE); +BENCHMARK_CAPTURE( + deserialize, + size_prefix_size64_none, + DataTypeString::SerdesFormat::SizePrefix, + 64, + CompressionMethod::NONE); +BENCHMARK_CAPTURE( + deserialize, + size_prefix_size128_none, + DataTypeString::SerdesFormat::SizePrefix, + 128, + CompressionMethod::NONE); +BENCHMARK_CAPTURE( + deserialize, + size_prefix_size256_none, + DataTypeString::SerdesFormat::SizePrefix, + 256, + CompressionMethod::NONE); +BENCHMARK_CAPTURE( + deserialize, + size_prefix_size512_none, + DataTypeString::SerdesFormat::SizePrefix, + 512, + CompressionMethod::NONE); +BENCHMARK_CAPTURE( + deserialize, + size_prefix_size1024_none, + DataTypeString::SerdesFormat::SizePrefix, + 1024, + CompressionMethod::NONE); + +BENCHMARK_CAPTURE( + deserialize, + seperate_size1_none, + DataTypeString::SerdesFormat::SeparateSizeAndChars, + 1, + CompressionMethod::NONE); +BENCHMARK_CAPTURE( + deserialize, + seperate_size2_none, + DataTypeString::SerdesFormat::SeparateSizeAndChars, + 2, + CompressionMethod::NONE); +BENCHMARK_CAPTURE( + deserialize, + seperate_size4_none, + DataTypeString::SerdesFormat::SeparateSizeAndChars, + 4, + CompressionMethod::NONE); +BENCHMARK_CAPTURE( + deserialize, + seperate_size8_none, + DataTypeString::SerdesFormat::SeparateSizeAndChars, + 8, + CompressionMethod::NONE); +BENCHMARK_CAPTURE( + deserialize, + seperate_size16_none, + DataTypeString::SerdesFormat::SeparateSizeAndChars, + 16, + CompressionMethod::NONE); +BENCHMARK_CAPTURE( + deserialize, + seperate_size32_none, + DataTypeString::SerdesFormat::SeparateSizeAndChars, + 32, + CompressionMethod::NONE); +BENCHMARK_CAPTURE( + deserialize, + seperate_size64_none, + DataTypeString::SerdesFormat::SeparateSizeAndChars, + 64, + CompressionMethod::NONE); +BENCHMARK_CAPTURE( + deserialize, + seperate_size128_none, + DataTypeString::SerdesFormat::SeparateSizeAndChars, + 128, + CompressionMethod::NONE); +BENCHMARK_CAPTURE( + deserialize, + seperate_size256_none, + DataTypeString::SerdesFormat::SeparateSizeAndChars, + 256, + CompressionMethod::NONE); +BENCHMARK_CAPTURE( + deserialize, + seperate_size512_none, + DataTypeString::SerdesFormat::SeparateSizeAndChars, + 512, + CompressionMethod::NONE); +BENCHMARK_CAPTURE( + deserialize, + seperate_size1024_none, + DataTypeString::SerdesFormat::SeparateSizeAndChars, + 1024, + CompressionMethod::NONE); + +BENCHMARK_CAPTURE( + serialize, + size_prefix_size1_lz4, + DataTypeString::SerdesFormat::SizePrefix, + 1, + CompressionMethod::LZ4); +BENCHMARK_CAPTURE( + serialize, + size_prefix_size2_lz4, + DataTypeString::SerdesFormat::SizePrefix, + 2, + CompressionMethod::LZ4); +BENCHMARK_CAPTURE( + serialize, + size_prefix_size4_lz4, + DataTypeString::SerdesFormat::SizePrefix, + 4, + CompressionMethod::LZ4); +BENCHMARK_CAPTURE( + serialize, + size_prefix_size8_lz4, + DataTypeString::SerdesFormat::SizePrefix, + 8, + CompressionMethod::LZ4); +BENCHMARK_CAPTURE( + serialize, + size_prefix_size16_lz4, + DataTypeString::SerdesFormat::SizePrefix, + 16, + CompressionMethod::LZ4); +BENCHMARK_CAPTURE( + serialize, + size_prefix_size32_lz4, + DataTypeString::SerdesFormat::SizePrefix, + 32, + CompressionMethod::LZ4); +BENCHMARK_CAPTURE( + serialize, + size_prefix_size64_lz4, + DataTypeString::SerdesFormat::SizePrefix, + 64, + CompressionMethod::LZ4); +BENCHMARK_CAPTURE( + serialize, + size_prefix_size128_lz4, + DataTypeString::SerdesFormat::SizePrefix, + 128, + CompressionMethod::LZ4); +BENCHMARK_CAPTURE( + serialize, + size_prefix_size256_lz4, + DataTypeString::SerdesFormat::SizePrefix, + 256, + CompressionMethod::LZ4); +BENCHMARK_CAPTURE( + serialize, + size_prefix_size512_lz4, + DataTypeString::SerdesFormat::SizePrefix, + 512, + CompressionMethod::LZ4); +BENCHMARK_CAPTURE( + serialize, + size_prefix_size1024_lz4, + DataTypeString::SerdesFormat::SizePrefix, + 1024, + CompressionMethod::LZ4); + +BENCHMARK_CAPTURE( + serialize, + seperate_size1_lz4, + DataTypeString::SerdesFormat::SeparateSizeAndChars, + 1, + CompressionMethod::LZ4); +BENCHMARK_CAPTURE( + serialize, + seperate_size2_lz4, + DataTypeString::SerdesFormat::SeparateSizeAndChars, + 2, + CompressionMethod::LZ4); +BENCHMARK_CAPTURE( + serialize, + seperate_size4_lz4, + DataTypeString::SerdesFormat::SeparateSizeAndChars, + 4, + CompressionMethod::LZ4); +BENCHMARK_CAPTURE( + serialize, + seperate_size8_lz4, + DataTypeString::SerdesFormat::SeparateSizeAndChars, + 8, + CompressionMethod::LZ4); +BENCHMARK_CAPTURE( + serialize, + seperate_size16_lz4, + DataTypeString::SerdesFormat::SeparateSizeAndChars, + 16, + CompressionMethod::LZ4); +BENCHMARK_CAPTURE( + serialize, + seperate_size32_lz4, + DataTypeString::SerdesFormat::SeparateSizeAndChars, + 32, + CompressionMethod::LZ4); +BENCHMARK_CAPTURE( + serialize, + seperate_size64_lz4, + DataTypeString::SerdesFormat::SeparateSizeAndChars, + 64, + CompressionMethod::LZ4); +BENCHMARK_CAPTURE( + serialize, + seperate_size128_lz4, + DataTypeString::SerdesFormat::SeparateSizeAndChars, + 128, + CompressionMethod::LZ4); +BENCHMARK_CAPTURE( + serialize, + seperate_size256_lz4, + DataTypeString::SerdesFormat::SeparateSizeAndChars, + 256, + CompressionMethod::LZ4); +BENCHMARK_CAPTURE( + serialize, + seperate_size512_lz4, + DataTypeString::SerdesFormat::SeparateSizeAndChars, + 512, + CompressionMethod::LZ4); +BENCHMARK_CAPTURE( + serialize, + seperate_size1024_lz4, + DataTypeString::SerdesFormat::SeparateSizeAndChars, + 1024, + CompressionMethod::LZ4); + +BENCHMARK_CAPTURE( + deserialize, + size_prefix_size1_lz4, + DataTypeString::SerdesFormat::SizePrefix, + 1, + CompressionMethod::LZ4); +BENCHMARK_CAPTURE( + deserialize, + size_prefix_size2_lz4, + DataTypeString::SerdesFormat::SizePrefix, + 2, + CompressionMethod::LZ4); +BENCHMARK_CAPTURE( + deserialize, + size_prefix_size4_lz4, + DataTypeString::SerdesFormat::SizePrefix, + 4, + CompressionMethod::LZ4); +BENCHMARK_CAPTURE( + deserialize, + size_prefix_size8_lz4, + DataTypeString::SerdesFormat::SizePrefix, + 8, + CompressionMethod::LZ4); +BENCHMARK_CAPTURE( + deserialize, + size_prefix_size16_lz4, + DataTypeString::SerdesFormat::SizePrefix, + 16, + CompressionMethod::LZ4); +BENCHMARK_CAPTURE( + deserialize, + size_prefix_size32_lz4, + DataTypeString::SerdesFormat::SizePrefix, + 32, + CompressionMethod::LZ4); +BENCHMARK_CAPTURE( + deserialize, + size_prefix_size64_lz4, + DataTypeString::SerdesFormat::SizePrefix, + 64, + CompressionMethod::LZ4); +BENCHMARK_CAPTURE( + deserialize, + size_prefix_size128_lz4, + DataTypeString::SerdesFormat::SizePrefix, + 128, + CompressionMethod::LZ4); +BENCHMARK_CAPTURE( + deserialize, + size_prefix_size256_lz4, + DataTypeString::SerdesFormat::SizePrefix, + 256, + CompressionMethod::LZ4); +BENCHMARK_CAPTURE( + deserialize, + size_prefix_size512_lz4, + DataTypeString::SerdesFormat::SizePrefix, + 512, + CompressionMethod::LZ4); +BENCHMARK_CAPTURE( + deserialize, + size_prefix_size1024_lz4, + DataTypeString::SerdesFormat::SizePrefix, + 1024, + CompressionMethod::LZ4); + +BENCHMARK_CAPTURE( + deserialize, + seperate_size1_lz4, + DataTypeString::SerdesFormat::SeparateSizeAndChars, + 1, + CompressionMethod::LZ4); +BENCHMARK_CAPTURE( + deserialize, + seperate_size2_lz4, + DataTypeString::SerdesFormat::SeparateSizeAndChars, + 2, + CompressionMethod::LZ4); +BENCHMARK_CAPTURE( + deserialize, + seperate_size4_lz4, + DataTypeString::SerdesFormat::SeparateSizeAndChars, + 4, + CompressionMethod::LZ4); +BENCHMARK_CAPTURE( + deserialize, + seperate_size8_lz4, + DataTypeString::SerdesFormat::SeparateSizeAndChars, + 8, + CompressionMethod::LZ4); +BENCHMARK_CAPTURE( + deserialize, + seperate_size16_lz4, + DataTypeString::SerdesFormat::SeparateSizeAndChars, + 16, + CompressionMethod::LZ4); +BENCHMARK_CAPTURE( + deserialize, + seperate_size32_lz4, + DataTypeString::SerdesFormat::SeparateSizeAndChars, + 32, + CompressionMethod::LZ4); +BENCHMARK_CAPTURE( + deserialize, + seperate_size64_lz4, + DataTypeString::SerdesFormat::SeparateSizeAndChars, + 64, + CompressionMethod::LZ4); +BENCHMARK_CAPTURE( + deserialize, + seperate_size128_lz4, + DataTypeString::SerdesFormat::SeparateSizeAndChars, + 128, + CompressionMethod::LZ4); +BENCHMARK_CAPTURE( + deserialize, + seperate_size256_lz4, + DataTypeString::SerdesFormat::SeparateSizeAndChars, + 256, + CompressionMethod::LZ4); +BENCHMARK_CAPTURE( + deserialize, + seperate_size512_lz4, + DataTypeString::SerdesFormat::SeparateSizeAndChars, + 512, + CompressionMethod::LZ4); +BENCHMARK_CAPTURE( + deserialize, + seperate_size1024_lz4, + DataTypeString::SerdesFormat::SeparateSizeAndChars, + 1024, + CompressionMethod::LZ4); + +BENCHMARK_CAPTURE( + serialize, + seperate_size1_lw, + DataTypeString::SerdesFormat::SeparateSizeAndChars, + 1, + CompressionMethod::Lightweight); +BENCHMARK_CAPTURE( + serialize, + seperate_size2_lw, + DataTypeString::SerdesFormat::SeparateSizeAndChars, + 2, + CompressionMethod::Lightweight); +BENCHMARK_CAPTURE( + serialize, + seperate_size4_lw, + DataTypeString::SerdesFormat::SeparateSizeAndChars, + 4, + CompressionMethod::Lightweight); +BENCHMARK_CAPTURE( + serialize, + seperate_size8_lw, + DataTypeString::SerdesFormat::SeparateSizeAndChars, + 8, + CompressionMethod::Lightweight); +BENCHMARK_CAPTURE( + serialize, + seperate_size16_lw, + DataTypeString::SerdesFormat::SeparateSizeAndChars, + 16, + CompressionMethod::Lightweight); +BENCHMARK_CAPTURE( + serialize, + seperate_size32_lw, + DataTypeString::SerdesFormat::SeparateSizeAndChars, + 32, + CompressionMethod::Lightweight); +BENCHMARK_CAPTURE( + serialize, + seperate_size64_lw, + DataTypeString::SerdesFormat::SeparateSizeAndChars, + 64, + CompressionMethod::Lightweight); +BENCHMARK_CAPTURE( + serialize, + seperate_size128_lw, + DataTypeString::SerdesFormat::SeparateSizeAndChars, + 128, + CompressionMethod::Lightweight); +BENCHMARK_CAPTURE( + serialize, + seperate_size256_lw, + DataTypeString::SerdesFormat::SeparateSizeAndChars, + 256, + CompressionMethod::Lightweight); +BENCHMARK_CAPTURE( + serialize, + seperate_size512_lw, + DataTypeString::SerdesFormat::SeparateSizeAndChars, + 512, + CompressionMethod::Lightweight); +BENCHMARK_CAPTURE( + serialize, + seperate_size1024_lw, + DataTypeString::SerdesFormat::SeparateSizeAndChars, + 1024, + CompressionMethod::Lightweight); + +BENCHMARK_CAPTURE( + deserialize, + seperate_size1_lw, + DataTypeString::SerdesFormat::SeparateSizeAndChars, + 1, + CompressionMethod::Lightweight); +BENCHMARK_CAPTURE( + deserialize, + seperate_size2_lw, + DataTypeString::SerdesFormat::SeparateSizeAndChars, + 2, + CompressionMethod::Lightweight); +BENCHMARK_CAPTURE( + deserialize, + seperate_size4_lw, + DataTypeString::SerdesFormat::SeparateSizeAndChars, + 4, + CompressionMethod::Lightweight); +BENCHMARK_CAPTURE( + deserialize, + seperate_size8_lw, + DataTypeString::SerdesFormat::SeparateSizeAndChars, + 8, + CompressionMethod::Lightweight); +BENCHMARK_CAPTURE( + deserialize, + seperate_size16_lw, + DataTypeString::SerdesFormat::SeparateSizeAndChars, + 16, + CompressionMethod::Lightweight); +BENCHMARK_CAPTURE( + deserialize, + seperate_size32_lw, + DataTypeString::SerdesFormat::SeparateSizeAndChars, + 32, + CompressionMethod::Lightweight); +BENCHMARK_CAPTURE( + deserialize, + seperate_size64_lw, + DataTypeString::SerdesFormat::SeparateSizeAndChars, + 64, + CompressionMethod::Lightweight); +BENCHMARK_CAPTURE( + deserialize, + seperate_size128_lw, + DataTypeString::SerdesFormat::SeparateSizeAndChars, + 128, + CompressionMethod::Lightweight); +BENCHMARK_CAPTURE( + deserialize, + seperate_size256_lw, + DataTypeString::SerdesFormat::SeparateSizeAndChars, + 256, + CompressionMethod::Lightweight); +BENCHMARK_CAPTURE( + deserialize, + seperate_size512_lw, + DataTypeString::SerdesFormat::SeparateSizeAndChars, + 512, + CompressionMethod::Lightweight); +BENCHMARK_CAPTURE( + deserialize, + seperate_size1024_lw, + DataTypeString::SerdesFormat::SeparateSizeAndChars, + 1024, + CompressionMethod::Lightweight); +} // namespace DB::bench diff --git a/dbms/src/DataTypes/tests/gtest_data_type_get_common_type.cpp b/dbms/src/DataTypes/tests/gtest_data_type_get_common_type.cpp index 5a91ba5af80..c34213e6adc 100644 --- a/dbms/src/DataTypes/tests/gtest_data_type_get_common_type.cpp +++ b/dbms/src/DataTypes/tests/gtest_data_type_get_common_type.cpp @@ -14,6 +14,7 @@ #include #include +#include #include #include #include @@ -137,8 +138,8 @@ try ASSERT_TRUE(getLeastSupertype(typesFromString("Decimal(43,4) Int64"))->equals(*typeFromString("Decimal(43,4)"))); ASSERT_TRUE(getLeastSupertype(typesFromString("Decimal(12,0) Int64"))->equals(*typeFromString("Decimal(19,0)"))); - ASSERT_TRUE( - getLeastSupertype(typesFromString("String FixedString(32) FixedString(8)"))->equals(*typeFromString("String"))); + ASSERT_TRUE(getLeastSupertype(typesFromString(DataTypeString::getDefaultName() + " FixedString(32) FixedString(8)")) + ->equals(*typeFromString(DataTypeString::getDefaultName()))); ASSERT_TRUE( getLeastSupertype(typesFromString("Array(UInt8) Array(UInt8)"))->equals(*typeFromString("Array(UInt8)"))); @@ -260,7 +261,9 @@ try ASSERT_TRUE(isSupportedDataTypeCast(typeFromString("DateTime"), typeFromString("DateTime"))); ASSERT_TRUE(isSupportedDataTypeCast(typeFromString("Date"), typeFromString("Date"))); ASSERT_TRUE(isSupportedDataTypeCast(typeFromString("Decimal(10, 4)"), typeFromString("Decimal(10, 4)"))); - ASSERT_TRUE(isSupportedDataTypeCast(typeFromString("String"), typeFromString("String"))); + ASSERT_TRUE(isSupportedDataTypeCast( + typeFromString(DataTypeString::getDefaultName()), + typeFromString(DataTypeString::getDefaultName()))); ASSERT_TRUE(isSupportedDataTypeCast(typeFromString("FixedString(16)"), typeFromString("FixedString(16)"))); // signed -> unsigned is lossy @@ -302,8 +305,10 @@ try // strings ASSERT_TRUE(isSupportedDataTypeCast(typeFromString("FixedString(16)"), typeFromString("FixedString(100)"))); - ASSERT_FALSE(isSupportedDataTypeCast(typeFromString("String"), typeFromString("FixedString(1024)"))); - ASSERT_TRUE(isSupportedDataTypeCast(typeFromString("FixedString(16)"), typeFromString("String"))); + ASSERT_FALSE( + isSupportedDataTypeCast(typeFromString(DataTypeString::getDefaultName()), typeFromString("FixedString(1024)"))); + ASSERT_TRUE( + isSupportedDataTypeCast(typeFromString("FixedString(16)"), typeFromString(DataTypeString::getDefaultName()))); // Decimal ASSERT_FALSE(isSupportedDataTypeCast(typeFromString("Decimal(10, 4)"), typeFromString("Decimal(10, 2)"))); diff --git a/dbms/src/DataTypes/tests/gtest_data_type_string.cpp b/dbms/src/DataTypes/tests/gtest_data_type_string.cpp new file mode 100644 index 00000000000..46ede6f945e --- /dev/null +++ b/dbms/src/DataTypes/tests/gtest_data_type_string.cpp @@ -0,0 +1,169 @@ +// Copyright 2024 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include +#include +#include +namespace DB::tests +{ +class DataTypeStringTest : public ::testing::Test +{ +public: + void SetUp() override {} + + void TearDown() override {} + +protected: + static String getStreamName(const String & column_name, const IDataType::SubstreamPath & substream_path) + { + return IDataType::getFileNameForStream(column_name, substream_path); + } + + void initWriteStream() + { + auto create_write_stream = [&](const String & column_name, const IDataType::SubstreamPath & substream_path) { + const auto stream_name = getStreamName(column_name, substream_path); + write_streams.emplace(stream_name, std::make_unique(10 * 1024 * 1024)); + }; + auto create_write_stream1 = [&](const IDataType::SubstreamPath & substream_path) { + create_write_stream("1", substream_path); + }; + auto create_write_stream2 = [&](const IDataType::SubstreamPath & substream_path) { + create_write_stream("2", substream_path); + }; + str_v0.enumerateStreams(create_write_stream1, {}); + str_v1.enumerateStreams(create_write_stream2, {}); + } + + void initReadStream() + { + auto create_read_stream = [&](const String & column_name, const IDataType::SubstreamPath & substream_path) { + const auto stream_name = getStreamName(column_name, substream_path); + auto s = write_streams.at(stream_name)->stringRef().toStringView(); + read_streams.emplace(stream_name, std::make_unique(s)); + }; + auto create_read_stream1 = [&](const IDataType::SubstreamPath & substream_path) { + create_read_stream("1", substream_path); + }; + auto create_read_stream2 = [&](const IDataType::SubstreamPath & substream_path) { + create_read_stream("2", substream_path); + }; + str_v0.enumerateStreams(create_read_stream1, {}); + str_v1.enumerateStreams(create_read_stream2, {}); + } + + void serialize(const IColumn & col, size_t offset, size_t limit) + { + auto get_write_stream = [&](const String & column_name, const IDataType::SubstreamPath & substream_path) { + const auto stream_name = getStreamName(column_name, substream_path); + return write_streams.at(stream_name).get(); + }; + auto get_write_stream1 = [&](const IDataType::SubstreamPath & substream_path) { + return get_write_stream("1", substream_path); + }; + auto get_write_stream2 = [&](const IDataType::SubstreamPath & substream_path) { + return get_write_stream("2", substream_path); + }; + str_v0.serializeBinaryBulkWithMultipleStreams(col, get_write_stream1, offset, limit, true, {}); + str_v1.serializeBinaryBulkWithMultipleStreams(col, get_write_stream2, offset, limit, true, {}); + } + + void deserialize(IColumn & col1, IColumn & col2, size_t limit) + { + auto get_read_stream = [&](const String & column_name, const IDataType::SubstreamPath & substream_path) { + const auto stream_name = getStreamName(column_name, substream_path); + return read_streams.at(stream_name).get(); + }; + auto get_read_stream1 = [&](const IDataType::SubstreamPath & substream_path) { + return get_read_stream("1", substream_path); + }; + auto get_read_stream2 = [&](const IDataType::SubstreamPath & substream_path) { + return get_read_stream("2", substream_path); + }; + str_v0.deserializeBinaryBulkWithMultipleStreams(col1, get_read_stream1, limit, 8, true, {}); + str_v1.deserializeBinaryBulkWithMultipleStreams(col2, get_read_stream2, limit, 8, true, {}); + } + + DataTypeString a{DataTypeString::SerdesFormat::SizePrefix}; + DataTypeString b{DataTypeString::SerdesFormat::SeparateSizeAndChars}; + IDataType & str_v0 = a; + IDataType & str_v1 = b; + + std::unordered_map> write_streams; + std::unordered_map> read_streams; +}; + +TEST_F(DataTypeStringTest, BasicSerDe) +try +{ + auto str_col = DB::tests::createColumn(DM::tests::createNumberStrings(0, 65536), "", 0).column; + initWriteStream(); + ASSERT_EQ(write_streams.size(), 3); + serialize(*str_col, 0, str_col->size()); + initReadStream(); + ASSERT_EQ(read_streams.size(), 3); + auto col1 = str_v0.createColumn(); + auto col2 = str_v1.createColumn(); + deserialize(*col1, *col2, str_col->size()); + + ASSERT_EQ(col1->size(), str_col->size()); + ASSERT_EQ(col2->size(), str_col->size()); + for (size_t i = 0; i < col2->size(); ++i) + { + ASSERT_EQ(col1->getDataAt(i).toStringView(), str_col->getDataAt(i).toStringView()); + ASSERT_EQ(col2->getDataAt(i).toStringView(), str_col->getDataAt(i).toStringView()); + } +} +CATCH + +TEST_F(DataTypeStringTest, Concat) +try +{ + auto str_col = DB::tests::createColumn(DM::tests::createNumberStrings(0, 65536), "", 0).column; + initWriteStream(); + ASSERT_EQ(write_streams.size(), 3); + serialize(*str_col, 0, 10000); + serialize(*str_col, 10000, 20000); + serialize(*str_col, 30000, 30000); + serialize(*str_col, 60000, 40000); + + initReadStream(); + ASSERT_EQ(read_streams.size(), 3); + auto col1 = str_v0.createColumn(); + auto col2 = str_v1.createColumn(); + deserialize(*col1, *col2, 20000); + ASSERT_EQ(col1->size(), 20000); + ASSERT_EQ(col2->size(), 20000); + deserialize(*col1, *col2, 30000); + ASSERT_EQ(col1->size(), 50000); + ASSERT_EQ(col2->size(), 50000); + deserialize(*col1, *col2, 10000); + ASSERT_EQ(col1->size(), 60000); + ASSERT_EQ(col2->size(), 60000); + deserialize(*col1, *col2, 8000); + + ASSERT_EQ(col1->size(), str_col->size()); + ASSERT_EQ(col2->size(), str_col->size()); + for (size_t i = 0; i < col2->size(); ++i) + { + ASSERT_EQ(col1->getDataAt(i).toStringView(), str_col->getDataAt(i).toStringView()); + ASSERT_EQ(col2->getDataAt(i).toStringView(), str_col->getDataAt(i).toStringView()); + } +} +CATCH + +} // namespace DB::tests diff --git a/dbms/src/Flash/Planner/tests/gtest_physical_plan.cpp b/dbms/src/Flash/Planner/tests/gtest_physical_plan.cpp index 6ceb84688dd..add5518954b 100644 --- a/dbms/src/Flash/Planner/tests/gtest_physical_plan.cpp +++ b/dbms/src/Flash/Planner/tests/gtest_physical_plan.cpp @@ -20,6 +20,8 @@ #include #include +#include + namespace DB { namespace tests @@ -137,6 +139,11 @@ class PhysicalPlanTestRunner : public DB::tests::ExecutorTest LoggerPtr log = Logger::get("PhysicalPlanTestRunner", "test_physical_plan"); }; +String replaceStringName(String s) +{ + return boost::replace_all_copy(s, "{StringName}", DataTypeString::getDefaultName()); +} + TEST_F(PhysicalPlanTestRunner, Filter) try { @@ -144,10 +151,10 @@ try execute( request, - /*expected_physical_plan=*/R"( - | is_tidb_operator: false, schema: , - | is_tidb_operator: true, schema: , - | is_tidb_operator: true, schema: , )", + /*expected_physical_plan=*/replaceStringName(R"( + | is_tidb_operator: false, schema: , + | is_tidb_operator: true, schema: , + | is_tidb_operator: true, schema: , )"), /*expected_streams=*/R"( Expression: Filter @@ -163,10 +170,10 @@ try execute( request, - /*expected_physical_plan=*/R"( - | is_tidb_operator: false, schema: , - | is_tidb_operator: true, schema: , - | is_tidb_operator: true, schema: , )", + /*expected_physical_plan=*/replaceStringName(R"( + | is_tidb_operator: false, schema: , + | is_tidb_operator: true, schema: , + | is_tidb_operator: true, schema: , )"), /*expected_streams=*/R"( Expression: Limit, limit = 1 @@ -182,10 +189,10 @@ try execute( request, - /*expected_physical_plan=*/R"( - | is_tidb_operator: false, schema: , - | is_tidb_operator: true, schema: , - | is_tidb_operator: true, schema: , )", + /*expected_physical_plan=*/replaceStringName(R"( + | is_tidb_operator: false, schema: , + | is_tidb_operator: true, schema: , + | is_tidb_operator: true, schema: , )"), /*expected_streams=*/R"( Expression: MergeSorting, limit = 1 @@ -203,10 +210,10 @@ try execute( request, - /*expected_physical_plan=*/R"( - | is_tidb_operator: false, schema: , - | is_tidb_operator: true, schema: , - | is_tidb_operator: true, schema: , )", + /*expected_physical_plan=*/replaceStringName(R"( + | is_tidb_operator: false, schema: , + | is_tidb_operator: true, schema: , + | is_tidb_operator: true, schema: , )"), /*expected_streams=*/R"( Expression: Expression: @@ -223,10 +230,10 @@ try execute( request, - /*expected_physical_plan=*/R"( - | is_tidb_operator: false, schema: - | is_tidb_operator: true, schema: - | is_tidb_operator: true, schema: , )", + /*expected_physical_plan=*/replaceStringName(R"( + | is_tidb_operator: false, schema: + | is_tidb_operator: true, schema: + | is_tidb_operator: true, schema: , )"), /*expected_streams=*/R"( Expression: Expression: @@ -243,10 +250,10 @@ Expression: execute( request, - /*expected_physical_plan=*/R"( - | is_tidb_operator: false, schema: , , , - | is_tidb_operator: true, schema: , , , - | is_tidb_operator: true, schema: , , , )", + /*expected_physical_plan=*/replaceStringName(R"( + | is_tidb_operator: false, schema: , , , + | is_tidb_operator: true, schema: , , , + | is_tidb_operator: true, schema: , , , )"), /*expected_streams=*/R"( Expression: Expression: @@ -265,10 +272,10 @@ try execute( request, - /*expected_physical_plan=*/R"( - | is_tidb_operator: true, schema: , - | is_tidb_operator: false, schema: , - | is_tidb_operator: true, schema: , )", + /*expected_physical_plan=*/replaceStringName(R"( + | is_tidb_operator: true, schema: , + | is_tidb_operator: false, schema: , + | is_tidb_operator: true, schema: , )"), /*expected_streams=*/R"( MockExchangeSender Expression: @@ -284,9 +291,9 @@ try execute( request, - /*expected_physical_plan=*/R"( - | is_tidb_operator: false, schema: , - | is_tidb_operator: true, schema: , )", + /*expected_physical_plan=*/replaceStringName(R"( + | is_tidb_operator: false, schema: , + | is_tidb_operator: true, schema: , )"), /*expected_streams=*/R"( Expression: MockExchangeReceiver)", @@ -318,11 +325,11 @@ try auto request = get_request(false); execute( request, - /*expected_physical_plan=*/R"( - | is_tidb_operator: false, schema: , , + /*expected_physical_plan=*/replaceStringName(R"( + | is_tidb_operator: false, schema: , , | is_tidb_operator: true, schema: , , | is_tidb_operator: true, schema: , - | is_tidb_operator: true, schema: , )", + | is_tidb_operator: true, schema: , )"), /*expected_streams=*/R"( Expression: Expression: @@ -337,11 +344,11 @@ Expression: request = get_request(true); execute( request, - /*expected_physical_plan=*/R"( - | is_tidb_operator: false, schema: , , + /*expected_physical_plan=*/replaceStringName(R"( + | is_tidb_operator: false, schema: , , | is_tidb_operator: true, schema: , , | is_tidb_operator: true, schema: , - | is_tidb_operator: true, schema: , )", + | is_tidb_operator: true, schema: , )"), /*expected_streams=*/R"( Expression: Expression: @@ -362,9 +369,9 @@ try execute( request, - /*expected_physical_plan=*/R"( - | is_tidb_operator: false, schema: , - | is_tidb_operator: true, schema: , )", + /*expected_physical_plan=*/replaceStringName(R"( + | is_tidb_operator: false, schema: , + | is_tidb_operator: true, schema: , )"), /*expected_streams=*/R"( Expression: MockTableScan)", @@ -386,13 +393,13 @@ try auto request = get_request(tipb::JoinType::TypeInnerJoin); execute( request, - /*expected_physical_plan=*/R"( - | is_tidb_operator: false, schema: , , , - | is_tidb_operator: true, schema: , , , - | is_tidb_operator: false, schema: , - | is_tidb_operator: true, schema: , - | is_tidb_operator: false, schema: , - | is_tidb_operator: true, schema: , )", + /*expected_physical_plan=*/replaceStringName(R"( + | is_tidb_operator: false, schema: , , , + | is_tidb_operator: true, schema: , , , + | is_tidb_operator: false, schema: , + | is_tidb_operator: true, schema: , + | is_tidb_operator: false, schema: , + | is_tidb_operator: true, schema: , )"), /*expected_streams=*/R"( CreatingSets HashJoinBuild: , join_kind = Inner @@ -410,13 +417,13 @@ CreatingSets request = get_request(tipb::JoinType::TypeLeftOuterJoin); execute( request, - /*expected_physical_plan=*/R"( - | is_tidb_operator: false, schema: , , , - | is_tidb_operator: true, schema: , , , - | is_tidb_operator: false, schema: , - | is_tidb_operator: true, schema: , - | is_tidb_operator: false, schema: , - | is_tidb_operator: true, schema: , )", + /*expected_physical_plan=*/replaceStringName(R"( + | is_tidb_operator: false, schema: , , , + | is_tidb_operator: true, schema: , , , + | is_tidb_operator: false, schema: , + | is_tidb_operator: true, schema: , + | is_tidb_operator: false, schema: , + | is_tidb_operator: true, schema: , )"), /*expected_streams=*/R"( CreatingSets HashJoinBuild: , join_kind = Left @@ -434,13 +441,13 @@ CreatingSets request = get_request(tipb::JoinType::TypeRightOuterJoin); execute( request, - /*expected_physical_plan=*/R"( - | is_tidb_operator: false, schema: , , , - | is_tidb_operator: true, schema: , , , - | is_tidb_operator: false, schema: , - | is_tidb_operator: true, schema: , - | is_tidb_operator: false, schema: , - | is_tidb_operator: true, schema: , )", + /*expected_physical_plan=*/replaceStringName(R"( + | is_tidb_operator: false, schema: , , , + | is_tidb_operator: true, schema: , , , + | is_tidb_operator: false, schema: , + | is_tidb_operator: true, schema: , + | is_tidb_operator: false, schema: , + | is_tidb_operator: true, schema: , )"), /*expected_streams=*/R"( CreatingSets HashJoinBuild: , join_kind = Right @@ -467,8 +474,8 @@ CreatingSets .build(context); execute( request, - /*expected_physical_plan=*/R"( - | is_tidb_operator: false, schema: , , , , , , , , , + /*expected_physical_plan=*/replaceStringName(R"( + | is_tidb_operator: false, schema: , , , , , , , , , | is_tidb_operator: true, schema: , , , , , , , , , | is_tidb_operator: false, schema: , , , , , | is_tidb_operator: true, schema: , , , , , @@ -481,7 +488,7 @@ CreatingSets | is_tidb_operator: false, schema: , | is_tidb_operator: true, schema: , | is_tidb_operator: false, schema: , - | is_tidb_operator: true, schema: , )", + | is_tidb_operator: true, schema: , )"), /*expected_streams=*/R"( CreatingSets HashJoinBuild x 2: , join_kind = Right @@ -521,8 +528,8 @@ CreatingSets .build(context); execute( request, - /*expected_physical_plan=*/R"( - | is_tidb_operator: false, schema: , , , , , , , , , + /*expected_physical_plan=*/replaceStringName(R"( + | is_tidb_operator: false, schema: , , , , , , , , , | is_tidb_operator: true, schema: , , , , , , , , , | is_tidb_operator: false, schema: , , , , , | is_tidb_operator: true, schema: , , , , , @@ -535,7 +542,7 @@ CreatingSets | is_tidb_operator: false, schema: , | is_tidb_operator: true, schema: , | is_tidb_operator: false, schema: , - | is_tidb_operator: true, schema: , )", + | is_tidb_operator: true, schema: , )"), /*expected_streams=*/R"( CreatingSets HashJoinBuild x 2: , join_kind = Right diff --git a/dbms/src/Flash/tests/gtest_interpreter.out b/dbms/src/Flash/tests/gtest_interpreter.out deleted file mode 100644 index b6f18571402..00000000000 --- a/dbms/src/Flash/tests/gtest_interpreter.out +++ /dev/null @@ -1,489 +0,0 @@ -~test_suite_name: SingleQueryBlock -~result_index: 0 -~result: -Union: - SharedQuery x 10: - Expression: - MergeSorting, limit = 10 - Union: - PartialSorting x 10: limit = 10 - Expression: - Filter: - SharedQuery: - ParallelAggregating, max_threads: 10, final: true - Expression x 10: - Filter: - MockTableScan -@ -~test_suite_name: SingleQueryBlock -~result_index: 1 -~result: -Union: - SharedQuery x 10: - Expression: - Expression: - Limit, limit = 10 - Union: - Limit x 10, limit = 10 - Filter: - SharedQuery: - ParallelAggregating, max_threads: 10, final: true - Expression x 10: - Filter: - MockTableScan -@ -~test_suite_name: MultipleQueryBlockWithSource -~result_index: 0 -~result: -Union: - Expression x 10: - Expression: - Expression: - Expression: - Expression: - Expression: - Expression: - MockTableScan -@ -~test_suite_name: MultipleQueryBlockWithSource -~result_index: 1 -~result: -Union: - Expression x 10: - Expression: - SharedQuery: - Expression: - MergeSorting, limit = 10 - Union: - PartialSorting x 10: limit = 10 - Expression: - Expression: - MockTableScan -@ -~test_suite_name: MultipleQueryBlockWithSource -~result_index: 2 -~result: -Union: - Expression x 10: - Expression: - Expression: - SharedQuery: - ParallelAggregating, max_threads: 10, final: true - Expression x 10: - SharedQuery: - Expression: - MergeSorting, limit = 10 - Union: - PartialSorting x 10: limit = 10 - Expression: - Expression: - MockTableScan -@ -~test_suite_name: MultipleQueryBlockWithSource -~result_index: 3 -~result: -Union: - SharedQuery x 10: - Expression: - Limit, limit = 10 - Union: - Limit x 10, limit = 10 - Expression: - Expression: - Expression: - Filter: - Expression: - Expression: - SharedQuery: - ParallelAggregating, max_threads: 10, final: true - Expression x 10: - SharedQuery: - Expression: - MergeSorting, limit = 10 - Union: - PartialSorting x 10: limit = 10 - Expression: - Expression: - MockTableScan -@ -~test_suite_name: MultipleQueryBlockWithSource -~result_index: 4 -~result: -Union: - Expression x 10: - Expression: - Expression: - Expression: - Expression: - Expression: - Expression: - MockExchangeReceiver -@ -~test_suite_name: MultipleQueryBlockWithSource -~result_index: 5 -~result: -Union: - MockExchangeSender x 10 - Expression: - Expression: - Expression: - Expression: - Expression: - Expression: - Expression: - MockExchangeReceiver -@ -~test_suite_name: Window -~result_index: 0 -~result: -Union: - Expression x 10: - Expression: - SharedQuery: - Window, function: {row_number}, frame: {type: Rows, boundary_begin: Current, boundary_end: Current} - Expression: - MergeSorting, limit = 0 - Union: - PartialSorting x 10: limit = 0 - Expression: - MockTableScan -@ -~test_suite_name: Window -~result_index: 1 -~result: -Union: - Expression x 10: - Expression: - Expression: - Expression: - SharedQuery: - Window, function: {row_number}, frame: {type: Rows, boundary_begin: Current, boundary_end: Current} - Expression: - MergeSorting, limit = 0 - Union: - PartialSorting x 10: limit = 0 - Expression: - MockTableScan -@ -~test_suite_name: Window -~result_index: 2 -~result: -Union: - Expression x 10: - Expression: - Expression: - Expression: - SharedQuery: - Window, function: {row_number}, frame: {type: Rows, boundary_begin: Current, boundary_end: Current} - Union: - Expression x 10: - Expression: - SharedQuery: - Expression: - MergeSorting, limit = 0 - Union: - PartialSorting x 10: limit = 0 - Expression: - MockTableScan -@ -~test_suite_name: FineGrainedShuffle -~result_index: 0 -~result: -Union: - Expression x 8: - Expression: - Window: , function: {row_number}, frame: {type: Rows, boundary_begin: Current, boundary_end: Current} - Expression: - MergeSorting: , limit = 0 - PartialSorting: : limit = 0 - Expression: - MockExchangeReceiver -@ -~test_suite_name: FineGrainedShuffle -~result_index: 1 -~result: -Union: - SharedQuery x 10: - Expression: - MergeSorting, limit = 10 - Union: - PartialSorting x 10: limit = 10 - MockExchangeReceiver -@ -~test_suite_name: FineGrainedShuffle -~result_index: 2 -~result: -Union: - Expression x 10: - Expression: - SharedQuery: - Window, function: {row_number}, frame: {type: Rows, boundary_begin: Current, boundary_end: Current} - Expression: - MergeSorting, limit = 0 - Union: - PartialSorting x 10: limit = 0 - Expression: - MockExchangeReceiver -@ -~test_suite_name: FineGrainedShuffle -~result_index: 3 -~result: -Union: - SharedQuery x 10: - Expression: - MergeSorting, limit = 10 - Union: - PartialSorting x 10: limit = 10 - MockExchangeReceiver -@ -~test_suite_name: FineGrainedShuffleJoin -~result_index: 0 -~result: -CreatingSets - Union: - HashJoinBuild x 8: , join_kind = Left - Expression: - MockExchangeReceiver - Union: - Expression x 10: - HashJoinProbe: - Expression: - MockExchangeReceiver -@ -~test_suite_name: FineGrainedShuffleJoin -~result_index: 1 -~result: -CreatingSets - Union: - HashJoinBuild x 8: , join_kind = Left - Expression: - MockExchangeReceiver - Union: - Expression x 5: - HashJoinProbe: - Expression: - MockTableScan -@ -~test_suite_name: FineGrainedShuffleJoin -~result_index: 2 -~result: -CreatingSets - Union: - HashJoinBuild x 10: , join_kind = Left - Expression: - MockExchangeReceiver - Union: - Expression x 10: - HashJoinProbe: - Expression: - MockExchangeReceiver -@ -~test_suite_name: FineGrainedShuffleAgg -~result_index: 0 -~result: -Union: - Expression x 8: - Aggregating: - MockExchangeReceiver -@ -~test_suite_name: FineGrainedShuffleAgg -~result_index: 1 -~result: -Union: - Expression x 10: - SharedQuery: - ParallelAggregating, max_threads: 10, final: true - MockExchangeReceiver x 10 -@ -~test_suite_name: Join -~result_index: 0 -~result: -CreatingSets - Union: - HashJoinBuild x 10: , join_kind = Left - Expression: - MockTableScan - Union x 2: - HashJoinBuild x 10: , join_kind = Left - Expression: - HashJoinProbe: - Expression: - MockTableScan - Union: - Expression x 10: - HashJoinProbe: - Expression: - MockTableScan -@ -~test_suite_name: Join -~result_index: 1 -~result: -CreatingSets - Union: - HashJoinBuild x 10: , join_kind = Left - Expression: - MockExchangeReceiver - Union x 2: - HashJoinBuild x 10: , join_kind = Left - Expression: - HashJoinProbe: - Expression: - MockExchangeReceiver - Union: - Expression x 10: - HashJoinProbe: - Expression: - MockExchangeReceiver -@ -~test_suite_name: Join -~result_index: 2 -~result: -CreatingSets - Union: - HashJoinBuild x 10: , join_kind = Left - Expression: - MockExchangeReceiver - Union x 2: - HashJoinBuild x 10: , join_kind = Left - Expression: - HashJoinProbe: - Expression: - MockExchangeReceiver - Union: - MockExchangeSender x 10 - Expression: - HashJoinProbe: - Expression: - MockExchangeReceiver -@ -~test_suite_name: JoinThenAgg -~result_index: 0 -~result: -CreatingSets - Union: - HashJoinBuild x 10: , join_kind = Left - Expression: - MockTableScan - Union: - Expression x 10: - SharedQuery: - ParallelAggregating, max_threads: 10, final: true - HashJoinProbe x 10: - Expression: - MockTableScan -@ -~test_suite_name: JoinThenAgg -~result_index: 1 -~result: -CreatingSets - Union: - HashJoinBuild x 10: , join_kind = Right - Expression: - MockTableScan - Union: - Expression x 10: - SharedQuery: - ParallelAggregating, max_threads: 10, final: true - HashJoinProbe x 10: - Expression: - MockTableScan -@ -~test_suite_name: JoinThenAgg -~result_index: 2 -~result: -CreatingSets - Union: - HashJoinBuild x 20: , join_kind = Right - Expression: - MockExchangeReceiver - Union: - MockExchangeSender x 20 - SharedQuery: - Expression: - Expression: - Limit, limit = 10 - Union: - Limit x 20, limit = 10 - SharedQuery: - ParallelAggregating, max_threads: 20, final: true - HashJoinProbe x 20: - Expression: - MockExchangeReceiver -@ -~test_suite_name: ListBase -~result_index: 0 -~result: -Expression: - Limit, limit = 10 - Aggregating - Expression: - Filter: - MockTableScan -@ -~test_suite_name: ListBase -~result_index: 1 -~result: -Union: - SharedQuery x 20: - Expression: - MergeSorting, limit = 10 - Union: - PartialSorting x 20: limit = 10 - SharedQuery: - ParallelAggregating, max_threads: 20, final: true - Expression x 20: - Filter: - MockTableScan -@ -~test_suite_name: ExpandPlan -~result_index: 0 -~result: -CreatingSets - Union: - HashJoinBuild x 10: , join_kind = Inner - Expression: - Expression: - Expression: - MockTableScan - Union: - SharedQuery x 10: - Expression: - MergeSorting, limit = 2 - Union: - PartialSorting x 10: limit = 2 - Expression: - Expression: - HashJoinProbe: - Expression: - Expression: }{}]> - SharedQuery: - ParallelAggregating, max_threads: 10, final: true - MockExchangeReceiver x 10 -@ -~test_suite_name: Expand2Plan -~result_index: 0 -~result: -CreatingSets - Union: - HashJoinBuild x 10: , join_kind = Inner - Expression: - Expression: - Expression: - MockTableScan - Union: - SharedQuery x 10: - Expression: - MergeSorting, limit = 2 - Union: - PartialSorting x 10: limit = 2 - Expression: - Expression: - HashJoinProbe: - Expression: - Expand2: - Expression: - SharedQuery: - ParallelAggregating, max_threads: 10, final: true - MockExchangeReceiver x 10 -@ diff --git a/dbms/src/Flash/tests/gtest_planner_interpreter.out b/dbms/src/Flash/tests/gtest_planner_interpreter.out index 2ae432cee0d..8e4532fe3d2 100644 --- a/dbms/src/Flash/tests/gtest_planner_interpreter.out +++ b/dbms/src/Flash/tests/gtest_planner_interpreter.out @@ -729,8 +729,8 @@ CreatingSets Expression: HashJoinProbe: Expression: - Expand2: + Expand2: Expression: SharedQuery: ParallelAggregating, max_threads: 10, final: true diff --git a/dbms/src/Functions/tests/gtest_ifnull.cpp b/dbms/src/Functions/tests/gtest_ifnull.cpp index d35e3264c99..5bc898af913 100644 --- a/dbms/src/Functions/tests/gtest_ifnull.cpp +++ b/dbms/src/Functions/tests/gtest_ifnull.cpp @@ -470,7 +470,7 @@ try test_type("Float64", "Float64", "Float64"); /// test type infer for string - test_type("String", "String", "String"); + test_type(DataTypeString::getDefaultName(), DataTypeString::getDefaultName(), DataTypeString::getDefaultName()); /// test type infer for decimal test_type("Decimal(5,3)", "Decimal(5,3)", "Decimal(5,3)"); diff --git a/dbms/src/Functions/tests/gtest_tidb_conversion.cpp b/dbms/src/Functions/tests/gtest_tidb_conversion.cpp index fa853f1c309..4157c467546 100644 --- a/dbms/src/Functions/tests/gtest_tidb_conversion.cpp +++ b/dbms/src/Functions/tests/gtest_tidb_conversion.cpp @@ -607,35 +607,53 @@ try /// null only cases ASSERT_COLUMN_EQ( createColumn>({{}}), - executeFunction(func_name, {createOnlyNullColumn(1), createCastTypeConstColumn("Nullable(String)")})); + executeFunction( + func_name, + {createOnlyNullColumn(1), createCastTypeConstColumn(DataTypeString::getNullableDefaultName())})); /// const cases // uint64/32/16/8 -> string ASSERT_COLUMN_EQ( createConstColumn(1, "18446744073709551615"), - executeFunction(func_name, {createConstColumn(1, MAX_UINT64), createCastTypeConstColumn("String")})); + executeFunction( + func_name, + {createConstColumn(1, MAX_UINT64), createCastTypeConstColumn(DataTypeString::getDefaultName())})); ASSERT_COLUMN_EQ( createConstColumn(1, "4294967295"), - executeFunction(func_name, {createConstColumn(1, MAX_UINT32), createCastTypeConstColumn("String")})); + executeFunction( + func_name, + {createConstColumn(1, MAX_UINT32), createCastTypeConstColumn(DataTypeString::getDefaultName())})); ASSERT_COLUMN_EQ( createConstColumn(1, "65535"), - executeFunction(func_name, {createConstColumn(1, MAX_UINT16), createCastTypeConstColumn("String")})); + executeFunction( + func_name, + {createConstColumn(1, MAX_UINT16), createCastTypeConstColumn(DataTypeString::getDefaultName())})); ASSERT_COLUMN_EQ( createConstColumn(1, "255"), - executeFunction(func_name, {createConstColumn(1, MAX_UINT8), createCastTypeConstColumn("String")})); + executeFunction( + func_name, + {createConstColumn(1, MAX_UINT8), createCastTypeConstColumn(DataTypeString::getDefaultName())})); // int64/32/16/8 -> string ASSERT_COLUMN_EQ( createConstColumn(1, "9223372036854775807"), - executeFunction(func_name, {createConstColumn(1, MAX_INT64), createCastTypeConstColumn("String")})); + executeFunction( + func_name, + {createConstColumn(1, MAX_INT64), createCastTypeConstColumn(DataTypeString::getDefaultName())})); ASSERT_COLUMN_EQ( createConstColumn(1, "2147483647"), - executeFunction(func_name, {createConstColumn(1, MAX_INT32), createCastTypeConstColumn("String")})); + executeFunction( + func_name, + {createConstColumn(1, MAX_INT32), createCastTypeConstColumn(DataTypeString::getDefaultName())})); ASSERT_COLUMN_EQ( createConstColumn(1, "32767"), - executeFunction(func_name, {createConstColumn(1, MAX_INT16), createCastTypeConstColumn("String")})); + executeFunction( + func_name, + {createConstColumn(1, MAX_INT16), createCastTypeConstColumn(DataTypeString::getDefaultName())})); ASSERT_COLUMN_EQ( createConstColumn(1, "127"), - executeFunction(func_name, {createConstColumn(1, MAX_INT8), createCastTypeConstColumn("String")})); + executeFunction( + func_name, + {createConstColumn(1, MAX_INT8), createCastTypeConstColumn(DataTypeString::getDefaultName())})); /// normal cases // uint64/32/16/8 -> string @@ -643,47 +661,51 @@ try createColumn>({"18446744073709551615", "0", {}}), executeFunction( func_name, - {createColumn>({MAX_UINT64, 0, {}}), createCastTypeConstColumn("Nullable(String)")})); + {createColumn>({MAX_UINT64, 0, {}}), + createCastTypeConstColumn(DataTypeString::getNullableDefaultName())})); ASSERT_COLUMN_EQ( createColumn>({"4294967295", "0", {}}), executeFunction( func_name, - {createColumn>({MAX_UINT32, 0, {}}), createCastTypeConstColumn("Nullable(String)")})); + {createColumn>({MAX_UINT32, 0, {}}), + createCastTypeConstColumn(DataTypeString::getNullableDefaultName())})); ASSERT_COLUMN_EQ( createColumn>({"65535", "0", {}}), executeFunction( func_name, - {createColumn>({MAX_UINT16, 0, {}}), createCastTypeConstColumn("Nullable(String)")})); + {createColumn>({MAX_UINT16, 0, {}}), + createCastTypeConstColumn(DataTypeString::getNullableDefaultName())})); ASSERT_COLUMN_EQ( createColumn>({"255", "0", {}}), executeFunction( func_name, - {createColumn>({MAX_UINT8, 0, {}}), createCastTypeConstColumn("Nullable(String)")})); + {createColumn>({MAX_UINT8, 0, {}}), + createCastTypeConstColumn(DataTypeString::getNullableDefaultName())})); // int64/32/16/8 -> string ASSERT_COLUMN_EQ( createColumn>({"9223372036854775807", "-9223372036854775808", "0", {}}), executeFunction( func_name, {createColumn>({MAX_INT64, MIN_INT64, 0, {}}), - createCastTypeConstColumn("Nullable(String)")})); + createCastTypeConstColumn(DataTypeString::getNullableDefaultName())})); ASSERT_COLUMN_EQ( createColumn>({"2147483647", "-2147483648", "0", {}}), executeFunction( func_name, {createColumn>({MAX_INT32, MIN_INT32, 0, {}}), - createCastTypeConstColumn("Nullable(String)")})); + createCastTypeConstColumn(DataTypeString::getNullableDefaultName())})); ASSERT_COLUMN_EQ( createColumn>({"32767", "-32768", "0", {}}), executeFunction( func_name, {createColumn>({MAX_INT16, MIN_INT16, 0, {}}), - createCastTypeConstColumn("Nullable(String)")})); + createCastTypeConstColumn(DataTypeString::getNullableDefaultName())})); ASSERT_COLUMN_EQ( createColumn>({"127", "-128", "0", {}}), executeFunction( func_name, {createColumn>({MAX_INT8, MIN_INT8, 0, {}}), - createCastTypeConstColumn("Nullable(String)")})); + createCastTypeConstColumn(DataTypeString::getNullableDefaultName())})); } CATCH diff --git a/dbms/src/IO/Compression/CompressionSettings.h b/dbms/src/IO/Compression/CompressionSettings.h index a70eedff34b..7ba26ecb6e8 100644 --- a/dbms/src/IO/Compression/CompressionSettings.h +++ b/dbms/src/IO/Compression/CompressionSettings.h @@ -60,21 +60,31 @@ struct CompressionSetting : CompressionSetting(CompressionMethod::LZ4) {} - explicit CompressionSetting(CompressionMethod method_) + explicit CompressionSetting( + CompressionMethod method_, + CompressionDataType data_type_ = CompressionDataType::Unknown) : method(method_) , level(getDefaultLevel(method)) + , data_type(data_type_) , method_byte(method_byte_map[static_cast(method_)]) {} - explicit CompressionSetting(CompressionMethodByte method_byte_) + explicit CompressionSetting( + CompressionMethodByte method_byte_, + CompressionDataType data_type_ = CompressionDataType::Unknown) : method(method_map.at(method_byte_)) , level(getDefaultLevel(method)) + , data_type(data_type_) , method_byte(method_byte_) {} - CompressionSetting(CompressionMethod method_, int level_) + CompressionSetting( + CompressionMethod method_, + int level_, + CompressionDataType data_type_ = CompressionDataType::Unknown) : method(method_) , level(level_) + , data_type(data_type_) , method_byte(method_byte_map[static_cast(method_)]) {} diff --git a/dbms/src/Server/DTTool/DTToolBench.cpp b/dbms/src/Server/DTTool/DTToolBench.cpp index 37e1a543752..c5085d7b3e6 100644 --- a/dbms/src/Server/DTTool/DTToolBench.cpp +++ b/dbms/src/Server/DTTool/DTToolBench.cpp @@ -89,7 +89,7 @@ ColumnDefinesPtr createColumnDefines(size_t column_number) primitive->emplace_back(ColumnDefine{ static_cast(3 + int_num + i), fmt::format("str_{}", i), - DB::DataTypeFactory::instance().get("String")}); + DB::DataTypeFactory::instance().get(DataTypeString::getDefaultName())}); } return primitive; } @@ -172,7 +172,7 @@ DB::Block createBlock( { ColumnWithTypeAndName str_col( nullptr, - DB::DataTypeFactory::instance().get("String"), + DB::DataTypeFactory::instance().get(DataTypeString::getDefaultName()), fmt::format("str_{}", i), static_cast(3 + int_num + i)); IColumn::MutablePtr m_col = str_col.type->createColumn(); diff --git a/dbms/src/Storages/DeltaMerge/File/ColumnStat.h b/dbms/src/Storages/DeltaMerge/File/ColumnStat.h index b01f55fdd9c..13eb4078e01 100644 --- a/dbms/src/Storages/DeltaMerge/File/ColumnStat.h +++ b/dbms/src/Storages/DeltaMerge/File/ColumnStat.h @@ -37,8 +37,8 @@ struct ColumnStat size_t nullmap_data_bytes = 0; size_t nullmap_mark_bytes = 0; size_t index_bytes = 0; - size_t array_sizes_bytes = 0; - size_t array_sizes_mark_bytes = 0; + size_t sizes_bytes = 0; // Array sizes or String sizes, depends on the data type of this column + size_t sizes_mark_bytes = 0; std::vector vector_index; @@ -59,8 +59,8 @@ struct ColumnStat stat.set_nullmap_data_bytes(nullmap_data_bytes); stat.set_nullmap_mark_bytes(nullmap_mark_bytes); stat.set_index_bytes(index_bytes); - stat.set_array_sizes_bytes(array_sizes_bytes); - stat.set_array_sizes_mark_bytes(array_sizes_mark_bytes); + stat.set_sizes_bytes(sizes_bytes); + stat.set_sizes_mark_bytes(sizes_mark_bytes); for (const auto & vec_idx : vector_index) { @@ -86,8 +86,8 @@ struct ColumnStat nullmap_data_bytes = proto.nullmap_data_bytes(); nullmap_mark_bytes = proto.nullmap_mark_bytes(); index_bytes = proto.index_bytes(); - array_sizes_bytes = proto.array_sizes_bytes(); - array_sizes_mark_bytes = proto.array_sizes_mark_bytes(); + sizes_bytes = proto.sizes_bytes(); + sizes_mark_bytes = proto.sizes_mark_bytes(); if (proto.has_vector_index()) { diff --git a/dbms/src/Storages/DeltaMerge/File/DMFile.cpp b/dbms/src/Storages/DeltaMerge/File/DMFile.cpp index a3f65d4c6c1..47191e835d4 100644 --- a/dbms/src/Storages/DeltaMerge/File/DMFile.cpp +++ b/dbms/src/Storages/DeltaMerge/File/DMFile.cpp @@ -243,7 +243,8 @@ size_t DMFile::colDataSize(ColId id, ColDataType type) const case ColDataType::NullMap: return itr->second.nullmap_data_bytes; case ColDataType::ArraySizes: - return itr->second.array_sizes_bytes; + case ColDataType::StringSizes: + return itr->second.sizes_bytes; } } else @@ -258,9 +259,11 @@ size_t DMFile::colDataSize(ColId id, ColDataType type) const namebase = getFileNameBase(id, {IDataType::Substream::NullMap}); break; case ColDataType::ArraySizes: + case ColDataType::StringSizes: RUNTIME_CHECK_MSG( - type != ColDataType::ArraySizes, - "Can not get array map size by filename, col_id={} path={}", + false, + "Can not get size of {} by filename, col_id={} path={}", + magic_enum::enum_name(type), id, path()); break; diff --git a/dbms/src/Storages/DeltaMerge/File/DMFile.h b/dbms/src/Storages/DeltaMerge/File/DMFile.h index 41fc7e060f2..76c28975ddb 100644 --- a/dbms/src/Storages/DeltaMerge/File/DMFile.h +++ b/dbms/src/Storages/DeltaMerge/File/DMFile.h @@ -273,6 +273,7 @@ class DMFile : private boost::noncopyable Elements, NullMap, ArraySizes, + StringSizes, }; size_t colDataSize(ColId id, ColDataType type) const; diff --git a/dbms/src/Storages/DeltaMerge/File/DMFileMeta.cpp b/dbms/src/Storages/DeltaMerge/File/DMFileMeta.cpp index ec4ca7c0047..4a4169cbce1 100644 --- a/dbms/src/Storages/DeltaMerge/File/DMFileMeta.cpp +++ b/dbms/src/Storages/DeltaMerge/File/DMFileMeta.cpp @@ -418,14 +418,14 @@ UInt64 DMFileMeta::getFileSize(ColId col_id, const String & filename) const { return itr->second.nullmap_mark_bytes; } - // Note that ".size0.dat"/".size0.mrk" must be check before ".dat"/".mrk" - else if (endsWith(filename, ".size0.dat")) + // Note that ".size0.dat"/".size0.mrk"/".size.dat"/".size.mrk" must be check before ".dat"/".mrk" + else if (endsWith(filename, ".size0.dat") || endsWith(filename, ".size.dat")) { - return itr->second.array_sizes_bytes; + return itr->second.sizes_bytes; } - else if (endsWith(filename, ".size0.mrk")) + else if (endsWith(filename, ".size0.mrk") || endsWith(filename, ".size.mrk")) { - return itr->second.array_sizes_mark_bytes; + return itr->second.sizes_mark_bytes; } else if (endsWith(filename, ".dat")) { diff --git a/dbms/src/Storages/DeltaMerge/File/DMFileMetaV2.cpp b/dbms/src/Storages/DeltaMerge/File/DMFileMetaV2.cpp index 2e6ca7276c4..a5f3f9810e6 100644 --- a/dbms/src/Storages/DeltaMerge/File/DMFileMetaV2.cpp +++ b/dbms/src/Storages/DeltaMerge/File/DMFileMetaV2.cpp @@ -370,11 +370,14 @@ void DMFileMetaV2::finalizeSmallFiles( delete_file_name.emplace_back(std::move(fname)); } - // check .size0.dat - if (stat.array_sizes_bytes > 0 && stat.array_sizes_bytes <= small_file_size_threshold) + // check .size0.dat and .size.dat + if (stat.sizes_bytes > 0 && stat.sizes_bytes <= small_file_size_threshold) { - auto fname = colDataFileName(getFileNameBase(col_id, {IDataType::Substream::ArraySizes})); - auto fsize = stat.array_sizes_bytes; + auto substream = removeNullable(stat.type)->getTypeId() == TypeIndex::String + ? IDataType::Substream::StringSizes + : IDataType::Substream::ArraySizes; + auto fname = colDataFileName(getFileNameBase(col_id, {substream})); + auto fsize = stat.sizes_bytes; copy_file_to_cur(fname, fsize); delete_file_name.emplace_back(std::move(fname)); } diff --git a/dbms/src/Storages/DeltaMerge/File/DMFileVectorIndexWriter.cpp b/dbms/src/Storages/DeltaMerge/File/DMFileVectorIndexWriter.cpp index 3186193e13a..78f415206a4 100644 --- a/dbms/src/Storages/DeltaMerge/File/DMFileVectorIndexWriter.cpp +++ b/dbms/src/Storages/DeltaMerge/File/DMFileVectorIndexWriter.cpp @@ -204,7 +204,8 @@ size_t DMFileVectorIndexWriter::buildIndexForFile(const DMFilePtr & dm_file_muta const auto & cd = read_columns[col_idx]; // Save index and update column stats auto callback = [&](const IDataType::SubstreamPath & substream_path) -> void { - if (IDataType::isNullMap(substream_path) || IDataType::isArraySizes(substream_path)) + if (IDataType::isNullMap(substream_path) || IDataType::isArraySizes(substream_path) + || IDataType::isStringSizes(substream_path)) return; std::vector new_indexes; diff --git a/dbms/src/Storages/DeltaMerge/File/DMFileWriter.cpp b/dbms/src/Storages/DeltaMerge/File/DMFileWriter.cpp index 780567b22ff..fe8a6b3d505 100644 --- a/dbms/src/Storages/DeltaMerge/File/DMFileWriter.cpp +++ b/dbms/src/Storages/DeltaMerge/File/DMFileWriter.cpp @@ -113,7 +113,8 @@ void DMFileWriter::addStreams(ColId col_id, DataTypePtr type, bool do_index) { auto callback = [&](const IDataType::SubstreamPath & substream_path) { const auto stream_name = DMFile::getFileNameBase(col_id, substream_path); - bool substream_can_index = !IDataType::isNullMap(substream_path) && !IDataType::isArraySizes(substream_path); + bool substream_can_index = !IDataType::isNullMap(substream_path) && !IDataType::isArraySizes(substream_path) + && !IDataType::isStringSizes(substream_path); auto stream = std::make_unique( dmfile, stream_name, @@ -125,7 +126,6 @@ void DMFileWriter::addStreams(ColId col_id, DataTypePtr type, bool do_index) do_index && substream_can_index); column_streams.emplace(stream_name, std::move(stream)); }; - type->enumerateStreams(callback, {}); } @@ -280,6 +280,7 @@ void DMFileWriter::finalizeColumn(ColId col_id, DataTypePtr type) const bool is_null = IDataType::isNullMap(substream); const bool is_array = IDataType::isArraySizes(substream); + const bool is_string_sizes = IDataType::isStringSizes(substream); // v3 if (dmfile->useMetaV2()) @@ -296,9 +297,9 @@ void DMFileWriter::finalizeColumn(ColId col_id, DataTypePtr type) { col_stat.nullmap_data_bytes = stream->plain_file->getMaterializedBytes(); } - else if (is_array) + else if (is_array || is_string_sizes) { - col_stat.array_sizes_bytes = stream->plain_file->getMaterializedBytes(); + col_stat.sizes_bytes = stream->plain_file->getMaterializedBytes(); } else { @@ -365,9 +366,9 @@ void DMFileWriter::finalizeColumn(ColId col_id, DataTypePtr type) { col_stat.nullmap_mark_bytes = mark_size; } - else if (is_array) + else if (is_array || is_string_sizes) { - col_stat.array_sizes_mark_bytes = mark_size; + col_stat.sizes_mark_bytes = mark_size; } else { diff --git a/dbms/src/Storages/DeltaMerge/File/DMFileWriter.h b/dbms/src/Storages/DeltaMerge/File/DMFileWriter.h index b7836c21321..1871c9bef29 100644 --- a/dbms/src/Storages/DeltaMerge/File/DMFileWriter.h +++ b/dbms/src/Storages/DeltaMerge/File/DMFileWriter.h @@ -69,10 +69,7 @@ class DMFileWriter , minmaxes(do_index ? std::make_shared(*type) : nullptr) { assert(compression_settings.settings.size() == 1); - auto setting = CompressionSetting::create<>( - compression_settings.settings[0].method, - compression_settings.settings[0].level, - *type); + auto setting = getCompressionSetting(type, file_base_name, compression_settings.settings[0]); compressed_buf = CompressedWriteBuffer<>::build( *plain_file, CompressionSettings(setting), @@ -97,6 +94,23 @@ class DMFileWriter } } + static bool isStringSizes(const DataTypePtr & type, const String & file_base_name) + { + return removeNullable(type)->getTypeId() == TypeIndex::String && file_base_name.ends_with(".size"); + } + + static CompressionSetting getCompressionSetting( + const DataTypePtr & type, + const String & file_base_name, + const CompressionSetting & setting) + { + // Force use Lightweight compression for string sizes, since the string sizes almost always small. + // Performance of LZ4 to decompress such integers is not good. + return isStringSizes(type, file_base_name) + ? CompressionSetting{CompressionMethod::Lightweight, CompressionDataType::Int64} + : CompressionSetting::create<>(setting.method, setting.level, *type); + } + // compressed_buf -> plain_file WriteBufferFromFileBasePtr plain_file; WriteBufferPtr compressed_buf; diff --git a/dbms/src/Storages/DeltaMerge/dtpb/dmfile.proto b/dbms/src/Storages/DeltaMerge/dtpb/dmfile.proto index 1161ee6c7b4..ca2cf1dbcc3 100644 --- a/dbms/src/Storages/DeltaMerge/dtpb/dmfile.proto +++ b/dbms/src/Storages/DeltaMerge/dtpb/dmfile.proto @@ -61,8 +61,8 @@ message ColumnStat { optional uint64 nullmap_data_bytes = 7; optional uint64 nullmap_mark_bytes = 8; optional uint64 index_bytes = 9; - optional uint64 array_sizes_bytes = 10; - optional uint64 array_sizes_mark_bytes = 11; + optional uint64 sizes_bytes = 10; + optional uint64 sizes_mark_bytes = 11; // Only used in tests. Modifying other fields of ColumnStat is hard. optional string additional_data_for_test = 101; diff --git a/dbms/src/Storages/DeltaMerge/tests/gtest_column_filter.cpp b/dbms/src/Storages/DeltaMerge/tests/gtest_column_filter.cpp index c2ffd8ae5cd..2701ed4f11f 100644 --- a/dbms/src/Storages/DeltaMerge/tests/gtest_column_filter.cpp +++ b/dbms/src/Storages/DeltaMerge/tests/gtest_column_filter.cpp @@ -37,7 +37,7 @@ class DebugBlockInputStream : public BlocksListBlockInputStream { auto cds = DMTestEnv::getDefaultColumns( is_common_handle ? DMTestEnv::PkType::CommonHandle : DMTestEnv::PkType::HiddenTiDBRowID); - cds->push_back(ColumnDefine(100, str_col_name, DataTypeFactory::instance().get("String"))); + cds->emplace_back(100, str_col_name, DataTypeFactory::instance().get(DataTypeString::getDefaultName())); return toEmptyBlock(*cds); } diff --git a/dbms/src/Storages/DeltaMerge/tests/gtest_dm_delta_merge_store.cpp b/dbms/src/Storages/DeltaMerge/tests/gtest_dm_delta_merge_store.cpp index cf1d3777b23..226a3bb4cea 100644 --- a/dbms/src/Storages/DeltaMerge/tests/gtest_dm_delta_merge_store.cpp +++ b/dbms/src/Storages/DeltaMerge/tests/gtest_dm_delta_merge_store.cpp @@ -2334,7 +2334,7 @@ TEST_P(DeltaMergeStoreRWTest, DDLAddColumnString) try { const String col_name_to_add = "string"; - const DataTypePtr col_type_to_add = DataTypeFactory::instance().get("String"); + const DataTypePtr col_type_to_add = DataTypeFactory::instance().get(DataTypeString::getDefaultName()); // write some rows before DDL size_t num_rows_write = 1; @@ -2761,6 +2761,152 @@ try } CATCH +namespace +{ +const ColumnDefine legacy_str_cd(2, "col2", DataTypeFactory::instance().get(DataTypeString::LegacyName)); +const ColumnDefine str_cd(2, "col2", DataTypeFactory::instance().get(DataTypeString::NameV2)); + +Block createBlock(const ColumnDefine & cd, size_t begin, size_t end) +{ + auto block = DMTestEnv::prepareSimpleWriteBlock(begin, end, false); + auto col = cd.type->createColumn(); + for (size_t i = begin; i < end; ++i) + col->insert(makeField(std::to_string(i))); + block.insert(ColumnWithTypeAndName{std::move(col), cd.type, cd.name, cd.id}); + return block; +} + +} // namespace + +TEST_F(DeltaMergeStoreTest, ReadLegacyStringData_CFTiny) +try +{ + // Write legacy string data to CFTiny. + { + auto table_column_defines = DMTestEnv::getDefaultColumns(); + table_column_defines->emplace_back(legacy_str_cd); + dropDataOnDisk(getTemporaryPath()); + store = reload(table_column_defines); + auto block = createBlock(legacy_str_cd, 0, 128); + store->write(*db_context, db_context->getSettingsRef(), block); + auto flush_res = store->flushCache( + *db_context, + {RowKeyRange::newAll(store->isCommonHandle(), store->getRowKeyColumnSize())}); + ASSERT_TRUE(flush_res); + ASSERT_EQ(store->segments.size(), 1); + auto seg = store->segments.begin()->second; + ASSERT_EQ(seg->delta->getMemTableSet()->getColumnFileCount(), 0); + ASSERT_EQ(seg->delta->getPersistedFileSet()->getColumnFileCount(), 1); + const auto * cf_tiny = seg->delta->getPersistedFileSet()->getFiles()[0]->tryToTinyFile(); + ASSERT_NE(cf_tiny, nullptr); + const auto & schema = cf_tiny->getSchema()->getSchema(); + auto col_type_name = schema.getByName(legacy_str_cd.name); + ASSERT_EQ(col_type_name.type->getName(), DataTypeString::LegacyName); + } + + { + // Mock that after restart, the data type has been changed to new serialize. But still can read old + // serialized format data. + auto table_column_defines = DMTestEnv::getDefaultColumns(); + table_column_defines->emplace_back(str_cd); + store = reload(table_column_defines); + } + + { + auto in = store->read( + *db_context, + db_context->getSettingsRef(), + {str_cd}, + {RowKeyRange::newAll(store->isCommonHandle(), store->getRowKeyColumnSize())}, + /* num_streams= */ 1, + /* start_ts= */ std::numeric_limits::max(), + EMPTY_FILTER, + std::vector{}, + 0, + "", + /* keep_order= */ false, + /* is_fast_scan= */ false, + /* expected_block_size= */ 1024)[0]; + auto block = in->read(); + ASSERT_EQ(block.rows(), 128); + + auto col_type_name = block.getByName(str_cd.name); + ASSERT_EQ(col_type_name.name, str_cd.name); + ASSERT_EQ(col_type_name.type->getName(), DataTypeString::NameV2); + + for (size_t i = 0; i < block.rows(); i++) + { + auto s = col_type_name.column->getDataAt(i).toStringView(); + ASSERT_EQ(s, std::to_string(i)); + } + } +} +CATCH + +TEST_F(DeltaMergeStoreTest, ReadLegacyStringData_DMFile) +try +{ + // Write legacy string data to DMFile. + { + auto table_column_defines = DMTestEnv::getDefaultColumns(); + table_column_defines->emplace_back(legacy_str_cd); + dropDataOnDisk(getTemporaryPath()); + store = reload(table_column_defines); + auto block = createBlock(legacy_str_cd, 0, 128); + store->write(*db_context, db_context->getSettingsRef(), block); + + ASSERT_TRUE(store->mergeDeltaAll(*db_context)); + + ASSERT_EQ(store->segments.size(), 1); + auto seg = store->segments.begin()->second; + const auto & dmfiles = seg->stable->getDMFiles(); + ASSERT_EQ(dmfiles.size(), 1); + const auto & column_stats = dmfiles.front()->getColumnStats(); + auto itr = column_stats.find(legacy_str_cd.id); + ASSERT_NE(itr, column_stats.end()); + const auto & column_stat = itr->second; + ASSERT_EQ(column_stat.type->getName(), DataTypeString::LegacyName); + } + + { + // Mock that after restart, the data type has been changed to new serialize. But still can read old + // serialized format data. + auto table_column_defines = DMTestEnv::getDefaultColumns(); + table_column_defines->emplace_back(str_cd); + store = reload(table_column_defines); + } + + { + auto in = store->read( + *db_context, + db_context->getSettingsRef(), + {str_cd}, + {RowKeyRange::newAll(store->isCommonHandle(), store->getRowKeyColumnSize())}, + /* num_streams= */ 1, + /* start_ts= */ std::numeric_limits::max(), + EMPTY_FILTER, + std::vector{}, + 0, + "", + /* keep_order= */ false, + /* is_fast_scan= */ false, + /* expected_block_size= */ 1024)[0]; + auto block = in->read(); + ASSERT_EQ(block.rows(), 128); + + auto col_type_name = block.getByName(str_cd.name); + ASSERT_EQ(col_type_name.name, str_cd.name); + ASSERT_EQ(col_type_name.type->getName(), DataTypeString::NameV2); + + for (size_t i = 0; i < block.rows(); i++) + { + auto s = col_type_name.column->getDataAt(i).toStringView(); + ASSERT_EQ(s, std::to_string(i)); + } + } +} +CATCH + TEST_P(DeltaMergeStoreRWTest, SimpleWriteReadCommonHandle) try { diff --git a/dbms/src/Storages/DeltaMerge/tests/gtest_dm_file.cpp b/dbms/src/Storages/DeltaMerge/tests/gtest_dm_file.cpp index cc854e73843..738eb68d0c8 100644 --- a/dbms/src/Storages/DeltaMerge/tests/gtest_dm_file.cpp +++ b/dbms/src/Storages/DeltaMerge/tests/gtest_dm_file.cpp @@ -1830,7 +1830,7 @@ try { auto cols = DMTestEnv::getDefaultColumns(); // Prepare columns - ColumnDefine fixed_str_col(2, "str", typeFromString("String")); + ColumnDefine fixed_str_col(2, "str", typeFromString(DataTypeString::getDefaultName())); cols->push_back(fixed_str_col); reload(cols); @@ -2268,7 +2268,7 @@ try auto cols_after_ddl = std::make_shared(); *cols_after_ddl = cols_before_ddl; // A new string column - ColumnDefine new_s_col(100, "s", typeFromString("String")); + ColumnDefine new_s_col(100, "s", typeFromString(DataTypeString::getDefaultName())); cols_after_ddl->emplace_back(new_s_col); // A new int64 column with default value 5 ColumnDefine new_i_col_with_default(101, "i", typeFromString("Int64")); diff --git a/dbms/src/Storages/DeltaMerge/tests/gtest_version_filter.cpp b/dbms/src/Storages/DeltaMerge/tests/gtest_version_filter.cpp index 8cdb24588f0..aee463ff075 100644 --- a/dbms/src/Storages/DeltaMerge/tests/gtest_version_filter.cpp +++ b/dbms/src/Storages/DeltaMerge/tests/gtest_version_filter.cpp @@ -42,7 +42,10 @@ class DebugBlockInputStream : public IProfilingBlockInputStream { auto cds = DMTestEnv::getDefaultColumns( is_common_handle ? DMTestEnv::PkType::CommonHandle : DMTestEnv::PkType::HiddenTiDBRowID); - cds->push_back(ColumnDefine(extra_column_id, str_col_name, DataTypeFactory::instance().get("String"))); + cds->emplace_back( + extra_column_id, + str_col_name, + DataTypeFactory::instance().get(DataTypeString::getDefaultName())); return toEmptyBlock(*cds); } diff --git a/dbms/src/Storages/DeltaMerge/workload/TableGenerator.cpp b/dbms/src/Storages/DeltaMerge/workload/TableGenerator.cpp index 26fb1ab470b..611b69ed71b 100644 --- a/dbms/src/Storages/DeltaMerge/workload/TableGenerator.cpp +++ b/dbms/src/Storages/DeltaMerge/workload/TableGenerator.cpp @@ -223,7 +223,7 @@ class TableDataType "Int64", "Float32", "Float64", - "String", + DataTypeString::getDefaultName(), "MyDate", "MyDateTime", "Enum16", @@ -315,7 +315,7 @@ class ConstantTableGenerator : public TableGenerator "Int64", "Float32", "Float64", - "String", + DataTypeString::getDefaultName(), "MyDate", "MyDateTime", "Enum16", @@ -352,4 +352,4 @@ std::unique_ptr TableGenerator::create(const WorkloadOptions & o throw std::invalid_argument(fmt::format("TableGenerator::create '{}' not support.", table)); } } -} // namespace DB::DM::tests \ No newline at end of file +} // namespace DB::DM::tests diff --git a/dbms/src/Storages/FormatVersion.h b/dbms/src/Storages/FormatVersion.h index c7b39cc31a5..0137a94c0b3 100644 --- a/dbms/src/Storages/FormatVersion.h +++ b/dbms/src/Storages/FormatVersion.h @@ -151,6 +151,16 @@ inline static const StorageFormatVersion STORAGE_FORMAT_V7 = StorageFormatVersio .identifier = 7, }; +inline static const StorageFormatVersion STORAGE_FORMAT_V8 = StorageFormatVersion{ + // diff is DataTypeString.DefaultName + .segment = SegmentFormat::V3, + .dm_file = DMFileFormat::V3, + .stable = StableFormat::V2, + .delta = DeltaFormat::V4, + .page = PageFormat::V3, + .identifier = 8, +}; + // STORAGE_FORMAT_V100 is used for S3 only inline static const StorageFormatVersion STORAGE_FORMAT_V100 = StorageFormatVersion{ .segment = SegmentFormat::V2, @@ -181,6 +191,17 @@ inline static const StorageFormatVersion STORAGE_FORMAT_V102 = StorageFormatVers .identifier = 102, }; +// STORAGE_FORMAT_V103 is used for S3 only +inline static const StorageFormatVersion STORAGE_FORMAT_V103 = StorageFormatVersion{ + // diff is DataTypeString.DefaultName + .segment = SegmentFormat::V3, + .dm_file = DMFileFormat::V3, + .stable = StableFormat::V2, + .delta = DeltaFormat::V4, + .page = PageFormat::V4, + .identifier = 103, +}; + inline StorageFormatVersion STORAGE_FORMAT_CURRENT = STORAGE_FORMAT_V7; inline const StorageFormatVersion & toStorageFormat(UInt64 setting) diff --git a/dbms/src/Storages/KVStore/Decode/DecodingStorageSchemaSnapshot.cpp b/dbms/src/Storages/KVStore/Decode/DecodingStorageSchemaSnapshot.cpp index b43e053768f..b615e288fd8 100644 --- a/dbms/src/Storages/KVStore/Decode/DecodingStorageSchemaSnapshot.cpp +++ b/dbms/src/Storages/KVStore/Decode/DecodingStorageSchemaSnapshot.cpp @@ -108,15 +108,15 @@ DecodingStorageSchemaSnapshot::DecodingStorageSchemaSnapshot( TMTPKType getTMTPKType(const IDataType & rhs) { - static const DataTypeInt64 & dataTypeInt64 = {}; // NOLINT - static const DataTypeUInt64 & dataTypeUInt64 = {}; // NOLINT - static const DataTypeString & dataTypeString = {}; // NOLINT + static const DataTypeInt64 data_type_int64; + static const DataTypeUInt64 data_type_uint64; + static const DataTypeString data_type_string; - if (rhs.equals(dataTypeInt64)) + if (rhs.equals(data_type_int64)) return TMTPKType::INT64; - else if (rhs.equals(dataTypeUInt64)) + else if (rhs.equals(data_type_uint64)) return TMTPKType::UINT64; - else if (rhs.equals(dataTypeString)) + else if (rhs.equals(data_type_string)) return TMTPKType::STRING; return TMTPKType::UNSPECIFIED; } diff --git a/dbms/src/Storages/MutableSupport.cpp b/dbms/src/Storages/MutableSupport.cpp index 3e5a76b7913..d8b91220375 100644 --- a/dbms/src/Storages/MutableSupport.cpp +++ b/dbms/src/Storages/MutableSupport.cpp @@ -12,9 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include #include - namespace DB { const String MutableSupport::mmt_storage_name = "MutableMergeTree"; @@ -27,7 +27,8 @@ const String MutableSupport::delmark_column_name = "_INTERNAL_DELMARK"; const String MutableSupport::extra_table_id_column_name = "_tidb_tid"; const DataTypePtr MutableSupport::tidb_pk_column_int_type = DataTypeFactory::instance().get("Int64"); -const DataTypePtr MutableSupport::tidb_pk_column_string_type = DataTypeFactory::instance().get("String"); +const DataTypePtr MutableSupport::tidb_pk_column_string_type + = DataTypeFactory::instance().get(DataTypeString::getDefaultName()); const DataTypePtr MutableSupport::version_column_type = DataTypeFactory::instance().get("UInt64"); const DataTypePtr MutableSupport::delmark_column_type = DataTypeFactory::instance().get("UInt8"); /// it should not be nullable, but TiDB does not set not null flag for extra_table_id_column_type, so has to align with TiDB diff --git a/dbms/src/TestUtils/InterpreterTestUtils.cpp b/dbms/src/TestUtils/InterpreterTestUtils.cpp index 48b6e6f5693..acf5ea639a2 100644 --- a/dbms/src/TestUtils/InterpreterTestUtils.cpp +++ b/dbms/src/TestUtils/InterpreterTestUtils.cpp @@ -21,6 +21,7 @@ #include #include +#include #include namespace DB::tests @@ -73,6 +74,7 @@ void InterpreterTestUtils::initExpectResults() assert(spilts.size() == 3); auto suite_key = fmt::format("~{}", Poco::trim(spilts[0])); auto unit_result = fmt::format("~{}", Poco::trim(spilts[2])); + boost::replace_all(unit_result, "{StringName}", DataTypeString::getDefaultName()); case_expect_results[suite_key].push_back(unit_result); } } diff --git a/dbms/src/TestUtils/tests/gtest_column_generator.cpp b/dbms/src/TestUtils/tests/gtest_column_generator.cpp index 873ca158048..e1d2247c46e 100644 --- a/dbms/src/TestUtils/tests/gtest_column_generator.cpp +++ b/dbms/src/TestUtils/tests/gtest_column_generator.cpp @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include #include #include @@ -34,7 +35,7 @@ try "UInt64", "Float32", "Float64", - "String", + DataTypeString::getDefaultName(), "MyDateTime", "MyDate", "Decimal"}; diff --git a/dbms/src/TiDB/Schema/tests/gtest_schema_sync.cpp b/dbms/src/TiDB/Schema/tests/gtest_schema_sync.cpp index 5ff90cbf468..03c1473c347 100644 --- a/dbms/src/TiDB/Schema/tests/gtest_schema_sync.cpp +++ b/dbms/src/TiDB/Schema/tests/gtest_schema_sync.cpp @@ -253,7 +253,7 @@ try MockTiDB::instance().newDataBase(db_name); auto cols = ColumnsDescription({ - {"col1", typeFromString("String")}, + {"col1", typeFromString(DataTypeString::getDefaultName())}, {"col2", typeFromString("Int64")}, }); // table_name, cols, pk_name @@ -295,7 +295,7 @@ try MockTiDB::instance().newDataBase(db_name); auto cols = ColumnsDescription({ - {"col1", typeFromString("String")}, + {"col1", typeFromString(DataTypeString::getDefaultName())}, {"col2", typeFromString("Int64")}, }); // table_name, cols, pk_name @@ -351,7 +351,7 @@ try MockTiDB::instance().newDataBase(db_name); auto cols = ColumnsDescription({ - {"col1", typeFromString("String")}, + {"col1", typeFromString(DataTypeString::getDefaultName())}, {"col2", typeFromString("Int64")}, }); // table_name, cols, pk_name @@ -423,7 +423,7 @@ try const String tbl_name = "mock_part_tbl"; auto cols = ColumnsDescription({ - {"col1", typeFromString("String")}, + {"col1", typeFromString(DataTypeString::getDefaultName())}, {"col2", typeFromString("Int64")}, }); @@ -480,7 +480,7 @@ try const String tbl_name = "mock_part_tbl"; auto cols = ColumnsDescription({ - {"col1", typeFromString("String")}, + {"col1", typeFromString(DataTypeString::getDefaultName())}, {"col2", typeFromString("Int64")}, }); @@ -577,7 +577,7 @@ try const String tbl_name = "mock_part_tbl"; auto cols = ColumnsDescription({ - {"col1", typeFromString("String")}, + {"col1", typeFromString(DataTypeString::getDefaultName())}, {"col2", typeFromString("Int64")}, }); @@ -703,7 +703,7 @@ try const String tbl_name = "mock_part_tbl"; auto cols = ColumnsDescription({ - {"col_1", typeFromString("String")}, + {"col_1", typeFromString(DataTypeString::getDefaultName())}, {"col_2", typeFromString("Int64")}, }); diff --git a/dbms/src/TiDB/Schema/tests/gtest_table_info.cpp b/dbms/src/TiDB/Schema/tests/gtest_table_info.cpp index 337b086e655..74b52ff57e4 100644 --- a/dbms/src/TiDB/Schema/tests/gtest_table_info.cpp +++ b/dbms/src/TiDB/Schema/tests/gtest_table_info.cpp @@ -13,6 +13,7 @@ // limitations under the License. #include +#include #include #include #include @@ -28,6 +29,7 @@ #include #include +#include using TableInfo = TiDB::TableInfo; using DBInfo = TiDB::DBInfo; @@ -296,6 +298,9 @@ struct StmtCase TEST(TiDBTableInfoTest, GenCreateTableStatement) try { + auto replace_string_name = [](String s) { + return boost::replace_all_copy(s, "{StringName}", DataTypeString::getDefaultName()); + }; // clang-format off auto cases = { @@ -340,7 +345,7 @@ try R"json({"id":2,"db_name":{"O":"db2","L":"db2"},"charset":"utf8mb4","collate":"utf8mb4_bin","state":5})json", R"json({"id":37,"name":{"O":"mytable","L":"mytable"},"charset":"","collate":"","cols":[{"id":1,"name":{"O":"mycol","L":"mycol"},"offset":0,"generated_expr_string":"","generated_stored":false,"dependences":null,"type":{"Tp":15,"Flag":4099,"Flen":256,"Decimal":0,"Charset":"utf8","Collate":"utf8_bin","Elems":null},"state":5}],"index_info":[{"id":1,"idx_name":{"O":"PRIMARY","L":"primary"},"tbl_name":{"O":"","L":""},"idx_cols":[{"name":{"O":"mycol","L":"mycol"},"offset":0,"length":-1}],"is_unique":true,"is_primary":true,"state":5,"index_type":1}],"fk_info":null,"state":5,"pk_is_handle":true,"schema_version":-1,"auto_inc_id":0,"max_col_id":1,"max_idx_id":1,"update_timestamp":404566455285710853,"ShardRowIDBits":0,"partition":null})json", // // The primary index is kept - R"stmt(CREATE TABLE `db_2`.`t_37`(`mycol` String) Engine = DeltaMerge((`mycol`), '{"cols":[{"id":1,"name":{"L":"mycol","O":"mycol"},"offset":0,"state":5,"type":{"Charset":"utf8","Collate":"utf8_bin","Decimal":0,"Flag":4099,"Flen":256,"Tp":15}}],"id":37,"index_info":[{"id":1,"idx_cols":[{"length":-1,"name":{"L":"mycol","O":"mycol"},"offset":0}],"idx_name":{"L":"primary","O":"primary"},"index_type":1,"is_global":false,"is_invisible":false,"is_primary":true,"is_unique":true,"state":5}],"is_common_handle":false,"keyspace_id":4294967295,"name":{"L":"mytable","O":"mytable"},"pk_is_handle":true,"schema_version":-1,"state":5,"tiflash_replica":{"Count":0},"update_timestamp":404566455285710853}', 0))stmt", // + replace_string_name(R"stmt(CREATE TABLE `db_2`.`t_37`(`mycol` {StringName}) Engine = DeltaMerge((`mycol`), '{"cols":[{"id":1,"name":{"L":"mycol","O":"mycol"},"offset":0,"state":5,"type":{"Charset":"utf8","Collate":"utf8_bin","Decimal":0,"Flag":4099,"Flen":256,"Tp":15}}],"id":37,"index_info":[{"id":1,"idx_cols":[{"length":-1,"name":{"L":"mycol","O":"mycol"},"offset":0}],"idx_name":{"L":"primary","O":"primary"},"index_type":1,"is_global":false,"is_invisible":false,"is_primary":true,"is_unique":true,"state":5}],"is_common_handle":false,"keyspace_id":4294967295,"name":{"L":"mytable","O":"mytable"},"pk_is_handle":true,"schema_version":-1,"state":5,"tiflash_replica":{"Count":0},"update_timestamp":404566455285710853}', 0))stmt"), // }, StmtCase{ 32, // @@ -361,7 +366,7 @@ try 0, R"json({"id":2,"db_name":{"O":"test","L":"test"},"charset":"utf8mb4","collate":"utf8mb4_bin","state":5})json", // R"json({"id":546,"name":{"O":"tcfc7825f","L":"tcfc7825f"},"charset":"utf8mb4","collate":"utf8mb4_general_ci","cols":[{"id":1,"name":{"O":"col_86","L":"col_86"},"offset":0,"default_is_expr":false,"generated_expr_string":"","generated_stored":false,"dependences":null,"type":{"Tp":252,"Flag":128,"Flen":65535,"Decimal":0,"Charset":"binary","Collate":"binary","ElemsIsBinaryLit":null,"Array":false},"state":5,"comment":"","hidden":false,"change_state_info":null,"version":2},{"id":2,"name":{"O":"col_87","L":"col_87"},"offset":1,"default":"1994-05-0600:00:00","default_is_expr":false,"generated_expr_string":"","generated_stored":false,"dependences":null,"type":{"Tp":12,"Flag":129,"Flen":19,"Decimal":0,"Charset":"binary","Collate":"binary","ElemsIsBinaryLit":null,"Array":false},"state":5,"comment":"","hidden":false,"change_state_info":null,"version":2},{"id":3,"name":{"O":"col_88","L":"col_88"},"offset":2,"default_is_expr":false,"generated_expr_string":"","generated_stored":false,"dependences":null,"type":{"Tp":16,"Flag":32,"Flen":42,"Decimal":0,"Charset":"binary","Collate":"binary","ElemsIsBinaryLit":null,"Array":false},"state":5,"comment":"","hidden":false,"change_state_info":null,"version":2},{"id":4,"name":{"O":"col_89","L":"col_89"},"offset":3,"default":"\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000","default_is_expr":false,"generated_expr_string":"","generated_stored":false,"dependences":null,"type":{"Tp":254,"Flag":129,"Flen":21,"Decimal":0,"Charset":"binary","Collate":"binary","ElemsIsBinaryLit":null,"Array":false},"state":5,"comment":"","hidden":false,"change_state_info":null,"version":2},{"id":5,"name":{"O":"col_90","L":"col_90"},"offset":4,"default_is_expr":false,"generated_expr_string":"","generated_stored":false,"dependences":null,"type":{"Tp":1,"Flag":4129,"Flen":3,"Decimal":0,"Charset":"binary","Collate":"binary","ElemsIsBinaryLit":null,"Array":false},"state":5,"comment":"","hidden":false,"change_state_info":null,"version":2},{"id":6,"name":{"O":"col_91","L":"col_91"},"offset":5,"default":"\u0007\u0007","default_bit":"Bwc=","default_is_expr":false,"generated_expr_string":"","generated_stored":false,"dependences":null,"type":{"Tp":16,"Flag":32,"Flen":12,"Decimal":0,"Charset":"binary","Collate":"binary","ElemsIsBinaryLit":null,"Array":false},"state":5,"comment":"","hidden":false,"change_state_info":null,"version":2},{"id":7,"name":{"O":"col_92","L":"col_92"},"offset":6,"default":"kY~6to6H4ut*QAPrj@\u0026","default_is_expr":false,"generated_expr_string":"","generated_stored":false,"dependences":null,"type":{"Tp":15,"Flag":129,"Flen":343,"Decimal":0,"Charset":"binary","Collate":"binary","ElemsIsBinaryLit":null,"Array":false},"state":5,"comment":"","hidden":false,"change_state_info":null,"version":2},{"id":8,"name":{"O":"col_93","L":"col_93"},"offset":7,"default_is_expr":false,"generated_expr_string":"","generated_stored":false,"dependences":null,"type":{"Tp":245,"Flag":128,"Flen":4294967295,"Decimal":0,"Charset":"binary","Collate":"binary","ElemsIsBinaryLit":null,"Array":false},"state":5,"comment":"","hidden":false,"change_state_info":null,"version":2}],"index_info":null,"constraint_info":null,"fk_info":null,"state":5,"pk_is_handle":false,"is_common_handle":false,"common_handle_version":0,"comment":"","auto_inc_id":0,"auto_id_cache":0,"auto_rand_id":0,"max_col_id":8,"max_idx_id":0,"max_fk_id":0,"max_cst_id":0,"update_timestamp":452653255976550448,"ShardRowIDBits":0,"max_shard_row_id_bits":0,"auto_random_bits":0,"auto_random_range_bits":0,"pre_split_regions":0,"compression":"","view":null,"sequence":null,"Lock":null,"version":5,"tiflash_replica":{"Count":1,"LocationLabels":[],"Available":false,"AvailablePartitionIDs":null},"is_columnar":false,"temp_table_type":0,"cache_table_status":0,"policy_ref_info":null,"stats_options":null,"exchange_partition_info":null,"ttl_info":null,"revision":1})json", // - R"stmt(CREATE TABLE `db_2`.`t_546`(`col_86` Nullable(String), `col_87` MyDateTime(0), `col_88` Nullable(UInt64), `col_89` String, `col_90` UInt8, `col_91` Nullable(UInt64), `col_92` String, `col_93` Nullable(String), `_tidb_rowid` Int64) Engine = DeltaMerge((`_tidb_rowid`), '{"cols":[{"id":1,"name":{"L":"col_86","O":"col_86"},"offset":0,"state":5,"type":{"Charset":"binary","Collate":"binary","Decimal":0,"Flag":128,"Flen":65535,"Tp":252}},{"default":"1994-05-0600:00:00","id":2,"name":{"L":"col_87","O":"col_87"},"offset":1,"state":5,"type":{"Charset":"binary","Collate":"binary","Decimal":0,"Flag":129,"Flen":19,"Tp":12}},{"id":3,"name":{"L":"col_88","O":"col_88"},"offset":2,"state":5,"type":{"Charset":"binary","Collate":"binary","Decimal":0,"Flag":32,"Flen":42,"Tp":16}},{"default":"\\u0000\\u0000\\u0000\\u0000\\u0000\\u0000\\u0000\\u0000\\u0000\\u0000\\u0000\\u0000\\u0000\\u0000\\u0000\\u0000\\u0000\\u0000\\u0000\\u0000\\u0000","id":4,"name":{"L":"col_89","O":"col_89"},"offset":3,"state":5,"type":{"Charset":"binary","Collate":"binary","Decimal":0,"Flag":129,"Flen":21,"Tp":254}},{"id":5,"name":{"L":"col_90","O":"col_90"},"offset":4,"state":5,"type":{"Charset":"binary","Collate":"binary","Decimal":0,"Flag":4129,"Flen":3,"Tp":1}},{"default":"\\u0007\\u0007","default_bit":"Bwc=","id":6,"name":{"L":"col_91","O":"col_91"},"offset":5,"state":5,"type":{"Charset":"binary","Collate":"binary","Decimal":0,"Flag":32,"Flen":12,"Tp":16}},{"default":"kY~6to6H4ut*QAPrj@&","id":7,"name":{"L":"col_92","O":"col_92"},"offset":6,"state":5,"type":{"Charset":"binary","Collate":"binary","Decimal":0,"Flag":129,"Flen":343,"Tp":15}},{"id":8,"name":{"L":"col_93","O":"col_93"},"offset":7,"state":5,"type":{"Charset":"binary","Collate":"binary","Decimal":0,"Flag":128,"Flen":-1,"Tp":245}}],"id":546,"index_info":[],"is_common_handle":false,"keyspace_id":4294967295,"name":{"L":"tcfc7825f","O":"tcfc7825f"},"pk_is_handle":false,"schema_version":-1,"state":5,"tiflash_replica":{"Available":false,"Count":1},"update_timestamp":452653255976550448}', 0))stmt", // + replace_string_name(R"stmt(CREATE TABLE `db_2`.`t_546`(`col_86` Nullable({StringName}), `col_87` MyDateTime(0), `col_88` Nullable(UInt64), `col_89` {StringName}, `col_90` UInt8, `col_91` Nullable(UInt64), `col_92` {StringName}, `col_93` Nullable({StringName}), `_tidb_rowid` Int64) Engine = DeltaMerge((`_tidb_rowid`), '{"cols":[{"id":1,"name":{"L":"col_86","O":"col_86"},"offset":0,"state":5,"type":{"Charset":"binary","Collate":"binary","Decimal":0,"Flag":128,"Flen":65535,"Tp":252}},{"default":"1994-05-0600:00:00","id":2,"name":{"L":"col_87","O":"col_87"},"offset":1,"state":5,"type":{"Charset":"binary","Collate":"binary","Decimal":0,"Flag":129,"Flen":19,"Tp":12}},{"id":3,"name":{"L":"col_88","O":"col_88"},"offset":2,"state":5,"type":{"Charset":"binary","Collate":"binary","Decimal":0,"Flag":32,"Flen":42,"Tp":16}},{"default":"\\u0000\\u0000\\u0000\\u0000\\u0000\\u0000\\u0000\\u0000\\u0000\\u0000\\u0000\\u0000\\u0000\\u0000\\u0000\\u0000\\u0000\\u0000\\u0000\\u0000\\u0000","id":4,"name":{"L":"col_89","O":"col_89"},"offset":3,"state":5,"type":{"Charset":"binary","Collate":"binary","Decimal":0,"Flag":129,"Flen":21,"Tp":254}},{"id":5,"name":{"L":"col_90","O":"col_90"},"offset":4,"state":5,"type":{"Charset":"binary","Collate":"binary","Decimal":0,"Flag":4129,"Flen":3,"Tp":1}},{"default":"\\u0007\\u0007","default_bit":"Bwc=","id":6,"name":{"L":"col_91","O":"col_91"},"offset":5,"state":5,"type":{"Charset":"binary","Collate":"binary","Decimal":0,"Flag":32,"Flen":12,"Tp":16}},{"default":"kY~6to6H4ut*QAPrj@&","id":7,"name":{"L":"col_92","O":"col_92"},"offset":6,"state":5,"type":{"Charset":"binary","Collate":"binary","Decimal":0,"Flag":129,"Flen":343,"Tp":15}},{"id":8,"name":{"L":"col_93","O":"col_93"},"offset":7,"state":5,"type":{"Charset":"binary","Collate":"binary","Decimal":0,"Flag":128,"Flen":-1,"Tp":245}}],"id":546,"index_info":[],"is_common_handle":false,"keyspace_id":4294967295,"name":{"L":"tcfc7825f","O":"tcfc7825f"},"pk_is_handle":false,"schema_version":-1,"state":5,"tiflash_replica":{"Available":false,"Count":1},"update_timestamp":452653255976550448}', 0))stmt"), // }, }; // clang-format on diff --git a/dbms/src/TiDB/tests/gtest_type_mapping.cpp b/dbms/src/TiDB/tests/gtest_type_mapping.cpp index f9e5b439022..a22674bb506 100644 --- a/dbms/src/TiDB/tests/gtest_type_mapping.cpp +++ b/dbms/src/TiDB/tests/gtest_type_mapping.cpp @@ -13,6 +13,7 @@ // limitations under the License. #include +#include #include #include #include @@ -73,10 +74,24 @@ try } } - column_info = reverseGetColumnInfo(NameAndTypePair{name, typeFromString("String")}, 1, default_field, true); - ASSERT_EQ(column_info.tp, TiDB::TypeString); - auto data_type = getDataTypeByColumnInfo(column_info); - ASSERT_EQ(data_type->getName(), "String"); + { + auto legacy_str_type = typeFromString(DataTypeString::LegacyName); + ASSERT_EQ(legacy_str_type->getName(), DataTypeString::LegacyName); + column_info = reverseGetColumnInfo(NameAndTypePair{name, legacy_str_type}, 1, default_field, true); + ASSERT_EQ(column_info.tp, TiDB::TypeString); + auto data_type = getDataTypeByColumnInfo(column_info); + // Get data type by column_info always returns the default type. + ASSERT_EQ(data_type->getName(), DataTypeString::getDefaultName()); + } + { + auto str_type = typeFromString(DataTypeString::NameV2); + ASSERT_EQ(str_type->getName(), DataTypeString::NameV2); + column_info = reverseGetColumnInfo(NameAndTypePair{name, str_type}, 1, default_field, true); + ASSERT_EQ(column_info.tp, TiDB::TypeString); + auto data_type = getDataTypeByColumnInfo(column_info); + // Get data type by column_info always returns the default type. + ASSERT_EQ(data_type->getName(), DataTypeString::getDefaultName()); + } // TODO: test decimal, datetime, enum }