From ff30209dbe7d1383a9cc41f5205b9f9df4438e5e Mon Sep 17 00:00:00 2001 From: lihangyu <15605149486@163.com> Date: Sun, 18 Feb 2024 10:38:12 +0800 Subject: [PATCH] [Optimize](Variant) make tablet schema more well-organized (#99) (#30922) --- be/src/olap/tablet_schema.cpp | 7 ++++++ be/src/vec/common/schema_util.cpp | 42 ++++++++++++++++++++----------- be/src/vec/json/path_in_data.h | 6 +++++ 3 files changed, 40 insertions(+), 15 deletions(-) diff --git a/be/src/olap/tablet_schema.cpp b/be/src/olap/tablet_schema.cpp index 3892c762c02239..79806c4703e58c 100644 --- a/be/src/olap/tablet_schema.cpp +++ b/be/src/olap/tablet_schema.cpp @@ -621,6 +621,13 @@ void TabletColumn::to_schema_pb(ColumnPB* column) const { if (!_column_path.empty()) { // CHECK_GT(_parent_col_unique_id, 0); _column_path.to_protobuf(column->mutable_column_path_info(), _parent_col_unique_id); + // Update unstable information for variant columns. Some of the fields in the tablet schema + // are irrelevant for variant sub-columns, but retaining them may lead to an excessive growth + // in the number of tablet schema cache entries. + if (_type == FieldType::OLAP_FIELD_TYPE_STRING) { + column->set_length(INT_MAX); + } + column->set_index_length(0); } for (auto& col : _sparse_cols) { ColumnPB* sparse_column = column->add_sparse_columns(); diff --git a/be/src/vec/common/schema_util.cpp b/be/src/vec/common/schema_util.cpp index e532662f6af9cc..b3fa9234cb3e0d 100644 --- a/be/src/vec/common/schema_util.cpp +++ b/be/src/vec/common/schema_util.cpp @@ -35,7 +35,6 @@ #include #include #include -#include #include #include @@ -213,6 +212,7 @@ void get_column_by_type(const vectorized::DataTypePtr& data_type, const std::str } // size is not fixed when type is string or json if (WhichDataType(*data_type).is_string() || WhichDataType(*data_type).is_json()) { + column.set_length(INT_MAX); return; } if (WhichDataType(*data_type).is_simple()) { @@ -253,10 +253,10 @@ TabletColumn get_least_type_column(const TabletColumn& original, const DataTypeP return result_column; } -void update_least_schema_internal( - const std::unordered_map& subcolumns_types, - TabletSchemaSPtr& common_schema, bool update_sparse_column, int32_t variant_col_unique_id, - std::unordered_set* path_set = nullptr) { +void update_least_schema_internal(const std::map& subcolumns_types, + TabletSchemaSPtr& common_schema, bool update_sparse_column, + int32_t variant_col_unique_id, + std::set* path_set = nullptr) { PathsInData tuple_paths; DataTypes tuple_types; // Get the least common type for all paths. @@ -310,9 +310,9 @@ void update_least_schema_internal( void update_least_common_schema(const std::vector& schemas, TabletSchemaSPtr& common_schema, int32_t variant_col_unique_id, - std::unordered_set* path_set) { + std::set* path_set) { // Types of subcolumns by path from all tuples. - std::unordered_map subcolumns_types; + std::map subcolumns_types; for (const TabletSchemaSPtr& schema : schemas) { for (const TabletColumn& col : schema->columns()) { // Get subcolumns of this variant @@ -346,9 +346,9 @@ void update_least_common_schema(const std::vector& schemas, void update_least_sparse_column(const std::vector& schemas, TabletSchemaSPtr& common_schema, int32_t variant_col_unique_id, - const std::unordered_set& path_set) { + const std::set& path_set) { // Types of subcolumns by path from all tuples. - std::unordered_map subcolumns_types; + std::map subcolumns_types; for (const TabletSchemaSPtr& schema : schemas) { if (schema->field_index(variant_col_unique_id) == -1) { // maybe dropped @@ -450,7 +450,7 @@ Status get_least_common_schema(const std::vector& schemas, // schema 3: k (int) v:a (double) v:b (smallint) // result : k (int) v:a (double) v:b (bigint) v:c (string) v:d (string) for (int32_t unique_id : variant_column_unique_id) { - std::unordered_set path_set; + std::set path_set; // 1. cast extracted column to common type // path set is used to record the paths of those sparse columns that have been merged into the extracted columns, eg: v:b update_least_common_schema(schemas, output_schema, unique_id, &path_set); @@ -464,6 +464,7 @@ Status get_least_common_schema(const std::vector& schemas, return Status::DataQualityError("Reached max column size limit {}", config::variant_max_merged_tablet_schema_size); } + return Status::OK(); } @@ -592,9 +593,9 @@ void encode_variant_sparse_subcolumns(Block& block, const std::vector& vari } } -void _append_column(const TabletColumn& parent_variant, - const ColumnObject::Subcolumns::NodePtr& subcolumn, TabletSchemaSPtr& to_append, - bool is_sparse) { +static void _append_column(const TabletColumn& parent_variant, + const ColumnObject::Subcolumns::NodePtr& subcolumn, + TabletSchemaSPtr& to_append, bool is_sparse) { // If column already exist in original tablet schema, then we pick common type // and cast column to common type, and modify tablet column to common type, // otherwise it's a new column @@ -620,6 +621,17 @@ void _append_column(const TabletColumn& parent_variant, } } +// sort by paths in lexicographical order +static vectorized::ColumnObject::Subcolumns get_sorted_subcolumns( + const vectorized::ColumnObject::Subcolumns& subcolumns) { + // sort by paths in lexicographical order + vectorized::ColumnObject::Subcolumns sorted = subcolumns; + std::sort(sorted.begin(), sorted.end(), [](const auto& lhsItem, const auto& rhsItem) { + return lhsItem->path < rhsItem->path; + }); + return sorted; +} + void rebuild_schema_and_block(const TabletSchemaSPtr& original, const std::vector& variant_positions, Block& flush_block, TabletSchemaSPtr& flush_schema) { @@ -638,7 +650,7 @@ void rebuild_schema_and_block(const TabletSchemaSPtr& original, CHECK(object_column.is_finalized()); std::shared_ptr root; // common extracted columns - for (const auto& entry : object_column.get_subcolumns()) { + for (const auto& entry : get_sorted_subcolumns(object_column.get_subcolumns())) { if (entry->path.empty()) { // root root = entry; @@ -652,7 +664,7 @@ void rebuild_schema_and_block(const TabletSchemaSPtr& original, } // add sparse columns to flush_schema - for (const auto& entry : object_column.get_sparse_subcolumns()) { + for (const auto& entry : get_sorted_subcolumns(object_column.get_sparse_subcolumns())) { _append_column(parent_column, entry, flush_schema, true); } diff --git a/be/src/vec/json/path_in_data.h b/be/src/vec/json/path_in_data.h index 267ab1fab3f761..aba700bb2f78ef 100644 --- a/be/src/vec/json/path_in_data.h +++ b/be/src/vec/json/path_in_data.h @@ -81,6 +81,12 @@ class PathInData { void to_protobuf(segment_v2::ColumnPathInfo* pb, int32_t parent_col_unique_id) const; void from_protobuf(const segment_v2::ColumnPathInfo& pb); + bool operator<(const PathInData& rhs) const { + return std::lexicographical_compare( + parts.begin(), parts.end(), rhs.parts.begin(), rhs.parts.end(), + [](const auto& a, const auto& b) { return a.key < b.key; }); + } + private: /// Creates full path from parts. void build_path(const Parts& other_parts);