Skip to content

Commit

Permalink
[Optimize](Variant) make tablet schema more well-organized (apache#99) (
Browse files Browse the repository at this point in the history
  • Loading branch information
eldenmoon authored Feb 18, 2024
1 parent ee6d24a commit ff30209
Show file tree
Hide file tree
Showing 3 changed files with 40 additions and 15 deletions.
7 changes: 7 additions & 0 deletions be/src/olap/tablet_schema.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -621,6 +621,13 @@ void TabletColumn::to_schema_pb(ColumnPB* column) const {
if (!_column_path.empty()) {
// CHECK_GT(_parent_col_unique_id, 0);
_column_path.to_protobuf(column->mutable_column_path_info(), _parent_col_unique_id);
// Update unstable information for variant columns. Some of the fields in the tablet schema
// are irrelevant for variant sub-columns, but retaining them may lead to an excessive growth
// in the number of tablet schema cache entries.
if (_type == FieldType::OLAP_FIELD_TYPE_STRING) {
column->set_length(INT_MAX);
}
column->set_index_length(0);
}
for (auto& col : _sparse_cols) {
ColumnPB* sparse_column = column->add_sparse_columns();
Expand Down
42 changes: 27 additions & 15 deletions be/src/vec/common/schema_util.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,6 @@
#include <memory>
#include <ostream>
#include <unordered_map>
#include <unordered_set>
#include <utility>
#include <vector>

Expand Down Expand Up @@ -213,6 +212,7 @@ void get_column_by_type(const vectorized::DataTypePtr& data_type, const std::str
}
// size is not fixed when type is string or json
if (WhichDataType(*data_type).is_string() || WhichDataType(*data_type).is_json()) {
column.set_length(INT_MAX);
return;
}
if (WhichDataType(*data_type).is_simple()) {
Expand Down Expand Up @@ -253,10 +253,10 @@ TabletColumn get_least_type_column(const TabletColumn& original, const DataTypeP
return result_column;
}

void update_least_schema_internal(
const std::unordered_map<PathInData, DataTypes, PathInData::Hash>& subcolumns_types,
TabletSchemaSPtr& common_schema, bool update_sparse_column, int32_t variant_col_unique_id,
std::unordered_set<PathInData, PathInData::Hash>* path_set = nullptr) {
void update_least_schema_internal(const std::map<PathInData, DataTypes>& subcolumns_types,
TabletSchemaSPtr& common_schema, bool update_sparse_column,
int32_t variant_col_unique_id,
std::set<PathInData>* path_set = nullptr) {
PathsInData tuple_paths;
DataTypes tuple_types;
// Get the least common type for all paths.
Expand Down Expand Up @@ -310,9 +310,9 @@ void update_least_schema_internal(

void update_least_common_schema(const std::vector<TabletSchemaSPtr>& schemas,
TabletSchemaSPtr& common_schema, int32_t variant_col_unique_id,
std::unordered_set<PathInData, PathInData::Hash>* path_set) {
std::set<PathInData>* path_set) {
// Types of subcolumns by path from all tuples.
std::unordered_map<PathInData, DataTypes, PathInData::Hash> subcolumns_types;
std::map<PathInData, DataTypes> subcolumns_types;
for (const TabletSchemaSPtr& schema : schemas) {
for (const TabletColumn& col : schema->columns()) {
// Get subcolumns of this variant
Expand Down Expand Up @@ -346,9 +346,9 @@ void update_least_common_schema(const std::vector<TabletSchemaSPtr>& schemas,

void update_least_sparse_column(const std::vector<TabletSchemaSPtr>& schemas,
TabletSchemaSPtr& common_schema, int32_t variant_col_unique_id,
const std::unordered_set<PathInData, PathInData::Hash>& path_set) {
const std::set<PathInData>& path_set) {
// Types of subcolumns by path from all tuples.
std::unordered_map<PathInData, DataTypes, PathInData::Hash> subcolumns_types;
std::map<PathInData, DataTypes> subcolumns_types;
for (const TabletSchemaSPtr& schema : schemas) {
if (schema->field_index(variant_col_unique_id) == -1) {
// maybe dropped
Expand Down Expand Up @@ -450,7 +450,7 @@ Status get_least_common_schema(const std::vector<TabletSchemaSPtr>& schemas,
// schema 3: k (int) v:a (double) v:b (smallint)
// result : k (int) v:a (double) v:b (bigint) v:c (string) v:d (string)
for (int32_t unique_id : variant_column_unique_id) {
std::unordered_set<PathInData, PathInData::Hash> path_set;
std::set<PathInData> path_set;
// 1. cast extracted column to common type
// path set is used to record the paths of those sparse columns that have been merged into the extracted columns, eg: v:b
update_least_common_schema(schemas, output_schema, unique_id, &path_set);
Expand All @@ -464,6 +464,7 @@ Status get_least_common_schema(const std::vector<TabletSchemaSPtr>& schemas,
return Status::DataQualityError("Reached max column size limit {}",
config::variant_max_merged_tablet_schema_size);
}

return Status::OK();
}

Expand Down Expand Up @@ -592,9 +593,9 @@ void encode_variant_sparse_subcolumns(Block& block, const std::vector<int>& vari
}
}

void _append_column(const TabletColumn& parent_variant,
const ColumnObject::Subcolumns::NodePtr& subcolumn, TabletSchemaSPtr& to_append,
bool is_sparse) {
static void _append_column(const TabletColumn& parent_variant,
const ColumnObject::Subcolumns::NodePtr& subcolumn,
TabletSchemaSPtr& to_append, bool is_sparse) {
// If column already exist in original tablet schema, then we pick common type
// and cast column to common type, and modify tablet column to common type,
// otherwise it's a new column
Expand All @@ -620,6 +621,17 @@ void _append_column(const TabletColumn& parent_variant,
}
}

// sort by paths in lexicographical order
static vectorized::ColumnObject::Subcolumns get_sorted_subcolumns(
const vectorized::ColumnObject::Subcolumns& subcolumns) {
// sort by paths in lexicographical order
vectorized::ColumnObject::Subcolumns sorted = subcolumns;
std::sort(sorted.begin(), sorted.end(), [](const auto& lhsItem, const auto& rhsItem) {
return lhsItem->path < rhsItem->path;
});
return sorted;
}

void rebuild_schema_and_block(const TabletSchemaSPtr& original,
const std::vector<int>& variant_positions, Block& flush_block,
TabletSchemaSPtr& flush_schema) {
Expand All @@ -638,7 +650,7 @@ void rebuild_schema_and_block(const TabletSchemaSPtr& original,
CHECK(object_column.is_finalized());
std::shared_ptr<vectorized::ColumnObject::Subcolumns::Node> root;
// common extracted columns
for (const auto& entry : object_column.get_subcolumns()) {
for (const auto& entry : get_sorted_subcolumns(object_column.get_subcolumns())) {
if (entry->path.empty()) {
// root
root = entry;
Expand All @@ -652,7 +664,7 @@ void rebuild_schema_and_block(const TabletSchemaSPtr& original,
}

// add sparse columns to flush_schema
for (const auto& entry : object_column.get_sparse_subcolumns()) {
for (const auto& entry : get_sorted_subcolumns(object_column.get_sparse_subcolumns())) {
_append_column(parent_column, entry, flush_schema, true);
}

Expand Down
6 changes: 6 additions & 0 deletions be/src/vec/json/path_in_data.h
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,12 @@ class PathInData {
void to_protobuf(segment_v2::ColumnPathInfo* pb, int32_t parent_col_unique_id) const;
void from_protobuf(const segment_v2::ColumnPathInfo& pb);

bool operator<(const PathInData& rhs) const {
return std::lexicographical_compare(
parts.begin(), parts.end(), rhs.parts.begin(), rhs.parts.end(),
[](const auto& a, const auto& b) { return a.key < b.key; });
}

private:
/// Creates full path from parts.
void build_path(const Parts& other_parts);
Expand Down

0 comments on commit ff30209

Please sign in to comment.