Skip to content

Commit

Permalink
Merge pull request #156 from Enmk/performance_Type_IsEqual
Browse files Browse the repository at this point in the history
Better performance for Type::IsEqual()
  • Loading branch information
Enmk authored Feb 22, 2022
2 parents bf2ad17 + 8e3970c commit f1f32f1
Show file tree
Hide file tree
Showing 3 changed files with 128 additions and 2 deletions.
63 changes: 62 additions & 1 deletion clickhouse/types/types.cpp
Original file line number Diff line number Diff line change
@@ -1,10 +1,15 @@
#include "types.h"

#include <cityhash/city.h>

#include <stdexcept>

namespace clickhouse {

Type::Type(const Code code) : code_(code) {}
Type::Type(const Code code)
: code_(code)
, type_unique_id_(0)
{}

std::string Type::GetName() const {
switch (code_) {
Expand Down Expand Up @@ -72,6 +77,62 @@ std::string Type::GetName() const {
return std::string();
}

uint64_t Type::GetTypeUniqueId() const {
// Helper method to optimize equality checks of types with Type::IsEqual(),
// base invariant: types with same names produce same unique id (and hence considered equal).
// As an optimization, full type name is constructed at most once, and only for complex types.
switch (code_) {
case Void:
case Int8:
case Int16:
case Int32:
case Int64:
case Int128:
case UInt8:
case UInt16:
case UInt32:
case UInt64:
case UUID:
case Float32:
case Float64:
case String:
case IPv4:
case IPv6:
case Date:
// For simple types, unique ID is the same as Type::Code
return code_;

case FixedString:
case DateTime:
case DateTime64:
case Array:
case Nullable:
case Tuple:
case Enum8:
case Enum16:
case Decimal:
case Decimal32:
case Decimal64:
case Decimal128:
case LowCardinality: {
// For complex types, exact unique ID depends on nested types and/or parameters,
// the easiest way is to lazy-compute unique ID from name once.
// Here we do not care if multiple threads are computing value simultaneosly since it is both:
// 1. going to be the same
// 2. going to be stored atomically

if (type_unique_id_ == 0) {
const auto name = GetName();
type_unique_id_ = CityHash64WithSeed(name.c_str(), name.size(), code_);
}

return type_unique_id_;
}
}
assert(false);
return 0;
}

TypeRef Type::CreateArray(TypeRef item_type) {
return TypeRef(new ArrayType(item_type));
}
Expand Down
11 changes: 10 additions & 1 deletion clickhouse/types/types.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

#include "absl/numeric/int128.h"

#include <atomic>
#include <map>
#include <memory>
#include <string>
Expand Down Expand Up @@ -73,7 +74,12 @@ class Type {
std::string GetName() const;

/// Is given type same as current one.
bool IsEqual(const Type& other) const { return this->GetName() == other.GetName(); }
bool IsEqual(const Type& other) const {
return this == &other
// GetTypeUniqueId() is relatively heavy, so avoid calling it when comparing obviously different types.
|| (this->GetCode() == other.GetCode() && this->GetTypeUniqueId() == other.GetTypeUniqueId());
}

bool IsEqual(const TypeRef& other) const { return IsEqual(*other); }

public:
Expand Down Expand Up @@ -113,7 +119,10 @@ class Type {
static TypeRef CreateLowCardinality(TypeRef item_type);

private:
uint64_t GetTypeUniqueId() const;

const Code code_;
mutable std::atomic<uint64_t> type_unique_id_;
};

inline bool operator==(const Type & left, const Type & right) {
Expand Down
56 changes: 56 additions & 0 deletions ut/types_ut.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
#include <clickhouse/types/types.h>
#include <clickhouse/columns/factory.h>

#include <gtest/gtest.h>

using namespace clickhouse;
Expand Down Expand Up @@ -70,3 +72,57 @@ TEST(TypesCase, EnumTypesEmpty) {
TEST(TypesCase, DecimalTypes) {
// TODO: implement this test.
}

TEST(TypesCase, IsEqual) {
const std::string type_names[] = {
"UInt8",
"Int8",
"UInt128",
"String",
"FixedString(0)",
"FixedString(10000)",
"DateTime('UTC')",
"DateTime64(3, 'UTC')",
"Decimal(9,3)",
"Decimal(18,3)",
"Enum8()",
"Enum16()",
"Enum8('ONE' = 1)",
"Enum8('ONE' = 1, 'TWO' = 2)",
"Enum16('ONE' = 1, 'TWO' = 2, 'THREE' = 3, 'FOUR' = 4)",
"Nullable(FixedString(10000))",
"Nullable(LowCardinality(FixedString(10000)))",
"Array(Int8)",
"Array(UInt8)",
"Array(String)",
"Array(Nullable(LowCardinality(FixedString(10000))))",
"Array(Enum8('ONE' = 1, 'TWO' = 2))"
"Tuple(String, Int8, Date, DateTime)",
"Nullable(Tuple(String, Int8, Date, DateTime))",
"Array(Nullable(Tuple(String, Int8, Date, DateTime)))",
"Array(Array(Nullable(Tuple(String, Int8, Date, DateTime))))",
"Array(Array(Array(Nullable(Tuple(String, Int8, Date, DateTime)))))",
"Array(Array(Array(Array(Nullable(Tuple(String, Int8, Date, DateTime('UTC')))))))"
"Array(Array(Array(Array(Nullable(Tuple(String, Int8, Date, DateTime('UTC'), Tuple(LowCardinality(String), Enum8('READ'=1, 'WRITE'=0))))))))",
};

// Check that Type::IsEqual returns true only if:
// - same Type instance
// - same Type layout (matching outer type with all nested types and/or parameters)
for (const auto & type_name : type_names) {
SCOPED_TRACE(type_name);
const auto type = clickhouse::CreateColumnByType(type_name)->Type();

// Should be equal to itself
EXPECT_TRUE(type->IsEqual(type));
EXPECT_TRUE(type->IsEqual(*type));

for (const auto & other_type_name : type_names) {
const auto other_type = clickhouse::CreateColumnByType(other_type_name)->Type();

const auto should_be_equal = type_name == other_type_name;
EXPECT_EQ(should_be_equal, type->IsEqual(other_type))
<< "For types: " << type_name << " and " << other_type_name;
}
}
}

0 comments on commit f1f32f1

Please sign in to comment.