Skip to content

Commit

Permalink
Lazy materialization of RowType::parameters (facebookincubator#10144)
Browse files Browse the repository at this point in the history
Summary:
Pull Request resolved: facebookincubator#10144

As part of optimization for reading tables with large number of columns
(> 7000), the materialization of `RowType::parameters` of table/file schema
costs a non-trivial amount of CPU and the parameters are never used.  Making it
lazy saves large amount of CPU for these tables.

As a result of laziness, `RowType` copy constructor is deleted.  This helps us
spotting some places in code base where we copy the object unnessarily and this
change cleans them up.

Reviewed By: oerling

Differential Revision: D58417885
  • Loading branch information
Yuhta authored and facebook-github-bot committed Jun 12, 2024
1 parent 9cc123f commit 5c05b57
Show file tree
Hide file tree
Showing 12 changed files with 64 additions and 15 deletions.
2 changes: 1 addition & 1 deletion velox/dwio/parquet/writer/Writer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,7 @@ std::shared_ptr<::arrow::Field> updateFieldNameRecursive(
const Type& type,
const std::string& name = "") {
if (type.isRow()) {
auto rowType = type.asRow();
auto& rowType = type.asRow();
auto newField = field->WithName(name);
auto structType =
std::dynamic_pointer_cast<::arrow::StructType>(newField->type());
Expand Down
2 changes: 1 addition & 1 deletion velox/examples/OpaqueType.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -314,7 +314,7 @@ VectorPtr evaluate(
std::vector<VectorPtr> result{nullptr};
SelectivityVector rows{rowVector->size()};

auto rowType = rowVector->type()->as<TypeKind::ROW>();
auto& rowType = rowVector->type()->as<TypeKind::ROW>();

auto fieldAccessExprNode1 = std::make_shared<core::FieldAccessTypedExpr>(
rowType.findChild(argName1), argName1);
Expand Down
2 changes: 1 addition & 1 deletion velox/exec/benchmarks/FilterProjectBenchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,7 @@ class FilterProjectBenchmark : public VectorTestBase {
bool shareStringDicts,
bool stringNulls) {
assert(!rows.empty());
auto type = rows[0]->type()->as<TypeKind::ROW>();
auto& type = rows[0]->type()->as<TypeKind::ROW>();
auto numColumns = rows[0]->type()->size();
for (auto column = 0; column < numColumns; ++column) {
if (type.childAt(column)->kind() == TypeKind::VARCHAR) {
Expand Down
2 changes: 1 addition & 1 deletion velox/exec/tests/MergeTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ class MergeTest : public OperatorTestBase {
const std::vector<RowVectorPtr>& inputVectors,
const std::string& key1,
const std::string& key2) {
auto rowType = inputVectors[0]->type()->asRow();
auto& rowType = inputVectors[0]->type()->asRow();
auto sortingKeys = {rowType.getChildIdx(key1), rowType.getChildIdx(key2)};

std::vector<core::SortOrder> sortOrders = {
Expand Down
2 changes: 1 addition & 1 deletion velox/exec/tests/OrderByTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@ class OrderByTest : public OperatorTestBase {
const std::vector<RowVectorPtr>& input,
const std::string& key1,
const std::string& key2) {
auto rowType = input[0]->type()->asRow();
auto& rowType = input[0]->type()->asRow();
auto keyIndices = {rowType.getChildIdx(key1), rowType.getChildIdx(key2)};

std::vector<core::SortOrder> sortOrders = {
Expand Down
2 changes: 1 addition & 1 deletion velox/exec/tests/TopNTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ class TopNTest : public OperatorTestBase {
const std::string& key1,
const std::string& key2,
int32_t limit) {
auto rowType = input[0]->type()->asRow();
auto& rowType = input[0]->type()->asRow();
auto keyIndices = {rowType.getChildIdx(key1), rowType.getChildIdx(key2)};

auto sortOrderSqls = getSortOrderSqls();
Expand Down
4 changes: 2 additions & 2 deletions velox/exec/tests/utils/QueryAssertions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -875,7 +875,7 @@ std::vector<MaterializedRow> materialize(const RowVectorPtr& vector) {
std::vector<MaterializedRow> rows;
rows.reserve(size);

auto rowType = vector->type()->as<TypeKind::ROW>();
auto& rowType = vector->type()->as<TypeKind::ROW>();

for (size_t i = 0; i < size; ++i) {
auto numColumns = rowType.size();
Expand All @@ -896,7 +896,7 @@ void DuckDbQueryRunner::createTable(
auto query = fmt::format("DROP TABLE IF EXISTS {}", name);
execute(query);

auto rowType = data[0]->type()->as<TypeKind::ROW>();
auto& rowType = data[0]->type()->as<TypeKind::ROW>();
::duckdb::Connection con(db_);
auto sql = duckdb::makeCreateTableSql(name, rowType);
auto res = con.Query(sql);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -756,7 +756,7 @@ std::vector<VectorPtr> extractArgColumns(
const core::CallTypedExprPtr& aggregateExpr,
const RowVectorPtr& input,
memory::MemoryPool* pool) {
auto type = input->type()->asRow();
auto& type = input->type()->asRow();
std::vector<VectorPtr> columns;
for (const auto& arg : aggregateExpr->inputs()) {
if (auto field = core::TypedExprs::asFieldAccess(arg)) {
Expand Down
15 changes: 12 additions & 3 deletions velox/type/Type.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -340,9 +340,7 @@ std::string namesAndTypesToString(
} // namespace

RowType::RowType(std::vector<std::string>&& names, std::vector<TypePtr>&& types)
: names_{std::move(names)},
children_{std::move(types)},
parameters_{createTypeParameters(children_)} {
: names_{std::move(names)}, children_{std::move(types)} {
VELOX_CHECK_EQ(
names_.size(),
children_.size(),
Expand All @@ -356,6 +354,17 @@ RowType::RowType(std::vector<std::string>&& names, std::vector<TypePtr>&& types)
}
}

RowType::~RowType() {
if (auto* parameters = parameters_.load()) {
delete parameters;
}
}

std::unique_ptr<std::vector<TypeParameter>> RowType::makeParameters() const {
return std::make_unique<std::vector<TypeParameter>>(
createTypeParameters(children_));
}

uint32_t RowType::size() const {
return children_.size();
}
Expand Down
17 changes: 15 additions & 2 deletions velox/type/Type.h
Original file line number Diff line number Diff line change
Expand Up @@ -913,6 +913,8 @@ class RowType : public TypeBase<TypeKind::ROW> {
std::vector<std::string>&& names,
std::vector<std::shared_ptr<const Type>>&& types);

~RowType() override;

uint32_t size() const override;

const std::shared_ptr<const Type>& childAt(uint32_t idx) const override;
Expand Down Expand Up @@ -959,13 +961,24 @@ class RowType : public TypeBase<TypeKind::ROW> {
}

const std::vector<TypeParameter>& parameters() const override {
return parameters_;
auto* parameters = parameters_.load();
if (FOLLY_UNLIKELY(!parameters)) {
parameters = makeParameters().release();
std::vector<TypeParameter>* oldParameters = nullptr;
if (!parameters_.compare_exchange_strong(oldParameters, parameters)) {
delete parameters;
parameters = oldParameters;
}
}
return *parameters;
}

private:
std::unique_ptr<std::vector<TypeParameter>> makeParameters() const;

const std::vector<std::string> names_;
const std::vector<std::shared_ptr<const Type>> children_;
const std::vector<TypeParameter> parameters_;
mutable std::atomic<std::vector<TypeParameter>*> parameters_{nullptr};
};

using RowTypePtr = std::shared_ptr<const RowType>;
Expand Down
27 changes: 27 additions & 0 deletions velox/type/tests/TypeTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -461,6 +461,33 @@ TEST(TypeTest, emptyRow) {
testTypeSerde(row);
}

TEST(TypeTest, rowParametersMultiThreaded) {
std::vector<std::string> names;
std::vector<TypePtr> types;
for (int i = 0; i < 20'000; ++i) {
auto name = fmt::format("c{}", i);
names.push_back(name);
types.push_back(ROW({name}, {BIGINT()}));
}
auto type = ROW(std::move(names), std::move(types));
constexpr int kNumThreads = 72;
const std::vector<TypeParameter>* parameters[kNumThreads];
std::vector<std::thread> threads;
for (int i = 0; i < kNumThreads; ++i) {
threads.emplace_back([&, i] { parameters[i] = &type->parameters(); });
}
for (auto& thread : threads) {
thread.join();
}
for (int i = 1; i < kNumThreads; ++i) {
ASSERT_TRUE(parameters[i] == parameters[0]);
}
ASSERT_EQ(parameters[0]->size(), type->size());
for (int i = 0; i < parameters[0]->size(); ++i) {
ASSERT_TRUE((*parameters[0])[i].type.get() == type->childAt(i).get());
}
}

class Foo {};
class Bar {};

Expand Down
2 changes: 1 addition & 1 deletion velox/vector/BaseVector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -300,7 +300,7 @@ VectorPtr BaseVector::createInternal(
switch (kind) {
case TypeKind::ROW: {
std::vector<VectorPtr> children;
auto rowType = type->as<TypeKind::ROW>();
auto& rowType = type->as<TypeKind::ROW>();
// Children are reserved the parent size and accessible for those rows.
for (int32_t i = 0; i < rowType.size(); ++i) {
children.push_back(create(rowType.childAt(i), size, pool));
Expand Down

0 comments on commit 5c05b57

Please sign in to comment.