Skip to content

Commit

Permalink
GH-40767: [C++][Parquet] Simplify PageWriter and ColumnWriter creation (
Browse files Browse the repository at this point in the history
#40768)

### Rationale for this change

Enhance style for creating page writer.

### What changes are included in this PR?

* create PageWriter with one calling
* extract `column_properties`

### Are these changes tested?

no need

### Are there any user-facing changes?

no

* GitHub Issue: #40767

Authored-by: mwish <[email protected]>
Signed-off-by: Antoine Pitrou <[email protected]>
  • Loading branch information
mapleFU authored Mar 26, 2024
1 parent 4b0f3fd commit 5e1a4fd
Showing 1 changed file with 36 additions and 65 deletions.
101 changes: 36 additions & 65 deletions cpp/src/parquet/file_writer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -139,39 +139,7 @@ class RowGroupSerializer : public RowGroupWriter::Contents {
}

const int32_t column_ordinal = next_column_index_++;
const auto& path = col_meta->descr()->path();
auto meta_encryptor =
file_encryptor_ ? file_encryptor_->GetColumnMetaEncryptor(path->ToDotString())
: nullptr;
auto data_encryptor =
file_encryptor_ ? file_encryptor_->GetColumnDataEncryptor(path->ToDotString())
: nullptr;
auto ci_builder = page_index_builder_ && properties_->page_index_enabled(path) &&
properties_->statistics_enabled(path)
? page_index_builder_->GetColumnIndexBuilder(column_ordinal)
: nullptr;
auto oi_builder = page_index_builder_ && properties_->page_index_enabled(path)
? page_index_builder_->GetOffsetIndexBuilder(column_ordinal)
: nullptr;
auto codec_options = properties_->codec_options(path)
? properties_->codec_options(path).get()
: nullptr;

std::unique_ptr<PageWriter> pager;
if (!codec_options) {
pager = PageWriter::Open(sink_, properties_->compression(path), col_meta,
row_group_ordinal_, static_cast<int16_t>(column_ordinal),
properties_->memory_pool(), false, meta_encryptor,
data_encryptor, properties_->page_checksum_enabled(),
ci_builder, oi_builder, CodecOptions());
} else {
pager = PageWriter::Open(sink_, properties_->compression(path), col_meta,
row_group_ordinal_, static_cast<int16_t>(column_ordinal),
properties_->memory_pool(), false, meta_encryptor,
data_encryptor, properties_->page_checksum_enabled(),
ci_builder, oi_builder, *codec_options);
}
column_writers_[0] = ColumnWriter::Make(col_meta, std::move(pager), properties_);
column_writers_[0] = CreateColumnWriterForColumn(col_meta, column_ordinal);
return column_writers_[0].get();
}

Expand Down Expand Up @@ -288,45 +256,48 @@ class RowGroupSerializer : public RowGroupWriter::Contents {
}

void InitColumns() {
for (int i = 0; i < num_columns(); i++) {
for (int i = 0; i < RowGroupSerializer::num_columns(); i++) {
auto col_meta = metadata_->NextColumnChunk();
const auto& path = col_meta->descr()->path();
const int32_t column_ordinal = next_column_index_++;
auto meta_encryptor =
file_encryptor_ ? file_encryptor_->GetColumnMetaEncryptor(path->ToDotString())
column_writers_.push_back(CreateColumnWriterForColumn(col_meta, column_ordinal));
}
}

std::shared_ptr<ColumnWriter> CreateColumnWriterForColumn(
ColumnChunkMetaDataBuilder* col_meta, int32_t column_ordinal) const {
const auto& path = col_meta->descr()->path();
const ColumnProperties& column_properties = properties_->column_properties(path);
auto meta_encryptor =
file_encryptor_ ? file_encryptor_->GetColumnMetaEncryptor(path->ToDotString())
: nullptr;
auto data_encryptor =
file_encryptor_ ? file_encryptor_->GetColumnDataEncryptor(path->ToDotString())
: nullptr;
auto ci_builder = page_index_builder_ && column_properties.page_index_enabled()
? page_index_builder_->GetColumnIndexBuilder(column_ordinal)
: nullptr;
auto data_encryptor =
file_encryptor_ ? file_encryptor_->GetColumnDataEncryptor(path->ToDotString())
auto oi_builder = page_index_builder_ && column_properties.page_index_enabled()
? page_index_builder_->GetOffsetIndexBuilder(column_ordinal)
: nullptr;
auto ci_builder = page_index_builder_ && properties_->page_index_enabled(path)
? page_index_builder_->GetColumnIndexBuilder(column_ordinal)
: nullptr;
auto oi_builder = page_index_builder_ && properties_->page_index_enabled(path)
? page_index_builder_->GetOffsetIndexBuilder(column_ordinal)
: nullptr;
auto codec_options = properties_->codec_options(path)
? (properties_->codec_options(path)).get()
: nullptr;

std::unique_ptr<PageWriter> pager;
if (!codec_options) {
pager = PageWriter::Open(
sink_, properties_->compression(path), col_meta, row_group_ordinal_,
static_cast<int16_t>(column_ordinal), properties_->memory_pool(),
buffered_row_group_, meta_encryptor, data_encryptor,
properties_->page_checksum_enabled(), ci_builder, oi_builder, CodecOptions());
} else {
pager = PageWriter::Open(
sink_, properties_->compression(path), col_meta, row_group_ordinal_,
static_cast<int16_t>(column_ordinal), properties_->memory_pool(),
buffered_row_group_, meta_encryptor, data_encryptor,
properties_->page_checksum_enabled(), ci_builder, oi_builder, *codec_options);
}
column_writers_.push_back(
ColumnWriter::Make(col_meta, std::move(pager), properties_));

const CodecOptions* codec_options = column_properties.codec_options()
? column_properties.codec_options().get()
: nullptr;
CodecOptions default_codec_options;
if (!codec_options) {
codec_options = &default_codec_options;
}
DCHECK_NE(nullptr, codec_options);
std::unique_ptr<PageWriter> pager = PageWriter::Open(
sink_, column_properties.compression(), col_meta, row_group_ordinal_,
static_cast<int16_t>(column_ordinal), properties_->memory_pool(),
buffered_row_group_, meta_encryptor, data_encryptor,
properties_->page_checksum_enabled(), ci_builder, oi_builder, *codec_options);
return ColumnWriter::Make(col_meta, std::move(pager), properties_);
}

// If buffered_row_group_ is false, only column_writers_[0] is used as current writer.
// If buffered_row_group_ is true, multiple column writers are used.
std::vector<std::shared_ptr<ColumnWriter>> column_writers_;
};

Expand Down

0 comments on commit 5e1a4fd

Please sign in to comment.