From cb4f13f579ea97891ac5df3e276ed82e90950fb9 Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Tue, 17 Dec 2024 16:42:35 +0100 Subject: [PATCH 1/3] deprecate unused parameter of `AddRowGroup()` --- cpp/src/arrow/dataset/file_parquet_test.cc | 3 +-- cpp/src/parquet/arrow/arrow_reader_writer_test.cc | 10 +++++----- cpp/src/parquet/arrow/writer.cc | 4 ++-- cpp/src/parquet/arrow/writer.h | 9 +++++++-- 4 files changed, 15 insertions(+), 11 deletions(-) diff --git a/cpp/src/arrow/dataset/file_parquet_test.cc b/cpp/src/arrow/dataset/file_parquet_test.cc index 2c05dcd9be459..536fcdb21c107 100644 --- a/cpp/src/arrow/dataset/file_parquet_test.cc +++ b/cpp/src/arrow/dataset/file_parquet_test.cc @@ -85,7 +85,6 @@ class ParquetFormatHelper { static Status WriteRecordBatch(const RecordBatch& batch, parquet::arrow::FileWriter* writer) { auto schema = batch.schema(); - auto size = batch.num_rows(); if (!schema->Equals(*writer->schema(), false)) { return Status::Invalid("RecordBatch schema does not match this writer's. batch:'", @@ -93,7 +92,7 @@ class ParquetFormatHelper { "'"); } - RETURN_NOT_OK(writer->NewRowGroup(size)); + RETURN_NOT_OK(writer->NewRowGroup()); for (int i = 0; i < batch.num_columns(); i++) { RETURN_NOT_OK(writer->WriteColumnChunk(*batch.column(i))); } diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc index 856c032c3588a..cedcebbfb6e33 100644 --- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc +++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc @@ -739,7 +739,7 @@ class ParquetIOTestBase : public ::testing::Test { ASSERT_OK_NO_THROW(FileWriter::Make(::arrow::default_memory_pool(), MakeWriter(schema), arrow_schema, default_arrow_writer_properties(), &writer)); - ASSERT_OK_NO_THROW(writer->NewRowGroup(values->length())); + ASSERT_OK_NO_THROW(writer->NewRowGroup()); ASSERT_OK_NO_THROW(writer->WriteColumnChunk(*values)); ASSERT_OK_NO_THROW(writer->Close()); // writer->Close() should be idempotent @@ -1053,7 +1053,7 @@ TYPED_TEST(TestParquetIO, SingleColumnRequiredChunkedWrite) { this->MakeWriter(schema), arrow_schema, default_arrow_writer_properties(), &writer)); for (int i = 0; i < 4; i++) { - ASSERT_OK_NO_THROW(writer->NewRowGroup(chunk_size)); + ASSERT_OK_NO_THROW(writer->NewRowGroup()); std::shared_ptr sliced_array = values->Slice(i * chunk_size, chunk_size); ASSERT_OK_NO_THROW(writer->WriteColumnChunk(*sliced_array)); } @@ -1126,7 +1126,7 @@ TYPED_TEST(TestParquetIO, SingleColumnOptionalChunkedWrite) { this->MakeWriter(schema), arrow_schema, default_arrow_writer_properties(), &writer)); for (int i = 0; i < 4; i++) { - ASSERT_OK_NO_THROW(writer->NewRowGroup(chunk_size)); + ASSERT_OK_NO_THROW(writer->NewRowGroup()); std::shared_ptr sliced_array = values->Slice(i * chunk_size, chunk_size); ASSERT_OK_NO_THROW(writer->WriteColumnChunk(*sliced_array)); } @@ -5149,7 +5149,7 @@ class TestIntegerAnnotateDecimalTypeParquetIO : public TestParquetIO { ::arrow::default_memory_pool(), ParquetFileWriter::Open(this->sink_, schema_node, writer_properties), arrow_schema, default_arrow_writer_properties(), &writer)); - ASSERT_OK_NO_THROW(writer->NewRowGroup(values->length())); + ASSERT_OK_NO_THROW(writer->NewRowGroup()); ASSERT_OK_NO_THROW(writer->WriteColumnChunk(*values)); ASSERT_OK_NO_THROW(writer->Close()); } @@ -5481,7 +5481,7 @@ TEST(TestArrowReadWrite, OperationsOnClosedWriter) { // Operations on closed writer are invalid ASSERT_OK(writer->Close()); - ASSERT_RAISES(Invalid, writer->NewRowGroup(1)); + ASSERT_RAISES(Invalid, writer->NewRowGroup()); ASSERT_RAISES(Invalid, writer->WriteColumnChunk(table->column(0), 0, 1)); ASSERT_RAISES(Invalid, writer->NewBufferedRowGroup()); ASSERT_OK_AND_ASSIGN(auto record_batch, table->CombineChunksToBatch()); diff --git a/cpp/src/parquet/arrow/writer.cc b/cpp/src/parquet/arrow/writer.cc index 463713df1b1aa..c6d86648c1d63 100644 --- a/cpp/src/parquet/arrow/writer.cc +++ b/cpp/src/parquet/arrow/writer.cc @@ -305,7 +305,7 @@ class FileWriterImpl : public FileWriter { default_arrow_reader_properties(), &schema_manifest_); } - Status NewRowGroup(int64_t chunk_size) override { + Status NewRowGroup() override { RETURN_NOT_OK(CheckClosed()); if (row_group_writer_ != nullptr) { PARQUET_CATCH_NOT_OK(row_group_writer_->Close()); @@ -379,7 +379,7 @@ class FileWriterImpl : public FileWriter { } auto WriteRowGroup = [&](int64_t offset, int64_t size) { - RETURN_NOT_OK(NewRowGroup(size)); + RETURN_NOT_OK(NewRowGroup()); for (int i = 0; i < table.num_columns(); i++) { RETURN_NOT_OK(WriteColumnChunk(table.column(i), offset, size)); } diff --git a/cpp/src/parquet/arrow/writer.h b/cpp/src/parquet/arrow/writer.h index 4e1ddafd9a082..e36b8f252c750 100644 --- a/cpp/src/parquet/arrow/writer.h +++ b/cpp/src/parquet/arrow/writer.h @@ -87,9 +87,14 @@ class PARQUET_EXPORT FileWriter { /// \brief Start a new row group. /// /// Returns an error if not all columns have been written. + virtual ::arrow::Status NewRowGroup() = 0; + + /// \brief Start a new row group. /// - /// \param chunk_size the number of rows in the next row group. - virtual ::arrow::Status NewRowGroup(int64_t chunk_size) = 0; + /// \deprecated Deprecated in 19.0.0. + ARROW_DEPRECATED( + "Deprecated in 19.0.0. Use NewRowGroup() without the `chunk_size` argument.") + virtual ::arrow::Status NewRowGroup(int64_t chunk_size) { return NewRowGroup(); } /// \brief Write ColumnChunk in row group using an array. virtual ::arrow::Status WriteColumnChunk(const ::arrow::Array& data) = 0; From 49bb54a34df1b82fe8b4929395068d7ddcb15945 Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Tue, 17 Dec 2024 16:57:53 +0100 Subject: [PATCH 2/3] remove argument usage from c_glib and pyarrow --- c_glib/parquet-glib/arrow-file-writer.cpp | 2 +- python/pyarrow/_parquet.pxd | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/c_glib/parquet-glib/arrow-file-writer.cpp b/c_glib/parquet-glib/arrow-file-writer.cpp index 2b8e2bdeac026..d46a41c98a42c 100644 --- a/c_glib/parquet-glib/arrow-file-writer.cpp +++ b/c_glib/parquet-glib/arrow-file-writer.cpp @@ -590,7 +590,7 @@ gparquet_arrow_file_writer_new_row_group(GParquetArrowFileWriter *writer, { auto parquet_arrow_file_writer = gparquet_arrow_file_writer_get_raw(writer); return garrow::check(error, - parquet_arrow_file_writer->NewRowGroup(chunk_size), + parquet_arrow_file_writer->NewRowGroup(), "[parquet][arrow][file-writer][new-row-group]"); } diff --git a/python/pyarrow/_parquet.pxd b/python/pyarrow/_parquet.pxd index d6aebd8284f4a..71e93ce0a4c51 100644 --- a/python/pyarrow/_parquet.pxd +++ b/python/pyarrow/_parquet.pxd @@ -556,7 +556,7 @@ cdef extern from "parquet/arrow/writer.h" namespace "parquet::arrow" nogil: const shared_ptr[ArrowWriterProperties]& arrow_properties) CStatus WriteTable(const CTable& table, int64_t chunk_size) - CStatus NewRowGroup(int64_t chunk_size) + CStatus NewRowGroup() CStatus Close() CStatus AddKeyValueMetadata(const shared_ptr[const CKeyValueMetadata]& key_value_metadata) From 789d57045e97037406d04d964365b0c2014d1958 Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Wed, 18 Dec 2024 09:13:38 +0100 Subject: [PATCH 3/3] address review comments --- c_glib/parquet-glib/arrow-file-writer.cpp | 5 +---- python/pyarrow/_parquet.pxd | 1 + 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/c_glib/parquet-glib/arrow-file-writer.cpp b/c_glib/parquet-glib/arrow-file-writer.cpp index d46a41c98a42c..738fb4fd824c8 100644 --- a/c_glib/parquet-glib/arrow-file-writer.cpp +++ b/c_glib/parquet-glib/arrow-file-writer.cpp @@ -574,7 +574,6 @@ gparquet_arrow_file_writer_write_table(GParquetArrowFileWriter *writer, /** * gparquet_arrow_file_writer_new_row_group: * @writer: A #GParquetArrowFileWriter. - * @chunk_size: The max number of rows in a row group. * @error: (nullable): Return location for a #GError or %NULL. * * Start a new row group. @@ -584,9 +583,7 @@ gparquet_arrow_file_writer_write_table(GParquetArrowFileWriter *writer, * Since: 18.0.0 */ gboolean -gparquet_arrow_file_writer_new_row_group(GParquetArrowFileWriter *writer, - gsize chunk_size, - GError **error) +gparquet_arrow_file_writer_new_row_group(GParquetArrowFileWriter *writer, GError **error) { auto parquet_arrow_file_writer = gparquet_arrow_file_writer_get_raw(writer); return garrow::check(error, diff --git a/python/pyarrow/_parquet.pxd b/python/pyarrow/_parquet.pxd index 71e93ce0a4c51..dba25e4a3f18a 100644 --- a/python/pyarrow/_parquet.pxd +++ b/python/pyarrow/_parquet.pxd @@ -556,6 +556,7 @@ cdef extern from "parquet/arrow/writer.h" namespace "parquet::arrow" nogil: const shared_ptr[ArrowWriterProperties]& arrow_properties) CStatus WriteTable(const CTable& table, int64_t chunk_size) + CStatus NewRowGroup(int64_t chunk_size) CStatus NewRowGroup() CStatus Close() CStatus AddKeyValueMetadata(const shared_ptr[const CKeyValueMetadata]& key_value_metadata)