From 450410a3f9ca81bbcf6a6049443331fc6ec23941 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Tue, 3 Dec 2024 19:02:35 +0100 Subject: [PATCH 1/2] GH-41667: [C++][Parquet] Refuse writing non-nullable column that contains nulls A non-nullable column that contains nulls would result in an invalid Parquet file. --- .../parquet/arrow/arrow_reader_writer_test.cc | 21 +++++++++++++++++++ cpp/src/parquet/column_writer.cc | 4 ++++ 2 files changed, 25 insertions(+) diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc index f8e639176aba3..856c032c3588a 100644 --- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc +++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc @@ -3349,6 +3349,27 @@ TEST(TestArrowWrite, CheckChunkSize) { WriteTable(*table, ::arrow::default_memory_pool(), sink, chunk_size)); } +void CheckWritingNonNullableColumnWithNulls(std::shared_ptr<::arrow::Field> field, + std::string json_batch) { + ARROW_SCOPED_TRACE("field = ", field, ", json_batch = ", json_batch); + auto schema = ::arrow::schema({field}); + auto table = ::arrow::TableFromJSON(schema, {json_batch}); + auto sink = CreateOutputStream(); + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, ::testing::HasSubstr("is declared non-nullable but contains nulls"), + WriteTable(*table, ::arrow::default_memory_pool(), sink)); +} + +TEST(TestArrowWrite, InvalidSchema) { + // GH-41667: nulls in non-nullable column + CheckWritingNonNullableColumnWithNulls( + ::arrow::field("a", ::arrow::int32(), /*nullable=*/false), + R"([{"a": 456}, {"a": null}])"); + CheckWritingNonNullableColumnWithNulls( + ::arrow::field("a", ::arrow::utf8(), /*nullable=*/false), + R"([{"a": "foo"}, {"a": null}])"); +} + void DoNestedValidate(const std::shared_ptr<::arrow::DataType>& inner_type, const std::shared_ptr<::arrow::Field>& outer_field, const std::shared_ptr& buffer, diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index 94c301f918544..d3e0fdfa811c0 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -1301,6 +1301,10 @@ class TypedColumnWriterImpl : public ColumnWriterImpl, public TypedColumnWriter< bool single_nullable_element = (level_info_.def_level == level_info_.repeated_ancestor_def_level + 1) && leaf_field_nullable; + if (!leaf_field_nullable && leaf_array.null_count() != 0) { + return Status::Invalid("Column '", descr_->name(), + "' is declared non-nullable but contains nulls"); + } bool maybe_parent_nulls = level_info_.HasNullableValues() && !single_nullable_element; if (maybe_parent_nulls) { ARROW_ASSIGN_OR_RAISE( From bb71157c032057a7d916fc19075db89bdc5e67d1 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Wed, 4 Dec 2024 10:16:13 +0100 Subject: [PATCH 2/2] Try fix for JNI test failure --- .../java/org/apache/arrow/dataset/TestAllTypes.java | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/java/dataset/src/test/java/org/apache/arrow/dataset/TestAllTypes.java b/java/dataset/src/test/java/org/apache/arrow/dataset/TestAllTypes.java index eb73663191414..935edcd8b3585 100644 --- a/java/dataset/src/test/java/org/apache/arrow/dataset/TestAllTypes.java +++ b/java/dataset/src/test/java/org/apache/arrow/dataset/TestAllTypes.java @@ -95,11 +95,9 @@ private VectorSchemaRoot generateAllTypesVector(BufferAllocator allocator) { // DenseUnion List childFields = new ArrayList<>(); childFields.add( - new Field( - "int-child", new FieldType(false, new ArrowType.Int(32, true), null, null), null)); + new Field("int-child", FieldType.notNullable(new ArrowType.Int(32, true)), null)); Field structField = - new Field( - "struct", new FieldType(true, ArrowType.Struct.INSTANCE, null, null), childFields); + new Field("struct", FieldType.nullable(ArrowType.Struct.INSTANCE), childFields); Field[] fields = new Field[] { Field.nullablePrimitive("null", ArrowType.Null.INSTANCE), @@ -239,7 +237,11 @@ private VectorSchemaRoot generateAllTypesVector(BufferAllocator allocator) { largeListWriter.integer().writeInt(1); largeListWriter.endList(); - ((StructVector) root.getVector("struct")).getChild("int-child", IntVector.class).set(1, 1); + var intChildVector = + ((StructVector) root.getVector("struct")).getChild("int-child", IntVector.class); + // set a non-null value at index 0 since the field is not nullable + intChildVector.set(0, 0); + intChildVector.set(1, 1); return root; }