From 4d591e58edf70d220c44306c2f7db36927784c9f Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Tue, 24 Sep 2024 13:45:29 +0200 Subject: [PATCH 01/16] Stricter equality check --- cpp/src/arrow/extension/json.cc | 25 ++++++++++++---------- cpp/src/arrow/extension/json.h | 4 +++- cpp/src/arrow/extension/json_test.cc | 14 ++++++++++++ cpp/src/parquet/arrow/arrow_schema_test.cc | 6 +++--- 4 files changed, 34 insertions(+), 15 deletions(-) diff --git a/cpp/src/arrow/extension/json.cc b/cpp/src/arrow/extension/json.cc index d793233c2b573..5e396738e62df 100644 --- a/cpp/src/arrow/extension/json.cc +++ b/cpp/src/arrow/extension/json.cc @@ -28,17 +28,13 @@ namespace arrow::extension { bool JsonExtensionType::ExtensionEquals(const ExtensionType& other) const { - return other.extension_name() == this->extension_name(); + return other.extension_name() == this->extension_name() && + other.storage_type()->Equals(storage_type_); } Result> JsonExtensionType::Deserialize( std::shared_ptr storage_type, const std::string& serialized) const { - if (storage_type->id() != Type::STRING && storage_type->id() != Type::STRING_VIEW && - storage_type->id() != Type::LARGE_STRING) { - return Status::Invalid("Invalid storage type for JsonExtensionType: ", - storage_type->ToString()); - } - return std::make_shared(storage_type); + return JsonExtensionType::Make(storage_type); } std::string JsonExtensionType::Serialize() const { return ""; } @@ -51,11 +47,18 @@ std::shared_ptr JsonExtensionType::MakeArray( return std::make_shared(data); } -std::shared_ptr json(const std::shared_ptr storage_type) { - ARROW_CHECK(storage_type->id() != Type::STRING || - storage_type->id() != Type::STRING_VIEW || - storage_type->id() != Type::LARGE_STRING); +Result> JsonExtensionType::Make( + const std::shared_ptr& storage_type) { + if (storage_type->id() != Type::STRING && storage_type->id() != Type::STRING_VIEW && + storage_type->id() != Type::LARGE_STRING) { + return Status::Invalid("Invalid storage type for JsonExtensionType: ", + storage_type->ToString()); + } return std::make_shared(storage_type); } +std::shared_ptr json(const std::shared_ptr& storage_type) { + return JsonExtensionType::Make(storage_type).ValueOrDie(); +} + } // namespace arrow::extension diff --git a/cpp/src/arrow/extension/json.h b/cpp/src/arrow/extension/json.h index 4793ab2bc9b36..4d475536cff59 100644 --- a/cpp/src/arrow/extension/json.h +++ b/cpp/src/arrow/extension/json.h @@ -45,12 +45,14 @@ class ARROW_EXPORT JsonExtensionType : public ExtensionType { std::shared_ptr MakeArray(std::shared_ptr data) const override; + static Result> Make(const std::shared_ptr& storage_type); + private: std::shared_ptr storage_type_; }; /// \brief Return a JsonExtensionType instance. ARROW_EXPORT std::shared_ptr json( - std::shared_ptr storage_type = utf8()); + const std::shared_ptr& storage_type = utf8()); } // namespace arrow::extension diff --git a/cpp/src/arrow/extension/json_test.cc b/cpp/src/arrow/extension/json_test.cc index 143e4f9ceeac7..b938ddb2cfef3 100644 --- a/cpp/src/arrow/extension/json_test.cc +++ b/cpp/src/arrow/extension/json_test.cc @@ -80,4 +80,18 @@ TEST_F(TestJsonExtensionType, InvalidUTF8) { } } +TEST_F(TestJsonExtensionType, StorageTypeValidation) { + ASSERT_TRUE(json(utf8())->Equals(json(utf8()))); + ASSERT_FALSE(json(large_utf8())->Equals(json(utf8()))); + ASSERT_FALSE(json(utf8_view())->Equals(json(utf8()))); + ASSERT_FALSE(json(utf8_view())->Equals(json(large_utf8()))); + + for (const auto& storage_type : {int16(), binary(), float64(), null()}) { + ASSERT_RAISES_WITH_MESSAGE(Invalid, + "Invalid: Invalid storage type for JsonExtensionType: " + + storage_type->ToString(), + extension::JsonExtensionType::Make(storage_type)); + } +} + } // namespace arrow diff --git a/cpp/src/parquet/arrow/arrow_schema_test.cc b/cpp/src/parquet/arrow/arrow_schema_test.cc index 31ead461aa6e2..334e2919d46ac 100644 --- a/cpp/src/parquet/arrow/arrow_schema_test.cc +++ b/cpp/src/parquet/arrow/arrow_schema_test.cc @@ -763,7 +763,7 @@ TEST_F(TestConvertParquetSchema, ParquetSchemaArrowExtensions) { props.set_arrow_extensions_enabled(true); auto arrow_schema = ::arrow::schema( {::arrow::field("json_1", ::arrow::extension::json(), true), - ::arrow::field("json_2", ::arrow::extension::json(::arrow::large_utf8()), + ::arrow::field("json_2", ::arrow::extension::json(::arrow::utf8()), true)}); std::shared_ptr metadata{}; ASSERT_OK(ConvertSchema(parquet_fields, metadata, props)); @@ -780,7 +780,7 @@ TEST_F(TestConvertParquetSchema, ParquetSchemaArrowExtensions) { ::arrow::key_value_metadata({"foo", "bar"}, {"biz", "baz"}); auto arrow_schema = ::arrow::schema( {::arrow::field("json_1", ::arrow::extension::json(), true, field_metadata), - ::arrow::field("json_2", ::arrow::extension::json(::arrow::large_utf8()), + ::arrow::field("json_2", ::arrow::extension::json(::arrow::utf8()), true)}); std::shared_ptr metadata; @@ -798,7 +798,7 @@ TEST_F(TestConvertParquetSchema, ParquetSchemaArrowExtensions) { ::arrow::key_value_metadata({"foo", "bar"}, {"biz", "baz"}); auto arrow_schema = ::arrow::schema( {::arrow::field("json_1", ::arrow::extension::json(), true, field_metadata), - ::arrow::field("json_2", ::arrow::extension::json(::arrow::large_utf8()), + ::arrow::field("json_2", ::arrow::extension::json(::arrow::utf8()), true)}); std::shared_ptr metadata; From 60eda4a18bd347e8bc72b624db5ef75642cbc397 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Tue, 24 Sep 2024 14:57:59 +0200 Subject: [PATCH 02/16] Review feedback --- cpp/src/parquet/arrow/arrow_schema_test.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/src/parquet/arrow/arrow_schema_test.cc b/cpp/src/parquet/arrow/arrow_schema_test.cc index 334e2919d46ac..31ead461aa6e2 100644 --- a/cpp/src/parquet/arrow/arrow_schema_test.cc +++ b/cpp/src/parquet/arrow/arrow_schema_test.cc @@ -763,7 +763,7 @@ TEST_F(TestConvertParquetSchema, ParquetSchemaArrowExtensions) { props.set_arrow_extensions_enabled(true); auto arrow_schema = ::arrow::schema( {::arrow::field("json_1", ::arrow::extension::json(), true), - ::arrow::field("json_2", ::arrow::extension::json(::arrow::utf8()), + ::arrow::field("json_2", ::arrow::extension::json(::arrow::large_utf8()), true)}); std::shared_ptr metadata{}; ASSERT_OK(ConvertSchema(parquet_fields, metadata, props)); @@ -780,7 +780,7 @@ TEST_F(TestConvertParquetSchema, ParquetSchemaArrowExtensions) { ::arrow::key_value_metadata({"foo", "bar"}, {"biz", "baz"}); auto arrow_schema = ::arrow::schema( {::arrow::field("json_1", ::arrow::extension::json(), true, field_metadata), - ::arrow::field("json_2", ::arrow::extension::json(::arrow::utf8()), + ::arrow::field("json_2", ::arrow::extension::json(::arrow::large_utf8()), true)}); std::shared_ptr metadata; @@ -798,7 +798,7 @@ TEST_F(TestConvertParquetSchema, ParquetSchemaArrowExtensions) { ::arrow::key_value_metadata({"foo", "bar"}, {"biz", "baz"}); auto arrow_schema = ::arrow::schema( {::arrow::field("json_1", ::arrow::extension::json(), true, field_metadata), - ::arrow::field("json_2", ::arrow::extension::json(::arrow::utf8()), + ::arrow::field("json_2", ::arrow::extension::json(::arrow::large_utf8()), true)}); std::shared_ptr metadata; From 1ba3ba798cf9ee9bb4cda53695397fc477e0e1b1 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Tue, 24 Sep 2024 22:07:05 +0200 Subject: [PATCH 03/16] Change how extension type storage is restored after reading parquet --- cpp/src/arrow/extension/json.cc | 2 +- cpp/src/arrow/extension/json.h | 2 +- cpp/src/parquet/arrow/arrow_schema_test.cc | 13 ++++++------- cpp/src/parquet/arrow/schema.cc | 4 +++- 4 files changed, 11 insertions(+), 10 deletions(-) diff --git a/cpp/src/arrow/extension/json.cc b/cpp/src/arrow/extension/json.cc index 5e396738e62df..4fd33a0c899ee 100644 --- a/cpp/src/arrow/extension/json.cc +++ b/cpp/src/arrow/extension/json.cc @@ -57,7 +57,7 @@ Result> JsonExtensionType::Make( return std::make_shared(storage_type); } -std::shared_ptr json(const std::shared_ptr& storage_type) { +std::shared_ptr json(std::shared_ptr storage_type) { return JsonExtensionType::Make(storage_type).ValueOrDie(); } diff --git a/cpp/src/arrow/extension/json.h b/cpp/src/arrow/extension/json.h index 4d475536cff59..106da900c361f 100644 --- a/cpp/src/arrow/extension/json.h +++ b/cpp/src/arrow/extension/json.h @@ -53,6 +53,6 @@ class ARROW_EXPORT JsonExtensionType : public ExtensionType { /// \brief Return a JsonExtensionType instance. ARROW_EXPORT std::shared_ptr json( - const std::shared_ptr& storage_type = utf8()); + std::shared_ptr storage_type = utf8()); } // namespace arrow::extension diff --git a/cpp/src/parquet/arrow/arrow_schema_test.cc b/cpp/src/parquet/arrow/arrow_schema_test.cc index 31ead461aa6e2..9350922b2053e 100644 --- a/cpp/src/parquet/arrow/arrow_schema_test.cc +++ b/cpp/src/parquet/arrow/arrow_schema_test.cc @@ -757,14 +757,13 @@ TEST_F(TestConvertParquetSchema, ParquetSchemaArrowExtensions) { { // Parquet file does not contain Arrow schema. - // If Arrow extensions are enabled, both fields should be treated as json() extension - // fields. + // If Arrow extensions are enabled, fields will be interpreted as json(utf8()) + // extension fields. ArrowReaderProperties props; props.set_arrow_extensions_enabled(true); auto arrow_schema = ::arrow::schema( {::arrow::field("json_1", ::arrow::extension::json(), true), - ::arrow::field("json_2", ::arrow::extension::json(::arrow::large_utf8()), - true)}); + ::arrow::field("json_2", ::arrow::extension::json(::arrow::utf8()), true)}); std::shared_ptr metadata{}; ASSERT_OK(ConvertSchema(parquet_fields, metadata, props)); CheckFlatSchema(arrow_schema); @@ -772,8 +771,8 @@ TEST_F(TestConvertParquetSchema, ParquetSchemaArrowExtensions) { { // Parquet file contains Arrow schema. - // Both json_1 and json_2 should be returned as a json() field - // even though extensions are not enabled. + // json_1 and json_2 will be interpreted as json(utf8()) and json(large_utf8()) + // fields even though extensions are not enabled. ArrowReaderProperties props; props.set_arrow_extensions_enabled(false); std::shared_ptr field_metadata = @@ -791,7 +790,7 @@ TEST_F(TestConvertParquetSchema, ParquetSchemaArrowExtensions) { { // Parquet file contains Arrow schema. Extensions are enabled. - // Both json_1 and json_2 should be returned as a json() field + // json_1 and json_2 will be interpreted as json(utf8()) and json(large_utf8()). ArrowReaderProperties props; props.set_arrow_extensions_enabled(true); std::shared_ptr field_metadata = diff --git a/cpp/src/parquet/arrow/schema.cc b/cpp/src/parquet/arrow/schema.cc index 1623d80dcb0e4..ef175668b0bcc 100644 --- a/cpp/src/parquet/arrow/schema.cc +++ b/cpp/src/parquet/arrow/schema.cc @@ -1017,7 +1017,9 @@ Result ApplyOriginalMetadata(const Field& origin_field, SchemaField* infer // Restore extension type, if the storage type is the same as inferred // from the Parquet type - if (ex_type.storage_type()->Equals(*inferred->field->type())) { + if (ex_type.storage_type()->Equals(*inferred->field->type()) || + (ex_type.extension_name() == "arrow.json" && + !ex_type.storage_type()->Equals(*inferred->field->type()))) { inferred->field = inferred->field->WithType(origin_type); } } From 94006f92085377fc4b72c2a2da844e36fb4d86d3 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Tue, 24 Sep 2024 22:15:33 +0200 Subject: [PATCH 04/16] lint --- cpp/src/arrow/extension/json.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/extension/json.h b/cpp/src/arrow/extension/json.h index 106da900c361f..f5edb286be123 100644 --- a/cpp/src/arrow/extension/json.h +++ b/cpp/src/arrow/extension/json.h @@ -45,7 +45,8 @@ class ARROW_EXPORT JsonExtensionType : public ExtensionType { std::shared_ptr MakeArray(std::shared_ptr data) const override; - static Result> Make(const std::shared_ptr& storage_type); + static Result> Make( + const std::shared_ptr& storage_type); private: std::shared_ptr storage_type_; From e522efb7a0d291573e8c44548b68e9d120e70328 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Wed, 25 Sep 2024 11:12:58 +0200 Subject: [PATCH 05/16] Add test case --- cpp/src/parquet/arrow/arrow_schema_test.cc | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/cpp/src/parquet/arrow/arrow_schema_test.cc b/cpp/src/parquet/arrow/arrow_schema_test.cc index 9350922b2053e..c60565b25594e 100644 --- a/cpp/src/parquet/arrow/arrow_schema_test.cc +++ b/cpp/src/parquet/arrow/arrow_schema_test.cc @@ -767,6 +767,17 @@ TEST_F(TestConvertParquetSchema, ParquetSchemaArrowExtensions) { std::shared_ptr metadata{}; ASSERT_OK(ConvertSchema(parquet_fields, metadata, props)); CheckFlatSchema(arrow_schema); + + // If original data was e.g. json(large_utf8()) it will be interpreted as json(utf8()) + // in absence of Arrow schema. + arrow_schema = ::arrow::schema( + {::arrow::field("json_1", ::arrow::extension::json(), true), + ::arrow::field("json_2", ::arrow::extension::json(::arrow::large_utf8()), + true)}); + metadata = std::shared_ptr{}; + ASSERT_OK(ConvertSchema(parquet_fields, metadata, props)); + EXPECT_TRUE(result_schema_->field(1)->type()->Equals( + ::arrow::extension::json(::arrow::utf8()))); } { From b1e92413a604a432e8c4c0994265d4077fdfe642 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Wed, 25 Sep 2024 11:29:47 +0200 Subject: [PATCH 06/16] Add another assertion --- cpp/src/parquet/arrow/arrow_schema_test.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cpp/src/parquet/arrow/arrow_schema_test.cc b/cpp/src/parquet/arrow/arrow_schema_test.cc index c60565b25594e..09532cf3e8bc0 100644 --- a/cpp/src/parquet/arrow/arrow_schema_test.cc +++ b/cpp/src/parquet/arrow/arrow_schema_test.cc @@ -776,6 +776,8 @@ TEST_F(TestConvertParquetSchema, ParquetSchemaArrowExtensions) { true)}); metadata = std::shared_ptr{}; ASSERT_OK(ConvertSchema(parquet_fields, metadata, props)); + EXPECT_FALSE(result_schema_->field(1)->type()->Equals( + arrow_schema->field(1)->type())); EXPECT_TRUE(result_schema_->field(1)->type()->Equals( ::arrow::extension::json(::arrow::utf8()))); } From 4806d9f914d9e068f4f56a910de4d9e8e8cb731d Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Wed, 25 Sep 2024 11:49:07 +0200 Subject: [PATCH 07/16] lint --- cpp/src/parquet/arrow/arrow_schema_test.cc | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/cpp/src/parquet/arrow/arrow_schema_test.cc b/cpp/src/parquet/arrow/arrow_schema_test.cc index 09532cf3e8bc0..3b88ac09ccdc5 100644 --- a/cpp/src/parquet/arrow/arrow_schema_test.cc +++ b/cpp/src/parquet/arrow/arrow_schema_test.cc @@ -776,10 +776,8 @@ TEST_F(TestConvertParquetSchema, ParquetSchemaArrowExtensions) { true)}); metadata = std::shared_ptr{}; ASSERT_OK(ConvertSchema(parquet_fields, metadata, props)); - EXPECT_FALSE(result_schema_->field(1)->type()->Equals( - arrow_schema->field(1)->type())); - EXPECT_TRUE(result_schema_->field(1)->type()->Equals( - ::arrow::extension::json(::arrow::utf8()))); + EXPECT_FALSE( + result_schema_->field(1)->type()->Equals(arrow_schema->field(1)->type())); } { From 77c964b04972fc08f04957190866fb475b453c3c Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Wed, 25 Sep 2024 11:55:19 +0200 Subject: [PATCH 08/16] Reintroduce assert --- cpp/src/parquet/arrow/arrow_schema_test.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cpp/src/parquet/arrow/arrow_schema_test.cc b/cpp/src/parquet/arrow/arrow_schema_test.cc index 3b88ac09ccdc5..0dac2508ca0e1 100644 --- a/cpp/src/parquet/arrow/arrow_schema_test.cc +++ b/cpp/src/parquet/arrow/arrow_schema_test.cc @@ -776,6 +776,8 @@ TEST_F(TestConvertParquetSchema, ParquetSchemaArrowExtensions) { true)}); metadata = std::shared_ptr{}; ASSERT_OK(ConvertSchema(parquet_fields, metadata, props)); + EXPECT_TRUE(result_schema_->field(1)->type()->Equals( + ::arrow::extension::json(::arrow::utf8()))); EXPECT_FALSE( result_schema_->field(1)->type()->Equals(arrow_schema->field(1)->type())); } From 3364c0f0f70a37c84e134043537c4ee4d55fbbd6 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Wed, 25 Sep 2024 17:23:52 +0200 Subject: [PATCH 09/16] Review feedback --- cpp/src/parquet/arrow/schema.cc | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/cpp/src/parquet/arrow/schema.cc b/cpp/src/parquet/arrow/schema.cc index ef175668b0bcc..f33f4681512c0 100644 --- a/cpp/src/parquet/arrow/schema.cc +++ b/cpp/src/parquet/arrow/schema.cc @@ -1018,8 +1018,10 @@ Result ApplyOriginalMetadata(const Field& origin_field, SchemaField* infer // Restore extension type, if the storage type is the same as inferred // from the Parquet type if (ex_type.storage_type()->Equals(*inferred->field->type()) || - (ex_type.extension_name() == "arrow.json" && - !ex_type.storage_type()->Equals(*inferred->field->type()))) { + ((ex_type.extension_name() == "arrow.json") && + (inferred->field->type()->storage_id() == ::arrow::Type::STRING || + inferred->field->type()->storage_id() == ::arrow::Type::LARGE_STRING || + inferred->field->type()->storage_id() == ::arrow::Type::STRING_VIEW))) { inferred->field = inferred->field->WithType(origin_type); } } From b2bca963a947b5f43a9064a1c0167a53861b6836 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Wed, 25 Sep 2024 17:52:31 +0200 Subject: [PATCH 10/16] Update cpp/src/arrow/extension/json.cc Co-authored-by: Antoine Pitrou --- cpp/src/arrow/extension/json.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/extension/json.cc b/cpp/src/arrow/extension/json.cc index 4fd33a0c899ee..7f3d99cc2a9b8 100644 --- a/cpp/src/arrow/extension/json.cc +++ b/cpp/src/arrow/extension/json.cc @@ -58,7 +58,7 @@ Result> JsonExtensionType::Make( } std::shared_ptr json(std::shared_ptr storage_type) { - return JsonExtensionType::Make(storage_type).ValueOrDie(); + return JsonExtensionType::Make(std::move(storage_type)).ValueOrDie(); } } // namespace arrow::extension From fdf8fb6cb05e2a0efcce1937a2ed33ccb886810f Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Wed, 25 Sep 2024 17:55:54 +0200 Subject: [PATCH 11/16] Review feedback --- cpp/src/arrow/extension/json.cc | 2 +- cpp/src/arrow/extension/json.h | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/extension/json.cc b/cpp/src/arrow/extension/json.cc index 7f3d99cc2a9b8..d42b8e202c330 100644 --- a/cpp/src/arrow/extension/json.cc +++ b/cpp/src/arrow/extension/json.cc @@ -48,7 +48,7 @@ std::shared_ptr JsonExtensionType::MakeArray( } Result> JsonExtensionType::Make( - const std::shared_ptr& storage_type) { + std::shared_ptr storage_type) { if (storage_type->id() != Type::STRING && storage_type->id() != Type::STRING_VIEW && storage_type->id() != Type::LARGE_STRING) { return Status::Invalid("Invalid storage type for JsonExtensionType: ", diff --git a/cpp/src/arrow/extension/json.h b/cpp/src/arrow/extension/json.h index f5edb286be123..85f9092d62b1a 100644 --- a/cpp/src/arrow/extension/json.h +++ b/cpp/src/arrow/extension/json.h @@ -45,8 +45,7 @@ class ARROW_EXPORT JsonExtensionType : public ExtensionType { std::shared_ptr MakeArray(std::shared_ptr data) const override; - static Result> Make( - const std::shared_ptr& storage_type); + static Result> Make(std::shared_ptr storage_type); private: std::shared_ptr storage_type_; From 99d8ca7536c8233a2f237af5a1feb19d7c44275e Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Wed, 25 Sep 2024 18:57:53 +0200 Subject: [PATCH 12/16] Apply suggestions from code review Co-authored-by: Antoine Pitrou --- cpp/src/arrow/extension/json.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/extension/json.cc b/cpp/src/arrow/extension/json.cc index d42b8e202c330..6c924a47b841f 100644 --- a/cpp/src/arrow/extension/json.cc +++ b/cpp/src/arrow/extension/json.cc @@ -34,7 +34,7 @@ bool JsonExtensionType::ExtensionEquals(const ExtensionType& other) const { Result> JsonExtensionType::Deserialize( std::shared_ptr storage_type, const std::string& serialized) const { - return JsonExtensionType::Make(storage_type); + return JsonExtensionType::Make(std::move(storage_type)); } std::string JsonExtensionType::Serialize() const { return ""; } @@ -54,7 +54,7 @@ Result> JsonExtensionType::Make( return Status::Invalid("Invalid storage type for JsonExtensionType: ", storage_type->ToString()); } - return std::make_shared(storage_type); + return std::make_shared(std::move(storage_type)); } std::shared_ptr json(std::shared_ptr storage_type) { From 6dff127adf94bf94972672036dde15f7e905f29e Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Thu, 26 Sep 2024 13:31:53 +0200 Subject: [PATCH 13/16] Review feedback --- cpp/src/arrow/extension/json.cc | 8 ++++++-- cpp/src/arrow/extension/json.h | 2 ++ cpp/src/parquet/arrow/schema.cc | 10 ++++------ 3 files changed, 12 insertions(+), 8 deletions(-) diff --git a/cpp/src/arrow/extension/json.cc b/cpp/src/arrow/extension/json.cc index 6c924a47b841f..9dc1cfe6a3617 100644 --- a/cpp/src/arrow/extension/json.cc +++ b/cpp/src/arrow/extension/json.cc @@ -47,10 +47,14 @@ std::shared_ptr JsonExtensionType::MakeArray( return std::make_shared(data); } +bool JsonExtensionType::IsSupportedStorageType(Type::type type_id) { + return type_id == Type::STRING || type_id == Type::STRING_VIEW || + type_id == Type::LARGE_STRING; +} + Result> JsonExtensionType::Make( std::shared_ptr storage_type) { - if (storage_type->id() != Type::STRING && storage_type->id() != Type::STRING_VIEW && - storage_type->id() != Type::LARGE_STRING) { + if (!IsSupportedStorageType(storage_type->id())) { return Status::Invalid("Invalid storage type for JsonExtensionType: ", storage_type->ToString()); } diff --git a/cpp/src/arrow/extension/json.h b/cpp/src/arrow/extension/json.h index 85f9092d62b1a..89976c8073fac 100644 --- a/cpp/src/arrow/extension/json.h +++ b/cpp/src/arrow/extension/json.h @@ -47,6 +47,8 @@ class ARROW_EXPORT JsonExtensionType : public ExtensionType { static Result> Make(std::shared_ptr storage_type); + static bool IsSupportedStorageType(Type::type type_id); + private: std::shared_ptr storage_type_; }; diff --git a/cpp/src/parquet/arrow/schema.cc b/cpp/src/parquet/arrow/schema.cc index f33f4681512c0..bd0b342caccf9 100644 --- a/cpp/src/parquet/arrow/schema.cc +++ b/cpp/src/parquet/arrow/schema.cc @@ -997,9 +997,8 @@ Result ApplyOriginalMetadata(const Field& origin_field, SchemaField* infer const auto& ex_type = checked_cast(*origin_type); if (inferred_type->id() != ::arrow::Type::EXTENSION && ex_type.extension_name() == std::string("arrow.json") && - (inferred_type->id() == ::arrow::Type::STRING || - inferred_type->id() == ::arrow::Type::LARGE_STRING || - inferred_type->id() == ::arrow::Type::STRING_VIEW)) { + ::arrow::extension::JsonExtensionType::IsSupportedStorageType( + inferred_type->id())) { // Schema mismatch. // // Arrow extensions are DISABLED in Parquet. @@ -1019,9 +1018,8 @@ Result ApplyOriginalMetadata(const Field& origin_field, SchemaField* infer // from the Parquet type if (ex_type.storage_type()->Equals(*inferred->field->type()) || ((ex_type.extension_name() == "arrow.json") && - (inferred->field->type()->storage_id() == ::arrow::Type::STRING || - inferred->field->type()->storage_id() == ::arrow::Type::LARGE_STRING || - inferred->field->type()->storage_id() == ::arrow::Type::STRING_VIEW))) { + ::arrow::extension::JsonExtensionType::IsSupportedStorageType( + inferred->field->type()->storage_id()))) { inferred->field = inferred->field->WithType(origin_type); } } From 6d156d1daf09c7f15fb25f95b81a45cce1dc3c71 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Wed, 2 Oct 2024 18:38:47 +0200 Subject: [PATCH 14/16] Review feedback --- cpp/src/parquet/arrow/schema.cc | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/cpp/src/parquet/arrow/schema.cc b/cpp/src/parquet/arrow/schema.cc index bd0b342caccf9..208427dfbd314 100644 --- a/cpp/src/parquet/arrow/schema.cc +++ b/cpp/src/parquet/arrow/schema.cc @@ -1017,9 +1017,7 @@ Result ApplyOriginalMetadata(const Field& origin_field, SchemaField* infer // Restore extension type, if the storage type is the same as inferred // from the Parquet type if (ex_type.storage_type()->Equals(*inferred->field->type()) || - ((ex_type.extension_name() == "arrow.json") && - ::arrow::extension::JsonExtensionType::IsSupportedStorageType( - inferred->field->type()->storage_id()))) { + (ex_type.extension_name() == "arrow.json")) { inferred->field = inferred->field->WithType(origin_type); } } From 25f666145b3400f6192a80042b4e781c9863d0b8 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Wed, 2 Oct 2024 22:14:09 +0200 Subject: [PATCH 15/16] Review feedback --- cpp/src/parquet/arrow/schema.cc | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/cpp/src/parquet/arrow/schema.cc b/cpp/src/parquet/arrow/schema.cc index 208427dfbd314..51191281320fb 100644 --- a/cpp/src/parquet/arrow/schema.cc +++ b/cpp/src/parquet/arrow/schema.cc @@ -1008,6 +1008,20 @@ Result ApplyOriginalMetadata(const Field& origin_field, SchemaField* infer // Origin type is restored as Arrow should be considered the source of truth. inferred->field = inferred->field->WithType(origin_type); RETURN_NOT_OK(ApplyOriginalStorageMetadata(origin_field, inferred)); + } else if (inferred_type->id() == ::arrow::Type::EXTENSION && + ex_type.extension_name() == std::string("arrow.json") && + ::arrow::extension::JsonExtensionType::IsSupportedStorageType( + inferred_type->storage_id())) { + // Potential schema mismatch. + // + // Arrow extensions are ENABLED in Parquet. + // origin_type is arrow::extension::json(...) + // inferred_type is arrow::extension::json(arrow::utf8()) + auto origin_storage_field = origin_field.WithType(ex_type.storage_type()); + + // Apply metadata recursively to storage type + RETURN_NOT_OK(ApplyOriginalStorageMetadata(*origin_storage_field, inferred)); + inferred->field = inferred->field->WithType(origin_type); } else { auto origin_storage_field = origin_field.WithType(ex_type.storage_type()); @@ -1016,8 +1030,7 @@ Result ApplyOriginalMetadata(const Field& origin_field, SchemaField* infer // Restore extension type, if the storage type is the same as inferred // from the Parquet type - if (ex_type.storage_type()->Equals(*inferred->field->type()) || - (ex_type.extension_name() == "arrow.json")) { + if (ex_type.storage_type()->Equals(*inferred->field->type())) { inferred->field = inferred->field->WithType(origin_type); } } From 8b308bb0e6308e065405d2f409bb53514ca7e2fd Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Tue, 8 Oct 2024 11:43:45 +0200 Subject: [PATCH 16/16] Review feedback --- cpp/src/parquet/arrow/schema.cc | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/cpp/src/parquet/arrow/schema.cc b/cpp/src/parquet/arrow/schema.cc index 51191281320fb..0d009c8d4f1e1 100644 --- a/cpp/src/parquet/arrow/schema.cc +++ b/cpp/src/parquet/arrow/schema.cc @@ -1009,9 +1009,7 @@ Result ApplyOriginalMetadata(const Field& origin_field, SchemaField* infer inferred->field = inferred->field->WithType(origin_type); RETURN_NOT_OK(ApplyOriginalStorageMetadata(origin_field, inferred)); } else if (inferred_type->id() == ::arrow::Type::EXTENSION && - ex_type.extension_name() == std::string("arrow.json") && - ::arrow::extension::JsonExtensionType::IsSupportedStorageType( - inferred_type->storage_id())) { + ex_type.extension_name() == std::string("arrow.json")) { // Potential schema mismatch. // // Arrow extensions are ENABLED in Parquet.