diff --git a/cpp/src/arrow/extension/json.cc b/cpp/src/arrow/extension/json.cc index d793233c2b573..9dc1cfe6a3617 100644 --- a/cpp/src/arrow/extension/json.cc +++ b/cpp/src/arrow/extension/json.cc @@ -28,17 +28,13 @@ namespace arrow::extension { bool JsonExtensionType::ExtensionEquals(const ExtensionType& other) const { - return other.extension_name() == this->extension_name(); + return other.extension_name() == this->extension_name() && + other.storage_type()->Equals(storage_type_); } Result> JsonExtensionType::Deserialize( std::shared_ptr storage_type, const std::string& serialized) const { - if (storage_type->id() != Type::STRING && storage_type->id() != Type::STRING_VIEW && - storage_type->id() != Type::LARGE_STRING) { - return Status::Invalid("Invalid storage type for JsonExtensionType: ", - storage_type->ToString()); - } - return std::make_shared(storage_type); + return JsonExtensionType::Make(std::move(storage_type)); } std::string JsonExtensionType::Serialize() const { return ""; } @@ -51,11 +47,22 @@ std::shared_ptr JsonExtensionType::MakeArray( return std::make_shared(data); } -std::shared_ptr json(const std::shared_ptr storage_type) { - ARROW_CHECK(storage_type->id() != Type::STRING || - storage_type->id() != Type::STRING_VIEW || - storage_type->id() != Type::LARGE_STRING); - return std::make_shared(storage_type); +bool JsonExtensionType::IsSupportedStorageType(Type::type type_id) { + return type_id == Type::STRING || type_id == Type::STRING_VIEW || + type_id == Type::LARGE_STRING; +} + +Result> JsonExtensionType::Make( + std::shared_ptr storage_type) { + if (!IsSupportedStorageType(storage_type->id())) { + return Status::Invalid("Invalid storage type for JsonExtensionType: ", + storage_type->ToString()); + } + return std::make_shared(std::move(storage_type)); +} + +std::shared_ptr json(std::shared_ptr storage_type) { + return JsonExtensionType::Make(std::move(storage_type)).ValueOrDie(); } } // namespace arrow::extension diff --git a/cpp/src/arrow/extension/json.h b/cpp/src/arrow/extension/json.h index 4793ab2bc9b36..89976c8073fac 100644 --- a/cpp/src/arrow/extension/json.h +++ b/cpp/src/arrow/extension/json.h @@ -45,6 +45,10 @@ class ARROW_EXPORT JsonExtensionType : public ExtensionType { std::shared_ptr MakeArray(std::shared_ptr data) const override; + static Result> Make(std::shared_ptr storage_type); + + static bool IsSupportedStorageType(Type::type type_id); + private: std::shared_ptr storage_type_; }; diff --git a/cpp/src/arrow/extension/json_test.cc b/cpp/src/arrow/extension/json_test.cc index 143e4f9ceeac7..b938ddb2cfef3 100644 --- a/cpp/src/arrow/extension/json_test.cc +++ b/cpp/src/arrow/extension/json_test.cc @@ -80,4 +80,18 @@ TEST_F(TestJsonExtensionType, InvalidUTF8) { } } +TEST_F(TestJsonExtensionType, StorageTypeValidation) { + ASSERT_TRUE(json(utf8())->Equals(json(utf8()))); + ASSERT_FALSE(json(large_utf8())->Equals(json(utf8()))); + ASSERT_FALSE(json(utf8_view())->Equals(json(utf8()))); + ASSERT_FALSE(json(utf8_view())->Equals(json(large_utf8()))); + + for (const auto& storage_type : {int16(), binary(), float64(), null()}) { + ASSERT_RAISES_WITH_MESSAGE(Invalid, + "Invalid: Invalid storage type for JsonExtensionType: " + + storage_type->ToString(), + extension::JsonExtensionType::Make(storage_type)); + } +} + } // namespace arrow diff --git a/cpp/src/parquet/arrow/arrow_schema_test.cc b/cpp/src/parquet/arrow/arrow_schema_test.cc index 31ead461aa6e2..0dac2508ca0e1 100644 --- a/cpp/src/parquet/arrow/arrow_schema_test.cc +++ b/cpp/src/parquet/arrow/arrow_schema_test.cc @@ -757,23 +757,35 @@ TEST_F(TestConvertParquetSchema, ParquetSchemaArrowExtensions) { { // Parquet file does not contain Arrow schema. - // If Arrow extensions are enabled, both fields should be treated as json() extension - // fields. + // If Arrow extensions are enabled, fields will be interpreted as json(utf8()) + // extension fields. ArrowReaderProperties props; props.set_arrow_extensions_enabled(true); auto arrow_schema = ::arrow::schema( {::arrow::field("json_1", ::arrow::extension::json(), true), - ::arrow::field("json_2", ::arrow::extension::json(::arrow::large_utf8()), - true)}); + ::arrow::field("json_2", ::arrow::extension::json(::arrow::utf8()), true)}); std::shared_ptr metadata{}; ASSERT_OK(ConvertSchema(parquet_fields, metadata, props)); CheckFlatSchema(arrow_schema); + + // If original data was e.g. json(large_utf8()) it will be interpreted as json(utf8()) + // in absence of Arrow schema. + arrow_schema = ::arrow::schema( + {::arrow::field("json_1", ::arrow::extension::json(), true), + ::arrow::field("json_2", ::arrow::extension::json(::arrow::large_utf8()), + true)}); + metadata = std::shared_ptr{}; + ASSERT_OK(ConvertSchema(parquet_fields, metadata, props)); + EXPECT_TRUE(result_schema_->field(1)->type()->Equals( + ::arrow::extension::json(::arrow::utf8()))); + EXPECT_FALSE( + result_schema_->field(1)->type()->Equals(arrow_schema->field(1)->type())); } { // Parquet file contains Arrow schema. - // Both json_1 and json_2 should be returned as a json() field - // even though extensions are not enabled. + // json_1 and json_2 will be interpreted as json(utf8()) and json(large_utf8()) + // fields even though extensions are not enabled. ArrowReaderProperties props; props.set_arrow_extensions_enabled(false); std::shared_ptr field_metadata = @@ -791,7 +803,7 @@ TEST_F(TestConvertParquetSchema, ParquetSchemaArrowExtensions) { { // Parquet file contains Arrow schema. Extensions are enabled. - // Both json_1 and json_2 should be returned as a json() field + // json_1 and json_2 will be interpreted as json(utf8()) and json(large_utf8()). ArrowReaderProperties props; props.set_arrow_extensions_enabled(true); std::shared_ptr field_metadata = diff --git a/cpp/src/parquet/arrow/schema.cc b/cpp/src/parquet/arrow/schema.cc index 1623d80dcb0e4..0d009c8d4f1e1 100644 --- a/cpp/src/parquet/arrow/schema.cc +++ b/cpp/src/parquet/arrow/schema.cc @@ -997,9 +997,8 @@ Result ApplyOriginalMetadata(const Field& origin_field, SchemaField* infer const auto& ex_type = checked_cast(*origin_type); if (inferred_type->id() != ::arrow::Type::EXTENSION && ex_type.extension_name() == std::string("arrow.json") && - (inferred_type->id() == ::arrow::Type::STRING || - inferred_type->id() == ::arrow::Type::LARGE_STRING || - inferred_type->id() == ::arrow::Type::STRING_VIEW)) { + ::arrow::extension::JsonExtensionType::IsSupportedStorageType( + inferred_type->id())) { // Schema mismatch. // // Arrow extensions are DISABLED in Parquet. @@ -1009,6 +1008,18 @@ Result ApplyOriginalMetadata(const Field& origin_field, SchemaField* infer // Origin type is restored as Arrow should be considered the source of truth. inferred->field = inferred->field->WithType(origin_type); RETURN_NOT_OK(ApplyOriginalStorageMetadata(origin_field, inferred)); + } else if (inferred_type->id() == ::arrow::Type::EXTENSION && + ex_type.extension_name() == std::string("arrow.json")) { + // Potential schema mismatch. + // + // Arrow extensions are ENABLED in Parquet. + // origin_type is arrow::extension::json(...) + // inferred_type is arrow::extension::json(arrow::utf8()) + auto origin_storage_field = origin_field.WithType(ex_type.storage_type()); + + // Apply metadata recursively to storage type + RETURN_NOT_OK(ApplyOriginalStorageMetadata(*origin_storage_field, inferred)); + inferred->field = inferred->field->WithType(origin_type); } else { auto origin_storage_field = origin_field.WithType(ex_type.storage_type());