Skip to content

Commit

Permalink
Change how extension type storage is restored after reading parquet
Browse files Browse the repository at this point in the history
  • Loading branch information
rok committed Sep 24, 2024
1 parent 60eda4a commit 1ba3ba7
Show file tree
Hide file tree
Showing 4 changed files with 11 additions and 10 deletions.
2 changes: 1 addition & 1 deletion cpp/src/arrow/extension/json.cc
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ Result<std::shared_ptr<DataType>> JsonExtensionType::Make(
return std::make_shared<JsonExtensionType>(storage_type);
}

std::shared_ptr<DataType> json(const std::shared_ptr<DataType>& storage_type) {
std::shared_ptr<DataType> json(std::shared_ptr<DataType> storage_type) {
return JsonExtensionType::Make(storage_type).ValueOrDie();
}

Expand Down
2 changes: 1 addition & 1 deletion cpp/src/arrow/extension/json.h
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,6 @@ class ARROW_EXPORT JsonExtensionType : public ExtensionType {

/// \brief Return a JsonExtensionType instance.
ARROW_EXPORT std::shared_ptr<DataType> json(
const std::shared_ptr<DataType>& storage_type = utf8());
std::shared_ptr<DataType> storage_type = utf8());

} // namespace arrow::extension
13 changes: 6 additions & 7 deletions cpp/src/parquet/arrow/arrow_schema_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -757,23 +757,22 @@ TEST_F(TestConvertParquetSchema, ParquetSchemaArrowExtensions) {

{
// Parquet file does not contain Arrow schema.
// If Arrow extensions are enabled, both fields should be treated as json() extension
// fields.
// If Arrow extensions are enabled, fields will be interpreted as json(utf8())
// extension fields.
ArrowReaderProperties props;
props.set_arrow_extensions_enabled(true);
auto arrow_schema = ::arrow::schema(
{::arrow::field("json_1", ::arrow::extension::json(), true),
::arrow::field("json_2", ::arrow::extension::json(::arrow::large_utf8()),
true)});
::arrow::field("json_2", ::arrow::extension::json(::arrow::utf8()), true)});
std::shared_ptr<KeyValueMetadata> metadata{};
ASSERT_OK(ConvertSchema(parquet_fields, metadata, props));
CheckFlatSchema(arrow_schema);
}

{
// Parquet file contains Arrow schema.
// Both json_1 and json_2 should be returned as a json() field
// even though extensions are not enabled.
// json_1 and json_2 will be interpreted as json(utf8()) and json(large_utf8())
// fields even though extensions are not enabled.
ArrowReaderProperties props;
props.set_arrow_extensions_enabled(false);
std::shared_ptr<KeyValueMetadata> field_metadata =
Expand All @@ -791,7 +790,7 @@ TEST_F(TestConvertParquetSchema, ParquetSchemaArrowExtensions) {

{
// Parquet file contains Arrow schema. Extensions are enabled.
// Both json_1 and json_2 should be returned as a json() field
// json_1 and json_2 will be interpreted as json(utf8()) and json(large_utf8()).
ArrowReaderProperties props;
props.set_arrow_extensions_enabled(true);
std::shared_ptr<KeyValueMetadata> field_metadata =
Expand Down
4 changes: 3 additions & 1 deletion cpp/src/parquet/arrow/schema.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1017,7 +1017,9 @@ Result<bool> ApplyOriginalMetadata(const Field& origin_field, SchemaField* infer

// Restore extension type, if the storage type is the same as inferred
// from the Parquet type
if (ex_type.storage_type()->Equals(*inferred->field->type())) {
if (ex_type.storage_type()->Equals(*inferred->field->type()) ||
(ex_type.extension_name() == "arrow.json" &&
!ex_type.storage_type()->Equals(*inferred->field->type()))) {
inferred->field = inferred->field->WithType(origin_type);
}
}
Expand Down

0 comments on commit 1ba3ba7

Please sign in to comment.