Skip to content

Commit

Permalink
Change how extension type storage is restored after reading parquet
Browse files Browse the repository at this point in the history
  • Loading branch information
rok committed Sep 24, 2024
1 parent 60eda4a commit 0c265ab
Show file tree
Hide file tree
Showing 2 changed files with 9 additions and 8 deletions.
13 changes: 6 additions & 7 deletions cpp/src/parquet/arrow/arrow_schema_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -757,23 +757,22 @@ TEST_F(TestConvertParquetSchema, ParquetSchemaArrowExtensions) {

{
// Parquet file does not contain Arrow schema.
// If Arrow extensions are enabled, both fields should be treated as json() extension
// fields.
// If Arrow extensions are enabled, fields will be interpreted as json(utf8())
// extension fields.
ArrowReaderProperties props;
props.set_arrow_extensions_enabled(true);
auto arrow_schema = ::arrow::schema(
{::arrow::field("json_1", ::arrow::extension::json(), true),
::arrow::field("json_2", ::arrow::extension::json(::arrow::large_utf8()),
true)});
::arrow::field("json_2", ::arrow::extension::json(::arrow::utf8()), true)});
std::shared_ptr<KeyValueMetadata> metadata{};
ASSERT_OK(ConvertSchema(parquet_fields, metadata, props));
CheckFlatSchema(arrow_schema);
}

{
// Parquet file contains Arrow schema.
// Both json_1 and json_2 should be returned as a json() field
// even though extensions are not enabled.
// json_1 and json_2 will be interpreted as json(utf8()) and json(large_utf8())
// fields even though extensions are not enabled.
ArrowReaderProperties props;
props.set_arrow_extensions_enabled(false);
std::shared_ptr<KeyValueMetadata> field_metadata =
Expand All @@ -791,7 +790,7 @@ TEST_F(TestConvertParquetSchema, ParquetSchemaArrowExtensions) {

{
// Parquet file contains Arrow schema. Extensions are enabled.
// Both json_1 and json_2 should be returned as a json() field
// json_1 and json_2 will be interpreted as json(utf8()) and json(large_utf8()).
ArrowReaderProperties props;
props.set_arrow_extensions_enabled(true);
std::shared_ptr<KeyValueMetadata> field_metadata =
Expand Down
4 changes: 3 additions & 1 deletion cpp/src/parquet/arrow/schema.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1017,7 +1017,9 @@ Result<bool> ApplyOriginalMetadata(const Field& origin_field, SchemaField* infer

// Restore extension type, if the storage type is the same as inferred
// from the Parquet type
if (ex_type.storage_type()->Equals(*inferred->field->type())) {
if (ex_type.storage_type()->Equals(*inferred->field->type()) ||
(ex_type.extension_name() == "arrow.json" &&
!ex_type.storage_type()->Equals(*inferred->field->type()))) {
inferred->field = inferred->field->WithType(origin_type);
}
}
Expand Down

0 comments on commit 0c265ab

Please sign in to comment.