Skip to content

Commit

Permalink
Fix handling of duplicate column names in parquet reader
Browse files Browse the repository at this point in the history
Parquet files may contain duplicate column names when written by
case sensitive tools. We read the first case insensitive match from
the file in this scenario.
  • Loading branch information
raunaqmorarka committed Aug 27, 2024
1 parent 6db2db4 commit 3b1eb2f
Show file tree
Hide file tree
Showing 3 changed files with 12 additions and 5 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,9 @@ public static Map<List<String>, ColumnDescriptor> getDescriptors(MessageType fil
.stream()
.collect(toImmutableMap(
columnIO -> Arrays.asList(columnIO.getFieldPath()),
PrimitiveColumnIO::getColumnDescriptor));
PrimitiveColumnIO::getColumnDescriptor,
// Same column name may occur more than once when the file is written by case-sensitive tools
(oldValue, _) -> oldValue));
}

@SuppressWarnings("deprecation")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,11 @@ public static PrunedBlockMetadata createPrunedColumnsMetadata(BlockMetadata bloc
{
Set<List<String>> requiredPaths = descriptorsByPath.keySet();
Map<List<String>, ColumnChunkMetadata> columnMetadataByPath = blockMetadata.columns().stream()
.collect(toImmutableMap(column -> asList(column.getPath().toArray()), identity()));
.collect(toImmutableMap(
column -> asList(column.getPath().toArray()),
identity(),
// Same column name may occur more than once when the file is written by case-sensitive tools
(oldValue, _) -> oldValue));
ImmutableMap.Builder<List<String>, ColumnChunkMetadata> columnMetadataByPathBuilder = ImmutableMap.builderWithExpectedSize(requiredPaths.size());
for (Map.Entry<List<String>, ColumnDescriptor> entry : descriptorsByPath.entrySet()) {
List<String> requiredPath = entry.getKey();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -616,10 +616,11 @@ public void testParquetPageSourceSchemaEvolution(int rowCount)
public void testParquetCaseSensitivity(int rowCount)
throws Exception
{
TestColumn writeColumn = new TestColumn("UPPER_CASE_COLUMN", createVarcharType(4), new HiveVarchar("test", 4), utf8Slice("test"));
TestColumn readColumn = new TestColumn("uppeR_casE_columN", createVarcharType(4), new HiveVarchar("test", 4), utf8Slice("test"));
TestColumn writeColumnA = new TestColumn("UPPER_CASE_COLUMN", createVarcharType(5), new HiveVarchar("testA", 5), utf8Slice("testA"));
TestColumn writeColumnB = new TestColumn("Upper_Case_Column", createVarcharType(5), new HiveVarchar("testB", 5), utf8Slice("testB"));
TestColumn readColumn = new TestColumn("uppeR_casE_columN", createVarcharType(5), new HiveVarchar("testA", 5), utf8Slice("testA"));
assertThatFileFormat(PARQUET)
.withWriteColumns(ImmutableList.of(writeColumn))
.withWriteColumns(ImmutableList.of(writeColumnA, writeColumnB))
.withReadColumns(ImmutableList.of(readColumn))
.withSession(PARQUET_SESSION_USE_NAME)
.withRowsCount(rowCount)
Expand Down

0 comments on commit 3b1eb2f

Please sign in to comment.