Fix handling of duplicate column names in parquet reader

Parquet files may contain duplicate column names when written by case sensitive tools. We read the first case insensitive match from the file in this scenario.
JackyWoo · Aug 27, 2024 · 3b1eb2f · 3b1eb2f
1 parent 6db2db4
commit 3b1eb2f
Show file tree

Hide file tree

Showing 3 changed files with 12 additions and 5 deletions.
diff --git a/lib/trino-parquet/src/main/java/io/trino/parquet/ParquetTypeUtils.java b/lib/trino-parquet/src/main/java/io/trino/parquet/ParquetTypeUtils.java
@@ -113,7 +113,9 @@ public static Map<List<String>, ColumnDescriptor> getDescriptors(MessageType fil
                 .stream()
                 .collect(toImmutableMap(
                         columnIO -> Arrays.asList(columnIO.getFieldPath()),
-                        PrimitiveColumnIO::getColumnDescriptor));
+                        PrimitiveColumnIO::getColumnDescriptor,
+                        // Same column name may occur more than once when the file is written by case-sensitive tools
+                        (oldValue, _) -> oldValue));
     }
 
     @SuppressWarnings("deprecation")

diff --git a/lib/trino-parquet/src/main/java/io/trino/parquet/metadata/PrunedBlockMetadata.java b/lib/trino-parquet/src/main/java/io/trino/parquet/metadata/PrunedBlockMetadata.java
@@ -38,7 +38,11 @@ public static PrunedBlockMetadata createPrunedColumnsMetadata(BlockMetadata bloc
     {
         Set<List<String>> requiredPaths = descriptorsByPath.keySet();
         Map<List<String>, ColumnChunkMetadata> columnMetadataByPath = blockMetadata.columns().stream()
-                .collect(toImmutableMap(column -> asList(column.getPath().toArray()), identity()));
+                .collect(toImmutableMap(
+                        column -> asList(column.getPath().toArray()),
+                        identity(),
+                        // Same column name may occur more than once when the file is written by case-sensitive tools
+                        (oldValue, _) -> oldValue));
         ImmutableMap.Builder<List<String>, ColumnChunkMetadata> columnMetadataByPathBuilder = ImmutableMap.builderWithExpectedSize(requiredPaths.size());
         for (Map.Entry<List<String>, ColumnDescriptor> entry : descriptorsByPath.entrySet()) {
             List<String> requiredPath = entry.getKey();

diff --git a/plugin/trino-hive/src/test/java/io/trino/plugin/hive/TestHiveFileFormats.java b/plugin/trino-hive/src/test/java/io/trino/plugin/hive/TestHiveFileFormats.java
@@ -616,10 +616,11 @@ public void testParquetPageSourceSchemaEvolution(int rowCount)
     public void testParquetCaseSensitivity(int rowCount)
             throws Exception
     {
-        TestColumn writeColumn = new TestColumn("UPPER_CASE_COLUMN", createVarcharType(4), new HiveVarchar("test", 4), utf8Slice("test"));
-        TestColumn readColumn = new TestColumn("uppeR_casE_columN", createVarcharType(4), new HiveVarchar("test", 4), utf8Slice("test"));
+        TestColumn writeColumnA = new TestColumn("UPPER_CASE_COLUMN", createVarcharType(5), new HiveVarchar("testA", 5), utf8Slice("testA"));
+        TestColumn writeColumnB = new TestColumn("Upper_Case_Column", createVarcharType(5), new HiveVarchar("testB", 5), utf8Slice("testB"));
+        TestColumn readColumn = new TestColumn("uppeR_casE_columN", createVarcharType(5), new HiveVarchar("testA", 5), utf8Slice("testA"));
         assertThatFileFormat(PARQUET)
-                .withWriteColumns(ImmutableList.of(writeColumn))
+                .withWriteColumns(ImmutableList.of(writeColumnA, writeColumnB))
                 .withReadColumns(ImmutableList.of(readColumn))
                 .withSession(PARQUET_SESSION_USE_NAME)
                 .withRowsCount(rowCount)