-
Notifications
You must be signed in to change notification settings - Fork 2.4k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[HUDI-5533] Support spark columns comments #8683
base: master
Are you sure you want to change the base?
Changes from all commits
7bdb949
8d6893f
de250fb
f41404a
e216e51
a8b972a
f320fd8
13993d6
b00bc0b
89409c2
20b2791
b92e5eb
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -45,7 +45,7 @@ private[sql] object SchemaConverters { | |
* | ||
* @since 2.4.0 | ||
*/ | ||
case class SchemaType(dataType: DataType, nullable: Boolean) | ||
case class SchemaType(dataType: DataType, nullable: Boolean, doc: Option[String]) | ||
|
||
/** | ||
* Converts an Avro schema to a corresponding Spark SQL schema. | ||
|
@@ -59,32 +59,32 @@ private[sql] object SchemaConverters { | |
private val unionFieldMemberPrefix = "member" | ||
|
||
private def toSqlTypeHelper(avroSchema: Schema, existingRecordNames: Set[String]): SchemaType = { | ||
avroSchema.getType match { | ||
case INT => avroSchema.getLogicalType match { | ||
case _: Date => SchemaType(DateType, nullable = false) | ||
case _ => SchemaType(IntegerType, nullable = false) | ||
(avroSchema.getType, Option(avroSchema.getDoc)) match { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The conversion tool is copied from Spark: https://github.com/apache/spark/blob/dd4db21cb69a9a9c3715360673a76e6f150303d4/connector/avro/src/main/scala/org/apache/spark/sql/avro/SchemaConverters.scala#L58, just noticed that Spark also does not support keeping comments from Avro fields while doing the converison. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Likely spark also have this limitation when retrieving schema from avro. But spark don't usually infer spark schema from avro. Hudi does, and that's the reason of the patch. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you write a test case for it, especially for creating table. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. sure There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @danny0405 added a test There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Kind of feel there is no need to change each match for every data types, can we write another method similiar with There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Any columns including nested columns also may have comments, so I don't see why we should'nt look after all avro content for doc.
This would lead to walk thought the avro schema two times, and also lead to complex merge of results. Am i missing something ? |
||
case (INT, doc) => avroSchema.getLogicalType match { | ||
case _: Date => SchemaType(DateType, nullable = false, doc) | ||
case _ => SchemaType(IntegerType, nullable = false, doc) | ||
} | ||
case STRING => SchemaType(StringType, nullable = false) | ||
case BOOLEAN => SchemaType(BooleanType, nullable = false) | ||
case BYTES | FIXED => avroSchema.getLogicalType match { | ||
case (STRING, doc) => SchemaType(StringType, nullable = false, doc) | ||
case (BOOLEAN, doc) => SchemaType(BooleanType, nullable = false, doc) | ||
case (BYTES | FIXED, doc) => avroSchema.getLogicalType match { | ||
// For FIXED type, if the precision requires more bytes than fixed size, the logical | ||
// type will be null, which is handled by Avro library. | ||
case d: Decimal => SchemaType(DecimalType(d.getPrecision, d.getScale), nullable = false) | ||
case _ => SchemaType(BinaryType, nullable = false) | ||
case d: Decimal => SchemaType(DecimalType(d.getPrecision, d.getScale), nullable = false, doc) | ||
case _ => SchemaType(BinaryType, nullable = false, doc) | ||
} | ||
|
||
case DOUBLE => SchemaType(DoubleType, nullable = false) | ||
case FLOAT => SchemaType(FloatType, nullable = false) | ||
case LONG => avroSchema.getLogicalType match { | ||
case _: TimestampMillis | _: TimestampMicros => SchemaType(TimestampType, nullable = false) | ||
case _ => SchemaType(LongType, nullable = false) | ||
case (DOUBLE, doc) => SchemaType(DoubleType, nullable = false, doc) | ||
case (FLOAT, doc) => SchemaType(FloatType, nullable = false, doc) | ||
case (LONG, doc) => avroSchema.getLogicalType match { | ||
case _: TimestampMillis | _: TimestampMicros => SchemaType(TimestampType, nullable = false, doc) | ||
case _ => SchemaType(LongType, nullable = false, doc) | ||
} | ||
|
||
case ENUM => SchemaType(StringType, nullable = false) | ||
case (ENUM, doc) => SchemaType(StringType, nullable = false, doc) | ||
|
||
case NULL => SchemaType(NullType, nullable = true) | ||
case (NULL, doc) => SchemaType(NullType, nullable = true, doc) | ||
|
||
case RECORD => | ||
case (RECORD, doc) => | ||
if (existingRecordNames.contains(avroSchema.getFullName)) { | ||
throw new IncompatibleSchemaException( | ||
s""" | ||
|
@@ -95,24 +95,25 @@ private[sql] object SchemaConverters { | |
val newRecordNames = existingRecordNames + avroSchema.getFullName | ||
val fields = avroSchema.getFields.asScala.map { f => | ||
val schemaType = toSqlTypeHelper(f.schema(), newRecordNames) | ||
StructField(f.name, schemaType.dataType, schemaType.nullable) | ||
val metadata = if(f.doc != null) new MetadataBuilder().putString("comment", f.doc).build() else Metadata.empty | ||
StructField(f.name, schemaType.dataType, schemaType.nullable, metadata) | ||
} | ||
|
||
SchemaType(StructType(fields.toSeq), nullable = false) | ||
SchemaType(StructType(fields.toSeq), nullable = false, doc) | ||
|
||
case ARRAY => | ||
case (ARRAY, doc) => | ||
val schemaType = toSqlTypeHelper(avroSchema.getElementType, existingRecordNames) | ||
SchemaType( | ||
ArrayType(schemaType.dataType, containsNull = schemaType.nullable), | ||
nullable = false) | ||
nullable = false, doc) | ||
|
||
case MAP => | ||
case (MAP, doc) => | ||
val schemaType = toSqlTypeHelper(avroSchema.getValueType, existingRecordNames) | ||
SchemaType( | ||
MapType(StringType, schemaType.dataType, valueContainsNull = schemaType.nullable), | ||
nullable = false) | ||
nullable = false, doc) | ||
|
||
case UNION => | ||
case (UNION, doc) => | ||
if (avroSchema.getTypes.asScala.exists(_.getType == NULL)) { | ||
// In case of a union with null, eliminate it and make a recursive call | ||
val remainingUnionTypes = avroSchema.getTypes.asScala.filterNot(_.getType == NULL) | ||
|
@@ -126,20 +127,21 @@ private[sql] object SchemaConverters { | |
case Seq(t1) => | ||
toSqlTypeHelper(avroSchema.getTypes.get(0), existingRecordNames) | ||
case Seq(t1, t2) if Set(t1, t2) == Set(INT, LONG) => | ||
SchemaType(LongType, nullable = false) | ||
SchemaType(LongType, nullable = false, doc) | ||
case Seq(t1, t2) if Set(t1, t2) == Set(FLOAT, DOUBLE) => | ||
SchemaType(DoubleType, nullable = false) | ||
SchemaType(DoubleType, nullable = false, doc) | ||
case _ => | ||
// Convert complex unions to struct types where field names are member0, member1, etc. | ||
// This is consistent with the behavior when converting between Avro and Parquet. | ||
val fields = avroSchema.getTypes.asScala.zipWithIndex.map { | ||
case (s, i) => | ||
val schemaType = toSqlTypeHelper(s, existingRecordNames) | ||
// All fields are nullable because only one of them is set at a time | ||
StructField(s"$unionFieldMemberPrefix$i", schemaType.dataType, nullable = true) | ||
val metadata = if(schemaType.doc.isDefined) new MetadataBuilder().putString("comment", schemaType.doc.get).build() else Metadata.empty | ||
StructField(s"$unionFieldMemberPrefix$i", schemaType.dataType, nullable = true, metadata) | ||
} | ||
|
||
SchemaType(StructType(fields.toSeq), nullable = false) | ||
SchemaType(StructType(fields.toSeq), nullable = false, doc) | ||
} | ||
|
||
case other => throw new IncompatibleSchemaException(s"Unsupported type $other") | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,83 @@ | ||
/* | ||
* Licensed to the Apache Software Foundation (ASF) under one or more | ||
* contributor license agreements. See the NOTICE file distributed with | ||
* this work for additional information regarding copyright ownership. | ||
* The ASF licenses this file to You under the Apache License, Version 2.0 | ||
* (the "License"); you may not use this file except in compliance with | ||
* the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
package org.apache.hudi.functional | ||
|
||
import org.apache.spark.sql._ | ||
import org.apache.spark.sql.hudi.HoodieSparkSessionExtension | ||
import org.apache.spark.SparkContext | ||
import org.apache.hudi.testutils.HoodieClientTestUtils.getSparkConfForTest | ||
import org.apache.hudi.DataSourceWriteOptions | ||
import org.apache.hudi.config.HoodieWriteConfig | ||
import org.apache.hudi.common.model.{HoodieTableType} | ||
import org.apache.spark.sql.types.StructType | ||
import org.junit.jupiter.api.Assertions.assertEquals | ||
import org.junit.jupiter.api.BeforeEach | ||
import org.junit.jupiter.params.ParameterizedTest | ||
import org.junit.jupiter.params.provider.EnumSource | ||
|
||
|
||
class TestColumnComments { | ||
var spark : SparkSession = _ | ||
var sqlContext: SQLContext = _ | ||
var sc : SparkContext = _ | ||
|
||
def initSparkContext(): Unit = { | ||
val sparkConf = getSparkConfForTest(getClass.getSimpleName) | ||
spark = SparkSession.builder() | ||
.withExtensions(new HoodieSparkSessionExtension) | ||
.config(sparkConf) | ||
.getOrCreate() | ||
sc = spark.sparkContext | ||
sc.setLogLevel("ERROR") | ||
sqlContext = spark.sqlContext | ||
} | ||
|
||
@BeforeEach | ||
def setUp() { | ||
initSparkContext() | ||
} | ||
|
||
@ParameterizedTest | ||
@EnumSource(value = classOf[HoodieTableType], names = Array("COPY_ON_WRITE", "MERGE_ON_READ")) | ||
def testColumnCommentWithSparkDatasource(tableType: HoodieTableType): Unit = { | ||
val basePath = java.nio.file.Files.createTempDirectory("hoodie_comments_path").toAbsolutePath.toString | ||
val opts = Map( | ||
HoodieWriteConfig.TBL_NAME.key -> "hoodie_comments", | ||
DataSourceWriteOptions.TABLE_TYPE.key -> tableType.toString, | ||
DataSourceWriteOptions.OPERATION.key -> "bulk_insert", | ||
DataSourceWriteOptions.RECORDKEY_FIELD.key -> "_row_key", | ||
DataSourceWriteOptions.PARTITIONPATH_FIELD.key -> "partition" | ||
) | ||
val inputDF = spark.sql("select '0' as _row_key, '1' as content, '2' as partition, '3' as ts") | ||
val struct = new StructType() | ||
.add("_row_key", "string", true, "dummy comment") | ||
.add("content", "string", true) | ||
.add("partition", "string", true) | ||
.add("ts", "string", true) | ||
spark.createDataFrame(inputDF.rdd, struct) | ||
.write.format("hudi") | ||
.options(opts) | ||
.mode(SaveMode.Overwrite) | ||
.save(basePath) | ||
spark.read.format("hudi").load(basePath).registerTempTable("test_tbl") | ||
|
||
// now confirm the comment is present at read time | ||
assertEquals(1, spark.sql("desc extended test_tbl") | ||
.filter("col_name = '_row_key' and comment = 'dummy comment'").count) | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -24,6 +24,7 @@ | |
import org.apache.hudi.hive.SchemaDifference; | ||
import org.apache.hudi.hive.util.HiveSchemaUtil; | ||
import org.apache.hudi.sync.common.HoodieSyncTool; | ||
import org.apache.hudi.sync.common.model.FieldSchema; | ||
import org.apache.hudi.sync.common.model.PartitionEvent; | ||
import org.apache.hudi.sync.common.model.PartitionEvent.PartitionEventType; | ||
import org.apache.hudi.sync.common.util.ConfigUtils; | ||
|
@@ -212,8 +213,9 @@ private void syncSchema(String tableName, boolean tableExists, boolean useRealTi | |
Map<String, String> tableProperties = ConfigUtils.toMap(config.getString(ADB_SYNC_TABLE_PROPERTIES)); | ||
Map<String, String> serdeProperties = ConfigUtils.toMap(config.getString(ADB_SYNC_SERDE_PROPERTIES)); | ||
if (config.getBoolean(ADB_SYNC_SYNC_AS_SPARK_DATA_SOURCE_TABLE)) { | ||
List<FieldSchema> fromStorage = syncClient.getStorageFieldSchemas(); | ||
Map<String, String> sparkTableProperties = SparkDataSourceTableUtils.getSparkTableProperties(config.getSplitStrings(META_SYNC_PARTITION_FIELDS), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
config.getString(META_SYNC_SPARK_VERSION), config.getInt(ADB_SYNC_SCHEMA_STRING_LENGTH_THRESHOLD), schema); | ||
config.getString(META_SYNC_SPARK_VERSION), config.getInt(ADB_SYNC_SCHEMA_STRING_LENGTH_THRESHOLD), schema, fromStorage); | ||
Map<String, String> sparkSerdeProperties = SparkDataSourceTableUtils.getSparkSerdeProperties(readAsOptimized, config.getString(META_SYNC_BASE_PATH)); | ||
tableProperties.putAll(sparkTableProperties); | ||
serdeProperties.putAll(sparkSerdeProperties); | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -65,7 +65,7 @@ public void runSQL(String s) { | |
try { | ||
stmt = connection.createStatement(); | ||
LOG.info("Executing SQL " + s); | ||
stmt.execute(s); | ||
stmt.execute(escapeAntiSlash(s)); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is the escape related with this change ? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes, both JDBC and HMS generate sql and antislash should be doubled escaped otherwise it is lost. Antislash is used to escape double quotes in the comments DDL. |
||
} catch (SQLException e) { | ||
throw new HoodieHiveSyncException("Failed in executing SQL " + s, e); | ||
} finally { | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -37,6 +37,7 @@ | |
import java.util.ArrayList; | ||
import java.util.List; | ||
import java.util.Map; | ||
import java.util.regex.Matcher; | ||
|
||
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_BATCH_SYNC_PARTITION_NUM; | ||
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_SUPPORT_TIMESTAMP_TYPE; | ||
|
@@ -220,5 +221,16 @@ private List<String> constructChangePartitions(String tableName, List<String> pa | |
} | ||
return changePartitions; | ||
} | ||
|
||
/** | ||
* SQL statement should be be escaped in order to consider anti-slash | ||
* | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Every sentence should end up with |
||
* For eg: \foo should be transformed into \\\foo | ||
* @param sql | ||
* @return | ||
*/ | ||
protected String escapeAntiSlash(String sql) { | ||
return sql.replaceAll("\\\\", Matcher.quoteReplacement("\\\\\\")); | ||
} | ||
} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Collections.emptyList() ?