Skip to content

Commit

Permalink
[VL] Support Int64 Timestamp in parquet reader
Browse files Browse the repository at this point in the history
  • Loading branch information
zml1206 committed Jan 7, 2025
1 parent e56ec1e commit 1aeff6b
Show file tree
Hide file tree
Showing 6 changed files with 4 additions and 57 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -155,18 +155,12 @@ object VeloxBackendSettings extends BackendSettingsApi {

format match {
case ParquetReadFormat =>
val typeValidator: PartialFunction[StructField, String] = {
// Parquet timestamp is not fully supported yet
case StructField(_, TimestampType, _, _)
if GlutenConfig.get.forceParquetTimestampTypeScanFallbackEnabled =>
"TimestampType(force fallback)"
}
val parquetOptions = new ParquetOptions(CaseInsensitiveMap(properties), SQLConf.get)
if (parquetOptions.mergeSchema) {
// https://github.com/apache/incubator-gluten/issues/7174
Some(s"not support when merge schema is true")
} else {
validateTypes(typeValidator)
None
}
case DwrfReadFormat => None
case OrcReadFormat =>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -917,8 +917,6 @@ class VeloxTestSettings extends BackendTestSettings {
.exclude("SPARK-35640: read binary as timestamp should throw schema incompatible error")
// Exception msg.
.exclude("SPARK-35640: int as long should throw schema incompatible error")
// Velox only support read Timestamp with INT96 for now.
.exclude("read dictionary and plain encoded timestamp_millis written as INT64")
enableSuite[GlutenParquetV1PartitionDiscoverySuite]
enableSuite[GlutenParquetV2PartitionDiscoverySuite]
enableSuite[GlutenParquetProtobufCompatibilitySuite]
Expand All @@ -927,9 +925,6 @@ class VeloxTestSettings extends BackendTestSettings {
.exclude("Enabling/disabling ignoreCorruptFiles")
// decimal failed ut
.exclude("SPARK-34212 Parquet should read decimals correctly")
// Timestamp is read as INT96.
.exclude("Migration from INT96 to TIMESTAMP_MICROS timestamp type")
.exclude("SPARK-10365 timestamp written and read as INT64 - TIMESTAMP_MICROS")
// Rewrite because the filter after datasource is not needed.
.exclude(
"SPARK-26677: negated null-safe equality comparison should not filter matched row groups")
Expand All @@ -938,9 +933,6 @@ class VeloxTestSettings extends BackendTestSettings {
.exclude("Enabling/disabling ignoreCorruptFiles")
// decimal failed ut
.exclude("SPARK-34212 Parquet should read decimals correctly")
// Timestamp is read as INT96.
.exclude("Migration from INT96 to TIMESTAMP_MICROS timestamp type")
.exclude("SPARK-10365 timestamp written and read as INT64 - TIMESTAMP_MICROS")
// Rewrite because the filter after datasource is not needed.
.exclude(
"SPARK-26677: negated null-safe equality comparison should not filter matched row groups")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -720,8 +720,7 @@ class VeloxTestSettings extends BackendTestSettings {
.exclude("SPARK-35640: read binary as timestamp should throw schema incompatible error")
// Exception msg.
.exclude("SPARK-35640: int as long should throw schema incompatible error")
// Velox only support read Timestamp with INT96 for now.
.exclude("read dictionary and plain encoded timestamp_millis written as INT64")
// Velox parquet reader not allow offset zero.
.exclude("SPARK-40128 read DELTA_LENGTH_BYTE_ARRAY encoded strings")
enableSuite[GlutenParquetV1PartitionDiscoverySuite]
enableSuite[GlutenParquetV2PartitionDiscoverySuite]
Expand All @@ -731,10 +730,6 @@ class VeloxTestSettings extends BackendTestSettings {
.exclude("Enabling/disabling ignoreCorruptFiles")
// decimal failed ut
.exclude("SPARK-34212 Parquet should read decimals correctly")
// Timestamp is read as INT96.
.exclude("Migration from INT96 to TIMESTAMP_MICROS timestamp type")
.exclude("SPARK-10365 timestamp written and read as INT64 - TIMESTAMP_MICROS")
.exclude("SPARK-36182: read TimestampNTZ as TimestampLTZ")
// new added in spark-3.3 and need fix later, random failure may caused by memory free
.exclude("SPARK-39833: pushed filters with project without filter columns")
.exclude("SPARK-39833: pushed filters with count()")
Expand All @@ -746,10 +741,6 @@ class VeloxTestSettings extends BackendTestSettings {
.exclude("Enabling/disabling ignoreCorruptFiles")
// decimal failed ut
.exclude("SPARK-34212 Parquet should read decimals correctly")
// Timestamp is read as INT96.
.exclude("Migration from INT96 to TIMESTAMP_MICROS timestamp type")
.exclude("SPARK-10365 timestamp written and read as INT64 - TIMESTAMP_MICROS")
.exclude("SPARK-36182: read TimestampNTZ as TimestampLTZ")
// Rewrite because the filter after datasource is not needed.
.exclude(
"SPARK-26677: negated null-safe equality comparison should not filter matched row groups")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -715,9 +715,7 @@ class VeloxTestSettings extends BackendTestSettings {
.exclude("SPARK-35640: read binary as timestamp should throw schema incompatible error")
// Exception msg.
.exclude("SPARK-35640: int as long should throw schema incompatible error")
// Velox only support read Timestamp with INT96 for now.
.exclude("read dictionary and plain encoded timestamp_millis written as INT64")
.exclude("Read TimestampNTZ and TimestampLTZ for various logical TIMESTAMP types")
// Velox parquet reader not allow offset zero.
.exclude("SPARK-40128 read DELTA_LENGTH_BYTE_ARRAY encoded strings")
enableSuite[GlutenParquetV1PartitionDiscoverySuite]
enableSuite[GlutenParquetV2PartitionDiscoverySuite]
Expand All @@ -728,10 +726,6 @@ class VeloxTestSettings extends BackendTestSettings {
.exclude("Enabling/disabling ignoreCorruptFiles")
// decimal failed ut
.exclude("SPARK-34212 Parquet should read decimals correctly")
// Timestamp is read as INT96.
.exclude("Migration from INT96 to TIMESTAMP_MICROS timestamp type")
.exclude("SPARK-10365 timestamp written and read as INT64 - TIMESTAMP_MICROS")
.exclude("SPARK-36182: read TimestampNTZ as TimestampLTZ")
// new added in spark-3.3 and need fix later, random failure may caused by memory free
.exclude("SPARK-39833: pushed filters with project without filter columns")
.exclude("SPARK-39833: pushed filters with count()")
Expand All @@ -744,10 +738,6 @@ class VeloxTestSettings extends BackendTestSettings {
.exclude("Enabling/disabling ignoreCorruptFiles")
// decimal failed ut
.exclude("SPARK-34212 Parquet should read decimals correctly")
// Timestamp is read as INT96.
.exclude("Migration from INT96 to TIMESTAMP_MICROS timestamp type")
.exclude("SPARK-10365 timestamp written and read as INT64 - TIMESTAMP_MICROS")
.exclude("SPARK-36182: read TimestampNTZ as TimestampLTZ")
// Rewrite because the filter after datasource is not needed.
.exclude(
"SPARK-26677: negated null-safe equality comparison should not filter matched row groups")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -726,9 +726,7 @@ class VeloxTestSettings extends BackendTestSettings {
.exclude("SPARK-35640: read binary as timestamp should throw schema incompatible error")
// Exception msg.
.exclude("SPARK-35640: int as long should throw schema incompatible error")
// Velox only support read Timestamp with INT96 for now.
.exclude("read dictionary and plain encoded timestamp_millis written as INT64")
.exclude("Read TimestampNTZ and TimestampLTZ for various logical TIMESTAMP types")
// Velox parquet reader not allow offset zero.
.exclude("SPARK-40128 read DELTA_LENGTH_BYTE_ARRAY encoded strings")
enableSuite[GlutenParquetV1PartitionDiscoverySuite]
enableSuite[GlutenParquetV2PartitionDiscoverySuite]
Expand All @@ -739,10 +737,6 @@ class VeloxTestSettings extends BackendTestSettings {
.exclude("Enabling/disabling ignoreCorruptFiles")
// decimal failed ut
.exclude("SPARK-34212 Parquet should read decimals correctly")
// Timestamp is read as INT96.
.exclude("Migration from INT96 to TIMESTAMP_MICROS timestamp type")
.exclude("SPARK-10365 timestamp written and read as INT64 - TIMESTAMP_MICROS")
.exclude("SPARK-36182: read TimestampNTZ as TimestampLTZ")
// new added in spark-3.3 and need fix later, random failure may caused by memory free
.exclude("SPARK-39833: pushed filters with project without filter columns")
.exclude("SPARK-39833: pushed filters with count()")
Expand All @@ -755,10 +749,6 @@ class VeloxTestSettings extends BackendTestSettings {
.exclude("Enabling/disabling ignoreCorruptFiles")
// decimal failed ut
.exclude("SPARK-34212 Parquet should read decimals correctly")
// Timestamp is read as INT96.
.exclude("Migration from INT96 to TIMESTAMP_MICROS timestamp type")
.exclude("SPARK-10365 timestamp written and read as INT64 - TIMESTAMP_MICROS")
.exclude("SPARK-36182: read TimestampNTZ as TimestampLTZ")
// Rewrite because the filter after datasource is not needed.
.exclude(
"SPARK-26677: negated null-safe equality comparison should not filter matched row groups")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -126,9 +126,6 @@ class GlutenConfig(conf: SQLConf) extends Logging {
def forceOrcCharTypeScanFallbackEnabled: Boolean =
conf.getConf(VELOX_FORCE_ORC_CHAR_TYPE_SCAN_FALLBACK)

def forceParquetTimestampTypeScanFallbackEnabled: Boolean =
conf.getConf(VELOX_FORCE_PARQUET_TIMESTAMP_TYPE_SCAN_FALLBACK)

def scanFileSchemeValidationEnabled: Boolean =
conf.getConf(VELOX_SCAN_FILE_SCHEME_VALIDATION_ENABLED)

Expand Down Expand Up @@ -2184,13 +2181,6 @@ object GlutenConfig {
.booleanConf
.createWithDefault(true)

val VELOX_FORCE_PARQUET_TIMESTAMP_TYPE_SCAN_FALLBACK =
buildConf("spark.gluten.sql.parquet.timestampType.scan.fallback.enabled")
.internal()
.doc("Force fallback for parquet timestamp type scan.")
.booleanConf
.createWithDefault(false)

val VELOX_SCAN_FILE_SCHEME_VALIDATION_ENABLED =
buildConf("spark.gluten.sql.scan.fileSchemeValidation.enabled")
.internal()
Expand Down

0 comments on commit 1aeff6b

Please sign in to comment.