Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[GLUTEN-8307][VL] Support Int64 Timestamp in parquet reader #8308

Merged
merged 1 commit into from
Jan 7, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -155,18 +155,12 @@ object VeloxBackendSettings extends BackendSettingsApi {

format match {
case ParquetReadFormat =>
val typeValidator: PartialFunction[StructField, String] = {
// Parquet timestamp is not fully supported yet
case StructField(_, TimestampType, _, _)
if GlutenConfig.get.forceParquetTimestampTypeScanFallbackEnabled =>
"TimestampType(force fallback)"
}
val parquetOptions = new ParquetOptions(CaseInsensitiveMap(properties), SQLConf.get)
if (parquetOptions.mergeSchema) {
// https://github.com/apache/incubator-gluten/issues/7174
Some(s"not support when merge schema is true")
} else {
validateTypes(typeValidator)
None
}
case DwrfReadFormat => None
case OrcReadFormat =>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -917,8 +917,6 @@ class VeloxTestSettings extends BackendTestSettings {
.exclude("SPARK-35640: read binary as timestamp should throw schema incompatible error")
// Exception msg.
.exclude("SPARK-35640: int as long should throw schema incompatible error")
// Velox only support read Timestamp with INT96 for now.
.exclude("read dictionary and plain encoded timestamp_millis written as INT64")
enableSuite[GlutenParquetV1PartitionDiscoverySuite]
enableSuite[GlutenParquetV2PartitionDiscoverySuite]
enableSuite[GlutenParquetProtobufCompatibilitySuite]
Expand All @@ -927,9 +925,6 @@ class VeloxTestSettings extends BackendTestSettings {
.exclude("Enabling/disabling ignoreCorruptFiles")
// decimal failed ut
.exclude("SPARK-34212 Parquet should read decimals correctly")
// Timestamp is read as INT96.
.exclude("Migration from INT96 to TIMESTAMP_MICROS timestamp type")
.exclude("SPARK-10365 timestamp written and read as INT64 - TIMESTAMP_MICROS")
// Rewrite because the filter after datasource is not needed.
.exclude(
"SPARK-26677: negated null-safe equality comparison should not filter matched row groups")
Expand All @@ -938,9 +933,6 @@ class VeloxTestSettings extends BackendTestSettings {
.exclude("Enabling/disabling ignoreCorruptFiles")
// decimal failed ut
.exclude("SPARK-34212 Parquet should read decimals correctly")
// Timestamp is read as INT96.
.exclude("Migration from INT96 to TIMESTAMP_MICROS timestamp type")
.exclude("SPARK-10365 timestamp written and read as INT64 - TIMESTAMP_MICROS")
// Rewrite because the filter after datasource is not needed.
.exclude(
"SPARK-26677: negated null-safe equality comparison should not filter matched row groups")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -720,8 +720,7 @@ class VeloxTestSettings extends BackendTestSettings {
.exclude("SPARK-35640: read binary as timestamp should throw schema incompatible error")
// Exception msg.
.exclude("SPARK-35640: int as long should throw schema incompatible error")
// Velox only support read Timestamp with INT96 for now.
.exclude("read dictionary and plain encoded timestamp_millis written as INT64")
// Velox parquet reader not allow offset zero.
.exclude("SPARK-40128 read DELTA_LENGTH_BYTE_ARRAY encoded strings")
enableSuite[GlutenParquetV1PartitionDiscoverySuite]
enableSuite[GlutenParquetV2PartitionDiscoverySuite]
Expand All @@ -731,10 +730,6 @@ class VeloxTestSettings extends BackendTestSettings {
.exclude("Enabling/disabling ignoreCorruptFiles")
// decimal failed ut
.exclude("SPARK-34212 Parquet should read decimals correctly")
// Timestamp is read as INT96.
.exclude("Migration from INT96 to TIMESTAMP_MICROS timestamp type")
.exclude("SPARK-10365 timestamp written and read as INT64 - TIMESTAMP_MICROS")
.exclude("SPARK-36182: read TimestampNTZ as TimestampLTZ")
// new added in spark-3.3 and need fix later, random failure may caused by memory free
.exclude("SPARK-39833: pushed filters with project without filter columns")
.exclude("SPARK-39833: pushed filters with count()")
Expand All @@ -746,10 +741,6 @@ class VeloxTestSettings extends BackendTestSettings {
.exclude("Enabling/disabling ignoreCorruptFiles")
// decimal failed ut
.exclude("SPARK-34212 Parquet should read decimals correctly")
// Timestamp is read as INT96.
.exclude("Migration from INT96 to TIMESTAMP_MICROS timestamp type")
.exclude("SPARK-10365 timestamp written and read as INT64 - TIMESTAMP_MICROS")
.exclude("SPARK-36182: read TimestampNTZ as TimestampLTZ")
// Rewrite because the filter after datasource is not needed.
.exclude(
"SPARK-26677: negated null-safe equality comparison should not filter matched row groups")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -715,9 +715,7 @@ class VeloxTestSettings extends BackendTestSettings {
.exclude("SPARK-35640: read binary as timestamp should throw schema incompatible error")
// Exception msg.
.exclude("SPARK-35640: int as long should throw schema incompatible error")
// Velox only support read Timestamp with INT96 for now.
.exclude("read dictionary and plain encoded timestamp_millis written as INT64")
.exclude("Read TimestampNTZ and TimestampLTZ for various logical TIMESTAMP types")
// Velox parquet reader not allow offset zero.
.exclude("SPARK-40128 read DELTA_LENGTH_BYTE_ARRAY encoded strings")
enableSuite[GlutenParquetV1PartitionDiscoverySuite]
enableSuite[GlutenParquetV2PartitionDiscoverySuite]
Expand All @@ -728,10 +726,6 @@ class VeloxTestSettings extends BackendTestSettings {
.exclude("Enabling/disabling ignoreCorruptFiles")
// decimal failed ut
.exclude("SPARK-34212 Parquet should read decimals correctly")
// Timestamp is read as INT96.
.exclude("Migration from INT96 to TIMESTAMP_MICROS timestamp type")
.exclude("SPARK-10365 timestamp written and read as INT64 - TIMESTAMP_MICROS")
.exclude("SPARK-36182: read TimestampNTZ as TimestampLTZ")
// new added in spark-3.3 and need fix later, random failure may caused by memory free
.exclude("SPARK-39833: pushed filters with project without filter columns")
.exclude("SPARK-39833: pushed filters with count()")
Expand All @@ -744,10 +738,6 @@ class VeloxTestSettings extends BackendTestSettings {
.exclude("Enabling/disabling ignoreCorruptFiles")
// decimal failed ut
.exclude("SPARK-34212 Parquet should read decimals correctly")
// Timestamp is read as INT96.
.exclude("Migration from INT96 to TIMESTAMP_MICROS timestamp type")
.exclude("SPARK-10365 timestamp written and read as INT64 - TIMESTAMP_MICROS")
.exclude("SPARK-36182: read TimestampNTZ as TimestampLTZ")
// Rewrite because the filter after datasource is not needed.
.exclude(
"SPARK-26677: negated null-safe equality comparison should not filter matched row groups")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -726,9 +726,7 @@ class VeloxTestSettings extends BackendTestSettings {
.exclude("SPARK-35640: read binary as timestamp should throw schema incompatible error")
// Exception msg.
.exclude("SPARK-35640: int as long should throw schema incompatible error")
// Velox only support read Timestamp with INT96 for now.
.exclude("read dictionary and plain encoded timestamp_millis written as INT64")
.exclude("Read TimestampNTZ and TimestampLTZ for various logical TIMESTAMP types")
// Velox parquet reader not allow offset zero.
.exclude("SPARK-40128 read DELTA_LENGTH_BYTE_ARRAY encoded strings")
enableSuite[GlutenParquetV1PartitionDiscoverySuite]
enableSuite[GlutenParquetV2PartitionDiscoverySuite]
Expand All @@ -739,10 +737,6 @@ class VeloxTestSettings extends BackendTestSettings {
.exclude("Enabling/disabling ignoreCorruptFiles")
// decimal failed ut
.exclude("SPARK-34212 Parquet should read decimals correctly")
// Timestamp is read as INT96.
.exclude("Migration from INT96 to TIMESTAMP_MICROS timestamp type")
.exclude("SPARK-10365 timestamp written and read as INT64 - TIMESTAMP_MICROS")
.exclude("SPARK-36182: read TimestampNTZ as TimestampLTZ")
// new added in spark-3.3 and need fix later, random failure may caused by memory free
.exclude("SPARK-39833: pushed filters with project without filter columns")
.exclude("SPARK-39833: pushed filters with count()")
Expand All @@ -755,10 +749,6 @@ class VeloxTestSettings extends BackendTestSettings {
.exclude("Enabling/disabling ignoreCorruptFiles")
// decimal failed ut
.exclude("SPARK-34212 Parquet should read decimals correctly")
// Timestamp is read as INT96.
.exclude("Migration from INT96 to TIMESTAMP_MICROS timestamp type")
.exclude("SPARK-10365 timestamp written and read as INT64 - TIMESTAMP_MICROS")
.exclude("SPARK-36182: read TimestampNTZ as TimestampLTZ")
// Rewrite because the filter after datasource is not needed.
.exclude(
"SPARK-26677: negated null-safe equality comparison should not filter matched row groups")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -126,9 +126,6 @@ class GlutenConfig(conf: SQLConf) extends Logging {
def forceOrcCharTypeScanFallbackEnabled: Boolean =
conf.getConf(VELOX_FORCE_ORC_CHAR_TYPE_SCAN_FALLBACK)

def forceParquetTimestampTypeScanFallbackEnabled: Boolean =
conf.getConf(VELOX_FORCE_PARQUET_TIMESTAMP_TYPE_SCAN_FALLBACK)

def scanFileSchemeValidationEnabled: Boolean =
conf.getConf(VELOX_SCAN_FILE_SCHEME_VALIDATION_ENABLED)

Expand Down Expand Up @@ -2184,13 +2181,6 @@ object GlutenConfig {
.booleanConf
.createWithDefault(true)

val VELOX_FORCE_PARQUET_TIMESTAMP_TYPE_SCAN_FALLBACK =
buildConf("spark.gluten.sql.parquet.timestampType.scan.fallback.enabled")
.internal()
.doc("Force fallback for parquet timestamp type scan.")
.booleanConf
.createWithDefault(false)

val VELOX_SCAN_FILE_SCHEME_VALIDATION_ENABLED =
buildConf("spark.gluten.sql.scan.fileSchemeValidation.enabled")
.internal()
Expand Down
Loading