delta-io · vkorukanti · Apr 17, 2024 · Mar 4, 2024 · Apr 11, 2024
diff --git a/spark/src/main/scala/org/apache/spark/sql/delta/OptimisticTransaction.scala b/spark/src/main/scala/org/apache/spark/sql/delta/OptimisticTransaction.scala
@@ -1874,6 +1874,13 @@ trait OptimisticTransactionImpl extends TransactionalWrite
     val info = currentTransactionInfo.commitInfo
       .map(_.copy(readVersion = None, isolationLevel = None)).orNull
     setNeedsCheckpoint(attemptVersion, postCommitSnapshot)
+    val doCollectCommitStats =
+      needsCheckpoint || spark.sessionState.conf.getConf(DeltaSQLConf.DELTA_FORCE_ALL_COMMIT_STATS)
+
+    // Stats that force an expensive snapshot state reconstruction:
+    val numFilesTotal = if (doCollectCommitStats) postCommitSnapshot.numOfFiles else -1L
+    val sizeInBytesTotal = if (doCollectCommitStats) postCommitSnapshot.sizeInBytes else -1L
+
     val stats = CommitStats(
       startVersion = snapshot.version,
       commitVersion = attemptVersion,
@@ -1887,8 +1894,8 @@ trait OptimisticTransactionImpl extends TransactionalWrite
       numRemove = numRemove,
       numSetTransaction = numSetTransaction,
       bytesNew = bytesNew,
-      numFilesTotal = postCommitSnapshot.numOfFiles,
-      sizeInBytesTotal = postCommitSnapshot.sizeInBytes,
+      numFilesTotal = numFilesTotal,
+      sizeInBytesTotal = sizeInBytesTotal,
       numCdcFiles = numCdcFiles,
       cdcBytesNew = cdcBytesNew,
       protocol = postCommitSnapshot.protocol,

diff --git a/spark/src/main/scala/org/apache/spark/sql/delta/RowId.scala b/spark/src/main/scala/org/apache/spark/sql/delta/RowId.scala
@@ -114,7 +114,11 @@ object RowId {
    * Extracts the high watermark of row IDs from a snapshot.
    */
   private[delta] def extractHighWatermark(snapshot: Snapshot): Option[Long] =
-    RowTrackingMetadataDomain.fromSnapshot(snapshot).map(_.rowIdHighWaterMark)
+    if (isSupported(snapshot.protocol)) {
+      RowTrackingMetadataDomain.fromSnapshot(snapshot).map(_.rowIdHighWaterMark)
+    } else {
+      None
+    }
 
   /** Base Row ID column name */
   val BASE_ROW_ID = "base_row_id"

diff --git a/spark/src/main/scala/org/apache/spark/sql/delta/sources/DeltaSQLConf.scala b/spark/src/main/scala/org/apache/spark/sql/delta/sources/DeltaSQLConf.scala
@@ -83,6 +83,17 @@ trait DeltaSQLConfBase {
       .stringConf
       .createOptional
 
+  val DELTA_FORCE_ALL_COMMIT_STATS =
+    buildConf("commitStats.force")
+      .internal()
+      .doc(
+        """When true, forces commit statistics to be collected for logging purposes.
+        | Enabling this feature requires the Snapshot State to be computed, which is
+        | potentially expensive.
+        """.stripMargin)
+      .booleanConf
+      .createWithDefault(false)
+
   val DELTA_CONVERT_USE_METADATA_LOG =
     buildConf("convert.useMetadataLog")
       .doc(

diff --git a/spark/src/test/scala/org/apache/spark/sql/delta/DeleteSuiteBase.scala b/spark/src/test/scala/org/apache/spark/sql/delta/DeleteSuiteBase.scala
@@ -326,6 +326,8 @@ abstract class DeleteSuiteBase extends QueryTest
   test("schema pruning on data condition") {
     val input = Seq((2, 2), (1, 4), (1, 1), (0, 3)).toDF("key", "value")
     append(input, Nil)
+    // Start from a cached snapshot state
+    deltaLog.update().stateDF
 
     val executedPlans = DeltaTestUtils.withPhysicalPlansCaptured(spark) {
       checkDelete(Some("key = 2"),
@@ -347,6 +349,8 @@ abstract class DeleteSuiteBase extends QueryTest
     val input = Seq((2, 2), (1, 4), (1, 1), (0, 3)).toDF("key", "value")
       .select(struct("key", "value").alias("nested"))
     append(input, Nil)
+    // Start from a cached snapshot state
+    deltaLog.update().stateDF
 
     val executedPlans = DeltaTestUtils.withPhysicalPlansCaptured(spark) {
       checkDelete(Some("nested.key = 2"),

diff --git a/spark/src/test/scala/org/apache/spark/sql/delta/OptimisticTransactionSuite.scala b/spark/src/test/scala/org/apache/spark/sql/delta/OptimisticTransactionSuite.scala
@@ -888,4 +888,22 @@ class OptimisticTransactionSuite
       }
     }
   }
+
+  test("Append does not trigger snapshot state computation") {
+    withTempDir { tableDir =>
+      val df = Seq((1, 0), (2, 1)).toDF("key", "value")
+      df.write.format("delta").mode("append").save(tableDir.getCanonicalPath)
+
+      val deltaLog = DeltaLog.forTable(spark, tableDir)
+      val preCommitSnapshot = deltaLog.update()
+      assert(!preCommitSnapshot.stateReconstructionTriggered)
+
+      df.write.format("delta").mode("append").save(tableDir.getCanonicalPath)
+
+      val postCommitSnapshot = deltaLog.update()
+      assert(!preCommitSnapshot.stateReconstructionTriggered)
+      assert(!postCommitSnapshot.stateReconstructionTriggered)
+    }
+  }
+
 }
diff --git a/spark/src/test/scala/org/apache/spark/sql/delta/UpdateSuiteBase.scala b/spark/src/test/scala/org/apache/spark/sql/delta/UpdateSuiteBase.scala
@@ -696,6 +696,8 @@ abstract class UpdateSuiteBase
 
   test("schema pruning on finding files to update") {
     append(Seq((2, 2), (1, 4), (1, 1), (0, 3)).toDF("key", "value"))
+    // Start from a cached snapshot state
+    deltaLog.update().stateDF
 
     val executedPlans = DeltaTestUtils.withPhysicalPlansCaptured(spark) {
       checkUpdate(condition = Some("key = 2"), setClauses = "key = 1, value = 3",
@@ -717,6 +719,8 @@ abstract class UpdateSuiteBase
   test("nested schema pruning on finding files to update") {
     append(Seq((2, 2), (1, 4), (1, 1), (0, 3)).toDF("key", "value")
       .select(struct("key", "value").alias("nested")))
+    // Start from a cached snapshot state
+    deltaLog.update().stateDF
 
     val executedPlans = DeltaTestUtils.withPhysicalPlansCaptured(spark) {
       checkUpdate(condition = Some("nested.key = 2"),