delta-io · ryan-johnson-databricks · Sep 26, 2023 · Sep 27, 2023 · Sep 27, 2023 · Sep 27, 2023
diff --git a/spark/src/main/scala/org/apache/spark/sql/delta/DeltaAnalysis.scala b/spark/src/main/scala/org/apache/spark/sql/delta/DeltaAnalysis.scala
@@ -553,7 +553,18 @@ class DeltaAnalysis(session: SparkSession)
       } else deltaMerge
       d.copy(target = stripTempViewForMergeWrapper(d.target))
 
-    case streamWrite: WriteToStream =>
+    case origStreamWrite: WriteToStream =>
+      // The command could have Delta as source and/or sink. We need to look at both.
+      val streamWrite = origStreamWrite match {
+        case WriteToStream(_, _, sink @ DeltaSink(_, _, _, _, _, None), _, _, _, _, Some(ct)) =>
+          // The command has a catalog table, but the DeltaSink does not. This happens because
+          // DeltaDataSource.createSink (Spark API) didn't have access to the catalog table when it
+          // created the DeltaSink. Fortunately we can fix it up here.
+          origStreamWrite.copy(sink = sink.copy(catalogTable = Some(ct)))
+        case _ => origStreamWrite
+      }
+
+      // We also need to validate the source schema location, if the command has a Delta source.
       verifyDeltaSourceSchemaLocation(
         streamWrite.inputQuery, streamWrite.resolvedCheckpointLocation)
       streamWrite

diff --git a/spark/src/main/scala/org/apache/spark/sql/delta/sources/DeltaDataSource.scala b/spark/src/main/scala/org/apache/spark/sql/delta/sources/DeltaDataSource.scala
@@ -169,6 +169,8 @@ class DeltaDataSource
       throw DeltaErrors.outputModeNotSupportedException(getClass.getName, outputMode.toString)
     }
     val deltaOptions = new DeltaOptions(parameters, sqlContext.sparkSession.sessionState.conf)
+    // NOTE: Spark API doesn't give access to the CatalogTable here, but DeltaAnalysis will pick
+    // that info out of the containing WriteToStream (if present), and update the sink there.
     new DeltaSink(sqlContext, new Path(path), partitionColumns, outputMode, deltaOptions)
   }
 

diff --git a/spark/src/main/scala/org/apache/spark/sql/delta/sources/DeltaSink.scala b/spark/src/main/scala/org/apache/spark/sql/delta/sources/DeltaSink.scala
@@ -27,6 +27,7 @@ import org.apache.hadoop.fs.Path
 
 // scalastyle:off import.ordering.noEmptyLine
 import org.apache.spark.sql._
+import org.apache.spark.sql.catalyst.catalog.CatalogTable
 import org.apache.spark.sql.catalyst.expressions.Attribute
 import org.apache.spark.sql.execution.SQLExecution
 import org.apache.spark.sql.execution.metric.{SQLMetric, SQLMetrics}
@@ -39,12 +40,13 @@ import org.apache.spark.util.Utils
 /**
  * A streaming sink that writes data into a Delta Table.
  */
-class DeltaSink(
+case class DeltaSink(
     sqlContext: SQLContext,
     path: Path,
     partitionColumns: Seq[String],
     outputMode: OutputMode,
-    options: DeltaOptions)
+    options: DeltaOptions,
+    catalogTable: Option[CatalogTable] = None)
   extends Sink
     with ImplicitMetadataOperation
     with DeltaLogging {

diff --git a/spark/src/test/scala/org/apache/spark/sql/delta/DeltaSinkSuite.scala b/spark/src/test/scala/org/apache/spark/sql/delta/DeltaSinkSuite.scala
@@ -21,7 +21,7 @@ import java.util.Locale
 
 // scalastyle:off import.ordering.noEmptyLine
 import org.apache.spark.sql.delta.actions.CommitInfo
-import org.apache.spark.sql.delta.sources.DeltaSQLConf
+import org.apache.spark.sql.delta.sources.{DeltaSQLConf, DeltaSink}
 import org.apache.spark.sql.delta.test.{DeltaColumnMappingSelectedTestMixin, DeltaSQLCommandTest}
 import org.apache.commons.io.FileUtils
 import org.scalatest.time.SpanSugar._
@@ -31,7 +31,8 @@ import org.apache.spark.sql._
 import org.apache.spark.sql.execution.DataSourceScanExec
 import org.apache.spark.sql.execution.datasources._
 
-import org.apache.spark.sql.execution.streaming.MemoryStream
+import org.apache.spark.sql.execution.streaming.{MemoryStream, MicroBatchExecution, StreamingQueryWrapper}
+import org.apache.spark.sql.execution.streaming.sources.WriteToMicroBatchDataSourceV1
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.streaming._
 import org.apache.spark.sql.types._
@@ -433,6 +434,47 @@ class DeltaSinkSuite
     }
   }
 
+  private def verifyDeltaSinkCatalog(f: DataStreamWriter[_] => StreamingQuery): Unit = {
+    // Create a Delta sink whose target table is defined by our caller.
+    val input = MemoryStream[Int]
+    val streamWriter = input.toDF
+      .writeStream
+      .format("delta")
+      .option(
+        "checkpointLocation",
+        Utils.createTempDir(namePrefix = "tahoe-test").getCanonicalPath)
+    val q = f(streamWriter).asInstanceOf[StreamingQueryWrapper]
+
+    // WARNING: Only the query execution thread is allowed to initialize the logical plan (enforced
+    // by an assertion in MicroBatchExecution.scala). To avoid flaky failures, run the stream to
+    // completion, to guarantee the query execution thread ran before we try to access the plan.
+    try {
+      input.addData(1, 2, 3)
+      q.processAllAvailable()
+    } finally {
+      q.stop()
+    }
+
+    val plan = q.streamingQuery.logicalPlan
+    val WriteToMicroBatchDataSourceV1(catalogTable, sink: DeltaSink, _, _, _, _, _) = plan
+    assert(catalogTable === sink.catalogTable)
+  }
+
+  test("DeltaSink.catalogTable is correctly populated - catalog-based table") {
+    withTable("tab") {
+      verifyDeltaSinkCatalog(_.toTable("tab"))
+    }
+  }
+
+  test("DeltaSink.catalogTable is correctly populated - path-based table") {
+    withTempDir { tempDir =>
+      if (tempDir.exists()) {
+        assert(tempDir.delete())
+      }
+      verifyDeltaSinkCatalog(_.start(tempDir.getCanonicalPath))
+    }
+  }
+
   test("can't write out with all columns being partition columns") {
     withTempDirs { (outputDir, checkpointDir) =>
       val inputData = MemoryStream[Int]