[SPARK-51769][SQL] Add maxRecordsPerOutputBatch to limit the number of record of Arrow output batch

viirya · viirya · commit 01c16af41518 · 2025-04-11T01:01:51.000-07:00
### What changes were proposed in this pull request? This patch adds a new config `maxRecordsPerOutputBatch` to limit the number of output record of Arrow output batch. ### Why are the changes needed? While implementing columnar-based operator for Spark, if the operator takes input from Arrow-based evaluation operator in Spark, the number of records of output batch is unlimited for now. For such columnar-based operator, sometimes we want to limit the maximum number of input batch. If we need to limit the batch size in rows, it seems there is no existing way we can do. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Unit test ### Was this patch authored or co-authored using generative AI tooling? No Closes #50301 from viirya/arrow_output_size. Authored-by: Liang-Chi Hsieh <viirya@gmail.com> Signed-off-by: Liang-Chi Hsieh <viirya@gmail.com>
diff --git a/python/pyspark/sql/tests/arrow/test_arrow_map.py b/python/pyspark/sql/tests/arrow/test_arrow_map.py
@@ -208,6 +208,14 @@ def setUpClass(cls):
         cls.spark.conf.set("spark.sql.execution.arrow.maxBytesPerBatch", "10")
 
 
+class MapInArrowWithOutputArrowBatchSlicingTests(MapInArrowTests):
+    @classmethod
+    def setUpClass(cls):
+        MapInArrowTests.setUpClass()
+        cls.spark.conf.set("spark.sql.execution.arrow.maxRecordsPerBatch", "10")
+        cls.spark.conf.set("spark.sql.execution.arrow.maxRecordsPerOutputBatch", "3")
+
+
 if __name__ == "__main__":
     from pyspark.sql.tests.arrow.test_arrow_map import *  # noqa: F401
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -3551,6 +3551,19 @@ object SQLConf {
       .intConf
       .createWithDefault(10000)
 
+  val ARROW_EXECUTION_MAX_RECORDS_PER_OUTPUT_BATCH =
+    buildConf("spark.sql.execution.arrow.maxRecordsPerOutputBatch")
+      .doc("When using Apache Arrow, limit the maximum number of records that can be output " +
+        "in a single ArrowRecordBatch to the downstream operator. If set to zero or negative " +
+        "there is no limit. Note that the complete ArrowRecordBatch is actually created but " +
+        "the number of records is limited when sending it to the downstream operator. This is " +
+        "used to avoid large batches being sent to the downstream operator including " +
+        "the columnar-based operator implemented by third-party libraries.")
+      .version("4.1.0")
+      .internal()
+      .intConf
+      .createWithDefault(-1)
+
   val ARROW_EXECUTION_MAX_BYTES_PER_BATCH =
     buildConf("spark.sql.execution.arrow.maxBytesPerBatch")
       .internal()
@@ -6553,6 +6566,8 @@ class SQLConf extends Serializable with Logging with SqlApiConf {
 
   def arrowMaxRecordsPerBatch: Int = getConf(ARROW_EXECUTION_MAX_RECORDS_PER_BATCH)
 
+  def arrowMaxRecordsPerOutputBatch: Int = getConf(ARROW_EXECUTION_MAX_RECORDS_PER_OUTPUT_BATCH)
+
   def arrowMaxBytesPerBatch: Long = getConf(ARROW_EXECUTION_MAX_BYTES_PER_BATCH)
 
   def arrowTransformWithStateInPandasMaxRecordsPerBatch: Int =
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/PythonArrowOutput.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/PythonArrowOutput.scala
@@ -27,6 +27,7 @@ import org.apache.arrow.vector.ipc.ArrowStreamReader
 import org.apache.spark.{SparkEnv, TaskContext}
 import org.apache.spark.api.python.{BasePythonRunner, PythonWorker, SpecialLengths}
 import org.apache.spark.sql.execution.metric.SQLMetric
+import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types.StructType
 import org.apache.spark.sql.util.ArrowUtils
 import org.apache.spark.sql.vectorized.{ArrowColumnVector, ColumnarBatch, ColumnVector}
@@ -43,6 +44,8 @@ private[python] trait PythonArrowOutput[OUT <: AnyRef] { self: BasePythonRunner[
 
   protected def deserializeColumnarBatch(batch: ColumnarBatch, schema: StructType): OUT
 
+  protected def arrowMaxRecordsPerOutputBatch: Int = SQLConf.get.arrowMaxRecordsPerOutputBatch
+
   protected def newReaderIterator(
       stream: DataInputStream,
       writer: Writer,
@@ -62,7 +65,7 @@ private[python] trait PythonArrowOutput[OUT <: AnyRef] { self: BasePythonRunner[
       private var reader: ArrowStreamReader = _
       private var root: VectorSchemaRoot = _
       private var schema: StructType = _
-      private var vectors: Array[ColumnVector] = _
+      private var processor: ArrowOutputProcessor = _
 
       context.addTaskCompletionListener[Unit] { _ =>
         if (reader != null) {
@@ -84,17 +87,12 @@ private[python] trait PythonArrowOutput[OUT <: AnyRef] { self: BasePythonRunner[
         }
         try {
           if (reader != null && batchLoaded) {
-            val bytesReadStart = reader.bytesRead()
-            batchLoaded = reader.loadNextBatch()
+            batchLoaded = processor.loadBatch()
             if (batchLoaded) {
-              val batch = new ColumnarBatch(vectors)
-              val rowCount = root.getRowCount
-              batch.setNumRows(root.getRowCount)
-              val bytesReadEnd = reader.bytesRead()
-              pythonMetrics("pythonNumRowsReceived") += rowCount
-              pythonMetrics("pythonDataReceived") += bytesReadEnd - bytesReadStart
+              val batch = processor.produceBatch()
               deserializeColumnarBatch(batch, schema)
             } else {
+              processor.close()
               reader.close(false)
               allocator.close()
               // Reach end of stream. Call `read()` again to read control data.
@@ -106,9 +104,14 @@ private[python] trait PythonArrowOutput[OUT <: AnyRef] { self: BasePythonRunner[
                 reader = new ArrowStreamReader(stream, allocator)
                 root = reader.getVectorSchemaRoot()
                 schema = ArrowUtils.fromArrowSchema(root.getSchema())
-                vectors = root.getFieldVectors().asScala.map { vector =>
-                  new ArrowColumnVector(vector)
-                }.toArray[ColumnVector]
+
+                if (arrowMaxRecordsPerOutputBatch > 0) {
+                  processor = new SliceArrowOutputProcessorImpl(
+                    reader, pythonMetrics, arrowMaxRecordsPerOutputBatch)
+                } else {
+                  processor = new ArrowOutputProcessorImpl(reader, pythonMetrics)
+                }
+
                 read()
               case SpecialLengths.TIMING_DATA =>
                 handleTimingData()
@@ -133,3 +136,114 @@ private[python] trait BasicPythonArrowOutput extends PythonArrowOutput[ColumnarB
       batch: ColumnarBatch,
       schema: StructType): ColumnarBatch = batch
 }
+
+trait ArrowOutputProcessor {
+  def loadBatch(): Boolean
+  protected def getRoot: VectorSchemaRoot
+  protected def getVectors(root: VectorSchemaRoot): Array[ColumnVector]
+  def produceBatch(): ColumnarBatch
+  def close(): Unit
+}
+
+class ArrowOutputProcessorImpl(reader: ArrowStreamReader, pythonMetrics: Map[String, SQLMetric])
+    extends ArrowOutputProcessor {
+  protected val root = reader.getVectorSchemaRoot()
+  protected val schema: StructType = ArrowUtils.fromArrowSchema(root.getSchema())
+  private val vectors: Array[ColumnVector] = root.getFieldVectors().asScala.map { vector =>
+    new ArrowColumnVector(vector)
+  }.toArray[ColumnVector]
+
+  protected var rowCount = -1
+
+  override def loadBatch(): Boolean = {
+    val bytesReadStart = reader.bytesRead()
+    val batchLoaded = reader.loadNextBatch()
+    if (batchLoaded) {
+      rowCount = root.getRowCount
+      val bytesReadEnd = reader.bytesRead()
+      pythonMetrics("pythonNumRowsReceived") += rowCount
+      pythonMetrics("pythonDataReceived") += bytesReadEnd - bytesReadStart
+    }
+    batchLoaded
+  }
+
+  protected override def getRoot: VectorSchemaRoot = root
+  protected override def getVectors(root: VectorSchemaRoot): Array[ColumnVector] = vectors
+  override def produceBatch(): ColumnarBatch = {
+    val batchRoot = getRoot
+    val vectors = getVectors(batchRoot)
+    val batch = new ColumnarBatch(vectors)
+    batch.setNumRows(batchRoot.getRowCount)
+    batch
+  }
+  override def close(): Unit = {
+    vectors.foreach(_.close())
+    root.close()
+  }
+}
+
+class SliceArrowOutputProcessorImpl(
+    reader: ArrowStreamReader,
+    pythonMetrics: Map[String, SQLMetric],
+    arrowMaxRecordsPerOutputBatch: Int)
+  extends ArrowOutputProcessorImpl(reader, pythonMetrics) {
+
+  private var currentRowIdx = -1
+  private var prevRoot: VectorSchemaRoot = null
+  private var prevVectors: Array[ColumnVector] = _
+
+  override def produceBatch(): ColumnarBatch = {
+    val batchRoot = getRoot
+
+    if (batchRoot != prevRoot) {
+      if (prevRoot != null) {
+        prevVectors.foreach(_.close())
+        prevRoot.close()
+      }
+      prevRoot = batchRoot
+    }
+
+    val vectors = getVectors(batchRoot)
+    prevVectors = vectors
+
+    val batch = new ColumnarBatch(vectors)
+    batch.setNumRows(batchRoot.getRowCount)
+    batch
+  }
+
+  override def loadBatch(): Boolean = {
+    if (rowCount > 0 && currentRowIdx < rowCount) {
+      true
+    } else {
+      val loaded = super.loadBatch()
+      currentRowIdx = 0
+      loaded
+    }
+  }
+
+  protected override def getRoot: VectorSchemaRoot = {
+    val remainingRows = rowCount - currentRowIdx
+    val rootSlice = if (remainingRows > arrowMaxRecordsPerOutputBatch) {
+      root.slice(currentRowIdx, arrowMaxRecordsPerOutputBatch)
+    } else {
+      root
+    }
+
+    currentRowIdx = currentRowIdx + rootSlice.getRowCount
+
+    rootSlice
+  }
+
+  protected override def getVectors(root: VectorSchemaRoot): Array[ColumnVector] = {
+    root.getFieldVectors.asScala.map { vector =>
+      new ArrowColumnVector(vector)
+    }.toArray[ColumnVector]
+  }
+
+  override def close(): Unit = {
+    if (prevRoot != null) {
+      prevVectors.foreach(_.close())
+      prevRoot.close()
+    }
+  }
+}