ARROW-11616: [Rust][DataFusion] Add collect_partitioned on DataFrame

The DataFrame API has a `collect` method which invokes the `collect(plan: Arc<dyn ExecutionPlan>) -> Result<Vec<RecordBatch>>` function which will collect records into a single vector of RecordBatches removing any partitioning via `MergeExec`. This PR adds the DataFrame `collect_partitioned` method so that partitioning can be maintained. This allows easy passing into a new `MemTable`. @andygrove Closes apache#9485 from seddonm1/expose_collect_partitioned Authored-by: Mike Seddon <[email protected]> Signed-off-by: Jorge C. Leitao <[email protected]>
ClickHouse · Feb 16, 2021 · 96dbeec · 96dbeec
1 parent ab30736
commit 96dbeec
Show file tree

Hide file tree

Showing 2 changed files with 31 additions and 2 deletions.
diff --git a/rust/datafusion/src/dataframe.rs b/rust/datafusion/src/dataframe.rs
@@ -206,6 +206,22 @@ pub trait DataFrame: Send + Sync {
     /// ```
     async fn collect(&self) -> Result<Vec<RecordBatch>>;
 
+    /// Executes this DataFrame and collects all results into a vector of vector of RecordBatch
+    /// maintaining the input partitioning.
+    ///
+    /// ```
+    /// # use datafusion::prelude::*;
+    /// # use datafusion::error::Result;
+    /// # #[tokio::main]
+    /// # async fn main() -> Result<()> {
+    /// let mut ctx = ExecutionContext::new();
+    /// let df = ctx.read_csv("tests/example.csv", CsvReadOptions::new())?;
+    /// let batches = df.collect_partitioned().await?;
+    /// # Ok(())
+    /// # }
+    /// ```
+    async fn collect_partitioned(&self) -> Result<Vec<Vec<RecordBatch>>>;
+
     /// Returns the schema describing the output of this DataFrame in terms of columns returned,
     /// where each column has a name, data type, and nullability attribute.
 

diff --git a/rust/datafusion/src/execution/dataframe_impl.rs b/rust/datafusion/src/execution/dataframe_impl.rs
@@ -19,14 +19,17 @@
 
 use std::sync::{Arc, Mutex};
 
-use crate::dataframe::*;
+use crate::arrow::record_batch::RecordBatch;
 use crate::error::Result;
 use crate::execution::context::{ExecutionContext, ExecutionContextState};
 use crate::logical_plan::{
     col, DFSchema, Expr, FunctionRegistry, JoinType, LogicalPlan, LogicalPlanBuilder,
     Partitioning,
 };
-use crate::{arrow::record_batch::RecordBatch, physical_plan::collect};
+use crate::{
+    dataframe::*,
+    physical_plan::{collect, collect_partitioned},
+};
 
 use async_trait::async_trait;
 
@@ -137,6 +140,16 @@ impl DataFrame for DataFrameImpl {
         Ok(collect(plan).await?)
     }
 
+    // Convert the logical plan represented by this DataFrame into a physical plan and
+    // execute it
+    async fn collect_partitioned(&self) -> Result<Vec<Vec<RecordBatch>>> {
+        let state = self.ctx_state.lock().unwrap().clone();
+        let ctx = ExecutionContext::from(Arc::new(Mutex::new(state)));
+        let plan = ctx.optimize(&self.plan)?;
+        let plan = ctx.create_physical_plan(&plan)?;
+        Ok(collect_partitioned(plan).await?)
+    }
+
     /// Returns the schema from the logical plan
     fn schema(&self) -> &DFSchema {
         self.plan.schema()