apache
diff --git a/‎.github/workflows/rust.yml‎
Lines changed: 13 additions & 0 deletions b/‎.github/workflows/rust.yml‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎Cargo.lock‎
Lines changed: 13 additions & 0 deletions b/‎Cargo.lock‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎benchmarks/src/imdb/run.rs‎
Lines changed: 1 addition & 1 deletion b/‎benchmarks/src/imdb/run.rs‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎benchmarks/src/tpch/run.rs‎
Lines changed: 1 addition & 1 deletion b/‎benchmarks/src/tpch/run.rs‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎datafusion/common/src/lib.rs‎
Lines changed: 6 additions & 0 deletions b/‎datafusion/common/src/lib.rs‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎datafusion/functions-aggregate/src/min_max/min_max_bytes.rs‎
Lines changed: 20 additions & 30 deletions b/‎datafusion/functions-aggregate/src/min_max/min_max_bytes.rs‎
Lines changed: 20 additions & 30 deletions
diff --git a/‎datafusion/functions-nested/src/set_ops.rs‎
Lines changed: 53 additions & 10 deletions b/‎datafusion/functions-nested/src/set_ops.rs‎
Lines changed: 53 additions & 10 deletions
diff --git a/‎datafusion/optimizer/src/replace_distinct_aggregate.rs‎
Lines changed: 39 additions & 4 deletions b/‎datafusion/optimizer/src/replace_distinct_aggregate.rs‎
Lines changed: 39 additions & 4 deletions
diff --git a/‎datafusion/proto/Cargo.toml‎
Lines changed: 19 additions & 3 deletions b/‎datafusion/proto/Cargo.toml‎
Lines changed: 19 additions & 3 deletions
@@ -353,6 +353,19 @@ jobs:
         with:
           save-if: ${{ github.ref_name == 'main' }}
           shared-key: "amd-ci-linux-test-example"
+      - name: Remove unnecessary preinstalled software
+        run: |
+          echo "Disk space before cleanup:"
+          df -h
+          apt-get clean
+          rm -rf /__t/CodeQL
+          rm -rf /__t/PyPy
+          rm -rf /__t/Java_Temurin-Hotspot_jdk
+          rm -rf /__t/Python
+          rm -rf /__t/go
+          rm -rf /__t/Ruby
+          echo "Disk space after cleanup:"
+          df -h
       - name: Run examples
         run: |
           # test datafusion-sql examples
 
@@ -534,7 +534,7 @@ mod tests {
             let plan = ctx.sql(&query).await?;
             let plan = plan.into_optimized_plan()?;
             let bytes = logical_plan_to_bytes(&plan)?;
-            let plan2 = logical_plan_from_bytes(&bytes, &ctx)?;
+            let plan2 = logical_plan_from_bytes(&bytes, &ctx.task_ctx())?;
             let plan_formatted = format!("{}", plan.display_indent());
             let plan2_formatted = format!("{}", plan2.display_indent());
             assert_eq!(plan_formatted, plan2_formatted);
 
@@ -387,7 +387,7 @@ mod tests {
             let plan = ctx.sql(&query).await?;
             let plan = plan.into_optimized_plan()?;
             let bytes = logical_plan_to_bytes(&plan)?;
-            let plan2 = logical_plan_from_bytes(&bytes, &ctx)?;
+            let plan2 = logical_plan_from_bytes(&bytes, &ctx.task_ctx())?;
             let plan_formatted = format!("{}", plan.display_indent());
             let plan2_formatted = format!("{}", plan2.display_indent());
             assert_eq!(plan_formatted, plan2_formatted);
 
@@ -108,6 +108,12 @@ pub use error::{
 // The HashMap and HashSet implementations that should be used as the uniform defaults
 pub type HashMap<K, V, S = DefaultHashBuilder> = hashbrown::HashMap<K, V, S>;
 pub type HashSet<T, S = DefaultHashBuilder> = hashbrown::HashSet<T, S>;
+pub mod hash_map {
+    pub use hashbrown::hash_map::Entry;
+}
+pub mod hash_set {
+    pub use hashbrown::hash_set::Entry;
+}
 
 /// Downcast an Arrow Array to a concrete type, return an `DataFusionError::Internal` if the cast is
 /// not possible. In normal usage of DataFusion the downcast should always succeed.
 
@@ -20,7 +20,8 @@ use arrow::array::{
     LargeBinaryBuilder, LargeStringBuilder, StringBuilder, StringViewBuilder,
 };
 use arrow::datatypes::DataType;
-use datafusion_common::{internal_err, Result};
+use datafusion_common::hash_map::Entry;
+use datafusion_common::{internal_err, HashMap, Result};
 use datafusion_expr::{EmitTo, GroupsAccumulator};
 use datafusion_functions_aggregate_common::aggregate::groups_accumulator::nulls::apply_filter_as_nulls;
 use std::mem::size_of;
@@ -391,14 +392,6 @@ struct MinMaxBytesState {
     total_data_bytes: usize,
 }
 
-#[derive(Debug, Clone, Copy)]
-enum MinMaxLocation<'a> {
-    /// the min/max value is stored in the existing `min_max` array
-    ExistingMinMax,
-    /// the min/max value is stored in the input array at the given index
-    Input(&'a [u8]),
-}
-
 /// Implement the MinMaxBytesAccumulator with a comparison function
 /// for comparing strings
 impl MinMaxBytesState {
@@ -450,7 +443,7 @@ impl MinMaxBytesState {
         // Minimize value copies by calculating the new min/maxes for each group
         // in this batch (either the existing min/max or the new input value)
         // and updating the owned values in `self.min_maxes` at most once
-        let mut locations = vec![MinMaxLocation::ExistingMinMax; total_num_groups];
+        let mut locations = HashMap::<usize, &[u8]>::with_capacity(group_indices.len());
 
         // Figure out the new min value for each group
         for (new_val, group_index) in iter.into_iter().zip(group_indices.iter()) {
@@ -459,32 +452,29 @@ impl MinMaxBytesState {
                 continue; // skip nulls
             };
 
-            let existing_val = match locations[group_index] {
-                // previous input value was the min/max, so compare it
-                MinMaxLocation::Input(existing_val) => existing_val,
-                MinMaxLocation::ExistingMinMax => {
-                    let Some(existing_val) = self.min_max[group_index].as_ref() else {
-                        // no existing min/max, so this is the new min/max
-                        locations[group_index] = MinMaxLocation::Input(new_val);
-                        continue;
-                    };
-                    existing_val.as_ref()
+            match locations.entry(group_index) {
+                Entry::Occupied(mut occupied_entry) => {
+                    if cmp(new_val, occupied_entry.get()) {
+                        occupied_entry.insert(new_val);
+                    }
+                }
+                Entry::Vacant(vacant_entry) => {
+                    if let Some(old_val) = self.min_max[group_index].as_ref() {
+                        if cmp(new_val, old_val) {
+                            vacant_entry.insert(new_val);
+                        }
+                    } else {
+                        vacant_entry.insert(new_val);
+                    }
                 }
             };
-
-            // Compare the new value to the existing value, replacing if necessary
-            if cmp(new_val, existing_val) {
-                locations[group_index] = MinMaxLocation::Input(new_val);
-            }
         }
 
         // Update self.min_max with any new min/max values we found in the input
-        for (group_index, location) in locations.iter().enumerate() {
-            match location {
-                MinMaxLocation::ExistingMinMax => {}
-                MinMaxLocation::Input(new_val) => self.set_value(group_index, new_val),
-            }
+        for (group_index, location) in locations.iter() {
+            self.set_value(*group_index, location);
         }
+
         Ok(())
     }
 
 
@@ -29,9 +29,7 @@ use arrow::datatypes::{DataType, Field, FieldRef};
 use arrow::row::{RowConverter, SortField};
 use datafusion_common::cast::{as_large_list_array, as_list_array};
 use datafusion_common::utils::ListCoercion;
-use datafusion_common::{
-    exec_err, internal_err, plan_err, utils::take_function_args, Result,
-};
+use datafusion_common::{exec_err, internal_err, utils::take_function_args, Result};
 use datafusion_expr::{
     ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility,
 };
@@ -289,13 +287,7 @@ impl ScalarUDFImpl for ArrayDistinct {
     }
 
     fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
-        match &arg_types[0] {
-            List(field) => Ok(DataType::new_list(field.data_type().clone(), true)),
-            LargeList(field) => {
-                Ok(DataType::new_large_list(field.data_type().clone(), true))
-            }
-            arg_type => plan_err!("{} does not support type {arg_type}", self.name()),
-        }
+        Ok(arg_types[0].clone())
     }
 
     fn invoke_with_args(
@@ -563,3 +555,54 @@ fn general_array_distinct<OffsetSize: OffsetSizeTrait>(
         array.nulls().cloned(),
     )?))
 }
+
+#[cfg(test)]
+mod tests {
+    use std::sync::Arc;
+
+    use arrow::{
+        array::{Int32Array, ListArray},
+        buffer::OffsetBuffer,
+        datatypes::{DataType, Field},
+    };
+    use datafusion_common::{config::ConfigOptions, DataFusionError};
+    use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
+
+    use crate::set_ops::array_distinct_udf;
+
+    #[test]
+    fn test_array_distinct_inner_nullability_result_type_match_return_type(
+    ) -> Result<(), DataFusionError> {
+        let udf = array_distinct_udf();
+
+        for inner_nullable in [true, false] {
+            let inner_field = Field::new_list_field(DataType::Int32, inner_nullable);
+            let input_field =
+                Field::new_list("input", Arc::new(inner_field.clone()), true);
+
+            // [[1, 1, 2]]
+            let input_array = ListArray::new(
+                inner_field.into(),
+                OffsetBuffer::new(vec![0, 3].into()),
+                Arc::new(Int32Array::new(vec![1, 1, 2].into(), None)),
+                None,
+            );
+
+            let input_array = ColumnarValue::Array(Arc::new(input_array));
+
+            let result = udf.invoke_with_args(ScalarFunctionArgs {
+                args: vec![input_array],
+                arg_fields: vec![input_field.clone().into()],
+                number_rows: 1,
+                return_field: input_field.clone().into(),
+                config_options: Arc::new(ConfigOptions::default()),
+            })?;
+
+            assert_eq!(
+                result.data_type(),
+                udf.return_type(&[input_field.data_type().clone()])?
+            );
+        }
+        Ok(())
+    }
+}
@@ -25,7 +25,7 @@ use datafusion_common::tree_node::Transformed;
 use datafusion_common::{Column, Result};
 use datafusion_expr::expr_rewriter::normalize_cols;
 use datafusion_expr::utils::expand_wildcard;
-use datafusion_expr::{col, ExprFunctionExt, LogicalPlanBuilder};
+use datafusion_expr::{col, lit, ExprFunctionExt, Limit, LogicalPlanBuilder};
 use datafusion_expr::{Aggregate, Distinct, DistinctOn, Expr, LogicalPlan};
 
 /// Optimizer that replaces logical [[Distinct]] with a logical [[Aggregate]]
@@ -54,6 +54,17 @@ use datafusion_expr::{Aggregate, Distinct, DistinctOn, Expr, LogicalPlan};
 /// )
 /// ORDER BY a DESC
 /// ```
+///
+/// In case there are no columns, the [[Distinct]] is replaced by a [[Limit]]
+///
+/// ```text
+/// SELECT DISTINCT * FROM empty_table
+/// ```
+///
+/// Into
+/// ```text
+/// SELECT * FROM empty_table LIMIT 1
+/// ```
 #[derive(Default, Debug)]
 pub struct ReplaceDistinctWithAggregate {}
 
@@ -78,6 +89,16 @@ impl OptimizerRule for ReplaceDistinctWithAggregate {
             LogicalPlan::Distinct(Distinct::All(input)) => {
                 let group_expr = expand_wildcard(input.schema(), &input, None)?;
 
+                if group_expr.is_empty() {
+                    // Special case: there are no columns to group by, so we can't replace it by a group by
+                    // however, we can replace it by LIMIT 1 because there is either no output or a single empty row
+                    return Ok(Transformed::yes(LogicalPlan::Limit(Limit {
+                        skip: None,
+                        fetch: Some(Box::new(lit(1i64))),
+                        input,
+                    })));
+                }
+
                 let field_count = input.schema().fields().len();
                 for dep in input.schema().functional_dependencies().iter() {
                     // If distinct is exactly the same with a previous GROUP BY, we can
@@ -184,15 +205,17 @@ impl OptimizerRule for ReplaceDistinctWithAggregate {
 
 #[cfg(test)]
 mod tests {
-    use std::sync::Arc;
-
     use crate::assert_optimized_plan_eq_snapshot;
     use crate::replace_distinct_aggregate::ReplaceDistinctWithAggregate;
     use crate::test::*;
+    use arrow::datatypes::{Fields, Schema};
+    use std::sync::Arc;
 
     use crate::OptimizerContext;
     use datafusion_common::Result;
-    use datafusion_expr::{col, logical_plan::builder::LogicalPlanBuilder, Expr};
+    use datafusion_expr::{
+        col, logical_plan::builder::LogicalPlanBuilder, table_scan, Expr,
+    };
     use datafusion_functions_aggregate::sum::sum;
 
     macro_rules! assert_optimized_plan_equal {
@@ -274,4 +297,16 @@ mod tests {
               TableScan: test
         ")
     }
+
+    #[test]
+    fn use_limit_1_when_no_columns() -> Result<()> {
+        let plan = table_scan(Some("test"), &Schema::new(Fields::empty()), None)?
+            .distinct()?
+            .build()?;
+
+        assert_optimized_plan_equal!(plan, @r"
+        Limit: skip=0, fetch=1
+          TableScan: test
+        ")
+    }
 }
@@ -40,15 +40,31 @@ name = "datafusion_proto"
 [features]
 default = ["parquet"]
 json = ["pbjson", "serde", "serde_json", "datafusion-proto-common/json"]
-parquet = ["datafusion/parquet", "datafusion-common/parquet"]
-avro = ["datafusion/avro", "datafusion-common/avro"]
+parquet = ["datafusion-datasource-parquet", "datafusion-common/parquet", "datafusion/parquet"]
+avro = ["datafusion-datasource-avro", "datafusion-common/avro"]
+
+# Note to developers: do *not* add `datafusion` as a dependency in
+# this crate. See https://github.com/apache/datafusion/issues/17713
+# for additional information.
 
 [dependencies]
 arrow = { workspace = true }
 chrono = { workspace = true }
-datafusion = { workspace = true, default-features = false }
+datafusion-catalog = { workspace = true }
+datafusion-catalog-listing = { workspace = true }
 datafusion-common = { workspace = true }
+datafusion-datasource = { workspace = true }
+datafusion-datasource-arrow = { workspace = true }
+datafusion-datasource-avro = { workspace = true, optional = true }
+datafusion-datasource-csv = { workspace = true }
+datafusion-datasource-json = { workspace = true }
+datafusion-datasource-parquet = { workspace = true, optional = true }
+datafusion-execution = { workspace = true }
 datafusion-expr = { workspace = true }
+datafusion-functions-table = { workspace = true }
+datafusion-physical-expr = { workspace = true }
+datafusion-physical-expr-common = { workspace = true }
+datafusion-physical-plan = { workspace = true }
 datafusion-proto-common = { workspace = true }
 object_store = { workspace = true }
 pbjson = { workspace = true, optional = true }