From c0dbb911d1c4ddf85d55f3c1c22f07a72f8c280e Mon Sep 17 00:00:00 2001 From: Andrew Gazelka Date: Wed, 20 Nov 2024 16:31:25 -0800 Subject: [PATCH] [FEAT]: connect: `df.distinct()` --- .../src/translation/logical_plan.rs | 10 +++-- .../translation/logical_plan/deduplicate.rs | 41 +++++++++++++++++++ tests/connect/test_distinct.py | 21 ++++++++++ 3 files changed, 69 insertions(+), 3 deletions(-) create mode 100644 src/daft-connect/src/translation/logical_plan/deduplicate.rs create mode 100644 tests/connect/test_distinct.py diff --git a/src/daft-connect/src/translation/logical_plan.rs b/src/daft-connect/src/translation/logical_plan.rs index 152a6f9510..c4858a6ce8 100644 --- a/src/daft-connect/src/translation/logical_plan.rs +++ b/src/daft-connect/src/translation/logical_plan.rs @@ -5,12 +5,13 @@ use spark_connect::{relation::RelType, Limit, Relation}; use tracing::warn; use crate::translation::logical_plan::{ - aggregate::aggregate, drop::drop, filter::filter, local_relation::local_relation, - project::project, range::range, read::read, set_op::set_op, to_df::to_df, - with_columns::with_columns, + aggregate::aggregate, deduplicate::deduplicate, drop::drop, filter::filter, + local_relation::local_relation, project::project, range::range, read::read, set_op::set_op, + to_df::to_df, with_columns::with_columns, }; mod aggregate; +mod deduplicate; mod drop; mod filter; mod local_relation; @@ -87,6 +88,9 @@ pub async fn to_logical_plan(relation: Relation) -> eyre::Result { RelType::SetOp(s) => set_op(*s) .await .wrap_err("Failed to apply set_op to logical plan"), + RelType::Deduplicate(d) => deduplicate(*d) + .await + .wrap_err("Failed to apply deduplicate to logical plan"), plan => bail!("Unsupported relation type: {plan:?}"), } } diff --git a/src/daft-connect/src/translation/logical_plan/deduplicate.rs b/src/daft-connect/src/translation/logical_plan/deduplicate.rs new file mode 100644 index 0000000000..512b81620c --- /dev/null +++ b/src/daft-connect/src/translation/logical_plan/deduplicate.rs @@ -0,0 +1,41 @@ +use eyre::{bail, ensure, WrapErr}; +use tracing::warn; + +use crate::translation::{to_logical_plan, Plan}; + +pub async fn deduplicate(deduplicate: spark_connect::Deduplicate) -> eyre::Result { + let spark_connect::Deduplicate { + input, + column_names, + all_columns_as_keys, + within_watermark, + } = deduplicate; + + let Some(input) = input else { + bail!("Input is required"); + }; + + if !column_names.is_empty() { + warn!("Ignoring column_names: {column_names:?}; not yet implemented"); + } + + let all_columns_as_keys = all_columns_as_keys.unwrap_or(false); + + ensure!( + all_columns_as_keys, + "only implemented for all_columns_as_keys=true" + ); + + if let Some(within_watermark) = within_watermark { + warn!("Ignoring within_watermark: {within_watermark:?}; not yet implemented"); + } + + let mut plan = Box::pin(to_logical_plan(*input)).await?; + + plan.builder = plan + .builder + .distinct() + .wrap_err("Failed to apply distinct to logical plan")?; + + Ok(plan) +} diff --git a/tests/connect/test_distinct.py b/tests/connect/test_distinct.py new file mode 100644 index 0000000000..9e4b861d36 --- /dev/null +++ b/tests/connect/test_distinct.py @@ -0,0 +1,21 @@ +from __future__ import annotations + + +def test_distinct(spark_session): + # Create ranges using Spark - with overlap + range1 = spark_session.range(7) # Creates DataFrame with numbers 0 to 6 + range2 = spark_session.range(3, 10) # Creates DataFrame with numbers 3 to 9 + + # Union the two ranges and get distinct values + unioned = range1.union(range2).distinct() + + # Collect results + results = unioned.collect() + + # Verify the DataFrame has expected values + # Distinct removes duplicates, so length should be 10 (0-9) + assert len(results) == 10, "DataFrame should have 10 unique rows" + + # Check that all expected values are present, with no duplicates + values = [row.id for row in results] + assert sorted(values) == [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], "Values should match expected sequence without duplicates"