-
Notifications
You must be signed in to change notification settings - Fork 175
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
f28dfd4
commit 78418ca
Showing
3 changed files
with
68 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
41 changes: 41 additions & 0 deletions
41
src/daft-connect/src/translation/logical_plan/deduplicate.rs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
use eyre::{bail, ensure, WrapErr}; | ||
use tracing::warn; | ||
|
||
use crate::translation::{to_logical_plan, Plan}; | ||
|
||
pub async fn deduplicate(deduplicate: spark_connect::Deduplicate) -> eyre::Result<Plan> { | ||
let spark_connect::Deduplicate { | ||
input, | ||
column_names, | ||
all_columns_as_keys, | ||
within_watermark, | ||
} = deduplicate; | ||
|
||
let Some(input) = input else { | ||
bail!("Input is required"); | ||
}; | ||
|
||
if !column_names.is_empty() { | ||
warn!("Ignoring column_names: {column_names:?}; not yet implemented"); | ||
} | ||
|
||
let all_columns_as_keys = all_columns_as_keys.unwrap_or(false); | ||
|
||
ensure!( | ||
all_columns_as_keys, | ||
"only implemented for all_columns_as_keys=true" | ||
); | ||
|
||
if let Some(within_watermark) = within_watermark { | ||
warn!("Ignoring within_watermark: {within_watermark:?}; not yet implemented"); | ||
} | ||
|
||
let mut plan = Box::pin(to_logical_plan(*input)).await?; | ||
|
||
plan.builder = plan | ||
.builder | ||
.distinct() | ||
.wrap_err("Failed to apply distinct to logical plan")?; | ||
|
||
Ok(plan) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
from __future__ import annotations | ||
|
||
|
||
def test_distinct(spark_session): | ||
# Create ranges using Spark - with overlap | ||
range1 = spark_session.range(7) # Creates DataFrame with numbers 0 to 6 | ||
range2 = spark_session.range(3, 10) # Creates DataFrame with numbers 3 to 9 | ||
|
||
# Union the two ranges and get distinct values | ||
unioned = range1.union(range2).distinct() | ||
|
||
# Collect results | ||
results = unioned.collect() | ||
|
||
# Verify the DataFrame has expected values | ||
# Distinct removes duplicates, so length should be 10 (0-9) | ||
assert len(results) == 10, "DataFrame should have 10 unique rows" | ||
|
||
# Check that all expected values are present, with no duplicates | ||
values = [row.id for row in results] | ||
assert sorted(values) == [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], "Values should match expected sequence without duplicates" |