Skip to content

Commit

Permalink
feat(python,rust,cli): support left and right anti/semi joins from th…
Browse files Browse the repository at this point in the history
…e SQL interface (#11501)

Co-authored-by: ritchie <[email protected]>
  • Loading branch information
alexander-beedie and ritchie46 authored Oct 4, 2023
1 parent fcdb244 commit 62ff49d
Show file tree
Hide file tree
Showing 6 changed files with 155 additions and 18 deletions.
78 changes: 65 additions & 13 deletions crates/polars-lazy/src/frame/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1000,6 +1000,38 @@ impl LazyFrame {
}
}

/// Left anti join this query with another lazy query.
///
/// Matches on the values of the expressions `left_on` and `right_on`. For more
/// flexible join logic, see [`join`](LazyFrame::join) or
/// [`join_builder`](LazyFrame::join_builder).
///
/// # Example
///
/// ```rust
/// use polars_core::prelude::*;
/// use polars_lazy::prelude::*;
/// fn anti_join_dataframes(ldf: LazyFrame, other: LazyFrame) -> LazyFrame {
/// ldf
/// .anti_join(other, col("foo"), col("bar").cast(DataType::Utf8))
/// }
/// ```
#[cfg(feature = "semi_anti_join")]
pub fn anti_join<E: Into<Expr>>(self, other: LazyFrame, left_on: E, right_on: E) -> LazyFrame {
self.join(
other,
[left_on.into()],
[right_on.into()],
JoinArgs::new(JoinType::Anti),
)
}

/// Creates the cartesian product from both frames, preserving the order of the left keys.
#[cfg(feature = "cross_join")]
pub fn cross_join(self, other: LazyFrame) -> LazyFrame {
self.join(other, vec![], vec![], JoinArgs::new(JoinType::Cross))
}

/// Left join this query with another lazy query.
///
/// Matches on the values of the expressions `left_on` and `right_on`. For more
Expand All @@ -1011,7 +1043,7 @@ impl LazyFrame {
/// ```rust
/// use polars_core::prelude::*;
/// use polars_lazy::prelude::*;
/// fn join_dataframes(ldf: LazyFrame, other: LazyFrame) -> LazyFrame {
/// fn left_join_dataframes(ldf: LazyFrame, other: LazyFrame) -> LazyFrame {
/// ldf
/// .left_join(other, col("foo"), col("bar"))
/// }
Expand All @@ -1025,6 +1057,31 @@ impl LazyFrame {
)
}

/// Inner join this query with another lazy query.
///
/// Matches on the values of the expressions `left_on` and `right_on`. For more
/// flexible join logic, see [`join`](LazyFrame::join) or
/// [`join_builder`](LazyFrame::join_builder).
///
/// # Example
///
/// ```rust
/// use polars_core::prelude::*;
/// use polars_lazy::prelude::*;
/// fn inner_join_dataframes(ldf: LazyFrame, other: LazyFrame) -> LazyFrame {
/// ldf
/// .inner_join(other, col("foo"), col("bar").cast(DataType::Utf8))
/// }
/// ```
pub fn inner_join<E: Into<Expr>>(self, other: LazyFrame, left_on: E, right_on: E) -> LazyFrame {
self.join(
other,
[left_on.into()],
[right_on.into()],
JoinArgs::new(JoinType::Inner),
)
}

/// Outer join this query with another lazy query.
///
/// Matches on the values of the expressions `left_on` and `right_on`. For more
Expand All @@ -1036,7 +1093,7 @@ impl LazyFrame {
/// ```rust
/// use polars_core::prelude::*;
/// use polars_lazy::prelude::*;
/// fn join_dataframes(ldf: LazyFrame, other: LazyFrame) -> LazyFrame {
/// fn outer_join_dataframes(ldf: LazyFrame, other: LazyFrame) -> LazyFrame {
/// ldf
/// .outer_join(other, col("foo"), col("bar"))
/// }
Expand All @@ -1050,7 +1107,7 @@ impl LazyFrame {
)
}

/// Inner join this query with another lazy query.
/// Left semi join this query with another lazy query.
///
/// Matches on the values of the expressions `left_on` and `right_on`. For more
/// flexible join logic, see [`join`](LazyFrame::join) or
Expand All @@ -1061,26 +1118,21 @@ impl LazyFrame {
/// ```rust
/// use polars_core::prelude::*;
/// use polars_lazy::prelude::*;
/// fn join_dataframes(ldf: LazyFrame, other: LazyFrame) -> LazyFrame {
/// fn semi_join_dataframes(ldf: LazyFrame, other: LazyFrame) -> LazyFrame {
/// ldf
/// .inner_join(other, col("foo"), col("bar").cast(DataType::Utf8))
/// .semi_join(other, col("foo"), col("bar").cast(DataType::Utf8))
/// }
/// ```
pub fn inner_join<E: Into<Expr>>(self, other: LazyFrame, left_on: E, right_on: E) -> LazyFrame {
#[cfg(feature = "semi_anti_join")]
pub fn semi_join<E: Into<Expr>>(self, other: LazyFrame, left_on: E, right_on: E) -> LazyFrame {
self.join(
other,
[left_on.into()],
[right_on.into()],
JoinArgs::new(JoinType::Inner),
JoinArgs::new(JoinType::Semi),
)
}

/// Creates the cartesian product from both frames, preserving the order of the left keys.
#[cfg(feature = "cross_join")]
pub fn cross_join(self, other: LazyFrame) -> LazyFrame {
self.join(other, vec![], vec![], JoinArgs::new(JoinType::Cross))
}

/// Generic function to join two LazyFrames.
///
/// `join` can join on multiple columns, given as two list of expressions, and with a
Expand Down
1 change: 1 addition & 0 deletions crates/polars-sql/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,4 @@ json = ["polars-lazy/json"]
default = []
ipc = ["polars-lazy/ipc"]
parquet = ["polars-lazy/parquet"]
semi_anti_join = ["polars-lazy/semi_anti_join"]
30 changes: 27 additions & 3 deletions crates/polars-sql/src/context.rs
Original file line number Diff line number Diff line change
Expand Up @@ -271,22 +271,46 @@ impl SQLContext {
for tbl in &tbl_expr.joins {
let (join_tbl_name, join_tbl) = self.get_table(&tbl.relation)?;
lf = match &tbl.join_operator {
JoinOperator::CrossJoin => lf.cross_join(join_tbl),
JoinOperator::FullOuter(constraint) => {
let (left_on, right_on) =
process_join_constraint(constraint, &tbl_name, &join_tbl_name)?;
lf.outer_join(join_tbl, left_on, right_on)
},
JoinOperator::Inner(constraint) => {
let (left_on, right_on) =
process_join_constraint(constraint, &tbl_name, &join_tbl_name)?;
lf.inner_join(join_tbl, left_on, right_on)
},
#[cfg(feature = "semi_anti_join")]
JoinOperator::LeftAnti(constraint) => {
let (left_on, right_on) =
process_join_constraint(constraint, &tbl_name, &join_tbl_name)?;
lf.anti_join(join_tbl, left_on, right_on)
},
JoinOperator::LeftOuter(constraint) => {
let (left_on, right_on) =
process_join_constraint(constraint, &tbl_name, &join_tbl_name)?;
lf.left_join(join_tbl, left_on, right_on)
},
JoinOperator::FullOuter(constraint) => {
#[cfg(feature = "semi_anti_join")]
JoinOperator::LeftSemi(constraint) => {
let (left_on, right_on) =
process_join_constraint(constraint, &tbl_name, &join_tbl_name)?;
lf.outer_join(join_tbl, left_on, right_on)
lf.semi_join(join_tbl, left_on, right_on)
},
#[cfg(feature = "semi_anti_join")]
JoinOperator::RightAnti(constraint) => {
let (left_on, right_on) =
process_join_constraint(constraint, &tbl_name, &join_tbl_name)?;
join_tbl.anti_join(lf, right_on, left_on)
},
#[cfg(feature = "semi_anti_join")]
JoinOperator::RightSemi(constraint) => {
let (left_on, right_on) =
process_join_constraint(constraint, &tbl_name, &join_tbl_name)?;
join_tbl.semi_join(lf, right_on, left_on)
},
JoinOperator::CrossJoin => lf.cross_join(join_tbl),
join_type => {
polars_bail!(
InvalidOperation:
Expand Down
3 changes: 2 additions & 1 deletion crates/polars-sql/src/keywords.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,9 @@ pub fn all_keywords() -> Vec<&'static str> {
use sqlparser::keywords;
let sql_keywords = &[
keywords::AND,
keywords::ANTI,
keywords::ARRAY,
keywords::AS,
keywords::AS,
keywords::ASC,
keywords::BOOLEAN,
keywords::BY,
Expand Down Expand Up @@ -51,6 +51,7 @@ pub fn all_keywords() -> Vec<&'static str> {
keywords::OUTER,
keywords::RIGHT,
keywords::SELECT,
keywords::SEMI,
keywords::SHOW,
keywords::TABLE,
keywords::TABLES,
Expand Down
2 changes: 1 addition & 1 deletion crates/polars/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,7 @@ product = ["polars-core/product"]
unique_counts = ["polars-core/unique_counts", "polars-lazy?/unique_counts"]
log = ["polars-ops/log", "polars-lazy?/log"]
partition_by = ["polars-core/partition_by"]
semi_anti_join = ["polars-lazy?/semi_anti_join", "polars-ops/semi_anti_join"]
semi_anti_join = ["polars-lazy?/semi_anti_join", "polars-ops/semi_anti_join", "polars-sql?/semi_anti_join"]
list_eval = ["polars-lazy?/list_eval"]
cumulative_eval = ["polars-lazy?/cumulative_eval"]
chunked_ids = ["polars-lazy?/chunked_ids", "polars-core/chunked_ids", "polars-ops/chunked_ids"]
Expand Down
59 changes: 59 additions & 0 deletions py-polars/tests/unit/sql/test_sql.py
Original file line number Diff line number Diff line change
Expand Up @@ -540,6 +540,65 @@ def test_sql_limit_offset() -> None:
assert len(out) == min(limit, n_values - offset)


def test_sql_join_anti_semi() -> None:
frames = {
"tbl_a": pl.DataFrame({"a": [1, 2, 3], "b": [4, 0, 6], "c": ["w", "y", "z"]}),
"tbl_b": pl.DataFrame({"a": [3, 2, 1], "b": [6, 5, 4], "c": ["x", "y", "z"]}),
"tbl_c": pl.DataFrame({"c": ["w", "y", "z"], "d": [10.5, -50.0, 25.5]}),
}
c = pl.SQLContext(frames, eager_execution=True)

out = c.execute(
"""
SELECT *
FROM tbl_a
LEFT SEMI JOIN tbl_b USING (b)
LEFT SEMI JOIN tbl_c USING (c)
"""
)
assert_frame_equal(pl.DataFrame({"a": [1, 3], "b": [4, 6], "c": ["w", "z"]}), out)

out = c.execute(
"""
SELECT *
FROM tbl_a
LEFT ANTI JOIN tbl_b USING (b)
LEFT SEMI JOIN tbl_c USING (c)
"""
)
assert_frame_equal(pl.DataFrame({"a": [2], "b": [0], "c": ["y"]}), out)

out = c.execute(
"""
SELECT *
FROM tbl_a
RIGHT ANTI JOIN tbl_b USING (b)
LEFT SEMI JOIN tbl_c USING (c)
"""
)
assert_frame_equal(pl.DataFrame({"a": [2], "b": [5], "c": ["y"]}), out)

out = c.execute(
"""
SELECT *
FROM tbl_a
RIGHT SEMI JOIN tbl_b USING (b)
RIGHT SEMI JOIN tbl_c USING (c)
"""
)
assert_frame_equal(pl.DataFrame({"c": ["z"], "d": [25.5]}), out)

out = c.execute(
"""
SELECT *
FROM tbl_a
RIGHT SEMI JOIN tbl_b USING (b)
RIGHT ANTI JOIN tbl_c USING (c)
"""
)
assert_frame_equal(pl.DataFrame({"c": ["w", "y"], "d": [10.5, -50.0]}), out)


def test_sql_join_inner(foods_ipc_path: Path) -> None:
lf = pl.scan_ipc(foods_ipc_path)

Expand Down

0 comments on commit 62ff49d

Please sign in to comment.