From 62ff49da26d80f20e36182f3407684b856a3aec5 Mon Sep 17 00:00:00 2001 From: Alexander Beedie Date: Wed, 4 Oct 2023 16:04:04 +0400 Subject: [PATCH] feat(python,rust,cli): support left and right anti/semi joins from the SQL interface (#11501) Co-authored-by: ritchie --- crates/polars-lazy/src/frame/mod.rs | 78 +++++++++++++++++++++++----- crates/polars-sql/Cargo.toml | 1 + crates/polars-sql/src/context.rs | 30 +++++++++-- crates/polars-sql/src/keywords.rs | 3 +- crates/polars/Cargo.toml | 2 +- py-polars/tests/unit/sql/test_sql.py | 59 +++++++++++++++++++++ 6 files changed, 155 insertions(+), 18 deletions(-) diff --git a/crates/polars-lazy/src/frame/mod.rs b/crates/polars-lazy/src/frame/mod.rs index aada259d13f2..ca3978723188 100644 --- a/crates/polars-lazy/src/frame/mod.rs +++ b/crates/polars-lazy/src/frame/mod.rs @@ -1000,6 +1000,38 @@ impl LazyFrame { } } + /// Left anti join this query with another lazy query. + /// + /// Matches on the values of the expressions `left_on` and `right_on`. For more + /// flexible join logic, see [`join`](LazyFrame::join) or + /// [`join_builder`](LazyFrame::join_builder). + /// + /// # Example + /// + /// ```rust + /// use polars_core::prelude::*; + /// use polars_lazy::prelude::*; + /// fn anti_join_dataframes(ldf: LazyFrame, other: LazyFrame) -> LazyFrame { + /// ldf + /// .anti_join(other, col("foo"), col("bar").cast(DataType::Utf8)) + /// } + /// ``` + #[cfg(feature = "semi_anti_join")] + pub fn anti_join>(self, other: LazyFrame, left_on: E, right_on: E) -> LazyFrame { + self.join( + other, + [left_on.into()], + [right_on.into()], + JoinArgs::new(JoinType::Anti), + ) + } + + /// Creates the cartesian product from both frames, preserving the order of the left keys. + #[cfg(feature = "cross_join")] + pub fn cross_join(self, other: LazyFrame) -> LazyFrame { + self.join(other, vec![], vec![], JoinArgs::new(JoinType::Cross)) + } + /// Left join this query with another lazy query. /// /// Matches on the values of the expressions `left_on` and `right_on`. For more @@ -1011,7 +1043,7 @@ impl LazyFrame { /// ```rust /// use polars_core::prelude::*; /// use polars_lazy::prelude::*; - /// fn join_dataframes(ldf: LazyFrame, other: LazyFrame) -> LazyFrame { + /// fn left_join_dataframes(ldf: LazyFrame, other: LazyFrame) -> LazyFrame { /// ldf /// .left_join(other, col("foo"), col("bar")) /// } @@ -1025,6 +1057,31 @@ impl LazyFrame { ) } + /// Inner join this query with another lazy query. + /// + /// Matches on the values of the expressions `left_on` and `right_on`. For more + /// flexible join logic, see [`join`](LazyFrame::join) or + /// [`join_builder`](LazyFrame::join_builder). + /// + /// # Example + /// + /// ```rust + /// use polars_core::prelude::*; + /// use polars_lazy::prelude::*; + /// fn inner_join_dataframes(ldf: LazyFrame, other: LazyFrame) -> LazyFrame { + /// ldf + /// .inner_join(other, col("foo"), col("bar").cast(DataType::Utf8)) + /// } + /// ``` + pub fn inner_join>(self, other: LazyFrame, left_on: E, right_on: E) -> LazyFrame { + self.join( + other, + [left_on.into()], + [right_on.into()], + JoinArgs::new(JoinType::Inner), + ) + } + /// Outer join this query with another lazy query. /// /// Matches on the values of the expressions `left_on` and `right_on`. For more @@ -1036,7 +1093,7 @@ impl LazyFrame { /// ```rust /// use polars_core::prelude::*; /// use polars_lazy::prelude::*; - /// fn join_dataframes(ldf: LazyFrame, other: LazyFrame) -> LazyFrame { + /// fn outer_join_dataframes(ldf: LazyFrame, other: LazyFrame) -> LazyFrame { /// ldf /// .outer_join(other, col("foo"), col("bar")) /// } @@ -1050,7 +1107,7 @@ impl LazyFrame { ) } - /// Inner join this query with another lazy query. + /// Left semi join this query with another lazy query. /// /// Matches on the values of the expressions `left_on` and `right_on`. For more /// flexible join logic, see [`join`](LazyFrame::join) or @@ -1061,26 +1118,21 @@ impl LazyFrame { /// ```rust /// use polars_core::prelude::*; /// use polars_lazy::prelude::*; - /// fn join_dataframes(ldf: LazyFrame, other: LazyFrame) -> LazyFrame { + /// fn semi_join_dataframes(ldf: LazyFrame, other: LazyFrame) -> LazyFrame { /// ldf - /// .inner_join(other, col("foo"), col("bar").cast(DataType::Utf8)) + /// .semi_join(other, col("foo"), col("bar").cast(DataType::Utf8)) /// } /// ``` - pub fn inner_join>(self, other: LazyFrame, left_on: E, right_on: E) -> LazyFrame { + #[cfg(feature = "semi_anti_join")] + pub fn semi_join>(self, other: LazyFrame, left_on: E, right_on: E) -> LazyFrame { self.join( other, [left_on.into()], [right_on.into()], - JoinArgs::new(JoinType::Inner), + JoinArgs::new(JoinType::Semi), ) } - /// Creates the cartesian product from both frames, preserving the order of the left keys. - #[cfg(feature = "cross_join")] - pub fn cross_join(self, other: LazyFrame) -> LazyFrame { - self.join(other, vec![], vec![], JoinArgs::new(JoinType::Cross)) - } - /// Generic function to join two LazyFrames. /// /// `join` can join on multiple columns, given as two list of expressions, and with a diff --git a/crates/polars-sql/Cargo.toml b/crates/polars-sql/Cargo.toml index 09b0fbbbdc60..2595a45b410c 100644 --- a/crates/polars-sql/Cargo.toml +++ b/crates/polars-sql/Cargo.toml @@ -26,3 +26,4 @@ json = ["polars-lazy/json"] default = [] ipc = ["polars-lazy/ipc"] parquet = ["polars-lazy/parquet"] +semi_anti_join = ["polars-lazy/semi_anti_join"] diff --git a/crates/polars-sql/src/context.rs b/crates/polars-sql/src/context.rs index 5f161084a2d3..fd038be62e3d 100644 --- a/crates/polars-sql/src/context.rs +++ b/crates/polars-sql/src/context.rs @@ -271,22 +271,46 @@ impl SQLContext { for tbl in &tbl_expr.joins { let (join_tbl_name, join_tbl) = self.get_table(&tbl.relation)?; lf = match &tbl.join_operator { + JoinOperator::CrossJoin => lf.cross_join(join_tbl), + JoinOperator::FullOuter(constraint) => { + let (left_on, right_on) = + process_join_constraint(constraint, &tbl_name, &join_tbl_name)?; + lf.outer_join(join_tbl, left_on, right_on) + }, JoinOperator::Inner(constraint) => { let (left_on, right_on) = process_join_constraint(constraint, &tbl_name, &join_tbl_name)?; lf.inner_join(join_tbl, left_on, right_on) }, + #[cfg(feature = "semi_anti_join")] + JoinOperator::LeftAnti(constraint) => { + let (left_on, right_on) = + process_join_constraint(constraint, &tbl_name, &join_tbl_name)?; + lf.anti_join(join_tbl, left_on, right_on) + }, JoinOperator::LeftOuter(constraint) => { let (left_on, right_on) = process_join_constraint(constraint, &tbl_name, &join_tbl_name)?; lf.left_join(join_tbl, left_on, right_on) }, - JoinOperator::FullOuter(constraint) => { + #[cfg(feature = "semi_anti_join")] + JoinOperator::LeftSemi(constraint) => { let (left_on, right_on) = process_join_constraint(constraint, &tbl_name, &join_tbl_name)?; - lf.outer_join(join_tbl, left_on, right_on) + lf.semi_join(join_tbl, left_on, right_on) + }, + #[cfg(feature = "semi_anti_join")] + JoinOperator::RightAnti(constraint) => { + let (left_on, right_on) = + process_join_constraint(constraint, &tbl_name, &join_tbl_name)?; + join_tbl.anti_join(lf, right_on, left_on) + }, + #[cfg(feature = "semi_anti_join")] + JoinOperator::RightSemi(constraint) => { + let (left_on, right_on) = + process_join_constraint(constraint, &tbl_name, &join_tbl_name)?; + join_tbl.semi_join(lf, right_on, left_on) }, - JoinOperator::CrossJoin => lf.cross_join(join_tbl), join_type => { polars_bail!( InvalidOperation: diff --git a/crates/polars-sql/src/keywords.rs b/crates/polars-sql/src/keywords.rs index 53e1044fd162..aea00fb54152 100644 --- a/crates/polars-sql/src/keywords.rs +++ b/crates/polars-sql/src/keywords.rs @@ -16,9 +16,9 @@ pub fn all_keywords() -> Vec<&'static str> { use sqlparser::keywords; let sql_keywords = &[ keywords::AND, + keywords::ANTI, keywords::ARRAY, keywords::AS, - keywords::AS, keywords::ASC, keywords::BOOLEAN, keywords::BY, @@ -51,6 +51,7 @@ pub fn all_keywords() -> Vec<&'static str> { keywords::OUTER, keywords::RIGHT, keywords::SELECT, + keywords::SEMI, keywords::SHOW, keywords::TABLE, keywords::TABLES, diff --git a/crates/polars/Cargo.toml b/crates/polars/Cargo.toml index 7943cec18545..5976ea644dbe 100644 --- a/crates/polars/Cargo.toml +++ b/crates/polars/Cargo.toml @@ -159,7 +159,7 @@ product = ["polars-core/product"] unique_counts = ["polars-core/unique_counts", "polars-lazy?/unique_counts"] log = ["polars-ops/log", "polars-lazy?/log"] partition_by = ["polars-core/partition_by"] -semi_anti_join = ["polars-lazy?/semi_anti_join", "polars-ops/semi_anti_join"] +semi_anti_join = ["polars-lazy?/semi_anti_join", "polars-ops/semi_anti_join", "polars-sql?/semi_anti_join"] list_eval = ["polars-lazy?/list_eval"] cumulative_eval = ["polars-lazy?/cumulative_eval"] chunked_ids = ["polars-lazy?/chunked_ids", "polars-core/chunked_ids", "polars-ops/chunked_ids"] diff --git a/py-polars/tests/unit/sql/test_sql.py b/py-polars/tests/unit/sql/test_sql.py index ce888f97e984..ca29917029a5 100644 --- a/py-polars/tests/unit/sql/test_sql.py +++ b/py-polars/tests/unit/sql/test_sql.py @@ -540,6 +540,65 @@ def test_sql_limit_offset() -> None: assert len(out) == min(limit, n_values - offset) +def test_sql_join_anti_semi() -> None: + frames = { + "tbl_a": pl.DataFrame({"a": [1, 2, 3], "b": [4, 0, 6], "c": ["w", "y", "z"]}), + "tbl_b": pl.DataFrame({"a": [3, 2, 1], "b": [6, 5, 4], "c": ["x", "y", "z"]}), + "tbl_c": pl.DataFrame({"c": ["w", "y", "z"], "d": [10.5, -50.0, 25.5]}), + } + c = pl.SQLContext(frames, eager_execution=True) + + out = c.execute( + """ + SELECT * + FROM tbl_a + LEFT SEMI JOIN tbl_b USING (b) + LEFT SEMI JOIN tbl_c USING (c) + """ + ) + assert_frame_equal(pl.DataFrame({"a": [1, 3], "b": [4, 6], "c": ["w", "z"]}), out) + + out = c.execute( + """ + SELECT * + FROM tbl_a + LEFT ANTI JOIN tbl_b USING (b) + LEFT SEMI JOIN tbl_c USING (c) + """ + ) + assert_frame_equal(pl.DataFrame({"a": [2], "b": [0], "c": ["y"]}), out) + + out = c.execute( + """ + SELECT * + FROM tbl_a + RIGHT ANTI JOIN tbl_b USING (b) + LEFT SEMI JOIN tbl_c USING (c) + """ + ) + assert_frame_equal(pl.DataFrame({"a": [2], "b": [5], "c": ["y"]}), out) + + out = c.execute( + """ + SELECT * + FROM tbl_a + RIGHT SEMI JOIN tbl_b USING (b) + RIGHT SEMI JOIN tbl_c USING (c) + """ + ) + assert_frame_equal(pl.DataFrame({"c": ["z"], "d": [25.5]}), out) + + out = c.execute( + """ + SELECT * + FROM tbl_a + RIGHT SEMI JOIN tbl_b USING (b) + RIGHT ANTI JOIN tbl_c USING (c) + """ + ) + assert_frame_equal(pl.DataFrame({"c": ["w", "y"], "d": [10.5, -50.0]}), out) + + def test_sql_join_inner(foods_ipc_path: Path) -> None: lf = pl.scan_ipc(foods_ipc_path)