fixing compare matching rows for spark compares and SF compare (#378)

* fixing compare matching rows for spark compares and SF compare * adding new test case * bump version to 0.16.2 --------- Co-authored-by: Faisal <[email protected]>
capitalone · Feb 12, 2025 · 966b193 · 966b193
1 parent e80fb12
commit 966b193
Show file tree

Hide file tree

Showing 13 changed files with 78 additions and 4 deletions.
diff --git a/datacompy/__init__.py b/datacompy/__init__.py
@@ -18,7 +18,7 @@
 Then extended to carry that functionality over to Spark Dataframes.
 """
 
-__version__ = "0.16.1"
+__version__ = "0.16.2"
 
 import platform
 from warnings import warn

diff --git a/datacompy/snowflake.py b/datacompy/snowflake.py
@@ -546,7 +546,7 @@ def count_matching_rows(self) -> int:
                 " and ".join(conditions)
             ).count()
         else:
-            match_columns_count = 0
+            match_columns_count = self.intersect_rows.count()
         return match_columns_count
 
     def intersect_rows_match(self) -> bool:

diff --git a/datacompy/spark/pandas.py b/datacompy/spark/pandas.py
@@ -490,7 +490,7 @@ def count_matching_rows(self) -> bool:
                 .shape[0]
             )
         else:
-            match_columns_count = 0
+            self.intersect_rows.shape[0]
         return match_columns_count
 
     def intersect_rows_match(self) -> bool:

diff --git a/datacompy/spark/sql.py b/datacompy/spark/sql.py
@@ -556,7 +556,7 @@ def count_matching_rows(self) -> int:
                 " and ".join(conditions)
             ).count()
         else:
-            match_columns_count = 0
+            match_columns_count = self.intersect_rows.count()
         return match_columns_count
 
     def intersect_rows_match(self) -> bool:

diff --git a/tests/test_core.py b/tests/test_core.py
@@ -911,6 +911,13 @@ def test_index_with_joins_with_ignore_case():
     assert compare.intersect_rows_match()
 
 
+def test_full_join_counts_all_matches():
+    df1 = pd.DataFrame([{"a": 1, "b": 2}, {"a": 1, "b": 2}])
+    df2 = pd.DataFrame([{"a": 1, "b": 2}, {"a": 1, "b": 2}])
+    compare = datacompy.Compare(df1, df2, ["a", "b"], ignore_spaces=False)
+    assert compare.count_matching_rows() == 2
+
+
 def test_strings_with_ignore_spaces_and_join_columns():
     df1 = pd.DataFrame([{"a": "hi", "b": "A"}, {"a": "bye", "b": "A"}])
     df2 = pd.DataFrame([{"a": " hi ", "b": "A"}, {"a": " bye ", "b": "A"}])

diff --git a/tests/test_fugue/test_duckdb.py b/tests/test_fugue/test_duckdb.py
@@ -168,6 +168,15 @@ def test_count_matching_rows_duckdb(count_matching_rows_df):
             )
             == 100
         )
+        assert (
+            count_matching_rows(
+                df1,
+                df1_copy,
+                join_columns=["a", "b"],
+                parallelism=2,
+            )
+            == 100
+        )
         assert (
             count_matching_rows(
                 df1,

diff --git a/tests/test_fugue/test_fugue_pandas.py b/tests/test_fugue/test_fugue_pandas.py
@@ -224,6 +224,15 @@ def test_count_matching_rows_native(count_matching_rows_df):
         )
         == 100
     )
+    assert (
+        count_matching_rows(
+            count_matching_rows_df[0],
+            count_matching_rows_df[0].copy(),
+            join_columns=["a", "b"],
+            parallelism=2,
+        )
+        == 100
+    )
     assert (
         count_matching_rows(
             count_matching_rows_df[0],

diff --git a/tests/test_fugue/test_fugue_polars.py b/tests/test_fugue/test_fugue_polars.py
@@ -149,6 +149,15 @@ def test_count_matching_rows_polars(count_matching_rows_df):
         )
         == 100
     )
+    assert (
+        count_matching_rows(
+            df1,
+            df1.clone(),
+            join_columns=["a", "b"],
+            parallelism=2,
+        )
+        == 100
+    )
     assert (
         count_matching_rows(
             df1,

diff --git a/tests/test_fugue/test_fugue_spark.py b/tests/test_fugue/test_fugue_spark.py
@@ -239,6 +239,15 @@ def test_count_matching_rows_spark(spark_session, count_matching_rows_df):
         )
         == 100
     )
+    assert (
+        count_matching_rows(
+            df1,
+            df1_copy,
+            join_columns=["a", "b"],
+            parallelism=2,
+        )
+        == 100
+    )
     assert (
         count_matching_rows(
             df1,

diff --git a/tests/test_polars.py b/tests/test_polars.py
@@ -793,6 +793,13 @@ def test_joins_with_ignore_case():
     assert compare.intersect_rows_match()
 
 
+def test_full_join_counts_all_matches():
+    df1 = pl.DataFrame([{"a": 1, "b": 2}, {"a": 1, "b": 2}])
+    df2 = pl.DataFrame([{"a": 1, "b": 2}, {"a": 1, "b": 2}])
+    compare = PolarsCompare(df1, df2, ["a", "b"], ignore_spaces=False)
+    assert compare.count_matching_rows() == 2
+
+
 def test_strings_with_ignore_spaces_and_join_columns():
     df1 = pl.DataFrame([{"a": "hi", "b": "A"}, {"a": "bye", "b": "A"}])
     df2 = pl.DataFrame([{"a": " hi ", "b": "A"}, {"a": " bye ", "b": "A"}])

diff --git a/tests/test_snowflake.py b/tests/test_snowflake.py
@@ -885,6 +885,15 @@ def test_joins_with_sensitive_lowercase_cols(snowpark_session):
     assert compare.intersect_rows_match()
 
 
+def test_full_join_counts_all_matches(snowpark_session):
+    df1 = snowpark_session.createDataFrame([{"A": 1, "B": 2}, {"A": 1, "B": 2}])
+    df2 = snowpark_session.createDataFrame([{"A": 1, "B": 2}, {"A": 1, "B": 2}])
+    compare = SnowflakeCompare(
+        snowpark_session, df1, df2, ["A", "B"], ignore_spaces=False
+    )
+    assert compare.count_matching_rows() == 2
+
+
 def test_strings_with_ignore_spaces_and_join_columns(snowpark_session):
     df1 = snowpark_session.createDataFrame(
         [{"A": "HI", "B": "A"}, {"A": "BYE", "B": "A"}]

diff --git a/tests/test_spark/test_pandas_spark.py b/tests/test_spark/test_pandas_spark.py
@@ -862,6 +862,14 @@ def test_joins_with_ignore_case():
     assert compare.intersect_rows_match()
 
 
+@pandas_version
+def test_full_join_counts_all_matches():
+    df1 = ps.DataFrame([{"a": 1, "b": 2}, {"a": 1, "b": 2}])
+    df2 = ps.DataFrame([{"a": 1, "b": 2}, {"a": 1, "b": 2}])
+    compare = SparkPandasCompare(df1, df2, ["a", "b"], ignore_spaces=False)
+    assert compare.count_matching_rows() == 2
+
+
 @pandas_version
 def test_strings_with_ignore_spaces_and_join_columns():
     df1 = ps.DataFrame([{"a": "hi", "b": "A"}, {"a": "bye", "b": "A"}])

diff --git a/tests/test_spark/test_sql_spark.py b/tests/test_spark/test_sql_spark.py
@@ -851,6 +851,13 @@ def test_joins_with_ignore_case(spark_session):
     assert compare.intersect_rows_match()
 
 
+def test_full_join_counts_all_matches(spark_session):
+    df1 = spark_session.createDataFrame([{"a": 1, "b": 2}, {"a": 1, "b": 2}])
+    df2 = spark_session.createDataFrame([{"a": 1, "b": 2}, {"a": 1, "b": 2}])
+    compare = SparkSQLCompare(spark_session, df1, df2, ["a", "b"], ignore_spaces=False)
+    assert compare.count_matching_rows() == 2
+
+
 def test_strings_with_ignore_spaces_and_join_columns(spark_session):
     df1 = spark_session.createDataFrame([{"a": "hi", "b": "A"}, {"a": "bye", "b": "A"}])
     df2 = spark_session.createDataFrame(