Skip to content

Commit

Permalink
fixing compare matching rows for spark compares and SF compare (#378)
Browse files Browse the repository at this point in the history
* fixing compare matching rows for spark compares and SF compare

* adding new test case

* bump version to 0.16.2

---------

Co-authored-by: Faisal <[email protected]>
  • Loading branch information
rhaffar and fdosani authored Feb 12, 2025
1 parent e80fb12 commit 966b193
Show file tree
Hide file tree
Showing 13 changed files with 78 additions and 4 deletions.
2 changes: 1 addition & 1 deletion datacompy/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
Then extended to carry that functionality over to Spark Dataframes.
"""

__version__ = "0.16.1"
__version__ = "0.16.2"

import platform
from warnings import warn
Expand Down
2 changes: 1 addition & 1 deletion datacompy/snowflake.py
Original file line number Diff line number Diff line change
Expand Up @@ -546,7 +546,7 @@ def count_matching_rows(self) -> int:
" and ".join(conditions)
).count()
else:
match_columns_count = 0
match_columns_count = self.intersect_rows.count()
return match_columns_count

def intersect_rows_match(self) -> bool:
Expand Down
2 changes: 1 addition & 1 deletion datacompy/spark/pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -490,7 +490,7 @@ def count_matching_rows(self) -> bool:
.shape[0]
)
else:
match_columns_count = 0
self.intersect_rows.shape[0]
return match_columns_count

def intersect_rows_match(self) -> bool:
Expand Down
2 changes: 1 addition & 1 deletion datacompy/spark/sql.py
Original file line number Diff line number Diff line change
Expand Up @@ -556,7 +556,7 @@ def count_matching_rows(self) -> int:
" and ".join(conditions)
).count()
else:
match_columns_count = 0
match_columns_count = self.intersect_rows.count()
return match_columns_count

def intersect_rows_match(self) -> bool:
Expand Down
7 changes: 7 additions & 0 deletions tests/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -911,6 +911,13 @@ def test_index_with_joins_with_ignore_case():
assert compare.intersect_rows_match()


def test_full_join_counts_all_matches():
df1 = pd.DataFrame([{"a": 1, "b": 2}, {"a": 1, "b": 2}])
df2 = pd.DataFrame([{"a": 1, "b": 2}, {"a": 1, "b": 2}])
compare = datacompy.Compare(df1, df2, ["a", "b"], ignore_spaces=False)
assert compare.count_matching_rows() == 2


def test_strings_with_ignore_spaces_and_join_columns():
df1 = pd.DataFrame([{"a": "hi", "b": "A"}, {"a": "bye", "b": "A"}])
df2 = pd.DataFrame([{"a": " hi ", "b": "A"}, {"a": " bye ", "b": "A"}])
Expand Down
9 changes: 9 additions & 0 deletions tests/test_fugue/test_duckdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,15 @@ def test_count_matching_rows_duckdb(count_matching_rows_df):
)
== 100
)
assert (
count_matching_rows(
df1,
df1_copy,
join_columns=["a", "b"],
parallelism=2,
)
== 100
)
assert (
count_matching_rows(
df1,
Expand Down
9 changes: 9 additions & 0 deletions tests/test_fugue/test_fugue_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,15 @@ def test_count_matching_rows_native(count_matching_rows_df):
)
== 100
)
assert (
count_matching_rows(
count_matching_rows_df[0],
count_matching_rows_df[0].copy(),
join_columns=["a", "b"],
parallelism=2,
)
== 100
)
assert (
count_matching_rows(
count_matching_rows_df[0],
Expand Down
9 changes: 9 additions & 0 deletions tests/test_fugue/test_fugue_polars.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,15 @@ def test_count_matching_rows_polars(count_matching_rows_df):
)
== 100
)
assert (
count_matching_rows(
df1,
df1.clone(),
join_columns=["a", "b"],
parallelism=2,
)
== 100
)
assert (
count_matching_rows(
df1,
Expand Down
9 changes: 9 additions & 0 deletions tests/test_fugue/test_fugue_spark.py
Original file line number Diff line number Diff line change
Expand Up @@ -239,6 +239,15 @@ def test_count_matching_rows_spark(spark_session, count_matching_rows_df):
)
== 100
)
assert (
count_matching_rows(
df1,
df1_copy,
join_columns=["a", "b"],
parallelism=2,
)
== 100
)
assert (
count_matching_rows(
df1,
Expand Down
7 changes: 7 additions & 0 deletions tests/test_polars.py
Original file line number Diff line number Diff line change
Expand Up @@ -793,6 +793,13 @@ def test_joins_with_ignore_case():
assert compare.intersect_rows_match()


def test_full_join_counts_all_matches():
df1 = pl.DataFrame([{"a": 1, "b": 2}, {"a": 1, "b": 2}])
df2 = pl.DataFrame([{"a": 1, "b": 2}, {"a": 1, "b": 2}])
compare = PolarsCompare(df1, df2, ["a", "b"], ignore_spaces=False)
assert compare.count_matching_rows() == 2


def test_strings_with_ignore_spaces_and_join_columns():
df1 = pl.DataFrame([{"a": "hi", "b": "A"}, {"a": "bye", "b": "A"}])
df2 = pl.DataFrame([{"a": " hi ", "b": "A"}, {"a": " bye ", "b": "A"}])
Expand Down
9 changes: 9 additions & 0 deletions tests/test_snowflake.py
Original file line number Diff line number Diff line change
Expand Up @@ -885,6 +885,15 @@ def test_joins_with_sensitive_lowercase_cols(snowpark_session):
assert compare.intersect_rows_match()


def test_full_join_counts_all_matches(snowpark_session):
df1 = snowpark_session.createDataFrame([{"A": 1, "B": 2}, {"A": 1, "B": 2}])
df2 = snowpark_session.createDataFrame([{"A": 1, "B": 2}, {"A": 1, "B": 2}])
compare = SnowflakeCompare(
snowpark_session, df1, df2, ["A", "B"], ignore_spaces=False
)
assert compare.count_matching_rows() == 2


def test_strings_with_ignore_spaces_and_join_columns(snowpark_session):
df1 = snowpark_session.createDataFrame(
[{"A": "HI", "B": "A"}, {"A": "BYE", "B": "A"}]
Expand Down
8 changes: 8 additions & 0 deletions tests/test_spark/test_pandas_spark.py
Original file line number Diff line number Diff line change
Expand Up @@ -862,6 +862,14 @@ def test_joins_with_ignore_case():
assert compare.intersect_rows_match()


@pandas_version
def test_full_join_counts_all_matches():
df1 = ps.DataFrame([{"a": 1, "b": 2}, {"a": 1, "b": 2}])
df2 = ps.DataFrame([{"a": 1, "b": 2}, {"a": 1, "b": 2}])
compare = SparkPandasCompare(df1, df2, ["a", "b"], ignore_spaces=False)
assert compare.count_matching_rows() == 2


@pandas_version
def test_strings_with_ignore_spaces_and_join_columns():
df1 = ps.DataFrame([{"a": "hi", "b": "A"}, {"a": "bye", "b": "A"}])
Expand Down
7 changes: 7 additions & 0 deletions tests/test_spark/test_sql_spark.py
Original file line number Diff line number Diff line change
Expand Up @@ -851,6 +851,13 @@ def test_joins_with_ignore_case(spark_session):
assert compare.intersect_rows_match()


def test_full_join_counts_all_matches(spark_session):
df1 = spark_session.createDataFrame([{"a": 1, "b": 2}, {"a": 1, "b": 2}])
df2 = spark_session.createDataFrame([{"a": 1, "b": 2}, {"a": 1, "b": 2}])
compare = SparkSQLCompare(spark_session, df1, df2, ["a", "b"], ignore_spaces=False)
assert compare.count_matching_rows() == 2


def test_strings_with_ignore_spaces_and_join_columns(spark_session):
df1 = spark_session.createDataFrame([{"a": "hi", "b": "A"}, {"a": "bye", "b": "A"}])
df2 = spark_session.createDataFrame(
Expand Down

0 comments on commit 966b193

Please sign in to comment.