Skip to content

Commit

Permalink
[v0.9.2] updated dataframe comparison
Browse files Browse the repository at this point in the history
  • Loading branch information
Guillaume227 committed May 13, 2024
1 parent 3feea56 commit cd8a249
Show file tree
Hide file tree
Showing 2 changed files with 58 additions and 21 deletions.
77 changes: 57 additions & 20 deletions exetest/dataframe_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def __init__(self,
ignore_cols=None,
filter_cols=None,
verbose: bool = True,
num_diffs: int=10,
num_diffs: int = 10,
**np_close_kwargs):
"""
:param ignore_cols: columns to ignore during comparison
Expand All @@ -40,7 +40,7 @@ def __init__(self,
self.filter_cols = filter_cols or []
self.verbose = verbose
self.np_close_kwargs = np_close_kwargs
self.num_diffs = num_diffs
self.num_diffs_to_display = num_diffs

def description(self) -> str:
if self.ignore_cols:
Expand Down Expand Up @@ -77,6 +77,10 @@ def compare_dataframes(self, df1, df2):
if shape_differs or columns_differ:
return False

# exclude NaNs from comparison by replacing them with 0
df1 = df1.fillna(0)
df2 = df2.fillna(0)

cols_with_diffs = []
for col in df1.columns:
if df1[col].dtype != 'category' and np.issubdtype(df1[col].dtype, np.number) \
Expand All @@ -91,7 +95,7 @@ def compare_dataframes(self, df1, df2):
if cols_with_diffs:
if self.verbose:
print('====================================')
print(f'Showing first {self.num_diffs} in cols with diff {cols_with_diffs}:')
print(f'Showing first {self.num_diffs_to_display} in cols with diff {cols_with_diffs}:')
numerical_diff_cols = []
non_numerical_diff_cols = []
for col in cols_with_diffs:
Expand All @@ -102,23 +106,56 @@ def compare_dataframes(self, df1, df2):
non_numerical_diff_cols.append(col)

if numerical_diff_cols:
print('numerical diffs:')
df1_with_diff = df1[numerical_diff_cols]
df2_with_diff = df2[numerical_diff_cols]
diff_mask = ~(df1_with_diff - df2_with_diff).apply(
functools.partial(is_close, b=0, **self.np_close_kwargs))
print(pd.concat([df1_with_diff[diff_mask],
df2_with_diff[diff_mask]
], axis=1).head(self.num_diffs))

if non_numerical_diff_cols:
print('non numerical diffs:')
df1_with_diff = df1[non_numerical_diff_cols]
df2_with_diff = df2[non_numerical_diff_cols]
diff_mask = df1_with_diff != df2_with_diff
print(pd.concat([df1_with_diff[diff_mask],
df2_with_diff[diff_mask]
], axis=1).head(self.num_diffs))
float_format = pd.options.display.float_format
pd.options.display.float_format = "{:.2f}"
print(f'correlation of numerical cols:')
print(df1[numerical_diff_cols].corrwith(df2[numerical_diff_cols]).to_string())
print()
pd.options.display.float_format = float_format # restore format

if self.num_diffs_to_display:
if self.num_diffs_to_display > 0:
print(f'Showing first {self.num_diffs_to_display} rows in cols with diff:')
func_name = 'head'
else:
print(f'Showing last {abs(self.num_diffs_to_display)} rows in cols with diff:')
func_name = 'tail'

if numerical_diff_cols:

df1_with_diff = df1[numerical_diff_cols]
df2_with_diff = df2[numerical_diff_cols]

diff_mask = ~(df1_with_diff - df2_with_diff).apply(
functools.partial(is_close, b=0, **self.np_close_kwargs))

diff_mask = diff_mask.any(axis=1)
print(f'{diff_mask.shape[0]} numerical diffs:')

masked_df1 = getattr(df1_with_diff.reset_index()[diff_mask], func_name)(abs(self.num_diffs_to_display))
masked_df2 = getattr(df2_with_diff[diff_mask], func_name)(abs(self.num_diffs_to_display))

diff_df = pd.DataFrame(masked_df1['index'])
for col_name in masked_df2:
diff_df = pd.concat([diff_df, masked_df1[col_name], masked_df2[col_name]], axis=1)

print(diff_df)

if non_numerical_diff_cols:
df1_with_diff = df1[non_numerical_diff_cols]
df2_with_diff = df2[non_numerical_diff_cols]
diff_mask = (df1_with_diff != df2_with_diff).any(axis=1)
masked_df1 = getattr(df1_with_diff.reset_index()[diff_mask], func_name)(abs(self.num_diffs_to_display))
masked_df2 = getattr(df2_with_diff[diff_mask], func_name)(abs(self.num_diffs_to_display))

print(f'{diff_mask.shape[0]} non numerical diffs:')

diff_df = pd.DataFrame(masked_df1['index'])
for col_name in masked_df2:
diff_df = pd.concat([diff_df, masked_df1[col_name], masked_df2[col_name]], axis=1)

print(diff_df)

return False

return True
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
setup(
name='exetest', # How you named your package folder (MyLib)
packages=['exetest'], # Chose the same as "name"
version='0.9.1', # Start with a small number and increase it with every change you make
version='0.9.2', # Start with a small number and increase it with every change you make
license='MIT', # Chose a license from here: https://help.github.com/articles/licensing-a-repository
description='A pytest-based test framework for black-box approach to testing executables', # Give a short description about your library
author='Guillaume227', # Type in your name
Expand Down

0 comments on commit cd8a249

Please sign in to comment.