diff --git a/tests/test_utils.py b/tests/test_utils.py index 9be26ab..a37c641 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,7 +1,9 @@ import unittest +import pandas as pd + from tests import d1_path -from valentine.data_sources.utils import get_encoding, get_delimiter, is_date +from valentine.data_sources.utils import get_encoding, get_delimiter, is_date, add_noise_to_df_column from valentine.utils.utils import is_sorted, convert_data_type @@ -30,3 +32,14 @@ def test_get_delimiter(self): def test_is_date(self): date_str = "2019-04-26 18:03:50.941332" assert is_date(date_str) + + def test_add_noise_to_df_column(self): + # Tiny chance that this test will fail due to the random nature of the noise + test_df = pd.DataFrame({'a': [1.0, 2.0, 3.0], 'b': ['abcdefg', 'hijklmn', 'opqrst']}) + assert_df = pd.DataFrame({'a': [1.0, 2.0, 3.0], 'b': ['abcdefg', 'hijklmn', 'opqrst']}) + assert add_noise_to_df_column(test_df, 'a', 0.0)['a'].equals(assert_df['a']) + assert add_noise_to_df_column(test_df, 'b', 0.0)['b'].equals(assert_df['b']) + assert not add_noise_to_df_column(test_df, 'a', 0.5)['a'].equals(assert_df['a']) + assert not add_noise_to_df_column(test_df, 'b', 0.5)['b'].equals(assert_df['b']) + assert not add_noise_to_df_column(test_df, 'a', 0.99999)['a'].equals(assert_df['a']) + assert not add_noise_to_df_column(test_df, 'b', 0.99999)['b'].equals(assert_df['b']) diff --git a/valentine/data_sources/utils.py b/valentine/data_sources/utils.py index 03d5a0a..28c9686 100644 --- a/valentine/data_sources/utils.py +++ b/valentine/data_sources/utils.py @@ -65,7 +65,4 @@ def add_noise_to_df_column(df, column_name, noise_level): for _ in range(df[column_name].shape[0]): if np.random.rand() < noise_level: df[column_name] = df[column_name].apply(lambda x: ''.join(np.random.permutation(list(str(x))))) - return df - -# if __name__ == "__main__": -# add_noise_to_df_column(pd.DataFrame({'a': [1, 2, 3], 'b': ['abcdefg', 'hijklmn', 'opqrst']}), 'b', 0.99) + return df \ No newline at end of file