diff --git a/jobs/generic/compare_pandas_job.py b/jobs/generic/compare_pandas_job.py index b3ae9a7a..194b795e 100644 --- a/jobs/generic/compare_pandas_job.py +++ b/jobs/generic/compare_pandas_job.py @@ -34,7 +34,7 @@ def transform(self, tableA, tableB): print(message) return pd.DataFrame() - # Comparing dataset content, fuzzy + # Comparing dataset content, fuzzy pks1 = self.jargs.inputs['tableA']['pk'] pks2 = self.jargs.inputs['tableB']['pk'] compare1 = list(set(tableA.columns) - set(pks1)) diff --git a/yaetos/etl_utils.py b/yaetos/etl_utils.py index 4dcb7f6e..4e7986f7 100644 --- a/yaetos/etl_utils.py +++ b/yaetos/etl_utils.py @@ -775,7 +775,7 @@ def check_pk(df, pks, df_type='spark'): logger.info("Given fields ({}) are PKs (i.e. unique). count=count_pk={}".format(pks, count)) return True elif df_type == 'pandas': - count =len(df) + count = len(df) count_pk = len(df[pks].drop_duplicates()) if count != count_pk: logger.error("Given fields ({}) are not PKs since not unique. count={}, count_pk={}".format(pks, count, count_pk)) @@ -786,7 +786,6 @@ def check_pk(df, pks, df_type='spark'): else: raise Exception(f"shouldn't get here, set df_type to 'spark' or 'pandas'. It is set in {df_type}") - def identify_non_unique_pks(self, df, pks): return su.identify_non_unique_pks(df, pks) diff --git a/yaetos/libs/generic_jobs/compare_pandas_job.py b/yaetos/libs/generic_jobs/compare_pandas_job.py index b3ae9a7a..194b795e 100644 --- a/yaetos/libs/generic_jobs/compare_pandas_job.py +++ b/yaetos/libs/generic_jobs/compare_pandas_job.py @@ -34,7 +34,7 @@ def transform(self, tableA, tableB): print(message) return pd.DataFrame() - # Comparing dataset content, fuzzy + # Comparing dataset content, fuzzy pks1 = self.jargs.inputs['tableA']['pk'] pks2 = self.jargs.inputs['tableB']['pk'] compare1 = list(set(tableA.columns) - set(pks1))