diff --git a/nbs/05_pheno_loader.ipynb b/nbs/05_pheno_loader.ipynb index 925e4ac..e217c53 100644 --- a/nbs/05_pheno_loader.ipynb +++ b/nbs/05_pheno_loader.ipynb @@ -518,7 +518,9 @@ " continue\n", " \n", " if table_name == 'age_sex':\n", - " keep_undefined = True\n", + " # The 'age_sex' table does not contain 'undefined', so the merge will not cause a Cartesian product\n", + " keep_undefined = True \n", + " # Left join to keep only rows with real data points\n", " how = 'left'\n", " else: \n", " keep_undefined = keep_undefined_research_stage\n", @@ -581,13 +583,13 @@ " return False\n", " \n", " @staticmethod\n", - " def join_and_filter_undefined_research_stage(df1, df2, how='outer'):\n", + " def join_and_filter_undefined_research_stage(df1, df2, how='outer', lsuffix='', rsuffix=''):\n", " df1_defined = df1[df1.index.get_level_values('research_stage') != 'undefined']\n", " df2_defined = df2[df2.index.get_level_values('research_stage') != 'undefined']\n", "\n", - " return df1_defined.join(df2_defined, how=how)\n", + " return df1_defined.join(df2_defined, how=how, lsuffix=lsuffix, rsuffix=rsuffix)\n", "\n", - " def __concat__(self, df1, df2, keep_undefined_research_stage=False, how='outer'):\n", + " def __concat__(self, df1, df2, keep_undefined_research_stage=False, how='outer', lsuffix='', rsuffix=''):\n", "\n", " if df1.empty:\n", " return df2\n", @@ -598,10 +600,10 @@ " self.is_value_in_index(df2, 'undefined', 'research_stage') and not keep_undefined_research_stage:\n", " \n", " warnings.warn('filtering \"undefined\" research_stage..')\n", - " df = self.join_and_filter_undefined_research_stage(df1, df2, how)\n", + " df = self.join_and_filter_undefined_research_stage(df1, df2, how, lsuffix='', rsuffix='')\n", " return df\n", " \n", - " return df1.join(df2, how=how)\n", + " return df1.join(df2, how=how, lsuffix=lsuffix, rsuffix=rsuffix)\n", " \n", " def merge_all_tables(self) -> pd.DataFrame:\n", " # merge all tables in self.dfs dictionary\n", @@ -610,8 +612,8 @@ " if align_df is None:\n", " align_df = df\n", " else:\n", - " align_df = pd.merge(align_df, df, left_index=True, right_index=True, how='outer', suffixes=('', name))\n", - " \n", + " # Join the table with an 'undefined' research_stage to keep the maximum number of data points\n", + " align_df = self.__concat__(align_df, df, keep_undefined_research_stage=True, how='outer', lsuffix='', rsuffix= name) \n", " return align_df\n", "\n", " def __load_age_sex__(self) -> None:\n", diff --git a/pheno_utils/pheno_loader.py b/pheno_utils/pheno_loader.py index f302792..b4f0528 100644 --- a/pheno_utils/pheno_loader.py +++ b/pheno_utils/pheno_loader.py @@ -467,7 +467,9 @@ def get(self, fields: Union[str,List[str]], flexible: bool=None, not_bulk_field= continue if table_name == 'age_sex': - keep_undefined = True + # The 'age_sex' table does not contain 'undefined', so the merge will not cause a Cartesian product + keep_undefined = True + # Left join to keep only rows with real data points how = 'left' else: keep_undefined = keep_undefined_research_stage @@ -530,13 +532,13 @@ def is_value_in_index(self, df, value, index_name): return False @staticmethod - def join_and_filter_undefined_research_stage(df1, df2, how='outer'): + def join_and_filter_undefined_research_stage(df1, df2, how='outer', lsuffix='', rsuffix=''): df1_defined = df1[df1.index.get_level_values('research_stage') != 'undefined'] df2_defined = df2[df2.index.get_level_values('research_stage') != 'undefined'] - return df1_defined.join(df2_defined, how=how) + return df1_defined.join(df2_defined, how=how, lsuffix=lsuffix, rsuffix=rsuffix) - def __concat__(self, df1, df2, keep_undefined_research_stage=False, how='outer'): + def __concat__(self, df1, df2, keep_undefined_research_stage=False, how='outer', lsuffix='', rsuffix=''): if df1.empty: return df2 @@ -547,10 +549,10 @@ def __concat__(self, df1, df2, keep_undefined_research_stage=False, how='outer') self.is_value_in_index(df2, 'undefined', 'research_stage') and not keep_undefined_research_stage: warnings.warn('filtering "undefined" research_stage..') - df = self.join_and_filter_undefined_research_stage(df1, df2, how) + df = self.join_and_filter_undefined_research_stage(df1, df2, how, lsuffix='', rsuffix='') return df - return df1.join(df2, how=how) + return df1.join(df2, how=how, lsuffix=lsuffix, rsuffix=rsuffix) def merge_all_tables(self) -> pd.DataFrame: # merge all tables in self.dfs dictionary @@ -559,8 +561,8 @@ def merge_all_tables(self) -> pd.DataFrame: if align_df is None: align_df = df else: - align_df = pd.merge(align_df, df, left_index=True, right_index=True, how='outer', suffixes=('', name)) - + # Join the table with an 'undefined' research_stage to keep the maximum number of data points + align_df = self.__concat__(align_df, df, keep_undefined_research_stage=True, how='outer', lsuffix='', rsuffix= name) return align_df def __load_age_sex__(self) -> None: