From b6d319ddbc7b8e27296519cb735f1eab4a7499cb Mon Sep 17 00:00:00 2001 From: ntalluri Date: Mon, 29 Jul 2024 11:50:41 -0500 Subject: [PATCH 1/8] Decoupled ensemble step and updated ml tests --- Snakefile | 22 ++++++++++++++++++---- spras/analysis/ml.py | 14 ++++++++------ test/ml/test_ml.py | 34 ++++++++++++++++++++++++++++++---- 3 files changed, 56 insertions(+), 14 deletions(-) diff --git a/Snakefile b/Snakefile index 71a8a6ed..d64bf631 100644 --- a/Snakefile +++ b/Snakefile @@ -100,7 +100,7 @@ def make_final_input(wildcards): final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-hac-clusters-vertical.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos,algorithm_params=algorithms_with_params)) final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-hac-horizontal.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos,algorithm_params=algorithms_with_params)) final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-hac-clusters-horizontal.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos,algorithm_params=algorithms_with_params)) - final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-ensemble-pathway.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos,algorithm_params=algorithms_with_params)) + final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-ensemble-pathway.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms,algorithm_params=algorithms_with_params)) if len(final_input) == 0: # No analysis added yet, so add reconstruction output files if they exist. @@ -291,13 +291,11 @@ rule ml_analysis: hac_clusters_vertical = SEP.join([out_dir, '{dataset}-ml', 'hac-clusters-vertical.txt']), hac_image_horizontal = SEP.join([out_dir, '{dataset}-ml', 'hac-horizontal.png']), hac_clusters_horizontal = SEP.join([out_dir, '{dataset}-ml', 'hac-clusters-horizontal.txt']), - ensemble_network_file = SEP.join([out_dir,'{dataset}-ml', 'ensemble-pathway.txt']) run: summary_df = ml.summarize_networks(input.pathways) ml.pca(summary_df, output.pca_image, output.pca_variance, output.pca_coordinates, **pca_params) ml.hac_vertical(summary_df, output.hac_image_vertical, output.hac_clusters_vertical, **hac_params) ml.hac_horizontal(summary_df, output.hac_image_horizontal, output.hac_clusters_horizontal, **hac_params) - ml.ensemble_network(summary_df, output.ensemble_network_file) def collect_pathways_per_algo(wildcards): filtered_algo_params = [algo_param for algo_param in algorithms_with_params if wildcards.algorithm in algo_param] @@ -314,12 +312,28 @@ rule ml_analysis_aggregate_algo: hac_clusters_vertical = SEP.join([out_dir, '{dataset}-ml', '{algorithm}-hac-clusters-vertical.txt']), hac_image_horizontal = SEP.join([out_dir, '{dataset}-ml', '{algorithm}-hac-horizontal.png']), hac_clusters_horizontal = SEP.join([out_dir, '{dataset}-ml', '{algorithm}-hac-clusters-horizontal.txt']), - ensemble_network_file = SEP.join([out_dir,'{dataset}-ml', '{algorithm}-ensemble-pathway.txt']) run: summary_df = ml.summarize_networks(input.pathways) ml.pca(summary_df, output.pca_image, output.pca_variance, output.pca_coordinates, **pca_params) ml.hac_vertical(summary_df, output.hac_image_vertical, output.hac_clusters_vertical, **hac_params) ml.hac_horizontal(summary_df, output.hac_image_horizontal, output.hac_clusters_horizontal, **hac_params) + +rule ensemble: + input: + pathways = expand('{out_dir}{sep}{{dataset}}-{algorithm_params}{sep}pathway.txt', out_dir=out_dir, sep=SEP, algorithm_params=algorithms_with_params) + output: + ensemble_network_file = SEP.join([out_dir,'{dataset}-ml', 'ensemble-pathway.txt']) + run: + summary_df = ml.summarize_networks(input.pathways) + ml.ensemble_network(summary_df, output.ensemble_network_file) + +rule ensemble_per_algo: + input: + pathways = collect_pathways_per_algo + output: + ensemble_network_file = SEP.join([out_dir,'{dataset}-ml', '{algorithm}-ensemble-pathway.txt']) + run: + summary_df = ml.summarize_networks(input.pathways) ml.ensemble_network(summary_df, output.ensemble_network_file) # Remove the output directory diff --git a/spras/analysis/ml.py b/spras/analysis/ml.py index 4fa1fd1d..49a59266 100644 --- a/spras/analysis/ml.py +++ b/spras/analysis/ml.py @@ -86,18 +86,17 @@ def summarize_networks(file_paths: Iterable[Union[str, PathLike]]) -> pd.DataFra concated_df = concated_df.fillna(0) concated_df = concated_df.astype('int64') - # don't do ml post-processing if there is an empty dataframe or the number of samples is <= 1 - if concated_df.empty: + return concated_df + +def df_error(dataframe: pd.DataFrame): + if dataframe.empty: raise ValueError("ML post-processing cannot proceed because the summarize network dataframe is empty.\nWe " "suggest setting ml include: false in the configuration file to avoid this error.") - if min(concated_df.shape) <= 1: + if min(dataframe.shape) <= 1: raise ValueError(f"ML post-processing cannot proceed because the available number of pathways is insufficient. " f"The ml post-processing requires more than one pathway, but currently " f"there are only {min(concated_df.shape)} pathways.") - return concated_df - - def create_palette(column_names): """ Generates a dictionary mapping each column name (algorithm name) @@ -121,6 +120,7 @@ def pca(dataframe: pd.DataFrame, output_png: str, output_var: str, output_coord: @param components: the number of principal components to calculate (Default is 2) @param labels: determines if labels will be included in the scatterplot (Default is True) """ + df_error(dataframe) df = dataframe.reset_index(drop=True) columns = dataframe.columns column_names = [element.split('-')[-3] for element in columns] # assume algorithm names do not contain '-' @@ -222,6 +222,7 @@ def hac_vertical(dataframe: pd.DataFrame, output_png: str, output_file: str, lin @param linkage: methods for calculating the distance between clusters @param metric: used for distance computation between instances of clusters """ + df_error(dataframe) if linkage not in linkage_methods: raise ValueError(f"linkage={linkage} must be one of {linkage_methods}") if metric not in distance_metrics: @@ -280,6 +281,7 @@ def hac_horizontal(dataframe: pd.DataFrame, output_png: str, output_file: str, l @param linkage: methods for calculating the distance between clusters @param metric: used for distance computation between instances of clusters """ + df_error(dataframe) if linkage not in linkage_methods: raise ValueError(f"linkage={linkage} must be one of {linkage_methods}") if linkage == "ward": diff --git a/test/ml/test_ml.py b/test/ml/test_ml.py index 3010179d..51610646 100644 --- a/test/ml/test_ml.py +++ b/test/ml/test_ml.py @@ -42,13 +42,39 @@ def test_summarize_networks_wrong_direction(self): with pytest.raises(ValueError): ml.summarize_networks([INPUT_DIR + 'test-data-wrong-direction/wrong-direction.txt']) - def test_summarize_networks_empty(self): + # TODO: Can I move everything into one test? + def test_empty_pca(self): with pytest.raises(ValueError): # raises error if empty dataframe is used for post processing - ml.summarize_networks([INPUT_DIR + 'test-data-empty/empty.txt']) + dataframe = ml.summarize_networks([INPUT_DIR + 'test-data-empty/empty.txt']) + ml.pca(dataframe, OUT_DIR + 'pca-empty.png', OUT_DIR + 'pca-empty-variance.txt', + OUT_DIR + 'pca-empty-coordinates.tsv') - def test_single_line(self): + def test_empty_hac_horizontal(self): + with pytest.raises(ValueError): # raises error if empty dataframe is used for post processing + dataframe = ml.summarize_networks([INPUT_DIR + 'test-data-empty/empty.txt']) + ml.hac_horizontal(dataframe, OUT_DIR + 'hac-empty-horizontal.png', OUT_DIR + 'hac-empty-clusters-horizontal.txt') + + def test_empty_hac_vertical(self): + with pytest.raises(ValueError): # raises error if empty dataframe is used for post processing + dataframe = ml.summarize_networks([INPUT_DIR + 'test-data-empty/empty.txt']) + ml.hac_vertical(dataframe, OUT_DIR + 'hac-empty-vertical.png', OUT_DIR + 'hac-empty-clusters-vertical.txt') + + # TODO: Can I move everything into one test? + def test_single_line_pca(self): + with pytest.raises(ValueError): # raises error if single line in file s.t. single row in dataframe is used for post processing + dataframe = ml.summarize_networks([INPUT_DIR + 'test-data-empty/empty.txt']) + ml.pca(dataframe, OUT_DIR + 'pca-single-line.png', OUT_DIR + 'pca-single-line-variance.txt', + OUT_DIR + 'pca-single-line-coordinates.tsv') + + def test_single_line_hac_horizontal(self): + with pytest.raises(ValueError): # raises error if single line in file s.t. single row in dataframe is used for post processing + dataframe = ml.summarize_networks([INPUT_DIR + 'test-data-empty/empty.txt']) + ml.hac_horizontal(dataframe, OUT_DIR + 'hac-single-line-horizontal.png', OUT_DIR + 'hac-single-line-clusters-horizontal.txt') + + def test_single_line_hac_vertical(self): with pytest.raises(ValueError): # raises error if single line in file s.t. single row in dataframe is used for post processing - ml.summarize_networks([INPUT_DIR + 'test-data-single/single.txt']) + dataframe = ml.summarize_networks([INPUT_DIR + 'test-data-empty/empty.txt']) + ml.hac_vertical(dataframe, OUT_DIR + 'hac-single-line-vertical.png', OUT_DIR + 'hac-single-line-clusters-vertical.txt') def test_pca(self): dataframe = ml.summarize_networks([INPUT_DIR + 'test-data-s1/s1.txt', INPUT_DIR + 'test-data-s2/s2.txt', INPUT_DIR + 'test-data-s3/s3.txt']) From b6610ae62899dd3620f51dbebe2a47a0a4ae16bd Mon Sep 17 00:00:00 2001 From: ntalluri Date: Mon, 29 Jul 2024 14:09:24 -0500 Subject: [PATCH 2/8] cleaned up tests and precommit --- spras/analysis/ml.py | 2 +- test/ml/test_ml.py | 30 ++++++++---------------------- 2 files changed, 9 insertions(+), 23 deletions(-) diff --git a/spras/analysis/ml.py b/spras/analysis/ml.py index 49a59266..451a5a45 100644 --- a/spras/analysis/ml.py +++ b/spras/analysis/ml.py @@ -95,7 +95,7 @@ def df_error(dataframe: pd.DataFrame): if min(dataframe.shape) <= 1: raise ValueError(f"ML post-processing cannot proceed because the available number of pathways is insufficient. " f"The ml post-processing requires more than one pathway, but currently " - f"there are only {min(concated_df.shape)} pathways.") + f"there are only {min(dataframe.shape)} pathways.") def create_palette(column_names): """ diff --git a/test/ml/test_ml.py b/test/ml/test_ml.py index 51610646..d897eb5f 100644 --- a/test/ml/test_ml.py +++ b/test/ml/test_ml.py @@ -42,38 +42,24 @@ def test_summarize_networks_wrong_direction(self): with pytest.raises(ValueError): ml.summarize_networks([INPUT_DIR + 'test-data-wrong-direction/wrong-direction.txt']) - # TODO: Can I move everything into one test? - def test_empty_pca(self): + def test_empty(self): + dataframe = ml.summarize_networks([INPUT_DIR + 'test-data-empty/empty.txt']) with pytest.raises(ValueError): # raises error if empty dataframe is used for post processing - dataframe = ml.summarize_networks([INPUT_DIR + 'test-data-empty/empty.txt']) ml.pca(dataframe, OUT_DIR + 'pca-empty.png', OUT_DIR + 'pca-empty-variance.txt', OUT_DIR + 'pca-empty-coordinates.tsv') - - def test_empty_hac_horizontal(self): - with pytest.raises(ValueError): # raises error if empty dataframe is used for post processing - dataframe = ml.summarize_networks([INPUT_DIR + 'test-data-empty/empty.txt']) + with pytest.raises(ValueError): ml.hac_horizontal(dataframe, OUT_DIR + 'hac-empty-horizontal.png', OUT_DIR + 'hac-empty-clusters-horizontal.txt') - - def test_empty_hac_vertical(self): - with pytest.raises(ValueError): # raises error if empty dataframe is used for post processing - dataframe = ml.summarize_networks([INPUT_DIR + 'test-data-empty/empty.txt']) + with pytest.raises(ValueError): ml.hac_vertical(dataframe, OUT_DIR + 'hac-empty-vertical.png', OUT_DIR + 'hac-empty-clusters-vertical.txt') - # TODO: Can I move everything into one test? - def test_single_line_pca(self): + def test_single_line(self): + dataframe = ml.summarize_networks([INPUT_DIR + 'test-data-empty/empty.txt']) with pytest.raises(ValueError): # raises error if single line in file s.t. single row in dataframe is used for post processing - dataframe = ml.summarize_networks([INPUT_DIR + 'test-data-empty/empty.txt']) ml.pca(dataframe, OUT_DIR + 'pca-single-line.png', OUT_DIR + 'pca-single-line-variance.txt', OUT_DIR + 'pca-single-line-coordinates.tsv') - - def test_single_line_hac_horizontal(self): - with pytest.raises(ValueError): # raises error if single line in file s.t. single row in dataframe is used for post processing - dataframe = ml.summarize_networks([INPUT_DIR + 'test-data-empty/empty.txt']) + with pytest.raises(ValueError): ml.hac_horizontal(dataframe, OUT_DIR + 'hac-single-line-horizontal.png', OUT_DIR + 'hac-single-line-clusters-horizontal.txt') - - def test_single_line_hac_vertical(self): - with pytest.raises(ValueError): # raises error if single line in file s.t. single row in dataframe is used for post processing - dataframe = ml.summarize_networks([INPUT_DIR + 'test-data-empty/empty.txt']) + with pytest.raises(ValueError): ml.hac_vertical(dataframe, OUT_DIR + 'hac-single-line-vertical.png', OUT_DIR + 'hac-single-line-clusters-vertical.txt') def test_pca(self): From 8c71ad895b8d2167e2bc2e5de546dcde8ec40d7f Mon Sep 17 00:00:00 2001 From: ntalluri Date: Thu, 15 Aug 2024 13:35:06 -0500 Subject: [PATCH 3/8] added more tests for ensembling --- .../expected-ensemble-network-empty.tsv | 1 + .../expected-ensemble-network-single.tsv | 2 ++ test/ml/test_ml.py | 22 +++++++++++++++++++ 3 files changed, 25 insertions(+) create mode 100644 test/ml/expected/expected-ensemble-network-empty.tsv create mode 100644 test/ml/expected/expected-ensemble-network-single.tsv diff --git a/test/ml/expected/expected-ensemble-network-empty.tsv b/test/ml/expected/expected-ensemble-network-empty.tsv new file mode 100644 index 00000000..754d8377 --- /dev/null +++ b/test/ml/expected/expected-ensemble-network-empty.tsv @@ -0,0 +1 @@ +Node1 Node2 Frequency Direction diff --git a/test/ml/expected/expected-ensemble-network-single.tsv b/test/ml/expected/expected-ensemble-network-single.tsv new file mode 100644 index 00000000..5f1276f5 --- /dev/null +++ b/test/ml/expected/expected-ensemble-network-single.tsv @@ -0,0 +1,2 @@ +Node1 Node2 Frequency Direction +L M 1.0 U diff --git a/test/ml/test_ml.py b/test/ml/test_ml.py index d897eb5f..c23535a2 100644 --- a/test/ml/test_ml.py +++ b/test/ml/test_ml.py @@ -95,3 +95,25 @@ def test_ensemble_network(self): expected = expected.round(5) assert en.equals(expected) + + def test_ensemble_network_single_line(self): + dataframe = ml.summarize_networks([INPUT_DIR + 'test-data-single/single.txt']) + ml.ensemble_network(dataframe, OUT_DIR + 'ensemble-network-single.tsv') + + en = pd.read_table(OUT_DIR + 'ensemble-network-single.tsv') + en = en.round(5) + expected = pd.read_table(EXPECT_DIR + 'expected-ensemble-network-single.tsv') + expected = expected.round(5) + + assert en.equals(expected) + + def test_ensemble_network_empty(self): + dataframe = ml.summarize_networks([INPUT_DIR + 'test-data-empty/empty.txt']) + ml.ensemble_network(dataframe, OUT_DIR + 'ensemble-network-empty.tsv') + + en = pd.read_table(OUT_DIR + 'ensemble-network-empty.tsv') + en = en.round(5) + expected = pd.read_table(EXPECT_DIR + 'expected-ensemble-network-empty.tsv') + expected = expected.round(5) + + assert en.equals(expected) From a9c823804a22f45fb5fe70004e0afd7b39447d4c Mon Sep 17 00:00:00 2001 From: ntalluri Date: Thu, 15 Aug 2024 13:50:03 -0500 Subject: [PATCH 4/8] cleaned up Snakefile, added comments, removed unused parameters --- Snakefile | 38 +++++++++++++++++++++----------------- 1 file changed, 21 insertions(+), 17 deletions(-) diff --git a/Snakefile b/Snakefile index d64bf631..dcc6666d 100644 --- a/Snakefile +++ b/Snakefile @@ -93,14 +93,14 @@ def make_final_input(wildcards): final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}ensemble-pathway.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params)) if _config.config.analysis_include_ml_aggregate_algo: - final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-pca.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos,algorithm_params=algorithms_with_params)) - final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-pca-variance.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos,algorithm_params=algorithms_with_params)) - final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-pca-coordinates.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos,algorithm_params=algorithms_with_params)) - final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-hac-vertical.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos,algorithm_params=algorithms_with_params)) - final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-hac-clusters-vertical.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos,algorithm_params=algorithms_with_params)) - final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-hac-horizontal.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos,algorithm_params=algorithms_with_params)) - final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-hac-clusters-horizontal.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos,algorithm_params=algorithms_with_params)) - final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-ensemble-pathway.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms,algorithm_params=algorithms_with_params)) + final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-pca.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos)) + final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-pca-variance.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos)) + final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-pca-coordinates.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos)) + final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-hac-vertical.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos)) + final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-hac-clusters-vertical.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos)) + final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-hac-horizontal.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos)) + final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-hac-clusters-horizontal.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos)) + final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-ensemble-pathway.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms)) if len(final_input) == 0: # No analysis added yet, so add reconstruction output files if they exist. @@ -297,10 +297,22 @@ rule ml_analysis: ml.hac_vertical(summary_df, output.hac_image_vertical, output.hac_clusters_vertical, **hac_params) ml.hac_horizontal(summary_df, output.hac_image_horizontal, output.hac_clusters_horizontal, **hac_params) +# Ensemble the output pathways for each dataset +rule ensemble: + input: + pathways = expand('{out_dir}{sep}{{dataset}}-{algorithm_params}{sep}pathway.txt', out_dir=out_dir, sep=SEP, algorithm_params=algorithms_with_params) + output: + ensemble_network_file = SEP.join([out_dir,'{dataset}-ml', 'ensemble-pathway.txt']) + run: + summary_df = ml.summarize_networks(input.pathways) + ml.ensemble_network(summary_df, output.ensemble_network_file) + +# Returns all pathways for a specific algorithm def collect_pathways_per_algo(wildcards): filtered_algo_params = [algo_param for algo_param in algorithms_with_params if wildcards.algorithm in algo_param] return expand('{out_dir}{sep}{{dataset}}-{algorithm_params}{sep}pathway.txt', out_dir=out_dir, sep=SEP, algorithm_params=filtered_algo_params) +# Cluster the output pathways for each dataset per algorithm rule ml_analysis_aggregate_algo: input: pathways = collect_pathways_per_algo @@ -318,15 +330,7 @@ rule ml_analysis_aggregate_algo: ml.hac_vertical(summary_df, output.hac_image_vertical, output.hac_clusters_vertical, **hac_params) ml.hac_horizontal(summary_df, output.hac_image_horizontal, output.hac_clusters_horizontal, **hac_params) -rule ensemble: - input: - pathways = expand('{out_dir}{sep}{{dataset}}-{algorithm_params}{sep}pathway.txt', out_dir=out_dir, sep=SEP, algorithm_params=algorithms_with_params) - output: - ensemble_network_file = SEP.join([out_dir,'{dataset}-ml', 'ensemble-pathway.txt']) - run: - summary_df = ml.summarize_networks(input.pathways) - ml.ensemble_network(summary_df, output.ensemble_network_file) - +# Ensemble the output pathways for each dataset per algorithm rule ensemble_per_algo: input: pathways = collect_pathways_per_algo From d5c5865aad789d67cd0b44bf404843dc63e9464a Mon Sep 17 00:00:00 2001 From: ntalluri Date: Thu, 15 Aug 2024 13:54:00 -0500 Subject: [PATCH 5/8] added a comment to df_error --- spras/analysis/ml.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/spras/analysis/ml.py b/spras/analysis/ml.py index 451a5a45..17a19f82 100644 --- a/spras/analysis/ml.py +++ b/spras/analysis/ml.py @@ -89,6 +89,9 @@ def summarize_networks(file_paths: Iterable[Union[str, PathLike]]) -> pd.DataFra return concated_df def df_error(dataframe: pd.DataFrame): + """ + Raises an error if the dataframe is empty or contains one pathway (one row) + """ if dataframe.empty: raise ValueError("ML post-processing cannot proceed because the summarize network dataframe is empty.\nWe " "suggest setting ml include: false in the configuration file to avoid this error.") From ce47c2835ba6c65861689ff96068f9261734483e Mon Sep 17 00:00:00 2001 From: ntalluri Date: Wed, 4 Sep 2024 18:09:57 -0500 Subject: [PATCH 6/8] update code based on comments --- spras/analysis/ml.py | 8 ++++---- test/ml/test_ml.py | 1 - 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/spras/analysis/ml.py b/spras/analysis/ml.py index 17a19f82..b82e845b 100644 --- a/spras/analysis/ml.py +++ b/spras/analysis/ml.py @@ -88,7 +88,7 @@ def summarize_networks(file_paths: Iterable[Union[str, PathLike]]) -> pd.DataFra return concated_df -def df_error(dataframe: pd.DataFrame): +def validate_df(dataframe: pd.DataFrame): """ Raises an error if the dataframe is empty or contains one pathway (one row) """ @@ -123,7 +123,7 @@ def pca(dataframe: pd.DataFrame, output_png: str, output_var: str, output_coord: @param components: the number of principal components to calculate (Default is 2) @param labels: determines if labels will be included in the scatterplot (Default is True) """ - df_error(dataframe) + validate_df(dataframe) df = dataframe.reset_index(drop=True) columns = dataframe.columns column_names = [element.split('-')[-3] for element in columns] # assume algorithm names do not contain '-' @@ -225,7 +225,7 @@ def hac_vertical(dataframe: pd.DataFrame, output_png: str, output_file: str, lin @param linkage: methods for calculating the distance between clusters @param metric: used for distance computation between instances of clusters """ - df_error(dataframe) + validate_df(dataframe) if linkage not in linkage_methods: raise ValueError(f"linkage={linkage} must be one of {linkage_methods}") if metric not in distance_metrics: @@ -284,7 +284,7 @@ def hac_horizontal(dataframe: pd.DataFrame, output_png: str, output_file: str, l @param linkage: methods for calculating the distance between clusters @param metric: used for distance computation between instances of clusters """ - df_error(dataframe) + validate_df(dataframe) if linkage not in linkage_methods: raise ValueError(f"linkage={linkage} must be one of {linkage_methods}") if linkage == "ward": diff --git a/test/ml/test_ml.py b/test/ml/test_ml.py index c23535a2..d38810d6 100644 --- a/test/ml/test_ml.py +++ b/test/ml/test_ml.py @@ -112,7 +112,6 @@ def test_ensemble_network_empty(self): ml.ensemble_network(dataframe, OUT_DIR + 'ensemble-network-empty.tsv') en = pd.read_table(OUT_DIR + 'ensemble-network-empty.tsv') - en = en.round(5) expected = pd.read_table(EXPECT_DIR + 'expected-ensemble-network-empty.tsv') expected = expected.round(5) From 4f78d8011b44cd11f1a1475e56cb34f099f337f0 Mon Sep 17 00:00:00 2001 From: ntalluri Date: Tue, 10 Sep 2024 17:03:24 -0500 Subject: [PATCH 7/8] update to the config logic with aggregate_per_algorithm --- spras/config.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/spras/config.py b/spras/config.py index 4f5f3c6b..a7463a0b 100644 --- a/spras/config.py +++ b/spras/config.py @@ -226,7 +226,7 @@ def process_config(self, raw_config): self.analysis_include_cytoscape = raw_config["analysis"]["cytoscape"]["include"] self.analysis_include_ml = raw_config["analysis"]["ml"]["include"] - if 'aggregate_per_algorithm' not in self.ml_params: - self.analysis_include_ml_aggregate_algo = False - else: + if 'aggregate_per_algorithm' in self.ml_params and self.analysis_include_ml == True: self.analysis_include_ml_aggregate_algo = raw_config["analysis"]["ml"]["aggregate_per_algorithm"] + else: + self.analysis_include_ml_aggregate_algo = False From d079fa3a0b90b4b5ad0f8dc7e1eaa06e5fa1e1e7 Mon Sep 17 00:00:00 2001 From: Anthony Gitter Date: Thu, 12 Sep 2024 14:09:01 -0500 Subject: [PATCH 8/8] Formatting changes --- spras/analysis/ml.py | 3 +++ spras/config.py | 6 ++++-- test/ml/test_ml.py | 2 -- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/spras/analysis/ml.py b/spras/analysis/ml.py index b82e845b..3dad8775 100644 --- a/spras/analysis/ml.py +++ b/spras/analysis/ml.py @@ -88,9 +88,11 @@ def summarize_networks(file_paths: Iterable[Union[str, PathLike]]) -> pd.DataFra return concated_df + def validate_df(dataframe: pd.DataFrame): """ Raises an error if the dataframe is empty or contains one pathway (one row) + @param dataframe: datafrom of pathways to validate """ if dataframe.empty: raise ValueError("ML post-processing cannot proceed because the summarize network dataframe is empty.\nWe " @@ -100,6 +102,7 @@ def validate_df(dataframe: pd.DataFrame): f"The ml post-processing requires more than one pathway, but currently " f"there are only {min(dataframe.shape)} pathways.") + def create_palette(column_names): """ Generates a dictionary mapping each column name (algorithm name) diff --git a/spras/config.py b/spras/config.py index e3bdde77..14f1a926 100644 --- a/spras/config.py +++ b/spras/config.py @@ -80,11 +80,13 @@ def __init__(self, raw_config): # Only includes algorithms that are set to be run with 'include: true'. self.algorithm_params = None # Deprecated. Previously a dict mapping algorithm names to a Boolean tracking whether they used directed graphs. - self.algorithm_directed = None + self.algorithm_directed = None # A dict with the analysis settings self.analysis_params = None # A dict with the ML settings self.ml_params = None + # A Boolean specifying whether to run ML analysis for individual algorithms + self.analysis_include_ml_aggregate_algo = None # A dict with the PCA settings self.pca_params = None # A dict with the hierarchical clustering settings @@ -254,7 +256,7 @@ def process_config(self, raw_config): raise ValueError("Evaluation analysis cannot run as gold standard data not provided. " "Please set evaluation include to false or provide gold standard data.") - if 'aggregate_per_algorithm' in self.ml_params and self.analysis_include_ml == True: + if 'aggregate_per_algorithm' in self.ml_params and self.analysis_include_ml: self.analysis_include_ml_aggregate_algo = raw_config["analysis"]["ml"]["aggregate_per_algorithm"] else: self.analysis_include_ml_aggregate_algo = False diff --git a/test/ml/test_ml.py b/test/ml/test_ml.py index 020bcce8..2b5720ae 100644 --- a/test/ml/test_ml.py +++ b/test/ml/test_ml.py @@ -97,7 +97,6 @@ def test_pca_robustness(self): assert coord.equals(expected) - def test_hac_horizontal(self): dataframe = ml.summarize_networks([INPUT_DIR + 'test-data-s1/s1.txt', INPUT_DIR + 'test-data-s2/s2.txt', INPUT_DIR + 'test-data-s3/s3.txt']) ml.hac_horizontal(dataframe, OUT_DIR + 'hac-horizontal.png', OUT_DIR + 'hac-clusters-horizontal.txt') @@ -138,6 +137,5 @@ def test_ensemble_network_empty(self): en = pd.read_table(OUT_DIR + 'ensemble-network-empty.tsv') expected = pd.read_table(EXPECT_DIR + 'expected-ensemble-network-empty.tsv') - expected = expected.round(5) assert en.equals(expected)