From b6d319ddbc7b8e27296519cb735f1eab4a7499cb Mon Sep 17 00:00:00 2001
From: ntalluri <nehatalluri@live.com>
Date: Mon, 29 Jul 2024 11:50:41 -0500
Subject: [PATCH 1/8] Decoupled ensemble step  and updated ml tests

---
 Snakefile            | 22 ++++++++++++++++++----
 spras/analysis/ml.py | 14 ++++++++------
 test/ml/test_ml.py   | 34 ++++++++++++++++++++++++++++++----
 3 files changed, 56 insertions(+), 14 deletions(-)

diff --git a/Snakefile b/Snakefile
index 71a8a6ed..d64bf631 100644
--- a/Snakefile
+++ b/Snakefile
@@ -100,7 +100,7 @@ def make_final_input(wildcards):
         final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-hac-clusters-vertical.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos,algorithm_params=algorithms_with_params))
         final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-hac-horizontal.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos,algorithm_params=algorithms_with_params))
         final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-hac-clusters-horizontal.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos,algorithm_params=algorithms_with_params))
-        final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-ensemble-pathway.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos,algorithm_params=algorithms_with_params))
+        final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-ensemble-pathway.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms,algorithm_params=algorithms_with_params))
 
     if len(final_input) == 0:
         # No analysis added yet, so add reconstruction output files if they exist.
@@ -291,13 +291,11 @@ rule ml_analysis:
         hac_clusters_vertical = SEP.join([out_dir, '{dataset}-ml', 'hac-clusters-vertical.txt']),
         hac_image_horizontal = SEP.join([out_dir, '{dataset}-ml', 'hac-horizontal.png']),
         hac_clusters_horizontal = SEP.join([out_dir, '{dataset}-ml', 'hac-clusters-horizontal.txt']),
-        ensemble_network_file = SEP.join([out_dir,'{dataset}-ml', 'ensemble-pathway.txt'])
     run: 
         summary_df = ml.summarize_networks(input.pathways)
         ml.pca(summary_df, output.pca_image, output.pca_variance, output.pca_coordinates, **pca_params)
         ml.hac_vertical(summary_df, output.hac_image_vertical, output.hac_clusters_vertical, **hac_params)
         ml.hac_horizontal(summary_df, output.hac_image_horizontal, output.hac_clusters_horizontal, **hac_params)
-        ml.ensemble_network(summary_df, output.ensemble_network_file)
 
 def collect_pathways_per_algo(wildcards):
     filtered_algo_params = [algo_param for algo_param in algorithms_with_params if wildcards.algorithm in algo_param]
@@ -314,12 +312,28 @@ rule ml_analysis_aggregate_algo:
         hac_clusters_vertical = SEP.join([out_dir, '{dataset}-ml', '{algorithm}-hac-clusters-vertical.txt']),
         hac_image_horizontal = SEP.join([out_dir, '{dataset}-ml', '{algorithm}-hac-horizontal.png']),
         hac_clusters_horizontal = SEP.join([out_dir, '{dataset}-ml', '{algorithm}-hac-clusters-horizontal.txt']),
-        ensemble_network_file = SEP.join([out_dir,'{dataset}-ml', '{algorithm}-ensemble-pathway.txt'])
     run:
         summary_df = ml.summarize_networks(input.pathways)
         ml.pca(summary_df, output.pca_image, output.pca_variance, output.pca_coordinates, **pca_params)
         ml.hac_vertical(summary_df, output.hac_image_vertical, output.hac_clusters_vertical, **hac_params)
         ml.hac_horizontal(summary_df, output.hac_image_horizontal, output.hac_clusters_horizontal, **hac_params)
+
+rule ensemble: 
+    input:
+        pathways = expand('{out_dir}{sep}{{dataset}}-{algorithm_params}{sep}pathway.txt', out_dir=out_dir, sep=SEP, algorithm_params=algorithms_with_params)
+    output:
+        ensemble_network_file = SEP.join([out_dir,'{dataset}-ml', 'ensemble-pathway.txt'])
+    run:
+        summary_df = ml.summarize_networks(input.pathways)
+        ml.ensemble_network(summary_df, output.ensemble_network_file)
+
+rule ensemble_per_algo:
+    input:
+        pathways = collect_pathways_per_algo
+    output:
+        ensemble_network_file = SEP.join([out_dir,'{dataset}-ml', '{algorithm}-ensemble-pathway.txt'])
+    run:
+        summary_df = ml.summarize_networks(input.pathways)
         ml.ensemble_network(summary_df, output.ensemble_network_file)
 
 # Remove the output directory
diff --git a/spras/analysis/ml.py b/spras/analysis/ml.py
index 4fa1fd1d..49a59266 100644
--- a/spras/analysis/ml.py
+++ b/spras/analysis/ml.py
@@ -86,18 +86,17 @@ def summarize_networks(file_paths: Iterable[Union[str, PathLike]]) -> pd.DataFra
     concated_df = concated_df.fillna(0)
     concated_df = concated_df.astype('int64')
 
-    # don't do ml post-processing if there is an empty dataframe or the number of samples is <= 1
-    if concated_df.empty:
+    return concated_df
+
+def df_error(dataframe: pd.DataFrame):
+    if dataframe.empty:
         raise ValueError("ML post-processing cannot proceed because the summarize network dataframe is empty.\nWe "
                       "suggest setting ml include: false in the configuration file to avoid this error.")
-    if min(concated_df.shape) <= 1:
+    if min(dataframe.shape) <= 1:
         raise ValueError(f"ML post-processing cannot proceed because the available number of pathways is insufficient. "
                       f"The ml post-processing requires more than one pathway, but currently "
                       f"there are only {min(concated_df.shape)} pathways.")
 
-    return concated_df
-
-
 def create_palette(column_names):
     """
     Generates a dictionary mapping each column name (algorithm name)
@@ -121,6 +120,7 @@ def pca(dataframe: pd.DataFrame, output_png: str, output_var: str, output_coord:
     @param components: the number of principal components to calculate (Default is 2)
     @param labels: determines if labels will be included in the scatterplot (Default is True)
     """
+    df_error(dataframe)
     df = dataframe.reset_index(drop=True)
     columns = dataframe.columns
     column_names = [element.split('-')[-3] for element in columns]  # assume algorithm names do not contain '-'
@@ -222,6 +222,7 @@ def hac_vertical(dataframe: pd.DataFrame, output_png: str, output_file: str, lin
     @param linkage: methods for calculating the distance between clusters
     @param metric: used for distance computation between instances of clusters
     """
+    df_error(dataframe)
     if linkage not in linkage_methods:
         raise ValueError(f"linkage={linkage} must be one of {linkage_methods}")
     if metric not in distance_metrics:
@@ -280,6 +281,7 @@ def hac_horizontal(dataframe: pd.DataFrame, output_png: str, output_file: str, l
     @param linkage: methods for calculating the distance between clusters
     @param metric: used for distance computation between instances of clusters
     """
+    df_error(dataframe)
     if linkage not in linkage_methods:
         raise ValueError(f"linkage={linkage} must be one of {linkage_methods}")
     if linkage == "ward":
diff --git a/test/ml/test_ml.py b/test/ml/test_ml.py
index 3010179d..51610646 100644
--- a/test/ml/test_ml.py
+++ b/test/ml/test_ml.py
@@ -42,13 +42,39 @@ def test_summarize_networks_wrong_direction(self):
         with pytest.raises(ValueError):
             ml.summarize_networks([INPUT_DIR + 'test-data-wrong-direction/wrong-direction.txt'])
 
-    def test_summarize_networks_empty(self):
+    # TODO: Can I move everything into one test?
+    def test_empty_pca(self):
         with pytest.raises(ValueError):  # raises error if empty dataframe is used for post processing
-            ml.summarize_networks([INPUT_DIR + 'test-data-empty/empty.txt'])
+            dataframe = ml.summarize_networks([INPUT_DIR + 'test-data-empty/empty.txt'])
+            ml.pca(dataframe, OUT_DIR + 'pca-empty.png', OUT_DIR + 'pca-empty-variance.txt',
+               OUT_DIR + 'pca-empty-coordinates.tsv')
 
-    def test_single_line(self):
+    def test_empty_hac_horizontal(self):
+        with pytest.raises(ValueError):  # raises error if empty dataframe is used for post processing
+            dataframe = ml.summarize_networks([INPUT_DIR + 'test-data-empty/empty.txt'])
+            ml.hac_horizontal(dataframe, OUT_DIR + 'hac-empty-horizontal.png', OUT_DIR + 'hac-empty-clusters-horizontal.txt')
+
+    def test_empty_hac_vertical(self):
+        with pytest.raises(ValueError):  # raises error if empty dataframe is used for post processing
+            dataframe = ml.summarize_networks([INPUT_DIR + 'test-data-empty/empty.txt'])
+            ml.hac_vertical(dataframe, OUT_DIR + 'hac-empty-vertical.png', OUT_DIR + 'hac-empty-clusters-vertical.txt')
+
+     # TODO: Can I move everything into one test?
+    def test_single_line_pca(self):
+        with pytest.raises(ValueError):  # raises error if single line in file s.t. single row in dataframe is used for post processing
+            dataframe = ml.summarize_networks([INPUT_DIR + 'test-data-empty/empty.txt'])
+            ml.pca(dataframe, OUT_DIR + 'pca-single-line.png', OUT_DIR + 'pca-single-line-variance.txt',
+               OUT_DIR + 'pca-single-line-coordinates.tsv')
+    
+    def test_single_line_hac_horizontal(self):
+        with pytest.raises(ValueError):  # raises error if single line in file s.t. single row in dataframe is used for post processing
+            dataframe = ml.summarize_networks([INPUT_DIR + 'test-data-empty/empty.txt'])
+            ml.hac_horizontal(dataframe, OUT_DIR + 'hac-single-line-horizontal.png', OUT_DIR + 'hac-single-line-clusters-horizontal.txt')
+
+    def test_single_line_hac_vertical(self):
         with pytest.raises(ValueError):  # raises error if single line in file s.t. single row in dataframe is used for post processing
-            ml.summarize_networks([INPUT_DIR + 'test-data-single/single.txt'])
+            dataframe = ml.summarize_networks([INPUT_DIR + 'test-data-empty/empty.txt'])
+            ml.hac_vertical(dataframe, OUT_DIR + 'hac-single-line-vertical.png', OUT_DIR + 'hac-single-line-clusters-vertical.txt')
 
     def test_pca(self):
         dataframe = ml.summarize_networks([INPUT_DIR + 'test-data-s1/s1.txt', INPUT_DIR + 'test-data-s2/s2.txt', INPUT_DIR + 'test-data-s3/s3.txt'])

From b6610ae62899dd3620f51dbebe2a47a0a4ae16bd Mon Sep 17 00:00:00 2001
From: ntalluri <nehatalluri@live.com>
Date: Mon, 29 Jul 2024 14:09:24 -0500
Subject: [PATCH 2/8] cleaned up tests and precommit

---
 spras/analysis/ml.py |  2 +-
 test/ml/test_ml.py   | 30 ++++++++----------------------
 2 files changed, 9 insertions(+), 23 deletions(-)

diff --git a/spras/analysis/ml.py b/spras/analysis/ml.py
index 49a59266..451a5a45 100644
--- a/spras/analysis/ml.py
+++ b/spras/analysis/ml.py
@@ -95,7 +95,7 @@ def df_error(dataframe: pd.DataFrame):
     if min(dataframe.shape) <= 1:
         raise ValueError(f"ML post-processing cannot proceed because the available number of pathways is insufficient. "
                       f"The ml post-processing requires more than one pathway, but currently "
-                      f"there are only {min(concated_df.shape)} pathways.")
+                      f"there are only {min(dataframe.shape)} pathways.")
 
 def create_palette(column_names):
     """
diff --git a/test/ml/test_ml.py b/test/ml/test_ml.py
index 51610646..d897eb5f 100644
--- a/test/ml/test_ml.py
+++ b/test/ml/test_ml.py
@@ -42,38 +42,24 @@ def test_summarize_networks_wrong_direction(self):
         with pytest.raises(ValueError):
             ml.summarize_networks([INPUT_DIR + 'test-data-wrong-direction/wrong-direction.txt'])
 
-    # TODO: Can I move everything into one test?
-    def test_empty_pca(self):
+    def test_empty(self):
+        dataframe = ml.summarize_networks([INPUT_DIR + 'test-data-empty/empty.txt'])
         with pytest.raises(ValueError):  # raises error if empty dataframe is used for post processing
-            dataframe = ml.summarize_networks([INPUT_DIR + 'test-data-empty/empty.txt'])
             ml.pca(dataframe, OUT_DIR + 'pca-empty.png', OUT_DIR + 'pca-empty-variance.txt',
                OUT_DIR + 'pca-empty-coordinates.tsv')
-
-    def test_empty_hac_horizontal(self):
-        with pytest.raises(ValueError):  # raises error if empty dataframe is used for post processing
-            dataframe = ml.summarize_networks([INPUT_DIR + 'test-data-empty/empty.txt'])
+        with pytest.raises(ValueError):
             ml.hac_horizontal(dataframe, OUT_DIR + 'hac-empty-horizontal.png', OUT_DIR + 'hac-empty-clusters-horizontal.txt')
-
-    def test_empty_hac_vertical(self):
-        with pytest.raises(ValueError):  # raises error if empty dataframe is used for post processing
-            dataframe = ml.summarize_networks([INPUT_DIR + 'test-data-empty/empty.txt'])
+        with pytest.raises(ValueError):
             ml.hac_vertical(dataframe, OUT_DIR + 'hac-empty-vertical.png', OUT_DIR + 'hac-empty-clusters-vertical.txt')
 
-     # TODO: Can I move everything into one test?
-    def test_single_line_pca(self):
+    def test_single_line(self):
+        dataframe = ml.summarize_networks([INPUT_DIR + 'test-data-empty/empty.txt'])
         with pytest.raises(ValueError):  # raises error if single line in file s.t. single row in dataframe is used for post processing
-            dataframe = ml.summarize_networks([INPUT_DIR + 'test-data-empty/empty.txt'])
             ml.pca(dataframe, OUT_DIR + 'pca-single-line.png', OUT_DIR + 'pca-single-line-variance.txt',
                OUT_DIR + 'pca-single-line-coordinates.tsv')
-    
-    def test_single_line_hac_horizontal(self):
-        with pytest.raises(ValueError):  # raises error if single line in file s.t. single row in dataframe is used for post processing
-            dataframe = ml.summarize_networks([INPUT_DIR + 'test-data-empty/empty.txt'])
+        with pytest.raises(ValueError):
             ml.hac_horizontal(dataframe, OUT_DIR + 'hac-single-line-horizontal.png', OUT_DIR + 'hac-single-line-clusters-horizontal.txt')
-
-    def test_single_line_hac_vertical(self):
-        with pytest.raises(ValueError):  # raises error if single line in file s.t. single row in dataframe is used for post processing
-            dataframe = ml.summarize_networks([INPUT_DIR + 'test-data-empty/empty.txt'])
+        with pytest.raises(ValueError):
             ml.hac_vertical(dataframe, OUT_DIR + 'hac-single-line-vertical.png', OUT_DIR + 'hac-single-line-clusters-vertical.txt')
 
     def test_pca(self):

From 8c71ad895b8d2167e2bc2e5de546dcde8ec40d7f Mon Sep 17 00:00:00 2001
From: ntalluri <nehatalluri@live.com>
Date: Thu, 15 Aug 2024 13:35:06 -0500
Subject: [PATCH 3/8] added more tests for ensembling

---
 .../expected-ensemble-network-empty.tsv       |  1 +
 .../expected-ensemble-network-single.tsv      |  2 ++
 test/ml/test_ml.py                            | 22 +++++++++++++++++++
 3 files changed, 25 insertions(+)
 create mode 100644 test/ml/expected/expected-ensemble-network-empty.tsv
 create mode 100644 test/ml/expected/expected-ensemble-network-single.tsv

diff --git a/test/ml/expected/expected-ensemble-network-empty.tsv b/test/ml/expected/expected-ensemble-network-empty.tsv
new file mode 100644
index 00000000..754d8377
--- /dev/null
+++ b/test/ml/expected/expected-ensemble-network-empty.tsv
@@ -0,0 +1 @@
+Node1	Node2	Frequency	Direction
diff --git a/test/ml/expected/expected-ensemble-network-single.tsv b/test/ml/expected/expected-ensemble-network-single.tsv
new file mode 100644
index 00000000..5f1276f5
--- /dev/null
+++ b/test/ml/expected/expected-ensemble-network-single.tsv
@@ -0,0 +1,2 @@
+Node1	Node2	Frequency	Direction
+L	M	1.0	U
diff --git a/test/ml/test_ml.py b/test/ml/test_ml.py
index d897eb5f..c23535a2 100644
--- a/test/ml/test_ml.py
+++ b/test/ml/test_ml.py
@@ -95,3 +95,25 @@ def test_ensemble_network(self):
         expected = expected.round(5)
 
         assert en.equals(expected)
+
+    def test_ensemble_network_single_line(self):
+        dataframe = ml.summarize_networks([INPUT_DIR + 'test-data-single/single.txt'])
+        ml.ensemble_network(dataframe, OUT_DIR + 'ensemble-network-single.tsv')
+
+        en = pd.read_table(OUT_DIR + 'ensemble-network-single.tsv')
+        en = en.round(5)
+        expected = pd.read_table(EXPECT_DIR + 'expected-ensemble-network-single.tsv')
+        expected = expected.round(5)
+
+        assert en.equals(expected)
+
+    def test_ensemble_network_empty(self):
+        dataframe = ml.summarize_networks([INPUT_DIR + 'test-data-empty/empty.txt'])
+        ml.ensemble_network(dataframe, OUT_DIR + 'ensemble-network-empty.tsv')
+
+        en = pd.read_table(OUT_DIR + 'ensemble-network-empty.tsv')
+        en = en.round(5)
+        expected = pd.read_table(EXPECT_DIR + 'expected-ensemble-network-empty.tsv')
+        expected = expected.round(5)
+
+        assert en.equals(expected)

From a9c823804a22f45fb5fe70004e0afd7b39447d4c Mon Sep 17 00:00:00 2001
From: ntalluri <nehatalluri@live.com>
Date: Thu, 15 Aug 2024 13:50:03 -0500
Subject: [PATCH 4/8] cleaned up Snakefile, added comments, removed unused
 parameters

---
 Snakefile | 38 +++++++++++++++++++++-----------------
 1 file changed, 21 insertions(+), 17 deletions(-)

diff --git a/Snakefile b/Snakefile
index d64bf631..dcc6666d 100644
--- a/Snakefile
+++ b/Snakefile
@@ -93,14 +93,14 @@ def make_final_input(wildcards):
         final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}ensemble-pathway.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params))
     
     if _config.config.analysis_include_ml_aggregate_algo:
-        final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-pca.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos,algorithm_params=algorithms_with_params))
-        final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-pca-variance.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos,algorithm_params=algorithms_with_params))
-        final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-pca-coordinates.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos,algorithm_params=algorithms_with_params))
-        final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-hac-vertical.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos,algorithm_params=algorithms_with_params))
-        final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-hac-clusters-vertical.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos,algorithm_params=algorithms_with_params))
-        final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-hac-horizontal.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos,algorithm_params=algorithms_with_params))
-        final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-hac-clusters-horizontal.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos,algorithm_params=algorithms_with_params))
-        final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-ensemble-pathway.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms,algorithm_params=algorithms_with_params))
+        final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-pca.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos))
+        final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-pca-variance.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos))
+        final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-pca-coordinates.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos))
+        final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-hac-vertical.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos))
+        final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-hac-clusters-vertical.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos))
+        final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-hac-horizontal.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos))
+        final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-hac-clusters-horizontal.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos))
+        final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-ensemble-pathway.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms))
 
     if len(final_input) == 0:
         # No analysis added yet, so add reconstruction output files if they exist.
@@ -297,10 +297,22 @@ rule ml_analysis:
         ml.hac_vertical(summary_df, output.hac_image_vertical, output.hac_clusters_vertical, **hac_params)
         ml.hac_horizontal(summary_df, output.hac_image_horizontal, output.hac_clusters_horizontal, **hac_params)
 
+# Ensemble the output pathways for each dataset
+rule ensemble: 
+    input:
+        pathways = expand('{out_dir}{sep}{{dataset}}-{algorithm_params}{sep}pathway.txt', out_dir=out_dir, sep=SEP, algorithm_params=algorithms_with_params)
+    output:
+        ensemble_network_file = SEP.join([out_dir,'{dataset}-ml', 'ensemble-pathway.txt'])
+    run:
+        summary_df = ml.summarize_networks(input.pathways)
+        ml.ensemble_network(summary_df, output.ensemble_network_file)
+
+# Returns all pathways for a specific algorithm
 def collect_pathways_per_algo(wildcards):
     filtered_algo_params = [algo_param for algo_param in algorithms_with_params if wildcards.algorithm in algo_param]
     return expand('{out_dir}{sep}{{dataset}}-{algorithm_params}{sep}pathway.txt', out_dir=out_dir, sep=SEP, algorithm_params=filtered_algo_params)
 
+# Cluster the output pathways for each dataset per algorithm
 rule ml_analysis_aggregate_algo:
     input:
         pathways = collect_pathways_per_algo
@@ -318,15 +330,7 @@ rule ml_analysis_aggregate_algo:
         ml.hac_vertical(summary_df, output.hac_image_vertical, output.hac_clusters_vertical, **hac_params)
         ml.hac_horizontal(summary_df, output.hac_image_horizontal, output.hac_clusters_horizontal, **hac_params)
 
-rule ensemble: 
-    input:
-        pathways = expand('{out_dir}{sep}{{dataset}}-{algorithm_params}{sep}pathway.txt', out_dir=out_dir, sep=SEP, algorithm_params=algorithms_with_params)
-    output:
-        ensemble_network_file = SEP.join([out_dir,'{dataset}-ml', 'ensemble-pathway.txt'])
-    run:
-        summary_df = ml.summarize_networks(input.pathways)
-        ml.ensemble_network(summary_df, output.ensemble_network_file)
-
+# Ensemble the output pathways for each dataset per algorithm
 rule ensemble_per_algo:
     input:
         pathways = collect_pathways_per_algo

From d5c5865aad789d67cd0b44bf404843dc63e9464a Mon Sep 17 00:00:00 2001
From: ntalluri <nehatalluri@live.com>
Date: Thu, 15 Aug 2024 13:54:00 -0500
Subject: [PATCH 5/8] added a comment to df_error

---
 spras/analysis/ml.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/spras/analysis/ml.py b/spras/analysis/ml.py
index 451a5a45..17a19f82 100644
--- a/spras/analysis/ml.py
+++ b/spras/analysis/ml.py
@@ -89,6 +89,9 @@ def summarize_networks(file_paths: Iterable[Union[str, PathLike]]) -> pd.DataFra
     return concated_df
 
 def df_error(dataframe: pd.DataFrame):
+    """
+    Raises an error if the dataframe is empty or contains one pathway (one row)
+    """
     if dataframe.empty:
         raise ValueError("ML post-processing cannot proceed because the summarize network dataframe is empty.\nWe "
                       "suggest setting ml include: false in the configuration file to avoid this error.")

From ce47c2835ba6c65861689ff96068f9261734483e Mon Sep 17 00:00:00 2001
From: ntalluri <nehatalluri@live.com>
Date: Wed, 4 Sep 2024 18:09:57 -0500
Subject: [PATCH 6/8] update code based on comments

---
 spras/analysis/ml.py | 8 ++++----
 test/ml/test_ml.py   | 1 -
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/spras/analysis/ml.py b/spras/analysis/ml.py
index 17a19f82..b82e845b 100644
--- a/spras/analysis/ml.py
+++ b/spras/analysis/ml.py
@@ -88,7 +88,7 @@ def summarize_networks(file_paths: Iterable[Union[str, PathLike]]) -> pd.DataFra
 
     return concated_df
 
-def df_error(dataframe: pd.DataFrame):
+def validate_df(dataframe: pd.DataFrame):
     """
     Raises an error if the dataframe is empty or contains one pathway (one row)
     """
@@ -123,7 +123,7 @@ def pca(dataframe: pd.DataFrame, output_png: str, output_var: str, output_coord:
     @param components: the number of principal components to calculate (Default is 2)
     @param labels: determines if labels will be included in the scatterplot (Default is True)
     """
-    df_error(dataframe)
+    validate_df(dataframe)
     df = dataframe.reset_index(drop=True)
     columns = dataframe.columns
     column_names = [element.split('-')[-3] for element in columns]  # assume algorithm names do not contain '-'
@@ -225,7 +225,7 @@ def hac_vertical(dataframe: pd.DataFrame, output_png: str, output_file: str, lin
     @param linkage: methods for calculating the distance between clusters
     @param metric: used for distance computation between instances of clusters
     """
-    df_error(dataframe)
+    validate_df(dataframe)
     if linkage not in linkage_methods:
         raise ValueError(f"linkage={linkage} must be one of {linkage_methods}")
     if metric not in distance_metrics:
@@ -284,7 +284,7 @@ def hac_horizontal(dataframe: pd.DataFrame, output_png: str, output_file: str, l
     @param linkage: methods for calculating the distance between clusters
     @param metric: used for distance computation between instances of clusters
     """
-    df_error(dataframe)
+    validate_df(dataframe)
     if linkage not in linkage_methods:
         raise ValueError(f"linkage={linkage} must be one of {linkage_methods}")
     if linkage == "ward":
diff --git a/test/ml/test_ml.py b/test/ml/test_ml.py
index c23535a2..d38810d6 100644
--- a/test/ml/test_ml.py
+++ b/test/ml/test_ml.py
@@ -112,7 +112,6 @@ def test_ensemble_network_empty(self):
         ml.ensemble_network(dataframe, OUT_DIR + 'ensemble-network-empty.tsv')
 
         en = pd.read_table(OUT_DIR + 'ensemble-network-empty.tsv')
-        en = en.round(5)
         expected = pd.read_table(EXPECT_DIR + 'expected-ensemble-network-empty.tsv')
         expected = expected.round(5)
 

From 4f78d8011b44cd11f1a1475e56cb34f099f337f0 Mon Sep 17 00:00:00 2001
From: ntalluri <nehatalluri@live.com>
Date: Tue, 10 Sep 2024 17:03:24 -0500
Subject: [PATCH 7/8] update to the config logic with  aggregate_per_algorithm

---
 spras/config.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/spras/config.py b/spras/config.py
index 4f5f3c6b..a7463a0b 100644
--- a/spras/config.py
+++ b/spras/config.py
@@ -226,7 +226,7 @@ def process_config(self, raw_config):
         self.analysis_include_cytoscape = raw_config["analysis"]["cytoscape"]["include"]
         self.analysis_include_ml = raw_config["analysis"]["ml"]["include"]
 
-        if 'aggregate_per_algorithm' not in self.ml_params:
-            self.analysis_include_ml_aggregate_algo = False
-        else:
+        if 'aggregate_per_algorithm' in self.ml_params and self.analysis_include_ml == True:
             self.analysis_include_ml_aggregate_algo = raw_config["analysis"]["ml"]["aggregate_per_algorithm"]
+        else:
+            self.analysis_include_ml_aggregate_algo = False

From d079fa3a0b90b4b5ad0f8dc7e1eaa06e5fa1e1e7 Mon Sep 17 00:00:00 2001
From: Anthony Gitter <gitter@biostat.wisc.edu>
Date: Thu, 12 Sep 2024 14:09:01 -0500
Subject: [PATCH 8/8] Formatting changes

---
 spras/analysis/ml.py | 3 +++
 spras/config.py      | 6 ++++--
 test/ml/test_ml.py   | 2 --
 3 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/spras/analysis/ml.py b/spras/analysis/ml.py
index b82e845b..3dad8775 100644
--- a/spras/analysis/ml.py
+++ b/spras/analysis/ml.py
@@ -88,9 +88,11 @@ def summarize_networks(file_paths: Iterable[Union[str, PathLike]]) -> pd.DataFra
 
     return concated_df
 
+
 def validate_df(dataframe: pd.DataFrame):
     """
     Raises an error if the dataframe is empty or contains one pathway (one row)
+    @param dataframe: datafrom of pathways to validate
     """
     if dataframe.empty:
         raise ValueError("ML post-processing cannot proceed because the summarize network dataframe is empty.\nWe "
@@ -100,6 +102,7 @@ def validate_df(dataframe: pd.DataFrame):
                       f"The ml post-processing requires more than one pathway, but currently "
                       f"there are only {min(dataframe.shape)} pathways.")
 
+
 def create_palette(column_names):
     """
     Generates a dictionary mapping each column name (algorithm name)
diff --git a/spras/config.py b/spras/config.py
index e3bdde77..14f1a926 100644
--- a/spras/config.py
+++ b/spras/config.py
@@ -80,11 +80,13 @@ def __init__(self, raw_config):
         # Only includes algorithms that are set to be run with 'include: true'.
         self.algorithm_params = None
         # Deprecated. Previously a dict mapping algorithm names to a Boolean tracking whether they used directed graphs.
-        self.algorithm_directed  = None
+        self.algorithm_directed = None
         # A dict with the analysis settings
         self.analysis_params = None
         # A dict with the ML settings
         self.ml_params = None
+        # A Boolean specifying whether to run ML analysis for individual algorithms
+        self.analysis_include_ml_aggregate_algo = None
         # A dict with the PCA settings
         self.pca_params = None
         # A dict with the hierarchical clustering settings
@@ -254,7 +256,7 @@ def process_config(self, raw_config):
             raise ValueError("Evaluation analysis cannot run as gold standard data not provided. "
                              "Please set evaluation include to false or provide gold standard data.")
 
-        if 'aggregate_per_algorithm' in self.ml_params and self.analysis_include_ml == True:
+        if 'aggregate_per_algorithm' in self.ml_params and self.analysis_include_ml:
             self.analysis_include_ml_aggregate_algo = raw_config["analysis"]["ml"]["aggregate_per_algorithm"]
         else:
             self.analysis_include_ml_aggregate_algo = False
diff --git a/test/ml/test_ml.py b/test/ml/test_ml.py
index 020bcce8..2b5720ae 100644
--- a/test/ml/test_ml.py
+++ b/test/ml/test_ml.py
@@ -97,7 +97,6 @@ def test_pca_robustness(self):
 
             assert coord.equals(expected)
 
-
     def test_hac_horizontal(self):
         dataframe = ml.summarize_networks([INPUT_DIR + 'test-data-s1/s1.txt', INPUT_DIR + 'test-data-s2/s2.txt', INPUT_DIR + 'test-data-s3/s3.txt'])
         ml.hac_horizontal(dataframe, OUT_DIR + 'hac-horizontal.png', OUT_DIR + 'hac-clusters-horizontal.txt')
@@ -138,6 +137,5 @@ def test_ensemble_network_empty(self):
 
         en = pd.read_table(OUT_DIR + 'ensemble-network-empty.tsv')
         expected = pd.read_table(EXPECT_DIR + 'expected-ensemble-network-empty.tsv')
-        expected = expected.round(5)
 
         assert en.equals(expected)