Skip to content

Commit

Permalink
Merge pull request #175 from ntalluri/decouple-ensemble
Browse files Browse the repository at this point in the history
Decouple Ensembling
  • Loading branch information
agitter authored Sep 22, 2024
2 parents 7b07916 + d079fa3 commit 624cbb4
Show file tree
Hide file tree
Showing 6 changed files with 86 additions and 25 deletions.
39 changes: 28 additions & 11 deletions Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -95,14 +95,14 @@ def make_final_input(wildcards):
final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}ensemble-pathway.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params))

if _config.config.analysis_include_ml_aggregate_algo:
final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-pca.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos,algorithm_params=algorithms_with_params))
final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-pca-variance.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos,algorithm_params=algorithms_with_params))
final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-pca-coordinates.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos,algorithm_params=algorithms_with_params))
final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-hac-vertical.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos,algorithm_params=algorithms_with_params))
final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-hac-clusters-vertical.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos,algorithm_params=algorithms_with_params))
final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-hac-horizontal.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos,algorithm_params=algorithms_with_params))
final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-hac-clusters-horizontal.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos,algorithm_params=algorithms_with_params))
final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-ensemble-pathway.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos,algorithm_params=algorithms_with_params))
final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-pca.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos))
final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-pca-variance.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos))
final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-pca-coordinates.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos))
final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-hac-vertical.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos))
final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-hac-clusters-vertical.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos))
final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-hac-horizontal.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos))
final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-hac-clusters-horizontal.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos))
final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-ensemble-pathway.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms))

if _config.config.analysis_include_evaluation:
final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-evaluation.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs,algorithm_params=algorithms_with_params))
Expand Down Expand Up @@ -311,19 +311,28 @@ rule ml_analysis:
hac_clusters_vertical = SEP.join([out_dir, '{dataset}-ml', 'hac-clusters-vertical.txt']),
hac_image_horizontal = SEP.join([out_dir, '{dataset}-ml', 'hac-horizontal.png']),
hac_clusters_horizontal = SEP.join([out_dir, '{dataset}-ml', 'hac-clusters-horizontal.txt']),
ensemble_network_file = SEP.join([out_dir,'{dataset}-ml', 'ensemble-pathway.txt'])
run:
summary_df = ml.summarize_networks(input.pathways)
ml.pca(summary_df, output.pca_image, output.pca_variance, output.pca_coordinates, **pca_params)
ml.hac_vertical(summary_df, output.hac_image_vertical, output.hac_clusters_vertical, **hac_params)
ml.hac_horizontal(summary_df, output.hac_image_horizontal, output.hac_clusters_horizontal, **hac_params)

# Ensemble the output pathways for each dataset
rule ensemble:
input:
pathways = expand('{out_dir}{sep}{{dataset}}-{algorithm_params}{sep}pathway.txt', out_dir=out_dir, sep=SEP, algorithm_params=algorithms_with_params)
output:
ensemble_network_file = SEP.join([out_dir,'{dataset}-ml', 'ensemble-pathway.txt'])
run:
summary_df = ml.summarize_networks(input.pathways)
ml.ensemble_network(summary_df, output.ensemble_network_file)

# Returns all pathways for a specific algorithm
def collect_pathways_per_algo(wildcards):
filtered_algo_params = [algo_param for algo_param in algorithms_with_params if wildcards.algorithm in algo_param]
return expand('{out_dir}{sep}{{dataset}}-{algorithm_params}{sep}pathway.txt', out_dir=out_dir, sep=SEP, algorithm_params=filtered_algo_params)

# Cluster the output pathways per algorithm for each dataset
# Cluster the output pathways for each dataset per algorithm
rule ml_analysis_aggregate_algo:
input:
pathways = collect_pathways_per_algo
Expand All @@ -335,12 +344,20 @@ rule ml_analysis_aggregate_algo:
hac_clusters_vertical = SEP.join([out_dir, '{dataset}-ml', '{algorithm}-hac-clusters-vertical.txt']),
hac_image_horizontal = SEP.join([out_dir, '{dataset}-ml', '{algorithm}-hac-horizontal.png']),
hac_clusters_horizontal = SEP.join([out_dir, '{dataset}-ml', '{algorithm}-hac-clusters-horizontal.txt']),
ensemble_network_file = SEP.join([out_dir,'{dataset}-ml', '{algorithm}-ensemble-pathway.txt'])
run:
summary_df = ml.summarize_networks(input.pathways)
ml.pca(summary_df, output.pca_image, output.pca_variance, output.pca_coordinates, **pca_params)
ml.hac_vertical(summary_df, output.hac_image_vertical, output.hac_clusters_vertical, **hac_params)
ml.hac_horizontal(summary_df, output.hac_image_horizontal, output.hac_clusters_horizontal, **hac_params)

# Ensemble the output pathways for each dataset per algorithm
rule ensemble_per_algo:
input:
pathways = collect_pathways_per_algo
output:
ensemble_network_file = SEP.join([out_dir,'{dataset}-ml', '{algorithm}-ensemble-pathway.txt'])
run:
summary_df = ml.summarize_networks(input.pathways)
ml.ensemble_network(summary_df, output.ensemble_network_file)

# Return the gold standard pickle file for a specific gold standard
Expand Down
20 changes: 14 additions & 6 deletions spras/analysis/ml.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,16 +86,21 @@ def summarize_networks(file_paths: Iterable[Union[str, PathLike]]) -> pd.DataFra
concated_df = concated_df.fillna(0)
concated_df = concated_df.astype('int64')

# don't do ml post-processing if there is an empty dataframe or the number of samples is <= 1
if concated_df.empty:
return concated_df


def validate_df(dataframe: pd.DataFrame):
"""
Raises an error if the dataframe is empty or contains one pathway (one row)
@param dataframe: datafrom of pathways to validate
"""
if dataframe.empty:
raise ValueError("ML post-processing cannot proceed because the summarize network dataframe is empty.\nWe "
"suggest setting ml include: false in the configuration file to avoid this error.")
if min(concated_df.shape) <= 1:
if min(dataframe.shape) <= 1:
raise ValueError(f"ML post-processing cannot proceed because the available number of pathways is insufficient. "
f"The ml post-processing requires more than one pathway, but currently "
f"there are only {min(concated_df.shape)} pathways.")

return concated_df
f"there are only {min(dataframe.shape)} pathways.")


def create_palette(column_names):
Expand All @@ -121,6 +126,7 @@ def pca(dataframe: pd.DataFrame, output_png: str, output_var: str, output_coord:
@param components: the number of principal components to calculate (Default is 2)
@param labels: determines if labels will be included in the scatterplot (Default is True)
"""
validate_df(dataframe)
df = dataframe.reset_index(drop=True)
columns = dataframe.columns
column_names = [element.split('-')[-3] for element in columns] # assume algorithm names do not contain '-'
Expand Down Expand Up @@ -222,6 +228,7 @@ def hac_vertical(dataframe: pd.DataFrame, output_png: str, output_file: str, lin
@param linkage: methods for calculating the distance between clusters
@param metric: used for distance computation between instances of clusters
"""
validate_df(dataframe)
if linkage not in linkage_methods:
raise ValueError(f"linkage={linkage} must be one of {linkage_methods}")
if metric not in distance_metrics:
Expand Down Expand Up @@ -280,6 +287,7 @@ def hac_horizontal(dataframe: pd.DataFrame, output_png: str, output_file: str, l
@param linkage: methods for calculating the distance between clusters
@param metric: used for distance computation between instances of clusters
"""
validate_df(dataframe)
if linkage not in linkage_methods:
raise ValueError(f"linkage={linkage} must be one of {linkage_methods}")
if linkage == "ward":
Expand Down
10 changes: 6 additions & 4 deletions spras/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,11 +80,13 @@ def __init__(self, raw_config):
# Only includes algorithms that are set to be run with 'include: true'.
self.algorithm_params = None
# Deprecated. Previously a dict mapping algorithm names to a Boolean tracking whether they used directed graphs.
self.algorithm_directed = None
self.algorithm_directed = None
# A dict with the analysis settings
self.analysis_params = None
# A dict with the ML settings
self.ml_params = None
# A Boolean specifying whether to run ML analysis for individual algorithms
self.analysis_include_ml_aggregate_algo = None
# A dict with the PCA settings
self.pca_params = None
# A dict with the hierarchical clustering settings
Expand Down Expand Up @@ -254,7 +256,7 @@ def process_config(self, raw_config):
raise ValueError("Evaluation analysis cannot run as gold standard data not provided. "
"Please set evaluation include to false or provide gold standard data.")

if 'aggregate_per_algorithm' not in self.ml_params:
self.analysis_include_ml_aggregate_algo = False
else:
if 'aggregate_per_algorithm' in self.ml_params and self.analysis_include_ml:
self.analysis_include_ml_aggregate_algo = raw_config["analysis"]["ml"]["aggregate_per_algorithm"]
else:
self.analysis_include_ml_aggregate_algo = False
1 change: 1 addition & 0 deletions test/ml/expected/expected-ensemble-network-empty.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Node1 Node2 Frequency Direction
2 changes: 2 additions & 0 deletions test/ml/expected/expected-ensemble-network-single.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Node1 Node2 Frequency Direction
L M 1.0 U
39 changes: 35 additions & 4 deletions test/ml/test_ml.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,13 +42,25 @@ def test_summarize_networks_wrong_direction(self):
with pytest.raises(ValueError):
ml.summarize_networks([INPUT_DIR + 'test-data-wrong-direction/wrong-direction.txt'])

def test_summarize_networks_empty(self):
def test_empty(self):
dataframe = ml.summarize_networks([INPUT_DIR + 'test-data-empty/empty.txt'])
with pytest.raises(ValueError): # raises error if empty dataframe is used for post processing
ml.summarize_networks([INPUT_DIR + 'test-data-empty/empty.txt'])
ml.pca(dataframe, OUT_DIR + 'pca-empty.png', OUT_DIR + 'pca-empty-variance.txt',
OUT_DIR + 'pca-empty-coordinates.tsv')
with pytest.raises(ValueError):
ml.hac_horizontal(dataframe, OUT_DIR + 'hac-empty-horizontal.png', OUT_DIR + 'hac-empty-clusters-horizontal.txt')
with pytest.raises(ValueError):
ml.hac_vertical(dataframe, OUT_DIR + 'hac-empty-vertical.png', OUT_DIR + 'hac-empty-clusters-vertical.txt')

def test_single_line(self):
dataframe = ml.summarize_networks([INPUT_DIR + 'test-data-empty/empty.txt'])
with pytest.raises(ValueError): # raises error if single line in file s.t. single row in dataframe is used for post processing
ml.summarize_networks([INPUT_DIR + 'test-data-single/single.txt'])
ml.pca(dataframe, OUT_DIR + 'pca-single-line.png', OUT_DIR + 'pca-single-line-variance.txt',
OUT_DIR + 'pca-single-line-coordinates.tsv')
with pytest.raises(ValueError):
ml.hac_horizontal(dataframe, OUT_DIR + 'hac-single-line-horizontal.png', OUT_DIR + 'hac-single-line-clusters-horizontal.txt')
with pytest.raises(ValueError):
ml.hac_vertical(dataframe, OUT_DIR + 'hac-single-line-vertical.png', OUT_DIR + 'hac-single-line-clusters-vertical.txt')

def test_pca(self):
dataframe = ml.summarize_networks([INPUT_DIR + 'test-data-s1/s1.txt', INPUT_DIR + 'test-data-s2/s2.txt', INPUT_DIR + 'test-data-s3/s3.txt'])
Expand Down Expand Up @@ -85,7 +97,6 @@ def test_pca_robustness(self):

assert coord.equals(expected)


def test_hac_horizontal(self):
dataframe = ml.summarize_networks([INPUT_DIR + 'test-data-s1/s1.txt', INPUT_DIR + 'test-data-s2/s2.txt', INPUT_DIR + 'test-data-s3/s3.txt'])
ml.hac_horizontal(dataframe, OUT_DIR + 'hac-horizontal.png', OUT_DIR + 'hac-clusters-horizontal.txt')
Expand All @@ -108,3 +119,23 @@ def test_ensemble_network(self):
expected = expected.round(5)

assert en.equals(expected)

def test_ensemble_network_single_line(self):
dataframe = ml.summarize_networks([INPUT_DIR + 'test-data-single/single.txt'])
ml.ensemble_network(dataframe, OUT_DIR + 'ensemble-network-single.tsv')

en = pd.read_table(OUT_DIR + 'ensemble-network-single.tsv')
en = en.round(5)
expected = pd.read_table(EXPECT_DIR + 'expected-ensemble-network-single.tsv')
expected = expected.round(5)

assert en.equals(expected)

def test_ensemble_network_empty(self):
dataframe = ml.summarize_networks([INPUT_DIR + 'test-data-empty/empty.txt'])
ml.ensemble_network(dataframe, OUT_DIR + 'ensemble-network-empty.tsv')

en = pd.read_table(OUT_DIR + 'ensemble-network-empty.tsv')
expected = pd.read_table(EXPECT_DIR + 'expected-ensemble-network-empty.tsv')

assert en.equals(expected)

0 comments on commit 624cbb4

Please sign in to comment.