Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Decouple Ensembling #175

Merged
merged 9 commits into from
Sep 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 28 additions & 11 deletions Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -95,14 +95,14 @@ def make_final_input(wildcards):
final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}ensemble-pathway.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params))

if _config.config.analysis_include_ml_aggregate_algo:
final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-pca.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos,algorithm_params=algorithms_with_params))
final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-pca-variance.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos,algorithm_params=algorithms_with_params))
final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-pca-coordinates.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos,algorithm_params=algorithms_with_params))
final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-hac-vertical.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos,algorithm_params=algorithms_with_params))
final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-hac-clusters-vertical.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos,algorithm_params=algorithms_with_params))
final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-hac-horizontal.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos,algorithm_params=algorithms_with_params))
final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-hac-clusters-horizontal.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos,algorithm_params=algorithms_with_params))
final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-ensemble-pathway.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos,algorithm_params=algorithms_with_params))
agitter marked this conversation as resolved.
Show resolved Hide resolved
final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-pca.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos))
final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-pca-variance.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos))
final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-pca-coordinates.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos))
final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-hac-vertical.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos))
final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-hac-clusters-vertical.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos))
final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-hac-horizontal.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos))
final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-hac-clusters-horizontal.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos))
final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-ensemble-pathway.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms))

if _config.config.analysis_include_evaluation:
final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-evaluation.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs,algorithm_params=algorithms_with_params))
Expand Down Expand Up @@ -311,19 +311,28 @@ rule ml_analysis:
hac_clusters_vertical = SEP.join([out_dir, '{dataset}-ml', 'hac-clusters-vertical.txt']),
hac_image_horizontal = SEP.join([out_dir, '{dataset}-ml', 'hac-horizontal.png']),
hac_clusters_horizontal = SEP.join([out_dir, '{dataset}-ml', 'hac-clusters-horizontal.txt']),
ensemble_network_file = SEP.join([out_dir,'{dataset}-ml', 'ensemble-pathway.txt'])
agitter marked this conversation as resolved.
Show resolved Hide resolved
run:
summary_df = ml.summarize_networks(input.pathways)
ml.pca(summary_df, output.pca_image, output.pca_variance, output.pca_coordinates, **pca_params)
ml.hac_vertical(summary_df, output.hac_image_vertical, output.hac_clusters_vertical, **hac_params)
ml.hac_horizontal(summary_df, output.hac_image_horizontal, output.hac_clusters_horizontal, **hac_params)

# Ensemble the output pathways for each dataset
rule ensemble:
input:
pathways = expand('{out_dir}{sep}{{dataset}}-{algorithm_params}{sep}pathway.txt', out_dir=out_dir, sep=SEP, algorithm_params=algorithms_with_params)
output:
ensemble_network_file = SEP.join([out_dir,'{dataset}-ml', 'ensemble-pathway.txt'])
run:
summary_df = ml.summarize_networks(input.pathways)
ntalluri marked this conversation as resolved.
Show resolved Hide resolved
ml.ensemble_network(summary_df, output.ensemble_network_file)
agitter marked this conversation as resolved.
Show resolved Hide resolved

# Returns all pathways for a specific algorithm
def collect_pathways_per_algo(wildcards):
filtered_algo_params = [algo_param for algo_param in algorithms_with_params if wildcards.algorithm in algo_param]
return expand('{out_dir}{sep}{{dataset}}-{algorithm_params}{sep}pathway.txt', out_dir=out_dir, sep=SEP, algorithm_params=filtered_algo_params)

# Cluster the output pathways per algorithm for each dataset
# Cluster the output pathways for each dataset per algorithm
rule ml_analysis_aggregate_algo:
input:
pathways = collect_pathways_per_algo
Expand All @@ -335,12 +344,20 @@ rule ml_analysis_aggregate_algo:
hac_clusters_vertical = SEP.join([out_dir, '{dataset}-ml', '{algorithm}-hac-clusters-vertical.txt']),
hac_image_horizontal = SEP.join([out_dir, '{dataset}-ml', '{algorithm}-hac-horizontal.png']),
hac_clusters_horizontal = SEP.join([out_dir, '{dataset}-ml', '{algorithm}-hac-clusters-horizontal.txt']),
ensemble_network_file = SEP.join([out_dir,'{dataset}-ml', '{algorithm}-ensemble-pathway.txt'])
run:
summary_df = ml.summarize_networks(input.pathways)
ml.pca(summary_df, output.pca_image, output.pca_variance, output.pca_coordinates, **pca_params)
ml.hac_vertical(summary_df, output.hac_image_vertical, output.hac_clusters_vertical, **hac_params)
ml.hac_horizontal(summary_df, output.hac_image_horizontal, output.hac_clusters_horizontal, **hac_params)

# Ensemble the output pathways for each dataset per algorithm
rule ensemble_per_algo:
input:
pathways = collect_pathways_per_algo
output:
ensemble_network_file = SEP.join([out_dir,'{dataset}-ml', '{algorithm}-ensemble-pathway.txt'])
run:
summary_df = ml.summarize_networks(input.pathways)
ml.ensemble_network(summary_df, output.ensemble_network_file)

# Return the gold standard pickle file for a specific gold standard
Expand Down
20 changes: 14 additions & 6 deletions spras/analysis/ml.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,16 +86,21 @@ def summarize_networks(file_paths: Iterable[Union[str, PathLike]]) -> pd.DataFra
concated_df = concated_df.fillna(0)
concated_df = concated_df.astype('int64')

# don't do ml post-processing if there is an empty dataframe or the number of samples is <= 1
if concated_df.empty:
return concated_df


def validate_df(dataframe: pd.DataFrame):
"""
Raises an error if the dataframe is empty or contains one pathway (one row)
@param dataframe: datafrom of pathways to validate
"""
if dataframe.empty:
raise ValueError("ML post-processing cannot proceed because the summarize network dataframe is empty.\nWe "
"suggest setting ml include: false in the configuration file to avoid this error.")
if min(concated_df.shape) <= 1:
if min(dataframe.shape) <= 1:
raise ValueError(f"ML post-processing cannot proceed because the available number of pathways is insufficient. "
f"The ml post-processing requires more than one pathway, but currently "
f"there are only {min(concated_df.shape)} pathways.")

return concated_df
f"there are only {min(dataframe.shape)} pathways.")


def create_palette(column_names):
Expand All @@ -121,6 +126,7 @@ def pca(dataframe: pd.DataFrame, output_png: str, output_var: str, output_coord:
@param components: the number of principal components to calculate (Default is 2)
@param labels: determines if labels will be included in the scatterplot (Default is True)
"""
validate_df(dataframe)
df = dataframe.reset_index(drop=True)
columns = dataframe.columns
column_names = [element.split('-')[-3] for element in columns] # assume algorithm names do not contain '-'
Expand Down Expand Up @@ -222,6 +228,7 @@ def hac_vertical(dataframe: pd.DataFrame, output_png: str, output_file: str, lin
@param linkage: methods for calculating the distance between clusters
@param metric: used for distance computation between instances of clusters
"""
validate_df(dataframe)
if linkage not in linkage_methods:
raise ValueError(f"linkage={linkage} must be one of {linkage_methods}")
if metric not in distance_metrics:
Expand Down Expand Up @@ -280,6 +287,7 @@ def hac_horizontal(dataframe: pd.DataFrame, output_png: str, output_file: str, l
@param linkage: methods for calculating the distance between clusters
@param metric: used for distance computation between instances of clusters
"""
validate_df(dataframe)
if linkage not in linkage_methods:
raise ValueError(f"linkage={linkage} must be one of {linkage_methods}")
if linkage == "ward":
Expand Down
10 changes: 6 additions & 4 deletions spras/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,11 +80,13 @@ def __init__(self, raw_config):
# Only includes algorithms that are set to be run with 'include: true'.
self.algorithm_params = None
# Deprecated. Previously a dict mapping algorithm names to a Boolean tracking whether they used directed graphs.
self.algorithm_directed = None
self.algorithm_directed = None
# A dict with the analysis settings
self.analysis_params = None
# A dict with the ML settings
self.ml_params = None
# A Boolean specifying whether to run ML analysis for individual algorithms
self.analysis_include_ml_aggregate_algo = None
# A dict with the PCA settings
self.pca_params = None
# A dict with the hierarchical clustering settings
Expand Down Expand Up @@ -254,7 +256,7 @@ def process_config(self, raw_config):
raise ValueError("Evaluation analysis cannot run as gold standard data not provided. "
"Please set evaluation include to false or provide gold standard data.")

if 'aggregate_per_algorithm' not in self.ml_params:
self.analysis_include_ml_aggregate_algo = False
else:
if 'aggregate_per_algorithm' in self.ml_params and self.analysis_include_ml:
self.analysis_include_ml_aggregate_algo = raw_config["analysis"]["ml"]["aggregate_per_algorithm"]
else:
self.analysis_include_ml_aggregate_algo = False
1 change: 1 addition & 0 deletions test/ml/expected/expected-ensemble-network-empty.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Node1 Node2 Frequency Direction
2 changes: 2 additions & 0 deletions test/ml/expected/expected-ensemble-network-single.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Node1 Node2 Frequency Direction
L M 1.0 U
39 changes: 35 additions & 4 deletions test/ml/test_ml.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,13 +42,25 @@ def test_summarize_networks_wrong_direction(self):
with pytest.raises(ValueError):
ml.summarize_networks([INPUT_DIR + 'test-data-wrong-direction/wrong-direction.txt'])

def test_summarize_networks_empty(self):
def test_empty(self):
dataframe = ml.summarize_networks([INPUT_DIR + 'test-data-empty/empty.txt'])
with pytest.raises(ValueError): # raises error if empty dataframe is used for post processing
ml.summarize_networks([INPUT_DIR + 'test-data-empty/empty.txt'])
ml.pca(dataframe, OUT_DIR + 'pca-empty.png', OUT_DIR + 'pca-empty-variance.txt',
OUT_DIR + 'pca-empty-coordinates.tsv')
with pytest.raises(ValueError):
ml.hac_horizontal(dataframe, OUT_DIR + 'hac-empty-horizontal.png', OUT_DIR + 'hac-empty-clusters-horizontal.txt')
with pytest.raises(ValueError):
ml.hac_vertical(dataframe, OUT_DIR + 'hac-empty-vertical.png', OUT_DIR + 'hac-empty-clusters-vertical.txt')

def test_single_line(self):
dataframe = ml.summarize_networks([INPUT_DIR + 'test-data-empty/empty.txt'])
with pytest.raises(ValueError): # raises error if single line in file s.t. single row in dataframe is used for post processing
ml.summarize_networks([INPUT_DIR + 'test-data-single/single.txt'])
ml.pca(dataframe, OUT_DIR + 'pca-single-line.png', OUT_DIR + 'pca-single-line-variance.txt',
OUT_DIR + 'pca-single-line-coordinates.tsv')
with pytest.raises(ValueError):
ml.hac_horizontal(dataframe, OUT_DIR + 'hac-single-line-horizontal.png', OUT_DIR + 'hac-single-line-clusters-horizontal.txt')
with pytest.raises(ValueError):
ml.hac_vertical(dataframe, OUT_DIR + 'hac-single-line-vertical.png', OUT_DIR + 'hac-single-line-clusters-vertical.txt')

def test_pca(self):
dataframe = ml.summarize_networks([INPUT_DIR + 'test-data-s1/s1.txt', INPUT_DIR + 'test-data-s2/s2.txt', INPUT_DIR + 'test-data-s3/s3.txt'])
Expand Down Expand Up @@ -85,7 +97,6 @@ def test_pca_robustness(self):

assert coord.equals(expected)


def test_hac_horizontal(self):
dataframe = ml.summarize_networks([INPUT_DIR + 'test-data-s1/s1.txt', INPUT_DIR + 'test-data-s2/s2.txt', INPUT_DIR + 'test-data-s3/s3.txt'])
ml.hac_horizontal(dataframe, OUT_DIR + 'hac-horizontal.png', OUT_DIR + 'hac-clusters-horizontal.txt')
Expand All @@ -108,3 +119,23 @@ def test_ensemble_network(self):
expected = expected.round(5)

assert en.equals(expected)

def test_ensemble_network_single_line(self):
dataframe = ml.summarize_networks([INPUT_DIR + 'test-data-single/single.txt'])
ml.ensemble_network(dataframe, OUT_DIR + 'ensemble-network-single.tsv')

en = pd.read_table(OUT_DIR + 'ensemble-network-single.tsv')
en = en.round(5)
expected = pd.read_table(EXPECT_DIR + 'expected-ensemble-network-single.tsv')
expected = expected.round(5)

assert en.equals(expected)

def test_ensemble_network_empty(self):
dataframe = ml.summarize_networks([INPUT_DIR + 'test-data-empty/empty.txt'])
ml.ensemble_network(dataframe, OUT_DIR + 'ensemble-network-empty.tsv')

en = pd.read_table(OUT_DIR + 'ensemble-network-empty.tsv')
expected = pd.read_table(EXPECT_DIR + 'expected-ensemble-network-empty.tsv')

assert en.equals(expected)
Loading