Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Decouple Ensembling #175

Merged
merged 9 commits into from
Sep 22, 2024
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 28 additions & 10 deletions Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -93,14 +93,14 @@ def make_final_input(wildcards):
final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}ensemble-pathway.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params))

if _config.config.analysis_include_ml_aggregate_algo:
final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-pca.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos,algorithm_params=algorithms_with_params))
final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-pca-variance.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos,algorithm_params=algorithms_with_params))
final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-pca-coordinates.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos,algorithm_params=algorithms_with_params))
final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-hac-vertical.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos,algorithm_params=algorithms_with_params))
final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-hac-clusters-vertical.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos,algorithm_params=algorithms_with_params))
final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-hac-horizontal.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos,algorithm_params=algorithms_with_params))
final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-hac-clusters-horizontal.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos,algorithm_params=algorithms_with_params))
final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-ensemble-pathway.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos,algorithm_params=algorithms_with_params))
agitter marked this conversation as resolved.
Show resolved Hide resolved
final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-pca.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos))
final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-pca-variance.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos))
final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-pca-coordinates.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos))
final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-hac-vertical.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos))
final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-hac-clusters-vertical.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos))
final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-hac-horizontal.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos))
final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-hac-clusters-horizontal.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos))
final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-ensemble-pathway.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms))

if len(final_input) == 0:
# No analysis added yet, so add reconstruction output files if they exist.
Expand Down Expand Up @@ -291,18 +291,28 @@ rule ml_analysis:
hac_clusters_vertical = SEP.join([out_dir, '{dataset}-ml', 'hac-clusters-vertical.txt']),
hac_image_horizontal = SEP.join([out_dir, '{dataset}-ml', 'hac-horizontal.png']),
hac_clusters_horizontal = SEP.join([out_dir, '{dataset}-ml', 'hac-clusters-horizontal.txt']),
ensemble_network_file = SEP.join([out_dir,'{dataset}-ml', 'ensemble-pathway.txt'])
agitter marked this conversation as resolved.
Show resolved Hide resolved
run:
summary_df = ml.summarize_networks(input.pathways)
ml.pca(summary_df, output.pca_image, output.pca_variance, output.pca_coordinates, **pca_params)
ml.hac_vertical(summary_df, output.hac_image_vertical, output.hac_clusters_vertical, **hac_params)
ml.hac_horizontal(summary_df, output.hac_image_horizontal, output.hac_clusters_horizontal, **hac_params)

# Ensemble the output pathways for each dataset
rule ensemble:
input:
pathways = expand('{out_dir}{sep}{{dataset}}-{algorithm_params}{sep}pathway.txt', out_dir=out_dir, sep=SEP, algorithm_params=algorithms_with_params)
output:
ensemble_network_file = SEP.join([out_dir,'{dataset}-ml', 'ensemble-pathway.txt'])
run:
summary_df = ml.summarize_networks(input.pathways)
ntalluri marked this conversation as resolved.
Show resolved Hide resolved
ml.ensemble_network(summary_df, output.ensemble_network_file)
agitter marked this conversation as resolved.
Show resolved Hide resolved

# Returns all pathways for a specific algorithm
def collect_pathways_per_algo(wildcards):
filtered_algo_params = [algo_param for algo_param in algorithms_with_params if wildcards.algorithm in algo_param]
return expand('{out_dir}{sep}{{dataset}}-{algorithm_params}{sep}pathway.txt', out_dir=out_dir, sep=SEP, algorithm_params=filtered_algo_params)

# Cluster the output pathways for each dataset per algorithm
rule ml_analysis_aggregate_algo:
input:
pathways = collect_pathways_per_algo
Expand All @@ -314,12 +324,20 @@ rule ml_analysis_aggregate_algo:
hac_clusters_vertical = SEP.join([out_dir, '{dataset}-ml', '{algorithm}-hac-clusters-vertical.txt']),
hac_image_horizontal = SEP.join([out_dir, '{dataset}-ml', '{algorithm}-hac-horizontal.png']),
hac_clusters_horizontal = SEP.join([out_dir, '{dataset}-ml', '{algorithm}-hac-clusters-horizontal.txt']),
ensemble_network_file = SEP.join([out_dir,'{dataset}-ml', '{algorithm}-ensemble-pathway.txt'])
run:
summary_df = ml.summarize_networks(input.pathways)
ml.pca(summary_df, output.pca_image, output.pca_variance, output.pca_coordinates, **pca_params)
ml.hac_vertical(summary_df, output.hac_image_vertical, output.hac_clusters_vertical, **hac_params)
ml.hac_horizontal(summary_df, output.hac_image_horizontal, output.hac_clusters_horizontal, **hac_params)

# Ensemble the output pathways for each dataset per algorithm
rule ensemble_per_algo:
input:
pathways = collect_pathways_per_algo
output:
ensemble_network_file = SEP.join([out_dir,'{dataset}-ml', '{algorithm}-ensemble-pathway.txt'])
run:
summary_df = ml.summarize_networks(input.pathways)
ml.ensemble_network(summary_df, output.ensemble_network_file)

# Remove the output directory
Expand Down
19 changes: 12 additions & 7 deletions spras/analysis/ml.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,17 +86,19 @@ def summarize_networks(file_paths: Iterable[Union[str, PathLike]]) -> pd.DataFra
concated_df = concated_df.fillna(0)
concated_df = concated_df.astype('int64')

# don't do ml post-processing if there is an empty dataframe or the number of samples is <= 1
if concated_df.empty:
return concated_df

def validate_df(dataframe: pd.DataFrame):
"""
Raises an error if the dataframe is empty or contains one pathway (one row)
"""
if dataframe.empty:
raise ValueError("ML post-processing cannot proceed because the summarize network dataframe is empty.\nWe "
"suggest setting ml include: false in the configuration file to avoid this error.")
if min(concated_df.shape) <= 1:
if min(dataframe.shape) <= 1:
raise ValueError(f"ML post-processing cannot proceed because the available number of pathways is insufficient. "
f"The ml post-processing requires more than one pathway, but currently "
f"there are only {min(concated_df.shape)} pathways.")

return concated_df

f"there are only {min(dataframe.shape)} pathways.")

def create_palette(column_names):
"""
Expand All @@ -121,6 +123,7 @@ def pca(dataframe: pd.DataFrame, output_png: str, output_var: str, output_coord:
@param components: the number of principal components to calculate (Default is 2)
@param labels: determines if labels will be included in the scatterplot (Default is True)
"""
validate_df(dataframe)
df = dataframe.reset_index(drop=True)
columns = dataframe.columns
column_names = [element.split('-')[-3] for element in columns] # assume algorithm names do not contain '-'
Expand Down Expand Up @@ -222,6 +225,7 @@ def hac_vertical(dataframe: pd.DataFrame, output_png: str, output_file: str, lin
@param linkage: methods for calculating the distance between clusters
@param metric: used for distance computation between instances of clusters
"""
validate_df(dataframe)
if linkage not in linkage_methods:
raise ValueError(f"linkage={linkage} must be one of {linkage_methods}")
if metric not in distance_metrics:
Expand Down Expand Up @@ -280,6 +284,7 @@ def hac_horizontal(dataframe: pd.DataFrame, output_png: str, output_file: str, l
@param linkage: methods for calculating the distance between clusters
@param metric: used for distance computation between instances of clusters
"""
validate_df(dataframe)
if linkage not in linkage_methods:
raise ValueError(f"linkage={linkage} must be one of {linkage_methods}")
if linkage == "ward":
Expand Down
1 change: 1 addition & 0 deletions test/ml/expected/expected-ensemble-network-empty.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Node1 Node2 Frequency Direction
2 changes: 2 additions & 0 deletions test/ml/expected/expected-ensemble-network-single.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Node1 Node2 Frequency Direction
L M 1.0 U
39 changes: 36 additions & 3 deletions test/ml/test_ml.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,13 +42,25 @@ def test_summarize_networks_wrong_direction(self):
with pytest.raises(ValueError):
ml.summarize_networks([INPUT_DIR + 'test-data-wrong-direction/wrong-direction.txt'])

def test_summarize_networks_empty(self):
def test_empty(self):
dataframe = ml.summarize_networks([INPUT_DIR + 'test-data-empty/empty.txt'])
with pytest.raises(ValueError): # raises error if empty dataframe is used for post processing
ml.summarize_networks([INPUT_DIR + 'test-data-empty/empty.txt'])
ml.pca(dataframe, OUT_DIR + 'pca-empty.png', OUT_DIR + 'pca-empty-variance.txt',
OUT_DIR + 'pca-empty-coordinates.tsv')
with pytest.raises(ValueError):
ml.hac_horizontal(dataframe, OUT_DIR + 'hac-empty-horizontal.png', OUT_DIR + 'hac-empty-clusters-horizontal.txt')
with pytest.raises(ValueError):
ml.hac_vertical(dataframe, OUT_DIR + 'hac-empty-vertical.png', OUT_DIR + 'hac-empty-clusters-vertical.txt')

def test_single_line(self):
dataframe = ml.summarize_networks([INPUT_DIR + 'test-data-empty/empty.txt'])
with pytest.raises(ValueError): # raises error if single line in file s.t. single row in dataframe is used for post processing
ml.summarize_networks([INPUT_DIR + 'test-data-single/single.txt'])
ml.pca(dataframe, OUT_DIR + 'pca-single-line.png', OUT_DIR + 'pca-single-line-variance.txt',
OUT_DIR + 'pca-single-line-coordinates.tsv')
with pytest.raises(ValueError):
ml.hac_horizontal(dataframe, OUT_DIR + 'hac-single-line-horizontal.png', OUT_DIR + 'hac-single-line-clusters-horizontal.txt')
with pytest.raises(ValueError):
ml.hac_vertical(dataframe, OUT_DIR + 'hac-single-line-vertical.png', OUT_DIR + 'hac-single-line-clusters-vertical.txt')

def test_pca(self):
dataframe = ml.summarize_networks([INPUT_DIR + 'test-data-s1/s1.txt', INPUT_DIR + 'test-data-s2/s2.txt', INPUT_DIR + 'test-data-s3/s3.txt'])
Expand Down Expand Up @@ -83,3 +95,24 @@ def test_ensemble_network(self):
expected = expected.round(5)

assert en.equals(expected)

def test_ensemble_network_single_line(self):
dataframe = ml.summarize_networks([INPUT_DIR + 'test-data-single/single.txt'])
ml.ensemble_network(dataframe, OUT_DIR + 'ensemble-network-single.tsv')

en = pd.read_table(OUT_DIR + 'ensemble-network-single.tsv')
en = en.round(5)
expected = pd.read_table(EXPECT_DIR + 'expected-ensemble-network-single.tsv')
expected = expected.round(5)

assert en.equals(expected)

def test_ensemble_network_empty(self):
dataframe = ml.summarize_networks([INPUT_DIR + 'test-data-empty/empty.txt'])
ml.ensemble_network(dataframe, OUT_DIR + 'ensemble-network-empty.tsv')

en = pd.read_table(OUT_DIR + 'ensemble-network-empty.tsv')
expected = pd.read_table(EXPECT_DIR + 'expected-ensemble-network-empty.tsv')
expected = expected.round(5)

assert en.equals(expected)