Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Decouple Ensembling #175

Merged
merged 9 commits into from
Sep 22, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 18 additions & 4 deletions Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ def make_final_input(wildcards):
final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-hac-clusters-vertical.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos,algorithm_params=algorithms_with_params))
final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-hac-horizontal.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos,algorithm_params=algorithms_with_params))
final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-hac-clusters-horizontal.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos,algorithm_params=algorithms_with_params))
final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-ensemble-pathway.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos,algorithm_params=algorithms_with_params))
agitter marked this conversation as resolved.
Show resolved Hide resolved
final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-ensemble-pathway.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms,algorithm_params=algorithms_with_params))

if len(final_input) == 0:
# No analysis added yet, so add reconstruction output files if they exist.
Expand Down Expand Up @@ -291,13 +291,11 @@ rule ml_analysis:
hac_clusters_vertical = SEP.join([out_dir, '{dataset}-ml', 'hac-clusters-vertical.txt']),
hac_image_horizontal = SEP.join([out_dir, '{dataset}-ml', 'hac-horizontal.png']),
hac_clusters_horizontal = SEP.join([out_dir, '{dataset}-ml', 'hac-clusters-horizontal.txt']),
ensemble_network_file = SEP.join([out_dir,'{dataset}-ml', 'ensemble-pathway.txt'])
agitter marked this conversation as resolved.
Show resolved Hide resolved
run:
summary_df = ml.summarize_networks(input.pathways)
ml.pca(summary_df, output.pca_image, output.pca_variance, output.pca_coordinates, **pca_params)
ml.hac_vertical(summary_df, output.hac_image_vertical, output.hac_clusters_vertical, **hac_params)
ml.hac_horizontal(summary_df, output.hac_image_horizontal, output.hac_clusters_horizontal, **hac_params)
ml.ensemble_network(summary_df, output.ensemble_network_file)

def collect_pathways_per_algo(wildcards):
filtered_algo_params = [algo_param for algo_param in algorithms_with_params if wildcards.algorithm in algo_param]
Expand All @@ -314,12 +312,28 @@ rule ml_analysis_aggregate_algo:
hac_clusters_vertical = SEP.join([out_dir, '{dataset}-ml', '{algorithm}-hac-clusters-vertical.txt']),
hac_image_horizontal = SEP.join([out_dir, '{dataset}-ml', '{algorithm}-hac-horizontal.png']),
hac_clusters_horizontal = SEP.join([out_dir, '{dataset}-ml', '{algorithm}-hac-clusters-horizontal.txt']),
ensemble_network_file = SEP.join([out_dir,'{dataset}-ml', '{algorithm}-ensemble-pathway.txt'])
run:
summary_df = ml.summarize_networks(input.pathways)
ml.pca(summary_df, output.pca_image, output.pca_variance, output.pca_coordinates, **pca_params)
ml.hac_vertical(summary_df, output.hac_image_vertical, output.hac_clusters_vertical, **hac_params)
ml.hac_horizontal(summary_df, output.hac_image_horizontal, output.hac_clusters_horizontal, **hac_params)

rule ensemble:
input:
pathways = expand('{out_dir}{sep}{{dataset}}-{algorithm_params}{sep}pathway.txt', out_dir=out_dir, sep=SEP, algorithm_params=algorithms_with_params)
output:
ensemble_network_file = SEP.join([out_dir,'{dataset}-ml', 'ensemble-pathway.txt'])
run:
summary_df = ml.summarize_networks(input.pathways)
ml.ensemble_network(summary_df, output.ensemble_network_file)

rule ensemble_per_algo:
input:
pathways = collect_pathways_per_algo
output:
ensemble_network_file = SEP.join([out_dir,'{dataset}-ml', '{algorithm}-ensemble-pathway.txt'])
run:
summary_df = ml.summarize_networks(input.pathways)
ml.ensemble_network(summary_df, output.ensemble_network_file)

# Remove the output directory
Expand Down
14 changes: 8 additions & 6 deletions spras/analysis/ml.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,18 +86,17 @@ def summarize_networks(file_paths: Iterable[Union[str, PathLike]]) -> pd.DataFra
concated_df = concated_df.fillna(0)
concated_df = concated_df.astype('int64')

# don't do ml post-processing if there is an empty dataframe or the number of samples is <= 1
if concated_df.empty:
return concated_df

def df_error(dataframe: pd.DataFrame):
agitter marked this conversation as resolved.
Show resolved Hide resolved
if dataframe.empty:
raise ValueError("ML post-processing cannot proceed because the summarize network dataframe is empty.\nWe "
"suggest setting ml include: false in the configuration file to avoid this error.")
if min(concated_df.shape) <= 1:
if min(dataframe.shape) <= 1:
raise ValueError(f"ML post-processing cannot proceed because the available number of pathways is insufficient. "
f"The ml post-processing requires more than one pathway, but currently "
f"there are only {min(concated_df.shape)} pathways.")

return concated_df


def create_palette(column_names):
"""
Generates a dictionary mapping each column name (algorithm name)
Expand All @@ -121,6 +120,7 @@ def pca(dataframe: pd.DataFrame, output_png: str, output_var: str, output_coord:
@param components: the number of principal components to calculate (Default is 2)
@param labels: determines if labels will be included in the scatterplot (Default is True)
"""
df_error(dataframe)
df = dataframe.reset_index(drop=True)
columns = dataframe.columns
column_names = [element.split('-')[-3] for element in columns] # assume algorithm names do not contain '-'
Expand Down Expand Up @@ -222,6 +222,7 @@ def hac_vertical(dataframe: pd.DataFrame, output_png: str, output_file: str, lin
@param linkage: methods for calculating the distance between clusters
@param metric: used for distance computation between instances of clusters
"""
df_error(dataframe)
if linkage not in linkage_methods:
raise ValueError(f"linkage={linkage} must be one of {linkage_methods}")
if metric not in distance_metrics:
Expand Down Expand Up @@ -280,6 +281,7 @@ def hac_horizontal(dataframe: pd.DataFrame, output_png: str, output_file: str, l
@param linkage: methods for calculating the distance between clusters
@param metric: used for distance computation between instances of clusters
"""
df_error(dataframe)
if linkage not in linkage_methods:
raise ValueError(f"linkage={linkage} must be one of {linkage_methods}")
if linkage == "ward":
Expand Down
34 changes: 30 additions & 4 deletions test/ml/test_ml.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,13 +42,39 @@ def test_summarize_networks_wrong_direction(self):
with pytest.raises(ValueError):
ml.summarize_networks([INPUT_DIR + 'test-data-wrong-direction/wrong-direction.txt'])

def test_summarize_networks_empty(self):
# TODO: Can I move everything into one test?
def test_empty_pca(self):
with pytest.raises(ValueError): # raises error if empty dataframe is used for post processing
ml.summarize_networks([INPUT_DIR + 'test-data-empty/empty.txt'])
dataframe = ml.summarize_networks([INPUT_DIR + 'test-data-empty/empty.txt'])
ml.pca(dataframe, OUT_DIR + 'pca-empty.png', OUT_DIR + 'pca-empty-variance.txt',
OUT_DIR + 'pca-empty-coordinates.tsv')

def test_single_line(self):
def test_empty_hac_horizontal(self):
with pytest.raises(ValueError): # raises error if empty dataframe is used for post processing
dataframe = ml.summarize_networks([INPUT_DIR + 'test-data-empty/empty.txt'])
ml.hac_horizontal(dataframe, OUT_DIR + 'hac-empty-horizontal.png', OUT_DIR + 'hac-empty-clusters-horizontal.txt')

def test_empty_hac_vertical(self):
with pytest.raises(ValueError): # raises error if empty dataframe is used for post processing
dataframe = ml.summarize_networks([INPUT_DIR + 'test-data-empty/empty.txt'])
ml.hac_vertical(dataframe, OUT_DIR + 'hac-empty-vertical.png', OUT_DIR + 'hac-empty-clusters-vertical.txt')

# TODO: Can I move everything into one test?
def test_single_line_pca(self):
with pytest.raises(ValueError): # raises error if single line in file s.t. single row in dataframe is used for post processing
dataframe = ml.summarize_networks([INPUT_DIR + 'test-data-empty/empty.txt'])
ml.pca(dataframe, OUT_DIR + 'pca-single-line.png', OUT_DIR + 'pca-single-line-variance.txt',
OUT_DIR + 'pca-single-line-coordinates.tsv')

def test_single_line_hac_horizontal(self):
with pytest.raises(ValueError): # raises error if single line in file s.t. single row in dataframe is used for post processing
dataframe = ml.summarize_networks([INPUT_DIR + 'test-data-empty/empty.txt'])
ml.hac_horizontal(dataframe, OUT_DIR + 'hac-single-line-horizontal.png', OUT_DIR + 'hac-single-line-clusters-horizontal.txt')

def test_single_line_hac_vertical(self):
with pytest.raises(ValueError): # raises error if single line in file s.t. single row in dataframe is used for post processing
ml.summarize_networks([INPUT_DIR + 'test-data-single/single.txt'])
dataframe = ml.summarize_networks([INPUT_DIR + 'test-data-empty/empty.txt'])
ml.hac_vertical(dataframe, OUT_DIR + 'hac-single-line-vertical.png', OUT_DIR + 'hac-single-line-clusters-vertical.txt')

def test_pca(self):
dataframe = ml.summarize_networks([INPUT_DIR + 'test-data-s1/s1.txt', INPUT_DIR + 'test-data-s2/s2.txt', INPUT_DIR + 'test-data-s3/s3.txt'])
Expand Down