Skip to content

Commit

Permalink
Merge pull request #119 from blab/cluster-by-distances
Browse files Browse the repository at this point in the history
Run HDBSCAN directly on genetic distances and compare clusters to those from embeddings
  • Loading branch information
huddlej authored Aug 7, 2024
2 parents 8a20d87 + 3b182c6 commit d12aee2
Show file tree
Hide file tree
Showing 103 changed files with 19,459 additions and 13,975 deletions.
29 changes: 28 additions & 1 deletion Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ EMBEDDING_NAME_BY_METHOD = {
"mds": "MDS",
"t-sne": "t-SNE",
"umap": "UMAP",
"genetic": "genetic",
}

DISTANCE_THRESHOLDS = [
Expand All @@ -37,6 +38,32 @@ DISTANCE_THRESHOLDS = [
6.0,
6.5,
7.0,
7.5,
8.0,
8.5,
9.0,
9.5,
10.0,
10.5,
11.0,
11.5,
12.0,
12.5,
13.0,
13.5,
14.0,
14.5,
15.0,
15.5,
16.0,
16.5,
17.0,
17.5,
18.0,
18.5,
19.0,
19.5,
20.0,
]
CLUSTER_MIN_SIZE = 10
CLUSTER_MIN_SAMPLES = 5
Expand All @@ -63,7 +90,7 @@ wildcard_constraints:
ha_concatenated="(ha|na|concatenated)",
ha_concat="(ha|concatenated)",
clade_membership="(Nextstrain_clade|Nextclade_pango|Nextclade_pango_collapsed)",
replicate="\d+",
replicate="[0-9]+",
subsampling_scheme="(even|random)",

# Define final outputs for the workflow.
Expand Down
44 changes: 36 additions & 8 deletions ha-na-nextstrain/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -454,6 +454,28 @@ rule seasonal_flu_reassortment_cluster_with_optimal_parameters:
--output-figure {output.clustered_embedding_figure}
"""

rule seasonal_flu_reassortment_cluster_distances_with_optimal_parameters:
input:
distances="ha-na-nextstrain/results/distance_matrix_{ha_concatenated}.csv",
parameters="seasonal-flu-nextstrain/results/optimal_cluster_accuracy_and_parameters.csv",
output:
clustered="ha-na-nextstrain/results/cluster_embed_genetic_{ha_concatenated}.csv",
conda: "../cartography.yml"
params:
min_size=CLUSTER_MIN_SIZE,
min_samples=CLUSTER_MIN_SAMPLES,
shell:
"""
pathogen-cluster \
--distance-matrix {input.distances} \
--label-attribute "genetic_{wildcards.ha_concatenated}_label" \
--min-size {params.min_size} \
--min-samples {params.min_samples} \
--distance-threshold "$(csvtk filter2 -f '$method=="genetic"' {input.parameters} | csvtk cut -f distance_threshold | csvtk del-header)" \
--output-dataframe /dev/stdout \
| tsv-select -H --delimiter "," -f strain,genetic_{wildcards.ha_concatenated}_label > {output.clustered}
"""

rule seasonal_flu_reassortment_create_node_output:
input:
dataframe = "ha-na-nextstrain/results/cluster_embed_{method}_{ha_concatenated}.csv"
Expand All @@ -477,6 +499,8 @@ rule seasonal_flu_reassortment_export:
mccs = "ha-na-nextstrain/results/mccs.json",
embeddings = expand("ha-na-nextstrain/results/cluster_embed_{embedding}_ha.json", embedding=EMBEDDING_METHODS),
embeddings_concat = expand("ha-na-nextstrain/results/cluster_embed_{embedding}_concatenated.json", embedding=EMBEDDING_METHODS),
genetic_clusters_ha="ha-na-nextstrain/results/cluster_embed_genetic_ha.json",
genetic_clusters_concat="ha-na-nextstrain/results/cluster_embed_genetic_concatenated.json",
auspice_config = seasonal_flu_reassortment_files.auspice_config,
clades = "ha-na-nextstrain/results/clades_ha.json"
output:
Expand All @@ -488,7 +512,7 @@ rule seasonal_flu_reassortment_export:
augur export v2 \
--tree {input.tree} \
--metadata {input.metadata} \
--node-data {input.branch_lengths} {input.clades} {input.nt_muts} {input.aa_muts} {input.embeddings} {input.embeddings_concat} {input.mccs} \
--node-data {input.branch_lengths} {input.clades} {input.nt_muts} {input.aa_muts} {input.embeddings} {input.embeddings_concat} {input.mccs} {input.genetic_clusters_ha} {input.genetic_clusters_concat} \
--auspice-config {input.auspice_config} \
--include-root-sequence \
--minify-json \
Expand Down Expand Up @@ -524,10 +548,12 @@ rule seasonal_flu_reassortment_create_distance_dataframe:
dataframe_mds = "ha-na-nextstrain/results/cluster_embed_mds_ha.csv",
dataframe_tsne = "ha-na-nextstrain/results/cluster_embed_t-sne_ha.csv",
dataframe_umap = "ha-na-nextstrain/results/cluster_embed_umap_ha.csv",
dataframe_genetic = "ha-na-nextstrain/results/cluster_embed_genetic_ha.csv",
dataframe_pca_na = "ha-na-nextstrain/results/cluster_embed_pca_concatenated.csv",
dataframe_mds_na = "ha-na-nextstrain/results/cluster_embed_mds_concatenated.csv",
dataframe_tsne_na = "ha-na-nextstrain/results/cluster_embed_t-sne_concatenated.csv",
dataframe_umap_na = "ha-na-nextstrain/results/cluster_embed_umap_concatenated.csv"
dataframe_umap_na = "ha-na-nextstrain/results/cluster_embed_umap_concatenated.csv",
dataframe_genetic_na = "ha-na-nextstrain/results/cluster_embed_genetic_concatenated.csv",
output:
metadata = "ha-na-nextstrain/results/ha_concatenated_data.csv"
conda: "../cartography.yml"
Expand All @@ -553,7 +579,7 @@ rule seasonal_flu_reassortment_infer_cluster_labels:
columns=lambda wildcards: [
f"{method}_{ha_concatenated}_label"
for ha_concatenated in HA_CONCAT
for method in EMBEDDING_METHODS
for method in EMBEDDING_METHODS + ["genetic"]
],
shell:
"""
Expand All @@ -576,12 +602,12 @@ rule seasonal_flu_reassortment_find_monophyletic_clusters:
methods=lambda wildcards: [
EMBEDDING_NAME_BY_METHOD[method] + " (" + ("HA only" if ha_concatenated == "ha" else "HA/NA") + ")"
for ha_concatenated in HA_CONCAT
for method in EMBEDDING_METHODS
for method in EMBEDDING_METHODS + ["genetic"]
],
attributes=lambda wildcards: [
f"{method}_{ha_concatenated}_label"
for ha_concatenated in HA_CONCAT
for method in EMBEDDING_METHODS
for method in EMBEDDING_METHODS + ["genetic"]
],
shell:
"""
Expand Down Expand Up @@ -634,6 +660,7 @@ rule seasonal_flu_reassortment_cluster_accuracy:
conda: "../cartography.yml"
params:
clade_column="MCC",
ignored_clusters="unassigned",
shell:
"""
python3 notebooks/scripts/metadata_HDBSCAN.py \
Expand All @@ -642,12 +669,13 @@ rule seasonal_flu_reassortment_cluster_accuracy:
--true-clusters-column {params.clade_column:q} \
--predicted-clusters {input.embedding} \
--predicted-clusters-column "{wildcards.method}_{wildcards.ha_concatenated}_label" \
--ignored-clusters {params.ignored_clusters:q} \
--output {output.dataframe}
"""

rule seasonal_flu_reassortment_concat_HDBSCAN_table:
input:
dataframes = expand("ha-na-nextstrain/results/cluster_accuracy_{method}_{{ha_concatenated}}.csv", method=EMBEDDING_METHODS)
dataframes = expand("ha-na-nextstrain/results/cluster_accuracy_{method}_{{ha_concatenated}}.csv", method=EMBEDDING_METHODS + ["genetic"])
output:
metadata = "ha-na-nextstrain/results/full_HDBSCAN_metadata_{ha_concatenated}.csv"
params:
Expand Down Expand Up @@ -693,7 +721,7 @@ rule seasonal_flu_reassortment_create_mutation_table:

rule seasonal_flu_reassortment_concat_mutation_tables:
input:
mutation_tables=expand("ha-na-nextstrain/results/mutation_table_{method}_{segment}.csv", method=EMBEDDING_METHODS, segment=SEGMENTS),
mutation_tables=expand("ha-na-nextstrain/results/mutation_table_{method}_{segment}.csv", method=EMBEDDING_METHODS + ["genetic"], segment=SEGMENTS),
output:
metadata = "ha-na-nextstrain/results/mutation_table.csv",
params:
Expand Down Expand Up @@ -727,7 +755,7 @@ rule seasonal_flu_reassortment_within_between_stats:

rule seasonal_flu_reassortment_concat_within_between_stats:
input:
tables=expand("ha-na-nextstrain/results/within_between_stats/{group_column}.csv", group_column=["MCC"] + [f"{method}_concatenated_label" for method in EMBEDDING_METHODS])
tables=expand("ha-na-nextstrain/results/within_between_stats/{group_column}.csv", group_column=["MCC"] + [f"{method}_concatenated_label" for method in EMBEDDING_METHODS + ["genetic"]])
output:
metadata = "ha-na-nextstrain/results/full_within_between_stats.csv"
conda: "../cartography.yml"
Expand Down
12 changes: 12 additions & 0 deletions ha-na-nextstrain/config/auspice_config.json
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,11 @@
"title": "HA/NA UMAP cluster",
"type": "categorical"
},
{
"key": "genetic_concatenated_label",
"title": "HA/NA genetic distance cluster",
"type": "categorical"
},
{
"key": "pca_ha_label",
"title": "HA PCA cluster",
Expand All @@ -51,6 +56,11 @@
"title": "HA UMAP cluster",
"type": "categorical"
},
{
"key": "genetic_ha_label",
"title": "HA genetic distance cluster",
"type": "categorical"
},
{
"key": "gt",
"title": "Genotype",
Expand Down Expand Up @@ -126,10 +136,12 @@
"mds_ha_label",
"t-sne_ha_label",
"umap_ha_label",
"genetic_ha_label",
"pca_concatenated_label",
"mds_concatenated_label",
"t-sne_concatenated_label",
"umap_concatenated_label",
"genetic_concatenated_label",
"clade_membership"
],

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
normalized_vi,method,predicted_clusters_column,n_predicted_clusters,n_predicted_cluster_samples,n_ignored_predicted_clusters,n_true_cluster_samples,n_ignored_true_clusters,n_vi_cluster_samples
0.06,t-sne,t-sne_concatenated_label,17,1608,0,1049,558,1049
0.11,mds,mds_concatenated_label,17,1608,0,1049,558,1049
0.11,umap,umap_concatenated_label,8,1608,0,1049,558,1049
0.11,genetic,genetic_concatenated_label,24,1608,0,1049,558,1049
0.13,pca,pca_concatenated_label,9,1608,0,1049,558,1049
6 changes: 6 additions & 0 deletions ha-na-nextstrain/results/full_HDBSCAN_metadata_ha.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
normalized_vi,method,predicted_clusters_column,n_predicted_clusters,n_predicted_cluster_samples,n_ignored_predicted_clusters,n_true_cluster_samples,n_ignored_true_clusters,n_vi_cluster_samples
0.11,t-sne,t-sne_ha_label,15,1608,0,1049,558,1049
0.14,umap,umap_ha_label,7,1608,0,1049,558,1049
0.18,pca,pca_ha_label,6,1608,0,1049,558,1049
0.18,mds,mds_ha_label,8,1608,0,1049,558,1049
0.2,genetic,genetic_ha_label,8,1608,0,1049,558,1049
13 changes: 13 additions & 0 deletions ha-na-nextstrain/results/full_within_between_stats.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
comparison,mean,median,std,group,dataset_name
between,52.62120724379293,50.0,17.461483780312808,MCC,seasonal-flu-h3n2-ha-na-2016-2018
within,25.03519395094363,18.0,20.824024544878867,MCC,seasonal-flu-h3n2-ha-na-2016-2018
between,49.03775789707037,45.0,17.646502082489178,pca_concatenated_label,seasonal-flu-h3n2-ha-na-2016-2018
within,21.23224366722899,21.0,9.176271138084312,pca_concatenated_label,seasonal-flu-h3n2-ha-na-2016-2018
between,46.55477667234857,43.0,18.296165389636226,mds_concatenated_label,seasonal-flu-h3n2-ha-na-2016-2018
within,20.15751214404641,15.0,15.433420831086393,mds_concatenated_label,seasonal-flu-h3n2-ha-na-2016-2018
between,47.26407165506307,43.0,18.0524801799966,t-sne_concatenated_label,seasonal-flu-h3n2-ha-na-2016-2018
within,16.530859919452336,16.0,6.823419674837644,t-sne_concatenated_label,seasonal-flu-h3n2-ha-na-2016-2018
between,48.764280944914695,45.0,17.634841931563823,umap_concatenated_label,seasonal-flu-h3n2-ha-na-2016-2018
within,21.2107974170994,20.0,10.580981662906307,umap_concatenated_label,seasonal-flu-h3n2-ha-na-2016-2018
between,45.97074825359922,42.0,18.648389707399943,genetic_concatenated_label,seasonal-flu-h3n2-ha-na-2016-2018
within,18.72484394799831,14.0,13.85786212071416,genetic_concatenated_label,seasonal-flu-h3n2-ha-na-2016-2018
11 changes: 11 additions & 0 deletions ha-na-nextstrain/results/monophyletic_clusters.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
dataset,method,n_clusters,n_cluster_transitions,n_extra_transitions,clusters,transitions
seasonal-flu-h3n2-ha-na-2016-2018,PCA (HA only),7,6,0,"['-1', '0', '1', '2', '3', '4', '5']","[('0', '2'), ('2', '1'), ('2', '3'), ('3', '5'), ('5', '3'), ('5', '4')]"
seasonal-flu-h3n2-ha-na-2016-2018,MDS (HA only),9,8,0,"['-1', '0', '1', '2', '3', '4', '5', '6', '7']","[('-1', '0'), ('-1', '2'), ('2', '3'), ('-1', '6'), ('-1', '1'), ('6', '7'), ('6', '4'), ('-1', '5')]"
seasonal-flu-h3n2-ha-na-2016-2018,t-SNE (HA only),16,14,0,"['-1', '0', '1', '10', '11', '12', '13', '14', '2', '3', '4', '5', '6', '7', '8', '9']","[('1', '5'), ('1', '0'), ('5', '7'), ('7', '4'), ('7', '8'), ('7', '6'), ('6', '3'), ('6', '14'), ('14', '13'), ('14', '12'), ('12', '10'), ('12', '11'), ('14', '9'), ('14', '2')]"
seasonal-flu-h3n2-ha-na-2016-2018,UMAP (HA only),7,6,0,"['0', '1', '2', '3', '4', '5', '6']","[('3', '5'), ('3', '2'), ('5', '6'), ('5', '1'), ('5', '4'), ('4', '0')]"
seasonal-flu-h3n2-ha-na-2016-2018,genetic (HA only),9,8,0,"['-1', '0', '1', '2', '3', '4', '5', '6', '7']","[('-1', '1'), ('-1', '0'), ('-1', '2'), ('-1', '3'), ('-1', '4'), ('-1', '5'), ('3', '7'), ('3', '6')]"
seasonal-flu-h3n2-ha-na-2016-2018,PCA (HA/NA),10,9,1,"['-1', '0', '1', '2', '3', '4', '5', '6', '7', '8']","[('0', '8'), ('0', '1'), ('8', '7'), ('8', '6'), ('8', '6'), ('6', '2'), ('2', '3'), ('-1', '5'), ('5', '4')]"
seasonal-flu-h3n2-ha-na-2016-2018,MDS (HA/NA),18,17,2,"['-1', '0', '1', '10', '11', '12', '13', '14', '15', '16', '2', '3', '4', '5', '6', '7', '8', '9']","[('-1', '0'), ('-1', '2'), ('2', '1'), ('-1', '8'), ('-1', '6'), ('6', '5'), ('-1', '11'), ('-1', '10'), ('-1', '4'), ('4', '3'), ('-1', '14'), ('14', '15'), ('15', '12'), ('14', '7'), ('7', '9'), ('7', '9'), ('7', '9')]"
seasonal-flu-h3n2-ha-na-2016-2018,t-SNE (HA/NA),18,19,3,"['-1', '0', '1', '10', '11', '12', '13', '14', '15', '16', '2', '3', '4', '5', '6', '7', '8', '9']","[('6', '1'), ('6', '5'), ('1', '14'), ('14', '0'), ('14', '13'), ('14', '11'), ('14', '10'), ('14', '2'), ('2', '3'), ('11', '16'), ('16', '15'), ('16', '9'), ('15', '12'), ('11', '4'), ('4', '7'), ('4', '8'), ('4', '8'), ('4', '7'), ('4', '7')]"
seasonal-flu-h3n2-ha-na-2016-2018,UMAP (HA/NA),8,11,3,"['0', '1', '2', '3', '4', '5', '6', '7']","[('0', '3'), ('3', '1'), ('3', '4'), ('4', '2'), ('3', '5'), ('5', '7'), ('7', '6'), ('7', '6'), ('7', '6'), ('7', '6'), ('6', '7')]"
seasonal-flu-h3n2-ha-na-2016-2018,genetic (HA/NA),25,40,10,"['-1', '0', '1', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '2', '20', '21', '22', '23', '3', '4', '5', '6', '7', '8', '9']","[('-1', '0'), ('-1', '1'), ('1', '2'), ('-1', '3'), ('-1', '13'), ('13', '14'), ('13', '14'), ('-1', '5'), ('5', '6'), ('-1', '3'), ('-1', '11'), ('-1', '15'), ('15', '16'), ('15', '16'), ('15', '16'), ('-1', '19'), ('19', '17'), ('19', '20'), ('19', '7'), ('19', '21'), ('19', '18'), ('19', '12'), ('19', '10'), ('19', '12'), ('19', '21'), ('21', '20'), ('-1', '10'), ('-1', '18'), ('21', '20'), ('20', '18'), ('19', '22'), ('19', '20'), ('20', '19'), ('20', '23'), ('20', '23'), ('-1', '8'), ('8', '9'), ('8', '9'), ('-1', '7'), ('8', '4')]"
Loading

0 comments on commit d12aee2

Please sign in to comment.