refactored heatmap

Reed-CompBio · ctrlaltaf · Oct 29, 2024 · Oct 30, 2024 · Oct 30, 2024 · Oct 30, 2024
commit b3aa2134bf9dd2363fd9297db4e77317e851a205
diff --git a/synthetic-benchmarking/benchmarking-pipeline/Snakefile b/synthetic-benchmarking/benchmarking-pipeline/Snakefile
@@ -9,44 +9,58 @@ def generate_paths(algorithms, pathways):
         for pathway in pathways[algorithm]:
             paths.append(f"output/images/network/{algorithm}_{pathway}_spras.png")
             paths.append(f"output/images/network/{algorithm}_{pathway}_panther.png")
-            paths.append(f"output/stats/edges/{algorithm}_{pathway}_stats.txt")
-            paths.append(f"output/stats/score/{algorithm}_{pathway}.txt")
+            paths.append(f"output/stats/network/{algorithm}_{pathway}_stats.txt")
+            paths.append(f"output/stats/score/edges/{algorithm}_{pathway}.txt")
+            paths.append(f"output/stats/score/nodes/{algorithm}_{pathway}.txt")
+
     return paths
 
 def score_files(algorithms, pathways):
-    inputs = []
+    edge_inputs = []
+    node_inputs = []
     outputs = []
     for algorithm in algorithms:
         for pathway in pathways[algorithm]:
-            score_file = f"output/stats/score/{algorithm}_{pathway}.txt"
-            inputs.append(score_file)
-        outputs.append(f"output/images/auc/{algorithm}_alg.png")
-    print(inputs)
-    return inputs, outputs
+            edge_score_file = f"output/stats/score/edges/{algorithm}_{pathway}.txt"
+            node_score_file = f"output/stats/score/nodes/{algorithm}_{pathway}.txt"
+
+            edge_inputs.append(edge_score_file)
+            node_inputs.append(node_score_file)
+        outputs.append(f"output/images/auc/edges/{algorithm}_alg.png")
+        outputs.append(f"output/images/auc/nodes/{algorithm}_alg.png")
+    return edge_inputs, node_inputs, outputs
 
 def pathway_files(algorithms, pathways):
-    pathway_inputs = []
+    edge_inputs = []
+    node_inputs = []
     pathway_outputs = []
     for algorithm in algorithms:
         for pathway in pathways[algorithm]:
-            score_file = f"output/stats/score/{algorithm}_{pathway}.txt"
-            pathway_inputs.append(score_file)
-            pathway_outputs.append(f"output/images/auc/{pathway}_pathway.png")
-    return pathway_inputs, pathway_outputs
+            edge_score_file = f"output/stats/score/edges/{algorithm}_{pathway}.txt"
+            node_score_file = f"output/stats/score/nodes/{algorithm}_{pathway}.txt"
+
+            edge_inputs.append(edge_score_file)
+            node_inputs.append(node_score_file)
+            pathway_outputs.append(f"output/images/auc/edges/{pathway}_pathway.png")
+            pathway_outputs.append(f"output/images/auc/nodes/{pathway}_pathway.png")
+
+    return edge_inputs, node_inputs, pathway_outputs
 
 def edges_files(algorithms, pathways):
     edge_inputs = []
     for algorithm in algorithms:
         for pathway in pathways[algorithm]:
-            edge_file = f"output/stats/edges/{algorithm}_{pathway}_stats.txt"
+            edge_file = f"output/stats/network/{algorithm}_{pathway}_stats.txt"
             edge_inputs.append(edge_file)
     return edge_inputs
 
 rule all:
     input:
-        generate_paths(algorithms, pathways) + score_files(algorithms, pathways)[1] + pathway_files(algorithms, pathways)[1],
-        "output/images/heatmap/heatmap.png"
-
+        generate_paths(algorithms, pathways),
+        score_files(algorithms, pathways)[2],  
+        pathway_files(algorithms, pathways)[2],
+        "output/images/heatmap/heatmap_edge.png",
+        "output/images/heatmap/heatmap_node.png"
 
 rule pathway_image:
     input:
@@ -63,7 +77,7 @@ rule pathway_stats:
         spras=lambda wildcards: f"networks/{wildcards.algorithms}/{wildcards.pathways}/spras.txt",
         panther=lambda wildcards: f"networks/{wildcards.algorithms}/{wildcards.pathways}/panther.txt"
     output:
-        stats="output/stats/edges/{algorithms}_{pathways}_stats.txt"
+        stats="output/stats/network/{algorithms}_{pathways}_stats.txt"
     script:
         "scripts/generate_stats.py"
 
@@ -72,30 +86,40 @@ rule generate_scores:
         spras=lambda wildcards: f"networks/{wildcards.algorithms}/{wildcards.pathways}/spras.txt",
         panther=lambda wildcards: f"networks/{wildcards.algorithms}/{wildcards.pathways}/panther.txt"
     output:
-        score="output/stats/score/{algorithms}_{pathways}.txt",
+        edge_score="output/stats/score/edges/{algorithms}_{pathways}.txt",
+        node_score="output/stats/score/nodes/{algorithms}_{pathways}.txt",
+
     script:
         "scripts/generate_scores_file.py"
 
 rule generate_algorithm_auc:
     input:
-        scores=score_files(algorithms, pathways)[0]
+        edge_scores=score_files(algorithms, pathways)[0],
+        node_scores=score_files(algorithms, pathways)[1]
+
     output:
-        auc_image="output/images/auc/{algorithm}_alg.png"
+        auc_edge_image="output/images/auc/edges/{algorithm}_alg.png",
+        auc_node_image="output/images/auc/nodes/{algorithm}_alg.png"
     script:
         "scripts/generate_auc_figures.py"
 
 rule generate_pathway_auc:
     input:
-        scores=pathway_files(algorithms, pathways)[0]
+        edge_scores=pathway_files(algorithms, pathways)[0],
+        node_scores=pathway_files(algorithms, pathways)[1]
     output:
-        auc_image="output/images/auc/{pathway}_pathway.png"
+        auc_edge_image="output/images/auc/edges/{pathway}_pathway.png",
+        auc_node_image="output/images/auc/nodes/{pathway}_pathway.png"
+
     script:
         "scripts/generate_auc_figures.py" 
 
 rule generate_heatmap:
     input:
         scores=edges_files(algorithms, pathways)
     output:
-        "output/images/heatmap/heatmap.png"
+        edge_heatmap="output/images/heatmap/heatmap_edge.png",
+        node_heatmap="output/images/heatmap/heatmap_node.png"
+
     script:
         "scripts/generate_heatmap.py"
diff --git a/synthetic-benchmarking/benchmarking-pipeline/config.yaml b/synthetic-benchmarking/benchmarking-pipeline/config.yaml
@@ -3,7 +3,7 @@ algorithm_pathways:
     - egfr
     - wnt
   OmicsIntegrator1:
-    - egfr
+    # - egfr
     - wnt
   BTB:
     - egfr

diff --git a/synthetic-benchmarking/benchmarking-pipeline/environment.yml b/synthetic-benchmarking/benchmarking-pipeline/environment.yml
@@ -1,4 +1,4 @@
-name: spras-benchmarking
+name: benchmarking-pipeline
 channels:
   - bioconda
   - conda-forge
@@ -129,6 +129,7 @@ dependencies:
   - packaging=24.1=pyhd8ed1ab_0
   - pandas=2.2.3=py312h98e817e_1
   - pango=1.54.0=h115fe74_1
+  - patsy=0.5.6=pyhd8ed1ab_0
   - pcre2=10.43=h0ad2156_0
   - perl=5.32.1=7_h10d778d_perl5
   - pillow=10.3.0=py312h0c923fa_0
@@ -160,6 +161,8 @@ dependencies:
   - samtools=1.19.2=hd510865_1
   - scikit-learn=1.5.2=py312h9d777eb_1
   - scipy=1.14.1=py312h888eae2_1
+  - seaborn=0.13.2=hd8ed1ab_2
+  - seaborn-base=0.13.2=pyhd8ed1ab_2
   - setuptools=75.3.0=pyhd8ed1ab_0
   - six=1.16.0=pyh6c4a22f_0
   - smart_open=7.0.5=pyhd8ed1ab_1
@@ -169,6 +172,7 @@ dependencies:
   - snakemake-interface-report-plugins=1.1.0=pyhdfd78af_0
   - snakemake-interface-storage-plugins=3.3.0=pyhdfd78af_0
   - snakemake-minimal=8.25.1=pyhdfd78af_0
+  - statsmodels=0.14.4=py312h3a11e2b_0
   - tabulate=0.9.0=pyhd8ed1ab_1
   - threadpoolctl=3.5.0=pyhc1e730c_0
   - throttler=1.2.2=pyhd8ed1ab_0

diff --git a/synthetic-benchmarking/benchmarking-pipeline/networks/BTB/egfr/spras.txt b/synthetic-benchmarking/benchmarking-pipeline/networks/BTB/egfr/spras.txt
@@ -11,3 +11,6 @@ E	F	1	D
 E	G	1	D
 H	G	1	D
 F	G	1	D
+G	P	1	D
+P	N	1	D
+N	F	1	D
diff --git a/synthetic-benchmarking/benchmarking-pipeline/networks/BTB/wnt/spras.txt b/synthetic-benchmarking/benchmarking-pipeline/networks/BTB/wnt/spras.txt
@@ -6,3 +6,6 @@ D	C	1	D
 C	E	1	D
 C	F	1	D
 F	G	1	D
+G	H	1	D
+H	Z	1	D
+Z	A	1	D
diff --git a/synthetic-benchmarking/benchmarking-pipeline/networks/OmicsIntegrator1/egfr/spras.txt b/synthetic-benchmarking/benchmarking-pipeline/networks/OmicsIntegrator1/egfr/spras.txt
@@ -4,6 +4,9 @@ A	E	1	D
 A	F	1	D
 B	C	1	D
 C	E	1	D
+C	J	1	D
+J	M	1	D
+M	N	1	D
 C	F	1	D
 D	A	1	D
 D	E	1	D
diff --git a/synthetic-benchmarking/benchmarking-pipeline/networks/OmicsIntegrator1/wnt/spras.txt b/synthetic-benchmarking/benchmarking-pipeline/networks/OmicsIntegrator1/wnt/spras.txt
@@ -1,5 +1,6 @@
 Node1	Node2	Rank	Direction
 A	C	1	D
+A	Z	1	D
 A	D	1	D
 B	C	1	D
 D	C	1	D

diff --git a/synthetic-benchmarking/benchmarking-pipeline/networks/PathLinker/egfr/spras.txt b/synthetic-benchmarking/benchmarking-pipeline/networks/PathLinker/egfr/spras.txt
@@ -1,5 +1,7 @@
 Node1	Node2	Rank	Direction
 A	D	1	D
+A	K	1	D
+A	J	1	D
 B	D	1	D
 B	C	1	D
 C	D	1	D

diff --git a/synthetic-benchmarking/benchmarking-pipeline/networks/PathLinker/wnt/spras.txt b/synthetic-benchmarking/benchmarking-pipeline/networks/PathLinker/wnt/spras.txt
@@ -5,4 +5,6 @@ B	C	1	D
 D	C	1	D
 C	E	1	D
 C	F	1	D
+C	P	1	D
+C	K	1	D
 F	G	1	D
diff --git a/synthetic-benchmarking/benchmarking-pipeline/scripts/generate_auc_figures.py b/synthetic-benchmarking/benchmarking-pipeline/scripts/generate_auc_figures.py
@@ -5,15 +5,17 @@
 import networkx as nx
 from sklearn.metrics import auc, precision_recall_curve, roc_curve
 
-input_files = snakemake.input.scores
-output_pr_path = snakemake.output[0]
+edge_input_files = snakemake.input.edge_scores
+node_input_files = snakemake.input.node_scores
+output_edge_path = snakemake.output[0]
+output_node_path = snakemake.output[1]
 
-auc_type = output_pr_path.split("/")[-1].split("_")[-1].split(".")[0]
-algorithm = output_pr_path.split("/")[-1].split("_")[0]
+auc_type = output_edge_path.split("/")[-1].split("_")[-1].split(".")[0]
+algorithm = output_edge_path.split("/")[-1].split("_")[0]
 
 filtered_inputs = []
 i = 0
-for input in input_files:
+for input in edge_input_files:
     current_alg = ""
     if auc_type == "alg":
         current_alg = input.split("/")[-1].split("_")[0]
@@ -61,4 +63,59 @@
 plt.legend()
 
 plt.tight_layout()
-plt.savefig(output_pr_path, format="png", dpi=300)
+plt.savefig(output_edge_path, format="png", dpi=300)
+
+auc_type = output_edge_path.split("/")[-1].split("_")[-1].split(".")[0]
+algorithm = output_edge_path.split("/")[-1].split("_")[0]
+
+filtered_inputs = []
+i = 0
+for input in node_input_files:
+    current_alg = ""
+    if auc_type == "alg":
+        current_alg = input.split("/")[-1].split("_")[0]
+    else:
+        current_alg = input.split("/")[-1].split("_")[-1].split(".")[0]
+    if current_alg == algorithm:
+        filtered_inputs.append(input)
+
+plt.figure(figsize=(10, 5))
+colors = ["b", "g", "r", "c", "m", "y", "k", "orange"]
+
+for idx, file in enumerate(filtered_inputs):
+    data = []
+    y_true = []
+    y_score = []
+    with open(file, "r") as f:
+        next(f)
+        for line in f:
+            line = line.strip()
+            col = line.split("\t")
+            y_true.append(int(col[1]))
+            y_score.append(int(col[2]))
+
+    precision, recall, _ = precision_recall_curve(y_true, y_score)
+    fpr, tpr, _ = roc_curve(y_true, y_score)
+
+    plt.subplot(1, 2, 1)
+    plt.plot(
+        recall, precision, color=colors[idx % len(colors)], label=os.path.basename(file)
+    )
+
+    plt.subplot(1, 2, 2)
+    plt.plot(fpr, tpr, color=colors[idx % len(colors)], label=os.path.basename(file))
+
+plt.subplot(1, 2, 1)
+plt.xlabel("Recall")
+plt.ylabel("Precision")
+plt.title("Precision-Recall Curve")
+plt.legend()
+
+plt.subplot(1, 2, 2)
+plt.xlabel("False Positive Rate")
+plt.ylabel("True Positive Rate")
+plt.title("ROC Curve")
+plt.legend()
+
+plt.tight_layout()
+plt.savefig(output_node_path, format="png", dpi=300)
diff --git a/synthetic-benchmarking/benchmarking-pipeline/scripts/generate_heatmap.py b/synthetic-benchmarking/benchmarking-pipeline/scripts/generate_heatmap.py
@@ -6,7 +6,8 @@
 import seaborn as sns
 
 scores_path = snakemake.input.scores
-output_path = snakemake.output[0]
+output_1_path = snakemake.output[0]
+output_2_path = snakemake.output[1]
 
 data = []
 algorithms = []
@@ -25,27 +26,49 @@
         next(f)
         for line in f:
             cols = line.split("\t")
-            data.append((parts[0], parts[1], cols[4]))
+            data.append((parts[0], parts[1], cols[4], cols[5]))
 
-jaccard_indices_list = []
+jaccard_edges_indices_list = []
+jaccard_nodes_indices_list = []
 
 for algorithm in algorithms:
-    current = []
+    current_edge_list = []
+    current_node_list = []
     for pathway in pathways:
         appended = False
         for entry in data:
             if entry[0] == algorithm and entry[1] == pathway:
                 appended = True
-                current.append(float(entry[2]))
+                current_edge_list.append(float(entry[2]))
+                current_node_list.append(float(entry[3]))
         if appended == False:
-            current.append(np.nan)
-    jaccard_indices_list.append(current)
+            current_edge_list.append(np.nan)
+            current_node_list.append(np.nan)
 
-jaccard_indices = np.array(jaccard_indices_list, dtype=float)
+    jaccard_edges_indices_list.append(current_edge_list)
+    jaccard_nodes_indices_list.append(current_node_list)
+
+
+jaccard_edges_indices = np.array(jaccard_edges_indices_list, dtype=float)
+jaccard_nodes_indices = np.array(jaccard_nodes_indices_list, dtype=float)
+
+plt.figure(figsize=(10, 8))
+sns.heatmap(
+    jaccard_edges_indices,
+    annot=True,
+    cmap="viridis",
+    xticklabels=pathways,
+    yticklabels=algorithms,
+)
+
+plt.xlabel("Pathways")
+plt.ylabel("Algorithms")
+plt.title("Jaccard Index Edge Heatmap")
+plt.savefig(output_1_path, format="png", dpi=300)
 
 plt.figure(figsize=(10, 8))
 sns.heatmap(
-    jaccard_indices,
+    jaccard_nodes_indices,
     annot=True,
     cmap="viridis",
     xticklabels=pathways,
@@ -54,5 +77,5 @@
 
 plt.xlabel("Pathways")
 plt.ylabel("Algorithms")
-plt.title("Jaccard Index Heatmap")
-plt.savefig(output_path, format="png", dpi=300)
+plt.title("Jaccard Index Node Heatmap")
+plt.savefig(output_2_path, format="png", dpi=300)
diff --git a/synthetic-benchmarking/benchmarking-pipeline/scripts/generate_scores_file.py b/synthetic-benchmarking/benchmarking-pipeline/scripts/generate_scores_file.py
@@ -5,7 +5,8 @@
 
 spras_path = snakemake.input[0]
 panther_path = snakemake.input[1]
-output_path = snakemake.output[0]
+output_1_path = snakemake.output[0]
+output_2_path = snakemake.output[1]
 
 spras_df = pd.read_csv(spras_path, sep="\t")
 panther_df = pd.read_csv(panther_path, sep="\t")
@@ -15,14 +16,40 @@
 
 all_edges = pd.concat([spras_df, panther_df]).drop_duplicates()
 edge_list = set(zip(all_edges["Node1"], all_edges["Node2"]))
-ground_truth = {edge: 1 for edge in zip(panther_df["Node1"], panther_df["Node2"])}
+ground_truth_edge = {edge: 1 for edge in zip(panther_df["Node1"], panther_df["Node2"])}
 
-y_true = []
-y_scores = []
+y_true_edge = []
+y_scores_edge = []
 
-f = open(output_path, "w+")
+
+f = open(output_1_path, "w+")
 f.write("Node1\tNode2\ty_true\ty_score\n")
 for edge in edge_list:
-    y_true.append(ground_truth.get(edge, 0))  # 1 if edge in file 1, else 0
-    y_scores.append(1 if edge in zip(spras_df["Node1"], spras_df["Node2"]) else 0)
-    f.write(f"{edge[0]}\t{edge[1]}\t{y_true[-1]}\t{y_scores[-1]}\n")
+    y_true_edge.append(ground_truth_edge.get(edge, 0))  # 1 if edge in file 1, else 0
+    y_scores_edge.append(1 if edge in zip(spras_df["Node1"], spras_df["Node2"]) else 0)
+    f.write(f"{edge[0]}\t{edge[1]}\t{y_true_edge[-1]}\t{y_scores_edge[-1]}\n")
+
+
+all_nodes = pd.concat([spras_df, panther_df]).drop_duplicates()
+node_list = pd.unique(all_nodes[["Node1", "Node2"]].values.ravel())
+spras_node_list = pd.unique(spras_df[["Node1", "Node2"]].values.ravel())
+panther_node_list = pd.unique(panther_df[["Node1", "Node2"]].values.ravel())
+
+print(spras_path)
+print(f"spras nodes {spras_node_list}")
+print(f"spras nodes {panther_node_list}")
+print(f"all nodes {node_list}")
+
+ground_truth_node = {node: 1 for node in panther_node_list}
+print(ground_truth_node)
+
+y_true_node = []
+y_scores_node = []
+
+g = open(output_2_path, "w+")
+g.write("Node1\ty_true\ty_score\n")
+for node in node_list:
+    y_true_node.append(ground_truth_node.get(node, 0))  # 1 if node in file 1, else 0
+    y_scores_node.append(1 if node in spras_node_list else 0)
+    g.write(f"{node}\t{y_true_node[-1]}\t{y_scores_node[-1]}\n")
+
diff --git a/synthetic-benchmarking/benchmarking-pipeline/scripts/generate_stats.py b/synthetic-benchmarking/benchmarking-pipeline/scripts/generate_stats.py
@@ -13,20 +13,29 @@
 G = nx.from_pandas_edgelist(spras_df, source="Node1", target="Node2")
 H = nx.from_pandas_edgelist(panther_df, source="Node1", target="Node2")
 
-G_set = set(G.edges())
-H_set = set(H.edges())
-union = G_set | H_set
-intersection = G_set & H_set
-jaccard_index = len(intersection) / len(union)
-overlap_ratio_set1 = (len(intersection) / len(G_set)) * 100
-overlap_ratio_set2 = (len(intersection) / len(H_set)) * 100
+G_set_edges = set(G.edges())
+H_set_edges = set(H.edges())
+edge_union = G_set_edges | H_set_edges
+edge_intersection = G_set_edges & H_set_edges
+jaccard_edge_index = len(edge_intersection) / len(edge_union)
+overlap_ratio_edge1 = (len(edge_intersection) / len(G_set_edges)) * 100
+overlap_ratio_edge2 = (len(edge_intersection) / len(H_set_edges)) * 100
+
+G_set_nodes = set(G.nodes())
+H_set_nodes = set(H.nodes())
+node_union = G_set_nodes | H_set_nodes
+node_intersection = G_set_nodes & H_set_nodes
+jaccard_node_index = len(node_intersection) / len(node_union)
+overlap_ratio_node1 = (len(node_intersection) / len(G_set_nodes)) * 100
+overlap_ratio_node2 = (len(node_intersection) / len(H_set_nodes)) * 100
 
 columns = [
     "spras_nodes",
     "panther_nodes",
     "spras_edges",
     "panther_edges",
-    "jaccard_index",
+    "jaccard_edge_index",
+    "jaccard_node_index",
     "panther_spras_edge_overlap",
     "spras_panther_edge_overlap",
 ]
@@ -36,9 +45,10 @@
     str(len(H.nodes())),
     str(len(G.edges())),
     str(len(H.edges())),
-    str(jaccard_index),
-    str(overlap_ratio_set1),
-    str(overlap_ratio_set2),
+    str(jaccard_edge_index),
+    str(jaccard_node_index),
+    str(overlap_ratio_edge1),
+    str(overlap_ratio_edge2),
 ]
 
 f = open(output_path, "w+")
-Original file line number
+Diff line change
@@ @@ -3,7 +3,7 @@ algorithm_pathways: @@
         - egfr
         - wnt
       OmicsIntegrator1:
-        - egfr
+        # - egfr
         - wnt
       BTB:
         - egfr