Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Synthetic data benchmarking pipeline #8

Open
wants to merge 28 commits into
base: main
Choose a base branch
from
Open
Changes from 1 commit
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
2b034cc
added preprocessing files
ctrlaltaf Oct 29, 2024
38add56
added initial namespace files
ctrlaltaf Oct 30, 2024
01dd30d
updated mapper
ctrlaltaf Oct 30, 2024
26c9690
updated docs
ctrlaltaf Oct 30, 2024
8dd5fb2
initial snakemake workflow
ctrlaltaf Nov 4, 2024
1e4c523
setup figure generation
ctrlaltaf Nov 4, 2024
90eb258
added stats calculation
ctrlaltaf Nov 5, 2024
59b669e
added edge overlap stats
ctrlaltaf Nov 5, 2024
0eebecf
added scores stats
ctrlaltaf Nov 5, 2024
935d19b
added auc images
ctrlaltaf Nov 5, 2024
1faff04
refactoring
ctrlaltaf Nov 5, 2024
65bf277
refactored code
ctrlaltaf Nov 5, 2024
3110659
updated structure to take algorithm and pathway combo
ctrlaltaf Nov 5, 2024
e4a9843
initial structure for algorithm auc
ctrlaltaf Nov 5, 2024
56ebd9d
cleaned up code
ctrlaltaf Nov 5, 2024
23ee963
combined auc for algorithm and pathway category
ctrlaltaf Nov 5, 2024
6c53479
cleaned up code
ctrlaltaf Nov 5, 2024
d40371b
refactored outputs
ctrlaltaf Nov 6, 2024
6197b92
added heatmap
ctrlaltaf Nov 6, 2024
a0c39dd
added heatmap
ctrlaltaf Nov 6, 2024
5e65ab5
added new sample dataset
ctrlaltaf Nov 6, 2024
0819762
updated config
ctrlaltaf Nov 6, 2024
69833a4
refactoring
ctrlaltaf Nov 7, 2024
8373ceb
modularized heatmap w/ algorithm pathway combo
ctrlaltaf Nov 8, 2024
8b399e4
updated config and cleaned up code
ctrlaltaf Nov 8, 2024
b3aa213
refactored heatmap
ctrlaltaf Nov 8, 2024
a578c0b
removed obsolete files
ctrlaltaf Nov 8, 2024
f5571b4
updated documentaion
ctrlaltaf Nov 11, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
refactored heatmap
ctrlaltaf committed Nov 8, 2024
commit b3aa2134bf9dd2363fd9297db4e77317e851a205
72 changes: 48 additions & 24 deletions synthetic-benchmarking/benchmarking-pipeline/Snakefile
Original file line number Diff line number Diff line change
@@ -9,44 +9,58 @@ def generate_paths(algorithms, pathways):
for pathway in pathways[algorithm]:
paths.append(f"output/images/network/{algorithm}_{pathway}_spras.png")
paths.append(f"output/images/network/{algorithm}_{pathway}_panther.png")
paths.append(f"output/stats/edges/{algorithm}_{pathway}_stats.txt")
paths.append(f"output/stats/score/{algorithm}_{pathway}.txt")
paths.append(f"output/stats/network/{algorithm}_{pathway}_stats.txt")
paths.append(f"output/stats/score/edges/{algorithm}_{pathway}.txt")
paths.append(f"output/stats/score/nodes/{algorithm}_{pathway}.txt")

return paths

def score_files(algorithms, pathways):
inputs = []
edge_inputs = []
node_inputs = []
outputs = []
for algorithm in algorithms:
for pathway in pathways[algorithm]:
score_file = f"output/stats/score/{algorithm}_{pathway}.txt"
inputs.append(score_file)
outputs.append(f"output/images/auc/{algorithm}_alg.png")
print(inputs)
return inputs, outputs
edge_score_file = f"output/stats/score/edges/{algorithm}_{pathway}.txt"
node_score_file = f"output/stats/score/nodes/{algorithm}_{pathway}.txt"

edge_inputs.append(edge_score_file)
node_inputs.append(node_score_file)
outputs.append(f"output/images/auc/edges/{algorithm}_alg.png")
outputs.append(f"output/images/auc/nodes/{algorithm}_alg.png")
return edge_inputs, node_inputs, outputs

def pathway_files(algorithms, pathways):
pathway_inputs = []
edge_inputs = []
node_inputs = []
pathway_outputs = []
for algorithm in algorithms:
for pathway in pathways[algorithm]:
score_file = f"output/stats/score/{algorithm}_{pathway}.txt"
pathway_inputs.append(score_file)
pathway_outputs.append(f"output/images/auc/{pathway}_pathway.png")
return pathway_inputs, pathway_outputs
edge_score_file = f"output/stats/score/edges/{algorithm}_{pathway}.txt"
node_score_file = f"output/stats/score/nodes/{algorithm}_{pathway}.txt"

edge_inputs.append(edge_score_file)
node_inputs.append(node_score_file)
pathway_outputs.append(f"output/images/auc/edges/{pathway}_pathway.png")
pathway_outputs.append(f"output/images/auc/nodes/{pathway}_pathway.png")

return edge_inputs, node_inputs, pathway_outputs

def edges_files(algorithms, pathways):
edge_inputs = []
for algorithm in algorithms:
for pathway in pathways[algorithm]:
edge_file = f"output/stats/edges/{algorithm}_{pathway}_stats.txt"
edge_file = f"output/stats/network/{algorithm}_{pathway}_stats.txt"
edge_inputs.append(edge_file)
return edge_inputs

rule all:
input:
generate_paths(algorithms, pathways) + score_files(algorithms, pathways)[1] + pathway_files(algorithms, pathways)[1],
"output/images/heatmap/heatmap.png"

generate_paths(algorithms, pathways),
score_files(algorithms, pathways)[2],
pathway_files(algorithms, pathways)[2],
"output/images/heatmap/heatmap_edge.png",
"output/images/heatmap/heatmap_node.png"

rule pathway_image:
input:
@@ -63,7 +77,7 @@ rule pathway_stats:
spras=lambda wildcards: f"networks/{wildcards.algorithms}/{wildcards.pathways}/spras.txt",
panther=lambda wildcards: f"networks/{wildcards.algorithms}/{wildcards.pathways}/panther.txt"
output:
stats="output/stats/edges/{algorithms}_{pathways}_stats.txt"
stats="output/stats/network/{algorithms}_{pathways}_stats.txt"
script:
"scripts/generate_stats.py"

@@ -72,30 +86,40 @@ rule generate_scores:
spras=lambda wildcards: f"networks/{wildcards.algorithms}/{wildcards.pathways}/spras.txt",
panther=lambda wildcards: f"networks/{wildcards.algorithms}/{wildcards.pathways}/panther.txt"
output:
score="output/stats/score/{algorithms}_{pathways}.txt",
edge_score="output/stats/score/edges/{algorithms}_{pathways}.txt",
node_score="output/stats/score/nodes/{algorithms}_{pathways}.txt",

script:
"scripts/generate_scores_file.py"

rule generate_algorithm_auc:
input:
scores=score_files(algorithms, pathways)[0]
edge_scores=score_files(algorithms, pathways)[0],
node_scores=score_files(algorithms, pathways)[1]

output:
auc_image="output/images/auc/{algorithm}_alg.png"
auc_edge_image="output/images/auc/edges/{algorithm}_alg.png",
auc_node_image="output/images/auc/nodes/{algorithm}_alg.png"
script:
"scripts/generate_auc_figures.py"

rule generate_pathway_auc:
input:
scores=pathway_files(algorithms, pathways)[0]
edge_scores=pathway_files(algorithms, pathways)[0],
node_scores=pathway_files(algorithms, pathways)[1]
output:
auc_image="output/images/auc/{pathway}_pathway.png"
auc_edge_image="output/images/auc/edges/{pathway}_pathway.png",
auc_node_image="output/images/auc/nodes/{pathway}_pathway.png"

script:
"scripts/generate_auc_figures.py"

rule generate_heatmap:
input:
scores=edges_files(algorithms, pathways)
output:
"output/images/heatmap/heatmap.png"
edge_heatmap="output/images/heatmap/heatmap_edge.png",
node_heatmap="output/images/heatmap/heatmap_node.png"

script:
"scripts/generate_heatmap.py"
2 changes: 1 addition & 1 deletion synthetic-benchmarking/benchmarking-pipeline/config.yaml
Original file line number Diff line number Diff line change
@@ -3,7 +3,7 @@ algorithm_pathways:
- egfr
- wnt
OmicsIntegrator1:
- egfr
# - egfr
- wnt
BTB:
- egfr
6 changes: 5 additions & 1 deletion synthetic-benchmarking/benchmarking-pipeline/environment.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
name: spras-benchmarking
name: benchmarking-pipeline
channels:
- bioconda
- conda-forge
@@ -129,6 +129,7 @@ dependencies:
- packaging=24.1=pyhd8ed1ab_0
- pandas=2.2.3=py312h98e817e_1
- pango=1.54.0=h115fe74_1
- patsy=0.5.6=pyhd8ed1ab_0
- pcre2=10.43=h0ad2156_0
- perl=5.32.1=7_h10d778d_perl5
- pillow=10.3.0=py312h0c923fa_0
@@ -160,6 +161,8 @@ dependencies:
- samtools=1.19.2=hd510865_1
- scikit-learn=1.5.2=py312h9d777eb_1
- scipy=1.14.1=py312h888eae2_1
- seaborn=0.13.2=hd8ed1ab_2
- seaborn-base=0.13.2=pyhd8ed1ab_2
- setuptools=75.3.0=pyhd8ed1ab_0
- six=1.16.0=pyh6c4a22f_0
- smart_open=7.0.5=pyhd8ed1ab_1
@@ -169,6 +172,7 @@ dependencies:
- snakemake-interface-report-plugins=1.1.0=pyhdfd78af_0
- snakemake-interface-storage-plugins=3.3.0=pyhdfd78af_0
- snakemake-minimal=8.25.1=pyhdfd78af_0
- statsmodels=0.14.4=py312h3a11e2b_0
- tabulate=0.9.0=pyhd8ed1ab_1
- threadpoolctl=3.5.0=pyhc1e730c_0
- throttler=1.2.2=pyhd8ed1ab_0
Original file line number Diff line number Diff line change
@@ -11,3 +11,6 @@ E F 1 D
E G 1 D
H G 1 D
F G 1 D
G P 1 D
P N 1 D
N F 1 D
Original file line number Diff line number Diff line change
@@ -6,3 +6,6 @@ D C 1 D
C E 1 D
C F 1 D
F G 1 D
G H 1 D
H Z 1 D
Z A 1 D
Original file line number Diff line number Diff line change
@@ -4,6 +4,9 @@ A E 1 D
A F 1 D
B C 1 D
C E 1 D
C J 1 D
J M 1 D
M N 1 D
C F 1 D
D A 1 D
D E 1 D
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
Node1 Node2 Rank Direction
A C 1 D
A Z 1 D
A D 1 D
B C 1 D
D C 1 D
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
Node1 Node2 Rank Direction
A D 1 D
A K 1 D
A J 1 D
B D 1 D
B C 1 D
C D 1 D
Original file line number Diff line number Diff line change
@@ -5,4 +5,6 @@ B C 1 D
D C 1 D
C E 1 D
C F 1 D
C P 1 D
C K 1 D
F G 1 D
Original file line number Diff line number Diff line change
@@ -5,15 +5,17 @@
import networkx as nx
from sklearn.metrics import auc, precision_recall_curve, roc_curve

input_files = snakemake.input.scores
output_pr_path = snakemake.output[0]
edge_input_files = snakemake.input.edge_scores
node_input_files = snakemake.input.node_scores
output_edge_path = snakemake.output[0]
output_node_path = snakemake.output[1]

auc_type = output_pr_path.split("/")[-1].split("_")[-1].split(".")[0]
algorithm = output_pr_path.split("/")[-1].split("_")[0]
auc_type = output_edge_path.split("/")[-1].split("_")[-1].split(".")[0]
algorithm = output_edge_path.split("/")[-1].split("_")[0]

filtered_inputs = []
i = 0
for input in input_files:
for input in edge_input_files:
current_alg = ""
if auc_type == "alg":
current_alg = input.split("/")[-1].split("_")[0]
@@ -61,4 +63,59 @@
plt.legend()

plt.tight_layout()
plt.savefig(output_pr_path, format="png", dpi=300)
plt.savefig(output_edge_path, format="png", dpi=300)

auc_type = output_edge_path.split("/")[-1].split("_")[-1].split(".")[0]
algorithm = output_edge_path.split("/")[-1].split("_")[0]

filtered_inputs = []
i = 0
for input in node_input_files:
current_alg = ""
if auc_type == "alg":
current_alg = input.split("/")[-1].split("_")[0]
else:
current_alg = input.split("/")[-1].split("_")[-1].split(".")[0]
if current_alg == algorithm:
filtered_inputs.append(input)

plt.figure(figsize=(10, 5))
colors = ["b", "g", "r", "c", "m", "y", "k", "orange"]

for idx, file in enumerate(filtered_inputs):
data = []
y_true = []
y_score = []
with open(file, "r") as f:
next(f)
for line in f:
line = line.strip()
col = line.split("\t")
y_true.append(int(col[1]))
y_score.append(int(col[2]))

precision, recall, _ = precision_recall_curve(y_true, y_score)
fpr, tpr, _ = roc_curve(y_true, y_score)

plt.subplot(1, 2, 1)
plt.plot(
recall, precision, color=colors[idx % len(colors)], label=os.path.basename(file)
)

plt.subplot(1, 2, 2)
plt.plot(fpr, tpr, color=colors[idx % len(colors)], label=os.path.basename(file))

plt.subplot(1, 2, 1)
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision-Recall Curve")
plt.legend()

plt.subplot(1, 2, 2)
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend()

plt.tight_layout()
plt.savefig(output_node_path, format="png", dpi=300)
Original file line number Diff line number Diff line change
@@ -6,7 +6,8 @@
import seaborn as sns

scores_path = snakemake.input.scores
output_path = snakemake.output[0]
output_1_path = snakemake.output[0]
output_2_path = snakemake.output[1]

data = []
algorithms = []
@@ -25,27 +26,49 @@
next(f)
for line in f:
cols = line.split("\t")
data.append((parts[0], parts[1], cols[4]))
data.append((parts[0], parts[1], cols[4], cols[5]))

jaccard_indices_list = []
jaccard_edges_indices_list = []
jaccard_nodes_indices_list = []

for algorithm in algorithms:
current = []
current_edge_list = []
current_node_list = []
for pathway in pathways:
appended = False
for entry in data:
if entry[0] == algorithm and entry[1] == pathway:
appended = True
current.append(float(entry[2]))
current_edge_list.append(float(entry[2]))
current_node_list.append(float(entry[3]))
if appended == False:
current.append(np.nan)
jaccard_indices_list.append(current)
current_edge_list.append(np.nan)
current_node_list.append(np.nan)

jaccard_indices = np.array(jaccard_indices_list, dtype=float)
jaccard_edges_indices_list.append(current_edge_list)
jaccard_nodes_indices_list.append(current_node_list)


jaccard_edges_indices = np.array(jaccard_edges_indices_list, dtype=float)
jaccard_nodes_indices = np.array(jaccard_nodes_indices_list, dtype=float)

plt.figure(figsize=(10, 8))
sns.heatmap(
jaccard_edges_indices,
annot=True,
cmap="viridis",
xticklabels=pathways,
yticklabels=algorithms,
)

plt.xlabel("Pathways")
plt.ylabel("Algorithms")
plt.title("Jaccard Index Edge Heatmap")
plt.savefig(output_1_path, format="png", dpi=300)

plt.figure(figsize=(10, 8))
sns.heatmap(
jaccard_indices,
jaccard_nodes_indices,
annot=True,
cmap="viridis",
xticklabels=pathways,
@@ -54,5 +77,5 @@

plt.xlabel("Pathways")
plt.ylabel("Algorithms")
plt.title("Jaccard Index Heatmap")
plt.savefig(output_path, format="png", dpi=300)
plt.title("Jaccard Index Node Heatmap")
plt.savefig(output_2_path, format="png", dpi=300)
Original file line number Diff line number Diff line change
@@ -5,7 +5,8 @@

spras_path = snakemake.input[0]
panther_path = snakemake.input[1]
output_path = snakemake.output[0]
output_1_path = snakemake.output[0]
output_2_path = snakemake.output[1]

spras_df = pd.read_csv(spras_path, sep="\t")
panther_df = pd.read_csv(panther_path, sep="\t")
@@ -15,14 +16,40 @@

all_edges = pd.concat([spras_df, panther_df]).drop_duplicates()
edge_list = set(zip(all_edges["Node1"], all_edges["Node2"]))
ground_truth = {edge: 1 for edge in zip(panther_df["Node1"], panther_df["Node2"])}
ground_truth_edge = {edge: 1 for edge in zip(panther_df["Node1"], panther_df["Node2"])}

y_true = []
y_scores = []
y_true_edge = []
y_scores_edge = []

f = open(output_path, "w+")

f = open(output_1_path, "w+")
f.write("Node1\tNode2\ty_true\ty_score\n")
for edge in edge_list:
y_true.append(ground_truth.get(edge, 0)) # 1 if edge in file 1, else 0
y_scores.append(1 if edge in zip(spras_df["Node1"], spras_df["Node2"]) else 0)
f.write(f"{edge[0]}\t{edge[1]}\t{y_true[-1]}\t{y_scores[-1]}\n")
y_true_edge.append(ground_truth_edge.get(edge, 0)) # 1 if edge in file 1, else 0
y_scores_edge.append(1 if edge in zip(spras_df["Node1"], spras_df["Node2"]) else 0)
f.write(f"{edge[0]}\t{edge[1]}\t{y_true_edge[-1]}\t{y_scores_edge[-1]}\n")


all_nodes = pd.concat([spras_df, panther_df]).drop_duplicates()
node_list = pd.unique(all_nodes[["Node1", "Node2"]].values.ravel())
spras_node_list = pd.unique(spras_df[["Node1", "Node2"]].values.ravel())
panther_node_list = pd.unique(panther_df[["Node1", "Node2"]].values.ravel())

print(spras_path)
print(f"spras nodes {spras_node_list}")
print(f"spras nodes {panther_node_list}")
print(f"all nodes {node_list}")

ground_truth_node = {node: 1 for node in panther_node_list}
print(ground_truth_node)

y_true_node = []
y_scores_node = []

g = open(output_2_path, "w+")
g.write("Node1\ty_true\ty_score\n")
for node in node_list:
y_true_node.append(ground_truth_node.get(node, 0)) # 1 if node in file 1, else 0
y_scores_node.append(1 if node in spras_node_list else 0)
g.write(f"{node}\t{y_true_node[-1]}\t{y_scores_node[-1]}\n")

Original file line number Diff line number Diff line change
@@ -13,20 +13,29 @@
G = nx.from_pandas_edgelist(spras_df, source="Node1", target="Node2")
H = nx.from_pandas_edgelist(panther_df, source="Node1", target="Node2")

G_set = set(G.edges())
H_set = set(H.edges())
union = G_set | H_set
intersection = G_set & H_set
jaccard_index = len(intersection) / len(union)
overlap_ratio_set1 = (len(intersection) / len(G_set)) * 100
overlap_ratio_set2 = (len(intersection) / len(H_set)) * 100
G_set_edges = set(G.edges())
H_set_edges = set(H.edges())
edge_union = G_set_edges | H_set_edges
edge_intersection = G_set_edges & H_set_edges
jaccard_edge_index = len(edge_intersection) / len(edge_union)
overlap_ratio_edge1 = (len(edge_intersection) / len(G_set_edges)) * 100
overlap_ratio_edge2 = (len(edge_intersection) / len(H_set_edges)) * 100

G_set_nodes = set(G.nodes())
H_set_nodes = set(H.nodes())
node_union = G_set_nodes | H_set_nodes
node_intersection = G_set_nodes & H_set_nodes
jaccard_node_index = len(node_intersection) / len(node_union)
overlap_ratio_node1 = (len(node_intersection) / len(G_set_nodes)) * 100
overlap_ratio_node2 = (len(node_intersection) / len(H_set_nodes)) * 100

columns = [
"spras_nodes",
"panther_nodes",
"spras_edges",
"panther_edges",
"jaccard_index",
"jaccard_edge_index",
"jaccard_node_index",
"panther_spras_edge_overlap",
"spras_panther_edge_overlap",
]
@@ -36,9 +45,10 @@
str(len(H.nodes())),
str(len(G.edges())),
str(len(H.edges())),
str(jaccard_index),
str(overlap_ratio_set1),
str(overlap_ratio_set2),
str(jaccard_edge_index),
str(jaccard_node_index),
str(overlap_ratio_edge1),
str(overlap_ratio_edge2),
]

f = open(output_path, "w+")