diff --git a/.gitignore b/.gitignore index 82f9275..0197504 100644 --- a/.gitignore +++ b/.gitignore @@ -160,3 +160,8 @@ cython_debug/ # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. #.idea/ + + +.snakemake +output/ +.DS_Store \ No newline at end of file diff --git a/synthetic-benchmarking/README.md b/synthetic-benchmarking/README.md new file mode 100644 index 0000000..d3cdaad --- /dev/null +++ b/synthetic-benchmarking/README.md @@ -0,0 +1,24 @@ +# Synthetic-Benchmarking +This section of the benchmarking analysis is interesting in applying sythentic pathway data from curated databases into SPRAS to see how they perform. + +## Setup + +Make sure you are in the benchmarking-pipeline directory and you have installed [conda](https://docs.conda.io/projects/conda/en/latest/user-guide/install/index.html) + + +To setup the environment, do the following command: +``` +conda env create -f environment.yml +``` + +This pipeline is built using the Snakemake workflow. You can configure which pathways you want to include in the `config.yml` file. To run the workflow, do the following command: + +``` +snakemake --cores 1 +``` + + +## Important Files/Directories +- `networks\` is a directory that contains all the methods and the pathway files. They all have the same structure where each algorithm and pathway combination must include the `spras.txt` file with the corresponding `panther.txt` file. +- `scripts\` is a directory that contains all the helper python files in order to generate the auc, heatmap, and stats. +- `Snakefile` this file contains all the rules and workflow requirements. \ No newline at end of file diff --git a/synthetic-benchmarking/benchmarking-pipeline/Snakefile b/synthetic-benchmarking/benchmarking-pipeline/Snakefile new file mode 100644 index 0000000..68b829d --- /dev/null +++ b/synthetic-benchmarking/benchmarking-pipeline/Snakefile @@ -0,0 +1,125 @@ +configfile: "config.yaml" + +algorithms = config["algorithm_pathways"].keys() +pathways = config["algorithm_pathways"] + +def generate_paths(algorithms, pathways): + paths = [] + for algorithm in algorithms: + for pathway in pathways[algorithm]: + paths.append(f"output/images/network/{algorithm}_{pathway}_spras.png") + paths.append(f"output/images/network/{algorithm}_{pathway}_panther.png") + paths.append(f"output/stats/network/{algorithm}_{pathway}_stats.txt") + paths.append(f"output/stats/score/edges/{algorithm}_{pathway}.txt") + paths.append(f"output/stats/score/nodes/{algorithm}_{pathway}.txt") + + return paths + +def score_files(algorithms, pathways): + edge_inputs = [] + node_inputs = [] + outputs = [] + for algorithm in algorithms: + for pathway in pathways[algorithm]: + edge_score_file = f"output/stats/score/edges/{algorithm}_{pathway}.txt" + node_score_file = f"output/stats/score/nodes/{algorithm}_{pathway}.txt" + + edge_inputs.append(edge_score_file) + node_inputs.append(node_score_file) + outputs.append(f"output/images/auc/edges/{algorithm}_alg.png") + outputs.append(f"output/images/auc/nodes/{algorithm}_alg.png") + return edge_inputs, node_inputs, outputs + +def pathway_files(algorithms, pathways): + edge_inputs = [] + node_inputs = [] + pathway_outputs = [] + for algorithm in algorithms: + for pathway in pathways[algorithm]: + edge_score_file = f"output/stats/score/edges/{algorithm}_{pathway}.txt" + node_score_file = f"output/stats/score/nodes/{algorithm}_{pathway}.txt" + + edge_inputs.append(edge_score_file) + node_inputs.append(node_score_file) + pathway_outputs.append(f"output/images/auc/edges/{pathway}_pathway.png") + pathway_outputs.append(f"output/images/auc/nodes/{pathway}_pathway.png") + + return edge_inputs, node_inputs, pathway_outputs + +def edges_files(algorithms, pathways): + edge_inputs = [] + for algorithm in algorithms: + for pathway in pathways[algorithm]: + edge_file = f"output/stats/network/{algorithm}_{pathway}_stats.txt" + edge_inputs.append(edge_file) + return edge_inputs + +rule all: + input: + generate_paths(algorithms, pathways), + score_files(algorithms, pathways)[2], + pathway_files(algorithms, pathways)[2], + "output/images/heatmap/heatmap_edge.png", + "output/images/heatmap/heatmap_node.png" + +rule pathway_image: + input: + spras=lambda wildcards: f"networks/{wildcards.algorithms}/{wildcards.pathways}/spras.txt", + panther=lambda wildcards: f"networks/{wildcards.algorithms}/{wildcards.pathways}/panther.txt" + output: + spras_image="output/images/network/{algorithms}_{pathways}_spras.png", + panther_image="output/images/network/{algorithms}_{pathways}_panther.png" + script: + "scripts/generate_network.py" + +rule pathway_stats: + input: + spras=lambda wildcards: f"networks/{wildcards.algorithms}/{wildcards.pathways}/spras.txt", + panther=lambda wildcards: f"networks/{wildcards.algorithms}/{wildcards.pathways}/panther.txt" + output: + stats="output/stats/network/{algorithms}_{pathways}_stats.txt" + script: + "scripts/generate_stats.py" + +rule generate_scores: + input: + spras=lambda wildcards: f"networks/{wildcards.algorithms}/{wildcards.pathways}/spras.txt", + panther=lambda wildcards: f"networks/{wildcards.algorithms}/{wildcards.pathways}/panther.txt" + output: + edge_score="output/stats/score/edges/{algorithms}_{pathways}.txt", + node_score="output/stats/score/nodes/{algorithms}_{pathways}.txt", + + script: + "scripts/generate_scores_file.py" + +rule generate_algorithm_auc: + input: + edge_scores=score_files(algorithms, pathways)[0], + node_scores=score_files(algorithms, pathways)[1] + + output: + auc_edge_image="output/images/auc/edges/{algorithm}_alg.png", + auc_node_image="output/images/auc/nodes/{algorithm}_alg.png" + script: + "scripts/generate_auc_figures.py" + +rule generate_pathway_auc: + input: + edge_scores=pathway_files(algorithms, pathways)[0], + node_scores=pathway_files(algorithms, pathways)[1] + output: + auc_edge_image="output/images/auc/edges/{pathway}_pathway.png", + auc_node_image="output/images/auc/nodes/{pathway}_pathway.png" + + script: + "scripts/generate_auc_figures.py" + +rule generate_heatmap: + input: + scores=edges_files(algorithms, pathways) + output: + edge_heatmap="output/images/heatmap/heatmap_edge.png", + node_heatmap="output/images/heatmap/heatmap_node.png" + + script: + "scripts/generate_heatmap.py" \ No newline at end of file diff --git a/synthetic-benchmarking/benchmarking-pipeline/config.yaml b/synthetic-benchmarking/benchmarking-pipeline/config.yaml new file mode 100644 index 0000000..6dd92a5 --- /dev/null +++ b/synthetic-benchmarking/benchmarking-pipeline/config.yaml @@ -0,0 +1,10 @@ +algorithm_pathways: + PathLinker: + - egfr + - wnt + OmicsIntegrator1: + # - egfr + - wnt + BTB: + - egfr + - wnt \ No newline at end of file diff --git a/synthetic-benchmarking/benchmarking-pipeline/environment.yml b/synthetic-benchmarking/benchmarking-pipeline/environment.yml new file mode 100644 index 0000000..ce554e0 --- /dev/null +++ b/synthetic-benchmarking/benchmarking-pipeline/environment.yml @@ -0,0 +1,199 @@ +name: benchmarking-pipeline +channels: + - bioconda + - conda-forge + - defaults + - https://conda.anaconda.org/gurobi +dependencies: + - amply=0.1.6=pyhd8ed1ab_0 + - appdirs=1.4.4=pyh9f0ad1d_0 + - argparse-dataclass=2.0.0=pyhd8ed1ab_0 + - atk-1.0=2.38.0=h4bec284_2 + - attrs=24.2.0=pyh71513ae_0 + - bcftools=1.19=h79774e1_1 + - brotli=1.1.0=h00291cd_2 + - brotli-bin=1.1.0=h00291cd_2 + - brotli-python=1.1.0=py312h5861a67_2 + - bwa=0.7.17=h45fc8d7_11 + - bzip2=1.0.8=hfdf4475_7 + - c-ares=1.34.2=h32b1619_0 + - ca-certificates=2024.8.30=h8857fd0_0 + - cairo=1.18.0=h99e66fa_0 + - certifi=2024.8.30=pyhd8ed1ab_0 + - cffi=1.17.1=py312hf857d28_0 + - charset-normalizer=3.4.0=pyhd8ed1ab_0 + - coin-or-cbc=2.10.12=h26cd4a2_1 + - coin-or-cgl=0.60.7=ha3c4b8c_0 + - coin-or-clp=1.17.8=hf0ee74e_0 + - coin-or-osi=0.108.10=h13a241d_0 + - coin-or-utils=2.11.11=h86ddba1_0 + - coincbc=2.10.12=1_metapackage + - conda-inject=1.3.2=pyhd8ed1ab_0 + - configargparse=1.7=pyhd8ed1ab_0 + - connection_pool=0.0.3=pyhd3deb0d_0 + - contourpy=1.3.0=py312h2a50410_2 + - cycler=0.12.1=pyhd8ed1ab_0 + - datrie=0.8.2=py312hb553811_8 + - docutils=0.21.2=pyhd8ed1ab_0 + - dpath=2.2.0=pyha770c72_0 + - expat=2.6.3=hac325c4_0 + - font-ttf-dejavu-sans-mono=2.37=hab24e00_0 + - font-ttf-inconsolata=3.000=h77eed37_0 + - font-ttf-source-code-pro=2.038=h77eed37_0 + - font-ttf-ubuntu=0.83=h77eed37_3 + - fontconfig=2.14.2=h5bb23bf_0 + - fonts-conda-ecosystem=1=0 + - fonts-conda-forge=1=0 + - fonttools=4.54.1=py312hbe3f5e4_1 + - freetype=2.12.1=h60636b9_2 + - fribidi=1.0.10=hbcb3906_0 + - gdk-pixbuf=2.42.12=ha587570_0 + - gettext=0.22.5=hdfe23c8_3 + - gettext-tools=0.22.5=hdfe23c8_3 + - giflib=5.2.2=h10d778d_0 + - gitdb=4.0.11=pyhd8ed1ab_0 + - gitpython=3.1.43=pyhd8ed1ab_0 + - graphite2=1.3.13=h73e2aa4_1003 + - graphviz=11.0.0=hc9017ca_0 + - gsl=2.7=h93259b0_0 + - gtk2=2.24.33=h8ca4665_4 + - gts=0.7.6=h53e17e3_4 + - h2=4.1.0=pyhd8ed1ab_0 + - harfbuzz=9.0.0=h053f038_0 + - hpack=4.0.0=pyh9f0ad1d_0 + - htslib=1.21=hec81eee_0 + - humanfriendly=10.0=pyhd8ed1ab_6 + - hyperframe=6.0.1=pyhd8ed1ab_0 + - icu=73.2=hf5e326d_0 + - idna=3.10=pyhd8ed1ab_0 + - immutables=0.21=py312hb553811_0 + - importlib_resources=6.4.5=pyhd8ed1ab_0 + - jinja2=3.1.4=pyhd8ed1ab_0 + - joblib=1.4.2=pyhd8ed1ab_0 + - jsonschema=4.23.0=pyhd8ed1ab_0 + - jsonschema-specifications=2024.10.1=pyhd8ed1ab_0 + - jupyter_core=5.7.2=pyh31011fe_1 + - kiwisolver=1.4.7=py312hc5c4d5f_0 + - krb5=1.21.3=h37d8d59_0 + - lcms2=2.16=ha2f27b4_0 + - lerc=4.0.0=hb486fe8_0 + - libasprintf=0.22.5=hdfe23c8_3 + - libasprintf-devel=0.22.5=hdfe23c8_3 + - libblas=3.9.0=25_osx64_openblas + - libbrotlicommon=1.1.0=h00291cd_2 + - libbrotlidec=1.1.0=h00291cd_2 + - libbrotlienc=1.1.0=h00291cd_2 + - libcblas=3.9.0=25_osx64_openblas + - libcurl=8.8.0=hf9fcc65_1 + - libcxx=19.1.3=hf95d169_0 + - libdeflate=1.20=h49d49c5_0 + - libedit=3.1.20191231=h0678c8f_2 + - libev=4.33=h10d778d_2 + - libexpat=2.6.3=hac325c4_0 + - libffi=3.4.2=h0d85af4_5 + - libgd=2.3.3=h0dceb68_9 + - libgettextpo=0.22.5=hdfe23c8_3 + - libgettextpo-devel=0.22.5=hdfe23c8_3 + - libgfortran=5.0.0=13_2_0_h97931a8_3 + - libgfortran5=13.2.0=h2873a65_3 + - libglib=2.80.2=h0f68cf7_0 + - libiconv=1.17=hd75f5a5_2 + - libintl=0.22.5=hdfe23c8_3 + - libintl-devel=0.22.5=hdfe23c8_3 + - libjpeg-turbo=3.0.0=h0dc2134_1 + - liblapack=3.9.0=25_osx64_openblas + - liblapacke=3.9.0=25_osx64_openblas + - libnghttp2=1.58.0=h64cf6d3_1 + - libopenblas=0.3.28=openmp_h1e3e198_0 + - libpng=1.6.43=h92b6c6a_0 + - librsvg=2.58.1=h368d7ee_0 + - libsqlite=3.46.0=h1b8f9f3_0 + - libssh2=1.11.0=hd019ec5_0 + - libtiff=4.6.0=h129831d_3 + - libwebp=1.4.0=hc207709_0 + - libwebp-base=1.4.0=h10d778d_0 + - libxcb=1.15=hb7f2c08_0 + - libxml2=2.12.7=h3e169fe_1 + - libzlib=1.2.13=h87427d6_6 + - llvm-openmp=19.1.3=hf78d878_0 + - markupsafe=3.0.2=py312hbe3f5e4_0 + - matplotlib=3.9.2=py312hb401068_1 + - matplotlib-base=3.9.2=py312h30cc4df_1 + - munkres=1.1.4=pyh9f0ad1d_0 + - nbformat=5.10.4=pyhd8ed1ab_0 + - ncurses=6.5=hf036a51_1 + - networkx=3.4.2=pyhd8ed1ab_1 + - numpy=2.1.3=py312hfc93d17_0 + - openjpeg=2.5.2=h7310d3a_0 + - openssl=3.3.2=hd23fc13_0 + - packaging=24.1=pyhd8ed1ab_0 + - pandas=2.2.3=py312h98e817e_1 + - pango=1.54.0=h115fe74_1 + - patsy=0.5.6=pyhd8ed1ab_0 + - pcre2=10.43=h0ad2156_0 + - perl=5.32.1=7_h10d778d_perl5 + - pillow=10.3.0=py312h0c923fa_0 + - pip=24.3.1=pyh8b19718_0 + - pixman=0.43.4=h73e2aa4_0 + - pkgutil-resolve-name=1.3.10=pyhd8ed1ab_1 + - plac=1.4.3=pyhd8ed1ab_0 + - platformdirs=4.3.6=pyhd8ed1ab_0 + - psutil=6.1.0=py312h3d0f464_0 + - pthread-stubs=0.4=h00291cd_1002 + - pulp=2.8.0=py312hb401068_0 + - pycparser=2.22=pyhd8ed1ab_0 + - pygments=2.18.0=pyhd8ed1ab_0 + - pyparsing=3.2.0=pyhd8ed1ab_1 + - pysocks=1.7.1=pyha2e5f31_6 + - python=3.12.3=h1411813_0_cpython + - python-dateutil=2.9.0=pyhd8ed1ab_0 + - python-fastjsonschema=2.20.0=pyhd8ed1ab_0 + - python-tzdata=2024.2=pyhd8ed1ab_0 + - python_abi=3.12=5_cp312 + - pytz=2024.1=pyhd8ed1ab_0 + - pyyaml=6.0.2=py312hb553811_1 + - qhull=2020.2=h3c5361c_5 + - readline=8.2=h9e318b2_1 + - referencing=0.35.1=pyhd8ed1ab_0 + - requests=2.32.3=pyhd8ed1ab_0 + - reretry=0.11.8=pyhd8ed1ab_0 + - rpds-py=0.20.1=py312h0d0de52_0 + - samtools=1.19.2=hd510865_1 + - scikit-learn=1.5.2=py312h9d777eb_1 + - scipy=1.14.1=py312h888eae2_1 + - seaborn=0.13.2=hd8ed1ab_2 + - seaborn-base=0.13.2=pyhd8ed1ab_2 + - setuptools=75.3.0=pyhd8ed1ab_0 + - six=1.16.0=pyh6c4a22f_0 + - smart_open=7.0.5=pyhd8ed1ab_1 + - smmap=5.0.0=pyhd8ed1ab_0 + - snakemake-interface-common=1.17.4=pyhdfd78af_0 + - snakemake-interface-executor-plugins=9.3.2=pyhdfd78af_0 + - snakemake-interface-report-plugins=1.1.0=pyhdfd78af_0 + - snakemake-interface-storage-plugins=3.3.0=pyhdfd78af_0 + - snakemake-minimal=8.25.1=pyhdfd78af_0 + - statsmodels=0.14.4=py312h3a11e2b_0 + - tabulate=0.9.0=pyhd8ed1ab_1 + - threadpoolctl=3.5.0=pyhc1e730c_0 + - throttler=1.2.2=pyhd8ed1ab_0 + - tk=8.6.13=h1abcd95_1 + - tornado=6.4.1=py312hb553811_1 + - traitlets=5.14.3=pyhd8ed1ab_0 + - typing_extensions=4.12.2=pyha770c72_0 + - tzdata=2024b=hc8b5060_0 + - unicodedata2=15.1.0=py312h3d0f464_1 + - urllib3=2.2.3=pyhd8ed1ab_0 + - wheel=0.44.0=pyhd8ed1ab_0 + - wrapt=1.16.0=py312hb553811_1 + - xorg-libxau=1.0.11=h00291cd_1 + - xorg-libxdmcp=1.1.5=h00291cd_0 + - xz=5.2.6=h775f41a_0 + - yaml=0.2.5=h0d85af4_2 + - yte=1.5.4=pyha770c72_0 + - zipp=3.20.2=pyhd8ed1ab_0 + - zlib=1.2.13=h87427d6_6 + - zstandard=0.23.0=py312h7122b0e_1 + - zstd=1.5.6=h915ae27_0 + - pip: + - pysam==0.22.0 +prefix: /opt/anaconda3/envs/snakemake-tutorial diff --git a/synthetic-benchmarking/benchmarking-pipeline/networks/BTB/egfr/panther.txt b/synthetic-benchmarking/benchmarking-pipeline/networks/BTB/egfr/panther.txt new file mode 100644 index 0000000..5e3fc66 --- /dev/null +++ b/synthetic-benchmarking/benchmarking-pipeline/networks/BTB/egfr/panther.txt @@ -0,0 +1,10 @@ +Node1 Node2 Rank Direction +A B 1 D +A C 1 D +B C 1 D +B D 1 D +C D 1 D +D F 1 D +D G 1 D +D E 1 D +C G 1 D \ No newline at end of file diff --git a/synthetic-benchmarking/benchmarking-pipeline/networks/BTB/egfr/spras.txt b/synthetic-benchmarking/benchmarking-pipeline/networks/BTB/egfr/spras.txt new file mode 100644 index 0000000..87cb3dc --- /dev/null +++ b/synthetic-benchmarking/benchmarking-pipeline/networks/BTB/egfr/spras.txt @@ -0,0 +1,16 @@ +Node1 Node2 Rank Direction +A B 1 D +A C 1 D +B C 1 D +B D 1 D +C D 1 D +C E 1 D +C H 1 D +E H 1 D +E F 1 D +E G 1 D +H G 1 D +F G 1 D +G P 1 D +P N 1 D +N F 1 D diff --git a/synthetic-benchmarking/benchmarking-pipeline/networks/BTB/wnt/panther.txt b/synthetic-benchmarking/benchmarking-pipeline/networks/BTB/wnt/panther.txt new file mode 100644 index 0000000..7780179 --- /dev/null +++ b/synthetic-benchmarking/benchmarking-pipeline/networks/BTB/wnt/panther.txt @@ -0,0 +1,8 @@ +Node1 Node2 Rank Direction +A D 1 D +B C 1 D +D E 1 D +C E 1 D +C F 1 D +C B 1 D +C G 1 D diff --git a/synthetic-benchmarking/benchmarking-pipeline/networks/BTB/wnt/spras.txt b/synthetic-benchmarking/benchmarking-pipeline/networks/BTB/wnt/spras.txt new file mode 100644 index 0000000..0863137 --- /dev/null +++ b/synthetic-benchmarking/benchmarking-pipeline/networks/BTB/wnt/spras.txt @@ -0,0 +1,11 @@ +Node1 Node2 Rank Direction +A C 1 D +A D 1 D +B C 1 D +D C 1 D +C E 1 D +C F 1 D +F G 1 D +G H 1 D +H Z 1 D +Z A 1 D diff --git a/synthetic-benchmarking/benchmarking-pipeline/networks/OmicsIntegrator1/egfr/panther.txt b/synthetic-benchmarking/benchmarking-pipeline/networks/OmicsIntegrator1/egfr/panther.txt new file mode 100644 index 0000000..6bf24d0 --- /dev/null +++ b/synthetic-benchmarking/benchmarking-pipeline/networks/OmicsIntegrator1/egfr/panther.txt @@ -0,0 +1,10 @@ +Node1 Node2 Rank Direction +A B 1 D +A D 1 D +B D 1 D +D E 1 D +B E 1 D +D C 1 D +D F 1 D +D G 1 D +F G 1 D \ No newline at end of file diff --git a/synthetic-benchmarking/benchmarking-pipeline/networks/OmicsIntegrator1/egfr/spras.txt b/synthetic-benchmarking/benchmarking-pipeline/networks/OmicsIntegrator1/egfr/spras.txt new file mode 100644 index 0000000..760d053 --- /dev/null +++ b/synthetic-benchmarking/benchmarking-pipeline/networks/OmicsIntegrator1/egfr/spras.txt @@ -0,0 +1,12 @@ +Node1 Node2 Rank Direction +A B 1 D +A E 1 D +A F 1 D +B C 1 D +C E 1 D +C J 1 D +J M 1 D +M N 1 D +C F 1 D +D A 1 D +D E 1 D \ No newline at end of file diff --git a/synthetic-benchmarking/benchmarking-pipeline/networks/OmicsIntegrator1/wnt/panther.txt b/synthetic-benchmarking/benchmarking-pipeline/networks/OmicsIntegrator1/wnt/panther.txt new file mode 100644 index 0000000..7780179 --- /dev/null +++ b/synthetic-benchmarking/benchmarking-pipeline/networks/OmicsIntegrator1/wnt/panther.txt @@ -0,0 +1,8 @@ +Node1 Node2 Rank Direction +A D 1 D +B C 1 D +D E 1 D +C E 1 D +C F 1 D +C B 1 D +C G 1 D diff --git a/synthetic-benchmarking/benchmarking-pipeline/networks/OmicsIntegrator1/wnt/spras.txt b/synthetic-benchmarking/benchmarking-pipeline/networks/OmicsIntegrator1/wnt/spras.txt new file mode 100644 index 0000000..e5af763 --- /dev/null +++ b/synthetic-benchmarking/benchmarking-pipeline/networks/OmicsIntegrator1/wnt/spras.txt @@ -0,0 +1,7 @@ +Node1 Node2 Rank Direction +A C 1 D +A Z 1 D +A D 1 D +B C 1 D +D C 1 D +C E 1 D diff --git a/synthetic-benchmarking/benchmarking-pipeline/networks/PathLinker/egfr/panther.txt b/synthetic-benchmarking/benchmarking-pipeline/networks/PathLinker/egfr/panther.txt new file mode 100644 index 0000000..01e4cc2 --- /dev/null +++ b/synthetic-benchmarking/benchmarking-pipeline/networks/PathLinker/egfr/panther.txt @@ -0,0 +1,8 @@ +Node1 Node2 Rank Direction +A C 1 D +A D 1 D +B D 1 D +C E 1 D +D E 1 D +D F 1 D + diff --git a/synthetic-benchmarking/benchmarking-pipeline/networks/PathLinker/egfr/spras.txt b/synthetic-benchmarking/benchmarking-pipeline/networks/PathLinker/egfr/spras.txt new file mode 100644 index 0000000..ca7bfb7 --- /dev/null +++ b/synthetic-benchmarking/benchmarking-pipeline/networks/PathLinker/egfr/spras.txt @@ -0,0 +1,9 @@ +Node1 Node2 Rank Direction +A D 1 D +A K 1 D +A J 1 D +B D 1 D +B C 1 D +C D 1 D +D E 1 D +D F 1 D \ No newline at end of file diff --git a/synthetic-benchmarking/benchmarking-pipeline/networks/PathLinker/wnt/panther.txt b/synthetic-benchmarking/benchmarking-pipeline/networks/PathLinker/wnt/panther.txt new file mode 100644 index 0000000..7780179 --- /dev/null +++ b/synthetic-benchmarking/benchmarking-pipeline/networks/PathLinker/wnt/panther.txt @@ -0,0 +1,8 @@ +Node1 Node2 Rank Direction +A D 1 D +B C 1 D +D E 1 D +C E 1 D +C F 1 D +C B 1 D +C G 1 D diff --git a/synthetic-benchmarking/benchmarking-pipeline/networks/PathLinker/wnt/spras.txt b/synthetic-benchmarking/benchmarking-pipeline/networks/PathLinker/wnt/spras.txt new file mode 100644 index 0000000..d7fa4d4 --- /dev/null +++ b/synthetic-benchmarking/benchmarking-pipeline/networks/PathLinker/wnt/spras.txt @@ -0,0 +1,10 @@ +Node1 Node2 Rank Direction +A C 1 D +A D 1 D +B C 1 D +D C 1 D +C E 1 D +C F 1 D +C P 1 D +C K 1 D +F G 1 D diff --git a/synthetic-benchmarking/benchmarking-pipeline/scripts/generate_auc_figures.py b/synthetic-benchmarking/benchmarking-pipeline/scripts/generate_auc_figures.py new file mode 100644 index 0000000..44d15e2 --- /dev/null +++ b/synthetic-benchmarking/benchmarking-pipeline/scripts/generate_auc_figures.py @@ -0,0 +1,121 @@ +import os +from pathlib import Path +from matplotlib import pyplot as plt +import pandas as pd +import networkx as nx +from sklearn.metrics import auc, precision_recall_curve, roc_curve + +edge_input_files = snakemake.input.edge_scores +node_input_files = snakemake.input.node_scores +output_edge_path = snakemake.output[0] +output_node_path = snakemake.output[1] + +auc_type = output_edge_path.split("/")[-1].split("_")[-1].split(".")[0] +algorithm = output_edge_path.split("/")[-1].split("_")[0] + +filtered_inputs = [] +i = 0 +for input in edge_input_files: + current_alg = "" + if auc_type == "alg": + current_alg = input.split("/")[-1].split("_")[0] + else: + current_alg = input.split("/")[-1].split("_")[-1].split(".")[0] + if current_alg == algorithm: + filtered_inputs.append(input) + +plt.figure(figsize=(10, 5)) +colors = ["b", "g", "r", "c", "m", "y", "k", "orange"] + +for idx, file in enumerate(filtered_inputs): + data = [] + y_true = [] + y_score = [] + with open(file, "r") as f: + next(f) + for line in f: + line = line.strip() + col = line.split("\t") + y_true.append(int(col[2])) + y_score.append(int(col[3])) + + precision, recall, _ = precision_recall_curve(y_true, y_score) + fpr, tpr, _ = roc_curve(y_true, y_score) + + plt.subplot(1, 2, 1) + plt.plot( + recall, precision, color=colors[idx % len(colors)], label=os.path.basename(file) + ) + + plt.subplot(1, 2, 2) + plt.plot(fpr, tpr, color=colors[idx % len(colors)], label=os.path.basename(file)) + +plt.subplot(1, 2, 1) +plt.xlabel("Recall") +plt.ylabel("Precision") +plt.title("Precision-Recall Curve") +plt.legend() + +plt.subplot(1, 2, 2) +plt.xlabel("False Positive Rate") +plt.ylabel("True Positive Rate") +plt.title("ROC Curve") +plt.legend() + +plt.tight_layout() +plt.savefig(output_edge_path, format="png", dpi=300) + +auc_type = output_edge_path.split("/")[-1].split("_")[-1].split(".")[0] +algorithm = output_edge_path.split("/")[-1].split("_")[0] + +filtered_inputs = [] +i = 0 +for input in node_input_files: + current_alg = "" + if auc_type == "alg": + current_alg = input.split("/")[-1].split("_")[0] + else: + current_alg = input.split("/")[-1].split("_")[-1].split(".")[0] + if current_alg == algorithm: + filtered_inputs.append(input) + +plt.figure(figsize=(10, 5)) +colors = ["b", "g", "r", "c", "m", "y", "k", "orange"] + +for idx, file in enumerate(filtered_inputs): + data = [] + y_true = [] + y_score = [] + with open(file, "r") as f: + next(f) + for line in f: + line = line.strip() + col = line.split("\t") + y_true.append(int(col[1])) + y_score.append(int(col[2])) + + precision, recall, _ = precision_recall_curve(y_true, y_score) + fpr, tpr, _ = roc_curve(y_true, y_score) + + plt.subplot(1, 2, 1) + plt.plot( + recall, precision, color=colors[idx % len(colors)], label=os.path.basename(file) + ) + + plt.subplot(1, 2, 2) + plt.plot(fpr, tpr, color=colors[idx % len(colors)], label=os.path.basename(file)) + +plt.subplot(1, 2, 1) +plt.xlabel("Recall") +plt.ylabel("Precision") +plt.title("Precision-Recall Curve") +plt.legend() + +plt.subplot(1, 2, 2) +plt.xlabel("False Positive Rate") +plt.ylabel("True Positive Rate") +plt.title("ROC Curve") +plt.legend() + +plt.tight_layout() +plt.savefig(output_node_path, format="png", dpi=300) diff --git a/synthetic-benchmarking/benchmarking-pipeline/scripts/generate_heatmap.py b/synthetic-benchmarking/benchmarking-pipeline/scripts/generate_heatmap.py new file mode 100644 index 0000000..a0dee79 --- /dev/null +++ b/synthetic-benchmarking/benchmarking-pipeline/scripts/generate_heatmap.py @@ -0,0 +1,81 @@ +from os import write +from pathlib import Path +from matplotlib import pyplot as plt +import pandas as pd +import numpy as np +import seaborn as sns + +scores_path = snakemake.input.scores +output_1_path = snakemake.output[0] +output_2_path = snakemake.output[1] + +data = [] +algorithms = [] +pathways = [] +valid_pairs = [] + +for idx, file in enumerate(scores_path): + file_name = file.split("/")[-1].split(".")[0] + parts = file_name.split("_") + if parts[0] not in algorithms: + algorithms.append(parts[0]) + if parts[1] not in pathways: + pathways.append(parts[1]) + valid_pairs.append((parts[0], parts[1])) + with open(file, "r") as f: + next(f) + for line in f: + cols = line.split("\t") + data.append((parts[0], parts[1], cols[4], cols[5])) + +jaccard_edges_indices_list = [] +jaccard_nodes_indices_list = [] + +for algorithm in algorithms: + current_edge_list = [] + current_node_list = [] + for pathway in pathways: + appended = False + for entry in data: + if entry[0] == algorithm and entry[1] == pathway: + appended = True + current_edge_list.append(float(entry[2])) + current_node_list.append(float(entry[3])) + if appended == False: + current_edge_list.append(np.nan) + current_node_list.append(np.nan) + + jaccard_edges_indices_list.append(current_edge_list) + jaccard_nodes_indices_list.append(current_node_list) + + +jaccard_edges_indices = np.array(jaccard_edges_indices_list, dtype=float) +jaccard_nodes_indices = np.array(jaccard_nodes_indices_list, dtype=float) + +plt.figure(figsize=(10, 8)) +sns.heatmap( + jaccard_edges_indices, + annot=True, + cmap="viridis", + xticklabels=pathways, + yticklabels=algorithms, +) + +plt.xlabel("Pathways") +plt.ylabel("Algorithms") +plt.title("Jaccard Index Edge Heatmap") +plt.savefig(output_1_path, format="png", dpi=300) + +plt.figure(figsize=(10, 8)) +sns.heatmap( + jaccard_nodes_indices, + annot=True, + cmap="viridis", + xticklabels=pathways, + yticklabels=algorithms, +) + +plt.xlabel("Pathways") +plt.ylabel("Algorithms") +plt.title("Jaccard Index Node Heatmap") +plt.savefig(output_2_path, format="png", dpi=300) diff --git a/synthetic-benchmarking/benchmarking-pipeline/scripts/generate_network.py b/synthetic-benchmarking/benchmarking-pipeline/scripts/generate_network.py new file mode 100644 index 0000000..fec7b45 --- /dev/null +++ b/synthetic-benchmarking/benchmarking-pipeline/scripts/generate_network.py @@ -0,0 +1,51 @@ +from pathlib import Path +from matplotlib import pyplot as plt +import pandas as pd +import networkx as nx + +spras_path = snakemake.input[0] +panther_path = snakemake.input[1] +output_1_path = snakemake.output[0] +output_2_path = snakemake.output[1] + +spras_df = pd.read_csv(spras_path, sep="\t") +panther_df = pd.read_csv(panther_path, sep="\t") + +G = nx.from_pandas_edgelist(spras_df, source="Node1", target="Node2") +H = nx.from_pandas_edgelist(panther_df, source="Node1", target="Node2") + +plt.figure(figsize=(10, 8)) +pos = nx.spring_layout(G) + +nx.draw( + G, + pos, + with_labels=True, + node_color="lightblue", + node_size=700, + font_size=10, + font_color="black", + edge_color="gray", +) + +plt.title("SPRAS") +plt.savefig(output_1_path, format="png", dpi=300) +plt.close() + +plt.figure(figsize=(10, 8)) +pos = nx.spring_layout(H) + +nx.draw( + H, + pos, + with_labels=True, + node_color="lightblue", + node_size=700, + font_size=10, + font_color="black", + edge_color="gray", +) + +plt.title("PANTHER") +plt.savefig(output_2_path, format="png", dpi=300) +plt.close() diff --git a/synthetic-benchmarking/benchmarking-pipeline/scripts/generate_scores_file.py b/synthetic-benchmarking/benchmarking-pipeline/scripts/generate_scores_file.py new file mode 100644 index 0000000..e37c35d --- /dev/null +++ b/synthetic-benchmarking/benchmarking-pipeline/scripts/generate_scores_file.py @@ -0,0 +1,55 @@ +from pathlib import Path +from matplotlib import pyplot as plt +import pandas as pd +import networkx as nx + +spras_path = snakemake.input[0] +panther_path = snakemake.input[1] +output_1_path = snakemake.output[0] +output_2_path = snakemake.output[1] + +spras_df = pd.read_csv(spras_path, sep="\t") +panther_df = pd.read_csv(panther_path, sep="\t") + +spras_df = spras_df[["Node1", "Node2"]] +panther_df = panther_df[["Node1", "Node2"]] + +all_edges = pd.concat([spras_df, panther_df]).drop_duplicates() +edge_list = set(zip(all_edges["Node1"], all_edges["Node2"])) +ground_truth_edge = {edge: 1 for edge in zip(panther_df["Node1"], panther_df["Node2"])} + +y_true_edge = [] +y_scores_edge = [] + + +f = open(output_1_path, "w+") +f.write("Node1\tNode2\ty_true\ty_score\n") +for edge in edge_list: + y_true_edge.append(ground_truth_edge.get(edge, 0)) # 1 if edge in file 1, else 0 + y_scores_edge.append(1 if edge in zip(spras_df["Node1"], spras_df["Node2"]) else 0) + f.write(f"{edge[0]}\t{edge[1]}\t{y_true_edge[-1]}\t{y_scores_edge[-1]}\n") + + +all_nodes = pd.concat([spras_df, panther_df]).drop_duplicates() +node_list = pd.unique(all_nodes[["Node1", "Node2"]].values.ravel()) +spras_node_list = pd.unique(spras_df[["Node1", "Node2"]].values.ravel()) +panther_node_list = pd.unique(panther_df[["Node1", "Node2"]].values.ravel()) + +print(spras_path) +print(f"spras nodes {spras_node_list}") +print(f"spras nodes {panther_node_list}") +print(f"all nodes {node_list}") + +ground_truth_node = {node: 1 for node in panther_node_list} +print(ground_truth_node) + +y_true_node = [] +y_scores_node = [] + +g = open(output_2_path, "w+") +g.write("Node1\ty_true\ty_score\n") +for node in node_list: + y_true_node.append(ground_truth_node.get(node, 0)) # 1 if node in file 1, else 0 + y_scores_node.append(1 if node in spras_node_list else 0) + g.write(f"{node}\t{y_true_node[-1]}\t{y_scores_node[-1]}\n") + diff --git a/synthetic-benchmarking/benchmarking-pipeline/scripts/generate_stats.py b/synthetic-benchmarking/benchmarking-pipeline/scripts/generate_stats.py new file mode 100644 index 0000000..a2753a5 --- /dev/null +++ b/synthetic-benchmarking/benchmarking-pipeline/scripts/generate_stats.py @@ -0,0 +1,58 @@ +from pathlib import Path +from matplotlib import pyplot as plt +import pandas as pd +import networkx as nx + +spras_path = snakemake.input[0] +panther_path = snakemake.input[1] +output_path = snakemake.output[0] + +spras_df = pd.read_csv(spras_path, sep="\t") +panther_df = pd.read_csv(panther_path, sep="\t") + +G = nx.from_pandas_edgelist(spras_df, source="Node1", target="Node2") +H = nx.from_pandas_edgelist(panther_df, source="Node1", target="Node2") + +G_set_edges = set(G.edges()) +H_set_edges = set(H.edges()) +edge_union = G_set_edges | H_set_edges +edge_intersection = G_set_edges & H_set_edges +jaccard_edge_index = len(edge_intersection) / len(edge_union) +overlap_ratio_edge1 = (len(edge_intersection) / len(G_set_edges)) * 100 +overlap_ratio_edge2 = (len(edge_intersection) / len(H_set_edges)) * 100 + +G_set_nodes = set(G.nodes()) +H_set_nodes = set(H.nodes()) +node_union = G_set_nodes | H_set_nodes +node_intersection = G_set_nodes & H_set_nodes +jaccard_node_index = len(node_intersection) / len(node_union) +overlap_ratio_node1 = (len(node_intersection) / len(G_set_nodes)) * 100 +overlap_ratio_node2 = (len(node_intersection) / len(H_set_nodes)) * 100 + +columns = [ + "spras_nodes", + "panther_nodes", + "spras_edges", + "panther_edges", + "jaccard_edge_index", + "jaccard_node_index", + "panther_spras_edge_overlap", + "spras_panther_edge_overlap", +] + +rows = [ + str(len(G.nodes())), + str(len(H.nodes())), + str(len(G.edges())), + str(len(H.edges())), + str(jaccard_edge_index), + str(jaccard_node_index), + str(overlap_ratio_edge1), + str(overlap_ratio_edge2), +] + +f = open(output_path, "w+") +column = "\t".join(columns) + "\t" +row = "\t".join(rows) + "\t" +f.write(f"{column}\n") +f.write(f"{row}\n") \ No newline at end of file