diff --git a/docs/_static/config/beginner.yaml b/docs/_static/config/beginner.yaml new file mode 100644 index 000000000..202879534 --- /dev/null +++ b/docs/_static/config/beginner.yaml @@ -0,0 +1,56 @@ +hash_length: 7 +container_framework: docker +unpack_singularity: false +container_registry: + base_url: docker.io + owner: reedcompbio + +# Each algorithm has an 'include' parameter. By toggling 'include' to true/false the user can change +# which algorithms are run in a given experiment. +# +# algorithm-specific parameters are embedded in lists so that users can specify multiple. If multiple +# parameters are specified then the algorithm will be run as many times as needed to cover all parameter +# combinations. For instance if we have the following: +# - name: "myAlg" +# params: +# include: true +# a: [1,2] +# b: [0.5,0.75] +# +# then myAlg will be run on (a=1,b=0.5),(a=1,b=0.75),(a=2,b=0.5), and (a=2,b=0,75). Pretty neat, but be +# careful: too many parameters might make your runs take a long time. + +algorithms: + - name: "pathlinker" + params: + include: true + run1: + k: 1 + # run2: # uncomment for step 3.2 + # k: [10, 100] # uncomment for step 3.2 + +# Here we specify which pathways to run and other file location information. +# Assume that if a dataset label does not change, the lists of associated input files do not change +datasets: + - # Labels can only contain letters, numbers, or underscores + label: egfr + node_files: ["tps-egfr-prizes.txt"] # the input nodes + edge_files: ["phosphosite-irefindex13.0-uniprot.txt"] # the interactome + # # Placeholder + other_files: [] + # Relative path from the spras repository root directory where these files live + data_dir: "input" + +reconstruction_settings: + + # Set where everything is saved + locations: + reconstruction_dir: "output/basic" + +analysis: + # Create one summary per pathway file and a single summary table for all pathways for each dataset + summary: + include: false # set to true for step 3.3 + # Create Cytoscape session file with all pathway graphs for each dataset + cytoscape: + include: false # set to true for step 3.3 diff --git a/docs/_static/config/intermediate.yaml b/docs/_static/config/intermediate.yaml new file mode 100644 index 000000000..78f5a7489 --- /dev/null +++ b/docs/_static/config/intermediate.yaml @@ -0,0 +1,132 @@ +hash_length: 7 +container_framework: docker +unpack_singularity: false +container_registry: + base_url: docker.io + owner: reedcompbio + +# Each algorithm has an 'include' parameter. By toggling 'include' to true/false the user can change +# which algorithms are run in a given experiment. +# +# algorithm-specific parameters are embedded in lists so that users can specify multiple. If multiple +# parameters are specified then the algorithm will be run as many times as needed to cover all parameter +# combinations. For instance if we have the following: +# - name: "myAlg" +# params: +# include: true +# a: [1,2] +# b: [0.5,0.75] +# +# then myAlg will be run on (a=1,b=0.5),(a=1,b=0.75),(a=2,b=0.5), and (a=2,b=0,75). Pretty neat, but be +# careful: too many parameters might make your runs take a long time. + +algorithms: + - name: "pathlinker" + params: + include: true + run1: + k: 1 + run2: + k: [10, 100] + - name: omicsintegrator1 + params: + include: true + run1: + b: [0.55, 2, 10] + d: 10 + g: 1e-3 + r: 0.01 + w: 0.1 + mu: 0.008 + - name: omicsintegrator2 + params: + include: true + run1: + b: 4 + g: 0 + run2: + b: 2 + g: 3 + - name: meo + params: + include: true + run1: + local_search: ["Yes", "No"] + max_path_length: [2, 3] + rand_restarts: 10 + - name: allpairs + params: + include: true + - name: domino + params: + include: true + run1: + slice_threshold: 0.3 + module_threshold: 0.05 + - name: mincostflow + params: + include: true + run1: + capacity: 15 + flow: 80 + run2: + capacity: 1 + flow: 6 + run3: + capacity: 5 + flow: 60 + - name: "strwr" + params: + include: true + run1: + alpha: [0.85] + threshold: [100, 200] + - name: "rwr" + params: + include: true + run1: + alpha: [0.85] + threshold: [100, 200] + +# Here we specify which pathways to run and other file location information. +# Assume that if a dataset label does not change, the lists of associated input files do not change +datasets: # TODO update this based on the dataset that I set up + - # Labels can only contain letters, numbers, or underscores + label: egfr + node_files: ["tps-egfr-prizes.txt"] # the input nodes + edge_files: ["phosphosite-irefindex13.0-uniprot.txt"] # the interactome + # Placeholder + other_files: [] + # Relative path from the spras directory where these files live + data_dir: "input" + +reconstruction_settings: + + # Set where everything is saved + locations: + reconstruction_dir: "output/intermediate" + +analysis: + # Machine learning analysis (e.g. clustering) of the pathway output files for each dataset + ml: + # ml analysis per dataset + include: false # set to true for step 3 + # adds ml analysis per algorithm output + # only runs for algorithms with multiple parameter combinations chosen + aggregate_per_algorithm: false + # specify how many principal components to calculate + components: 2 + # boolean to show the labels on the pca graph + labels: true + # 'ward', 'complete', 'average', 'single' + # if linkage: ward, must use metric: euclidean + linkage: 'ward' + # 'euclidean', 'manhattan', 'cosine' + metric: 'euclidean' + # controls whether kernel density estimation (KDE) is computed and visualized on top of PCA plots. + # the coordinates of the KDE maximum (kde_peak) are also saved to the PCA coordinates output file. + # KDE needs to be run in order to select a parameter combination with PCA because the maximum kernel density is used + # to pick the 'best' parameter combination. + kde: false + # removes empty pathways from consideration in ml analysis (pca only) + remove_empty_pathways: false diff --git a/docs/_static/images/100_pathway.png b/docs/_static/images/100_pathway.png new file mode 100644 index 000000000..fdfab120f Binary files /dev/null and b/docs/_static/images/100_pathway.png differ diff --git a/docs/_static/images/10_pathway.png b/docs/_static/images/10_pathway.png new file mode 100644 index 000000000..6d701b695 Binary files /dev/null and b/docs/_static/images/10_pathway.png differ diff --git a/docs/_static/images/1_pathway.png b/docs/_static/images/1_pathway.png new file mode 100644 index 000000000..aa451895f Binary files /dev/null and b/docs/_static/images/1_pathway.png differ diff --git a/docs/_static/images/cytoscape-open-cys-file.png b/docs/_static/images/cytoscape-open-cys-file.png new file mode 100644 index 000000000..8cb194f54 Binary files /dev/null and b/docs/_static/images/cytoscape-open-cys-file.png differ diff --git a/docs/_static/images/cytoscape-opened.png b/docs/_static/images/cytoscape-opened.png new file mode 100644 index 000000000..3ff8b0ce9 Binary files /dev/null and b/docs/_static/images/cytoscape-opened.png differ diff --git a/docs/_static/images/cytoscape_upload_network.png b/docs/_static/images/cytoscape_upload_network.png new file mode 100644 index 000000000..eceea3e62 Binary files /dev/null and b/docs/_static/images/cytoscape_upload_network.png differ diff --git a/docs/_static/images/hac-horizontal.png b/docs/_static/images/hac-horizontal.png new file mode 100644 index 000000000..d67394daf Binary files /dev/null and b/docs/_static/images/hac-horizontal.png differ diff --git a/docs/_static/images/hac-vertical.png b/docs/_static/images/hac-vertical.png new file mode 100644 index 000000000..284b2492c Binary files /dev/null and b/docs/_static/images/hac-vertical.png differ diff --git a/docs/_static/images/jaccard-heatmap.png b/docs/_static/images/jaccard-heatmap.png new file mode 100644 index 000000000..f80e20a71 Binary files /dev/null and b/docs/_static/images/jaccard-heatmap.png differ diff --git a/docs/_static/images/pca.png b/docs/_static/images/pca.png new file mode 100644 index 000000000..7b78df704 Binary files /dev/null and b/docs/_static/images/pca.png differ diff --git a/docs/_static/images/summary-stats.png b/docs/_static/images/summary-stats.png new file mode 100644 index 000000000..17204a004 Binary files /dev/null and b/docs/_static/images/summary-stats.png differ diff --git a/docs/index.rst b/docs/index.rst index d12af3157..c9e5e72b6 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -56,6 +56,15 @@ methods (PRMs) to omics data. contributing/index contributing/maintain +.. toctree:: + :maxdepth: 1 + :caption: Tutorials + + tutorial/introduction + tutorial/beginner + tutorial/intermediate + tutorial/advanced + Indices and tables ================== diff --git a/docs/tutorial/advanced.rst b/docs/tutorial/advanced.rst new file mode 100644 index 000000000..a306543dd --- /dev/null +++ b/docs/tutorial/advanced.rst @@ -0,0 +1,31 @@ +Advanced Capabilities and Features +====================================== + +More like these are all the things we can do with this, but will not be showing + +- mention parameter tuning +- say that parameters are not preset and need to be tuned for each dataset + +CHTC integration + +Anything not included in the config file + +1. Global Workflow Control + +Sets options that apply to the entire workflow. + +- Examples: the container framework (docker, singularity, dsub) and where to pull container images from + +running spras with multiple parameter combinations with multiple algorithms on multiple Datasets +- for the tutorial we are only doing one dataset + +4. Gold Standards + +Defines the input files SPRAS will use to evaluate output subnetworks + +A gold standard dataset is comprised of: + +- a label: defines the name of the gold standard dataset +- node_file or edge_file: a list of either node files or edge files. Only one or the other can exist in a single dataset. At the moment only one edge or one node file can exist in one dataset +- data_dir: the path to where the input gold standard files live +- dataset_labels: a list of dataset labels that link each gold standard links to one or more datasets via the dataset labels diff --git a/docs/tutorial/beginner.rst b/docs/tutorial/beginner.rst new file mode 100644 index 000000000..bf03bec69 --- /dev/null +++ b/docs/tutorial/beginner.rst @@ -0,0 +1,593 @@ +################################################## +Beginner Tutorial - Set up & Running One Algorithm +################################################## + +This tutorial provides a hands-on introduction to SPRAS. It is designed to show participants how to install the software, run example workflows, and use tools to interpret the results. + +You will learn how to: + +- Set up the SPRAS software environment +- Explore the folder structure and understand how inputs, configurations, and outputs are organized +- Configure and run a pathway reconstruction algorithm on a provided dataset +- Enable post-analysis steps to generate post analysis information (summary statistics and Cytoscape visualizations) + + +Step 0: Clone the SPRAS repository, set up the environment, and run Docker +========================================================================== + +0.1 Start Docker +---------------- + +Launch Docker Desktop and wait until it says "Docker is running". + +0.2 Clone the SPRAS repository +------------------------------- + +Visit the `SPRAS GitHub repository `__ and clone it locally + +0.3 Set up the SPRAS environment +------------------------------------- + +From the root directory of the SPRAS repository, create and activate the Conda environment and install the SPRAS python package: + +.. code:: bash + + conda env create -f environment.yml + conda activate spras + python -m pip install . + +0.4 Test the installation +------------------------- + +Run the following command to confirm that SPRAS has been set up successfully from the command line: + +.. code:: bash + + python -c "import spras; print('SPRAS import successful')" + +Step 1: Explanation of configuration file +========================================= + +A configuration file specifies how a SPRAS workflow should run; think of it as the control center for the workflow. +It defines which algorithms to run, the parameters to use, the datasets and gold standards to include, the analyses to perform after reconstruction, and the container settings for execution. + +SPRAS uses Snakemake (a workflow manager) and containerized software (like Docker and Apptainer), to read the configuration file and execute a SPRAS workflow. + +Snakemake considers a task from the configuration file complete once the expected output files are present in the output directory. +As a result, rerunning the same configuration file may do nothing if those files already exist. +To continue or rerun SPRAS with the same configuration file, delete the output directory (or its contents) or modify the configuration file so Snakemake regenerates new results. + +For this part of the tutorial, we'll use a pre-defined configuration file. +Download it here: :download:`Beginner Config File <../_static/config/beginner.yaml>` + +Save the file into the config/ folder of your SPRAS installation. +After adding this file, SPRAS will use the configuration to set up and reference your directory structure, which will look like this: + +.. code-block:: text + + spras/ + ├── config/ + │ └── beginner.yaml + ├── inputs/ + │ ├── phosphosite-irefindex13.0-uniprot.txt # pre-defined in SPRAS already + │ └── tps-egfr-prizes.txt # pre-defined in SPRAS already + + +Here's an overview of the major sections when looking at a configuration file: + +Algorithms +----------- + + +.. code-block:: text + + algorithms: + - name: omicsintegrator1 + params: + include: true + run1: + b: 0.1 + d: 10 + g: 1e-3 + run2: + b: [0.55, 2, 10] + d: [10, 20] + g: 1e-3 + + +When defining an algorithm in the configuration file, its name must match one of the supported SPRAS algorithms (introduced in the intermediate tutorial / more information on the algorithms can be found under the Supported Algorithms section). +Each algorithm includes an include flag, which you set to true to have Snakemake run it, or false to disable it. + +Algorithm parameters can be organized into one or more run blocks (e.g., run1, run2, …), with each block containing key-value pairs. +When defining a parameter, it can be passed as a single value or passed by listing parameters within a list. +If multiple parameters are defined as lists within a run block, SPRAS generates all possible combinations (Cartesian product) of those list values together with any fixed single-value parameters in the same run block. +Each unique combination runs once per algorithm. +Invalid or missing parameter keys will cause SPRAS to fail. + +Datasets +-------- + +.. code-block:: text + + datasets: + - + label: egfr + node_files: ["prizes.txt", "sources-targets.txt"] + edge_files: ["interactome.txt"] + other_files: [] + data_dir: "input" + +In the configuration file, datasets are defined under the datasets section. +Each dataset you define will be run against all of the algorithms enabled in the configuration file. + +The dataset must include the following types of keys and files: + +- label: a name that uniquely identifies a dataset throughout the SPRAS workflow and outputs. +- node_files: Input files listing the “prizes” or important starting nodes ("sources" or "targets") for the algorithm +- edge_files: Input interactome or network file that defines the relationships between nodes +- other_files: This placefolder is not used +- data_dir: The file path of the directory where the input dataset files are located + +Reconstruction Settings +----------------------- + +.. code-block:: text + + reconstruction_settings: + locations: + reconstruction_dir: "output" + + +The reconstruction_settings section controls where outputs are stored. +Set reconstruction_dir to the directory path where you want results saved. SPRAS will automatically create this folder if it doesn't exist. +If you are running multiple configuration files, you can set unique paths to keep outputs organized and separate. + +Analysis +-------- + +.. code-block:: text + + analysis: + summary: + include: true + cytoscape: + include: true + ml: + include: true + + + +SPRAS includes multiple downstream analyses that can be toggled on or off directly in the configuration file. +When enabled, these analyses are performed per dataset and produce summaries or visualizations of the results from all enabled algorithms for that dataset. + +Step 2: Running SPRAS on a provided example dataset +==================================================== + +2.1 Running SPRAS with the Beginner Configuration +------------------------------------------------- +In the beginner.yaml configuration file, it is set up have SPRAS run a single algorithm with one parameter setting on one dataset. + +From the root directory spras/, run the command below from the command line: + +.. code:: bash + + snakemake --cores 1 --configfile config/beginner.yaml + +What Happens When You Run This Command +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +SPRAS will executes quickly from your perspective; however, several automated steps (handled by Snakemake and Docker) occur behind the scenes. + +1. Snakemake starts the workflow + +Snakemake reads the options set in the beginner.yaml configuration file and determines which datasets, algorithms, and parameter combinations need to run and if any post-analysis steps were requested. + +2. Preparing the dataset + +SPRAS takes the interactome and node prize files specified in the configuration and bundles them into a Dataset object to be used for processing algorithm specific inputs. +This object is stored as a .pickle file (e.g. dataset-egfr-merged.pickle) so it can be reused for other algorithms without re-processing it. + +3. Creating algorithm specific inputs + +For each algorithm marked as include: true in the configuration, SPRAS generates input files tailored to that algorithm using the input standardized egfr dataset. +In this case, only PathLinker is enabled. +SPRAS creates the network.txt and nodetypes.txt files required by PathLinker in the prepared/egfr-pathlinker-inputs/. + +4. Organizing results with parameter hashes + +Each dataset-algorithm-parameter combination is placed in its own folder named like egfr-pathlinker-params-D4TUKMX/. +D4TUKMX is a hash that uniquely identifies the specific parameter combination (k = 10 here). +A matching log file in logs/parameters-pathlinker-params-D4TUKMX.yaml records the exact parameter values. + +5. Running the algorithm + +SPRAS launches the PathLinker Docker image that it downloads from DockerHub, sending it the prepared files and parameter settings. +PathLinker runs and produces a raw pathway output file (raw-pathway.txt) that holds the subnetwork it found in its own native format. + +6. Standardizing the results + +SPRAS parses the raw PathLinker output into a standardized SPRAS format (pathway.txt). +This ensures all algorithms output are put into a standardized output, because their native formats differ. + +7. Logging the Snakemake run + +Snakemake creates a dated log in .snakemake/log/. This log shows what rules ran and any errors that occurred during the SPRAS run. + +What Your Directory Structure Should Like After This Run: +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code-block:: text + + spras/ + ├── .snakemake/ + │ └── log/ + │ └── ... snakemake log files ... + ├── config/ + │ └── beginner.yaml + ├── inputs/ + │ ├── phosphosite-irefindex13.0-uniprot.txt + │ └── tps-egfr-prizes.txt + ├── outputs/ + │ └── basic/ + │ └── egfr-pathlinker-params-D4TUKMX/ + │ └── pathway.txt + │ └── raw-pathway.txt + │ └── logs/ + │ └── dataset-egfr.yaml + │ └── parameters-pathlinker-params-D4TUKMX.yaml + │ └── prepared/ + │ └── egfr-pathlinker-inputs + │ └── network.txt + │ └── nodetypes.txt + │ └── dataset-egfr-merged.pickle + + +Step 2.2: Overview of the SPRAS Folder Structure +================================================= + +After running the SPRAS command, you'll see that the folder structure includes four main directories that organize everything needed to run workflows and store their results. + +.. code-block:: text + + spras/ + ├── .snakemake/ + │ └── log/ + │ └── ... snakemake log files ... + ├── config/ + │ └── ... other configs ... + ├── inputs/ + │ └── ... input files ... + ├── outputs/ + │ └── ... output files ... + +.snakemake/log/ +--------------- + +The .snakemake/log/ directory contains records of all Snakemake jobs that were executed for the SPRAS run, including any errors encountered during those runs. + +config/ +------- + +Holds configuration files (YAML) that define which algorithms to run, what datasets to use, and which analyses to perform. + +input/ +------ + +Contains the input data files, such as interactome edge files and input nodes. This is where you can place your own datasets when running custom experiments. + +output/ +------- + +Stores all results generated by SPRAS. Subfolders are created automatically for each run, and their structure can be controlled through the configuration file. + +By default, the directories are named to be config/, input/, and output/. The config/, input/, and output/ folders can be placed anywhere and named anything within the SPRAS repository. Their input/ and output/ locations can be updated in the configuration file, and the configuration file itself can be set by providing its path when running the SPRAS command. +SPRAS has additional files and directories to use during runs. However, for most users, and for the purposes of this tutorial, it isn't necessary to fully understand them. + + +2.4 Running SPRAS with More Parameter Combinations +--------------------------------------------------- + +In the beginner.yaml configuration file, uncomment the run2 section under pathlinker so it looks like: + +.. code-block:: text + + run2: + k: [10, 100] + +With this update, the beginner.yaml configuration file is set up have SPRAS run a single algorithm with multiple parameter settings on one dataset. + +After saving the changes, rerun with: + +.. code:: bash + + snakemake --cores 1 --configfile config/beginner.yaml + +What Happens When You Run This Command +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +1. Snakemake loads the configuration file + +Snakemake reads beginner.yaml to determine which datasets, algorithms, parameters, and post-analyses to run. +It reuses cached results to skip completed steps, rerunning only those that are new or outdated. +Here, the dataset pickle, PathLinker inputs, and D4TUKMX parameter set are reused instead of rerun. + +2. Organizing outputs per parameter combination + +Each new dataset-algorithm-parameter combination gets its own folder (e.g egfr-pathlinker-params-7S4SLU6/ and egfr-pathlinker-params-VQL7BDZ/) +The hashes 7S4SLU6 and VQL7BDZ uniquely identifies the specific set of parameters used. + +3. Reusing prepared inputs with additional parameter combinations + +Since PathLinker has already been run once, SPRAS uses the cached prepared inputs (network.txt, nodetypes.txt) rather than regenerating them. +For each new parameter combination, SPRAS executes the PathLinker by launching its corresponding Docker image multiple times (once for each parameter configuration). +PathLinker then runs and produces a raw-pathway.txt file specific to each parameter hash. + +4. Parsing into standardized results + +SPRAS parses each new raw-pathway.txt file into a standardized SPRAS format (pathway.txt). + +What Your Directory Structure Should Like After This Run: +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code-block:: text + + spras/ + ├── .snakemake/ + │ └── log/ + │ └── ... snakemake log files ... + ├── config/ + │ └── beginner.yaml + ├── inputs/ + │ ├── phosphosite-irefindex13.0-uniprot.txt + │ └── tps-egfr-prizes.txt + ├── outputs/ + │ └── basic/ + │ └── egfr-pathlinker-params-7S4SLU6/ + │ └── pathway.txt + │ └── raw-pathway.txt + │ └── egfr-pathlinker-params-D4TUKMX/ + │ └── pathway.txt + │ └── raw-pathway.txt + │ └── egfr-pathlinker-params-VQL7BDZ/ + │ └── pathway.txt + │ └── raw-pathway.txt + │ └── logs/ + │ └── dataset-egfr.yaml + │ └── parameters-pathlinker-params-7S4SLU6.yaml + │ └── parameters-pathlinker-params-D4TUKMX.yaml + │ └── parameters-pathlinker-params-VQL7BDZ.yaml + │ └── prepared/ + │ └── egfr-pathlinker-inputs + │ └── network.txt + │ └── nodetypes.txt + │ └── dataset-egfr-merged.pickle + + +2.5 Reviewing the pathway.txt Files +------------------------------------------- + +Each algorithm and parameter combination produces a corresponding pathway.txt file. +These files contain the reconstructed subnetworks and can be used at face value, or for further post analysis. + +1. Locate the files + +Navigate to the output directory spras/output/beginner/. Inside, you will find subfolders corresponding to each dataset-algorithm-parameter combination. + +2. Open a pathway.txt file + +Each file lists the network edges that were reconstructed for that specific run. The format includes columns for the two interacting nodes, the rank, and the edge direction + +For example, the file egfr-pathlinker-params-7S4SLU6/pathway.txt contains the following reconstructed subnetwork: + +.. code-block:: text + + Node1 Node2 Rank Direction + EGF_HUMAN EGFR_HUMAN 1 D + EGF_HUMAN S10A4_HUMAN 2 D + S10A4_HUMAN MYH9_HUMAN 2 D + K7PPA8_HUMAN MDM2_HUMAN 3 D + MDM2_HUMAN P53_HUMAN 3 D + S10A4_HUMAN K7PPA8_HUMAN 3 D + K7PPA8_HUMAN SIR1_HUMAN 4 D + MDM2_HUMAN MDM4_HUMAN 5 D + MDM4_HUMAN P53_HUMAN 5 D + CD2A2_HUMAN CDK4_HUMAN 6 D + CDK4_HUMAN RB_HUMAN 6 D + MDM2_HUMAN CD2A2_HUMAN 6 D + EP300_HUMAN P53_HUMAN 7 D + K7PPA8_HUMAN EP300_HUMAN 7 D + K7PPA8_HUMAN UBP7_HUMAN 8 D + UBP7_HUMAN P53_HUMAN 8 D + K7PPA8_HUMAN MDM4_HUMAN 9 D + MDM4_HUMAN MDM2_HUMAN 9 D + +The pathway.txt files serve as the foundation for further analysis, allowing you to explore and interpret the reconstructed networks in greater detail. +In this case you can visulize them in cytoscape or compare their statistics to better understand these outputs. + + +Step 3: Running Post-Analyses within SPRAS +========================================== +To enable downstream analyses, update the analysis section in your configuration file by setting both summary and cytoscape to true. Your analysis section in the configuration file should look like this: + +.. code-block:: text + + analysis: + summary: + include: true + cytoscape: + include: true + +summary generates graph topological summary statistics for each algorithm's parameter combination output, generating a summary file for all reconstructed subnetworks for each dataset. +This post analysis will report these statistics for each pathway: + +- Number of nodes +- Number of edges +- Number of connected components +- Network density +- Maximum degree +- Median degree +- Maximum diameter +- Average path length + +cytoscape creates a Cytoscape session file (.cys) containing all reconstructed subnetworks for each dataset, making it easy to upload and visualize them directly in Cytoscape. + +With this update, the beginner.yaml configuration file is set up for SPRAS to run two post-analyses on the outputs generated by a single algorithm that was executed with multiple parameter settings on one dataset. + +After saving the changes, rerun with: + +.. code:: bash + + snakemake --cores 1 --configfile config/beginner.yaml + + +What Happens When You Run This Command +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +1. Reusing cached results + +Snakemake reads the options set in beginner.yaml and checks for any requested post-analysis steps. +It reuses cached results; in this case, the pathway.txt files generated from the previously executed PathLinker parameter combinations for the egfr dataset. + +2. Running the summary analysis + +SPRAS aggregates the pathway.txt files from all selected parameter combinations into a single summary table. +The results are saved in egfr-pathway-summary.txt. + +3. Running the Cytoscape analysis + +All pathway.txt files from the chosen parameter combinations are collected and passed into the Cytoscape Docker image. +A Cytoscape session file is then generated, containing visualizations for each pathway and saved as egfr-cytoscape.cys. + +What Your Directory Structure Should Like After This Run: +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code-block:: text + + spras/ + ├── .snakemake/ + │ └── log/ + │ └── ... snakemake log files ... + ├── config/ + │ └── basic.yaml + ├── inputs/ + │ ├── phosphosite-irefindex13.0-uniprot.txt + │ └── tps-egfr-prizes.txt + ├── outputs/ + │ └── basic/ + │ └── egfr-pathlinker-params-7S4SLU6/ + │ └── pathway.txt + │ └── raw-pathway.txt + │ └── egfr-pathlinker-params-D4TUKMX/ + │ └── pathway.txt + │ └── raw-pathway.txt + │ └── egfr-pathlinker-params-VQL7BDZ/ + │ └── pathway.txt + │ └── raw-pathway.txt + │ └── logs/ + │ └── dataset-egfr.yaml + │ └── parameters-pathlinker-params-7S4SLU6.yaml + │ └── parameters-pathlinker-params-D4TUKMX.yaml + │ └── parameters-pathlinker-params-VQL7BDZ.yaml + │ └── prepared/ + │ └── egfr-pathlinker-inputs + │ └── network.txt + │ └── nodetypes.txt + │ └── dataset-egfr-merged.pickle + │ └── egfr-cytoscape.cys + │ └── egfr-pathway-summary.txt + +Step 3.1: Reviewing the Outputs +----------------------------------- +After completing the workflow, you will have several post analysis outputs that help you explore and interpret the results: + +1. egfr-cytoscape.cys: a Cytoscape session file containing visualizations of the reconstructed subnetworks. +2. egfr-pathway-summary.txt: a summary file with statistics describing each network. + +Reviewing Summary Files +^^^^^^^^^^^^^^^^^^^^^^^^ +1. Open the summary statistics file + +In your file explorer, go to spras/output/basic/egfr-pathway-summary.txt and open it locally. + +.. image:: ../_static/images/summary-stats.png + :alt: description of the image + :align: center + +.. raw:: html + +
+ + +This file summarizes the graph topological statistics for each output pathway.txt file for a given dataset, +along with the parameter combinations that produced them, allowing you to interpret and compare algorithm outputs side by side in a compact format. + +Reviewing Outputs in Cytoscape +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +1. Open Cytoscape + +Launch the Cytoscape application on your computer. + +2. Load the Cytoscape session file + +Navigate to spras/output/basic/egfr-cytoscape.cys and open it in Cytoscape. + +.. image:: ../_static/images/cytoscape_upload_network.png + :alt: description of the image + :width: 500 + :align: center + +.. raw:: html + +
+ +.. image:: ../_static/images/cytoscape-open-cys-file.png + :alt: description of the image + :width: 500 + :align: center + + +.. raw:: html + +
+ +Once loaded, the session will display all reconstructed subnetworks for a given dataset, organized by algorithm and parameter combination. + +.. image:: ../_static/images/cytoscape-opened.png + :alt: description of the image + :width: 500 + :align: center + +You can view and interact with each reconstructed subnetwork. Compare how the different parameter settings influence the pathways generated. + +The small parameter value (k=1) produced a compact subnetwork: + +.. image:: ../_static/images/1_pathway.png + :alt: description of the image + :width: 400 + :align: center + +.. raw:: html + +
+ + +The moderate parameter value (k=10) expanded the subnetwork, introducing additional nodes and edges that may uncover new connections: + +.. image:: ../_static/images/10_pathway.png + :alt: description of the image + :width: 600 + :align: center + +.. raw:: html + +
+ +The large parameter value (k=100) generates a much denser subnetwork, capturing a broader range of edges but also could introduce connections that may be less meaningful: + +.. image:: ../_static/images/100_pathway.png + :alt: description of the image + :width: 600 + :align: center + +.. raw:: html + +
+ +The parameters used here help determine which edges and nodes are included; each setting produces a different subnetwork. +By examining the statistics (egfr-pathway-summary.txt) alongside the visualizations (Cytoscape), you can assess how parameter choices influence both the structure and interpretability of the outputs. \ No newline at end of file diff --git a/docs/tutorial/intermediate.rst b/docs/tutorial/intermediate.rst new file mode 100644 index 000000000..007adb919 --- /dev/null +++ b/docs/tutorial/intermediate.rst @@ -0,0 +1,716 @@ +########################################################## +Intermediate Tutorial - Custom Data & Multi-Algorithm Runs +########################################################## + +This tutorial builds on the introduction to SPRAS from the previous tutorial. +It guides participants through how to convert data into a format usable by pathway reconstruction algorithms, run multiple algorithms within a single workflow, and apply new tools to interpret and compare the resulting pathways. + +You will learn how to: + +- Prepare and format data for use with SPRAS +- Configure and run additional pathway reconstruction algorithms on a dataset +- Enable post-analysis steps to generate post analysis information + +Step 1: Transforming Data into SPRAS-Compatible Inputs +====================================================== + +1.1 Understanding the Data +------------------------------------------------------------------- + +We start with mass spectrometry data containing three biological replicates, each with two technical replicates (IMAC and IP). +ADD THAT WE COMBINE THESE TOGETHER +Each replicate measures peptide abundance across multiple time points (0 to 124 minutes). + +Show images and charts as to what is changing instead of giving the code + +The goal is to turn this experimental data into the format that SPRAS expects; +a list of proteins with associated prizes and a defined set of source and target proteins. + + +1.2 Filtering and Normalizing the Replicates +------------------------------------------------------------------- + +When working with multiple replicates, we want to ensure that all of the peptides measures are present in all three replicates. +This guarantees consistent observation of the peptides across experiments. + +For each replicate after removing the peptides that are not in all three replicates, each replicate needs to be renoramlized to ensure each replicate is internally consistent and comparable, reducing bias from replicate specific intensity differences. + +1.3 Detecting Significant Changes using Tukey's HSD Test +-------------------------------------------------------------- + +After filtering and renormalizing, Tukey's Honest Significant Difference (HSD) test is preformed for each peptide. + +Tukey's HSD evaluates the significance of differences in mean peptide intensities across all pairs of time points while correcting for multiple comparisons within each peptide's time course. + +For each peptide, Tukey's HSD reports a p-value for every pair of time points, representing how likely the observed difference in abundance occurred by chance across the three biological replicates. +Lower p-values indicate stronger evidence that a peptide's abundance truly changes between those time points. + + +1.4 From p-values to Prizes for Pathway Reconstruction +-------------------------------------------------------- + +In SPRAS, prizes quantify how “interesting” a protein is to a given condition. +Peptides with low p-values reflect statistically significant changes and therefore are likely to represent interesting biologically active or perturbed proteins to use for pathway reconstruction. + +We transform the p-values into scores that capture statistically significant changes across replicates using the transformation -log10(p-value). +This produces higher scores for smaller p-values, highlighting peptides with stronger changes over time. + +To compute these scores, we identify the smallest p-value across all relevant time comparisons for each peptide. +The relevant comparisons include each time point versus the baseline (0 min) and each consecutive time point. + +We then apply the -log10 transformation to the smallest p-value for each peptide to obtain a positive prize score, where smaller p-values yield higher scores. +This process generates a peptide-level prize table that quantifies how strongly each peptide responds over time. + +1.5 Aggregating Prizes at the Protein Level +-------------------------------------------- + +Multiple peptides can map to the same protein in this data, so we keep the maximum prize among all its peptides, representing the strongest observed response. + +We also convert the protein identifiers to UniProt Entry Names to ensure consistency across the other data sources that will be used, allowing all data components to align within the same naming space. + +1.6 From Prizes to Source and Targets / Actives +----------------------------------------------- + +- add the egfr pathway (cite it) + + +After assigning protein-level prizes, the next step is to define sources, targets, and actives for use in pathway reconstruction. + +We use prior biological knowledge to guide this. +For example, in the EGFR signaling pathway, EGF acts as the initiating signal and EGFR as its receptor. +We can set EGF as the source (with the highest prize score) and EGFR as a target (with the second-highest score). +All other pathway proteins are treated as targets (with the score set from the previous step), since they represent downstream components influenced by EGF-EGFR signaling. + +Finally, actives refer to nodes in a biological network that are significantly “on” or highly active under a given biological condition. +In this context, all proteins chosen can be considered active since they correspond to active under the given biological condition. + + +1.7 Combing the data into a spras standardized data +--------------------------------------------------- + + +1.8 Finding an Interactome to use +---------------------------------- + +Next, we need to define the interactome, the background protein-protein interaction (PPI) network used by pathway reconstruction algorithms to identify connections between sources and targets, prizes, and actives. + +Databases, such as STRING, contatin interactomes that represent known interacts between proteins. + + +However, for this analysis, we use a human PPI network compiled from two sources: + +- iRefIndex (version 13.0), containing 159,095 undirected interactions, and +- PhosphoSitePlus, containing 4,080 directed kinase–substrate interactions. + +We merge the two sources, by prioritizing directed edges wherever possible otherwise keeping the undirected edges. +The final network contains 15,677 proteins, 157,984 undirected, and 3,917 directed interactions, using UniProt Entry Names for the identifiers of the nodes. + +This interactome includes 653 of the 701 proteins with mass spectrometry-based prizes. + + +8) This data is already saved into SPRAS + + +.. code-block:: text + + spras/ + ├── .snakemake/ + │ └── log/ + │ └── ... snakemake log files ... + ├── config/ + │ └── ... + ├── inputs/ + │ ├── THE DATA + │ └── THE NETWORK + ├── outputs/ + │ └── basic/ + │ └── ... output files ... + + + +Step 2: Adding multiple PRAs to the workflow +============================================= + +Now that we've prepared our input data, we can begin running multiple pathway reconstruction algorithms on it. + +For this part of the tutorial, we'll use a pre-defined configuration file that includes additional algorithms and post-analysis steps available in SPRAS. +Download it here: :download:`Intermediate Config File <../_static/config/intermediate.yaml>` + +Save the file into the config/ folder of your SPRAS installation. + +After adding this file, SPRAS will use the configuration to set up and reference your directory structure, which will look like this: + +.. code-block:: text + + spras/ + ├── .snakemake/ + │ └── log/ + │ └── ... snakemake log files ... + ├── config/ + │ └── basic.yaml + │ └── intermediate.yaml + ├── inputs/ + │ ├── THE DATA + │ └── THE NETWORK + ├── outputs/ + │ └── basic/ + │ └── ... output files ... + + +2.1 Supported Algorithms in SPRAS +--------------------------------- + +SPRAS supports a wide range of algorithms, each designed around different biological assumptions and optimization strategies: + +- Pathlinker +- Omics Integrator 1 +- Omics Integrator 2 +- MEO +- Minimum-Cost Flow +- All pairs shortest paths +- Domino +- Source-Targets Random Walk with Restarts +- Random Walk with Restarts +- BowTieBuilder (Not optimized for large datasets; slower on big networks) +- ResponseNet + +Wrapped Algorithms +^^^^^^^^^^^^^^^^^^^ +Each algorithm has been wrapped by SPRAS. +Wrapping an algorithm in SPRAS involves three main steps: + +1. Input generation: SPRAS creates and formats the input files required by the algorithm based on the provided dataset +2. Execution: SPRAS runs the algorithm within its corresponding Docker container, which holds the algorithm code. This is called for each specified parameter combination in the configuration file. +3. Output standardization: The raw outputs are converted into a standardized SPRAS format + +Inputs +^^^^^^^ +These pathway reconstruction algorithms differ in the inputs nodes they require and how they interpret those nodes to identify subnetworks. +Some use source and target nodes to connect predefined start and end points, others use prizes, which are scores assigned to nodes of interest, and some rely on active nodes that represent proteins or genes significantly “on” or perturbed under specific biological conditions. + +Along with differences in their inputs nodes, these algorithms also interpret the input interactome differently. +Some can handle directed graphs, others work only with undirected graphs, and a few support mixed directionaltiy graphs. + +Parameters +^^^^^^^^^^ +Each algorithm also exposes its own set of parameters that control its optimization strategy. +Some algorithms have no adjustable parameters, while others include multiple tunable settings that influence how subnetworks are created. +These parameters vary widely between algorithms and reflect the unique optimization techniques each method employs under the hood. + +2.3 Running SPRAS with Multiple Algorithms +------------------------------------------ +In the intermediate.yaml configuration file, it is set up have SPRAS run multiple algorithms (all of the algorithms supported in SPRAS except BowTieBuilder) with multiple parameter settings (if available) on one dataset. + +From the root directory spras/, run the command below from the command line: + +.. code:: bash + + snakemake --cores 4 --configfile config/intermediate.yaml + + +What Happens When You Run This Command +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +SPRAS will run more slowly than the beginner.yaml configuration. +The same automated steps as in beginner.yaml (managed by Snakemake and Docker) run behind the scenes for intermediate.yaml; however, this configuration now runs multiple algorithms with different parameter combinations, which takes longer to complete. +By increasing the number of cores to 4, it allows Snakemake to parallelize the work locally, speeding up execution when possible. + +1. Snakemake starts the workflow + +Snakemake reads the options set in the intermediate.yaml configuration file and determines which datasets, algorithms, and parameter combinations need to run. It also checks if any post-analysis steps were requested. + +2. Preparing the dataset + +SPRAS takes the interactome and node prize files specified in the configuration and bundles them into a Dataset object to be used for processing algorithm specific inputs. +This object is stored as a .pickle file so it can be reused for other algorithms without re-processing it. + +3. Creating algorithm specific inputs + +For each algorithm marked as include: true in the configuration, SPRAS generates input files tailored to that algorithm. +In this case, every algorithm is enabled, so SPRAS creates the files required for each algorithm. + +4. Organizing results with parameter hashes + +Each --params- combination folder is created. +A matching log file in logs/parameters--params-.yaml records the exact parameter values used. + +5. Running the algorithm + +SPRAS executes each algorithm by launching its corresponding Docker image multiple times (once for each parameter configuration). +During each run, SPRAS provides the prepared input files and the corresponding parameter settings to the container. Each algorithm then runs independently within its Docker environment and produces a raw pathway output file (raw-pathway.txt), which contains the reconstructed subnetwork in the algorithm's native format. + +6. Standardizing the results + +SPRAS parses each of the raw output into a standardized SPRAS format (pathway.txt). +This ensures all algorithms output are put into a standardized output, because their native formats differ. + +7. Logging the Snakemake run + +Snakemake creates a dated log in .snakemake/log/. This log shows what rules ran and any errors that occurred during the SPRAS run. + + +What Your Directory Structure Should Like After This Run: +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. code-block:: text + + spras/ + ├── .snakemake/ + │ └── log/ + │ └── ... snakemake log files ... + ├── config/ + │ └── basic.yaml + ├── inputs/ + │ ├── phosphosite-irefindex13.0-uniprot.txt + │ └── tps-egfr-prizes.txt + ├── outputs/ + │ └── basic/ + │ └── dataset-egfr-merged.pickle + │ └── egfr-meo-params-FJBHHNE + │ └── pathway.txt + │ └── raw-pathway.txt + │ └── egfr-meo-params-GKEDDFZ + │ └── pathway.txt + │ └── raw-pathway.txt + │ └── egfr-meo-params-JQ4DL7K + │ └── pathway.txt + │ └── raw-pathway.txt + │ └── egfr-meo-params-OXXIFMZ + │ └── pathway.txt + │ └── raw-pathway.txt + │ └── egfr-mincostflow-params-42UBTQI + │ └── pathway.txt + │ └── raw-pathway.txt + │ └── egfr-mincostflow-params-4G2PQRB + │ └── pathway.txt + │ └── raw-pathway.txt + │ └── egfr-omicsintegrator1-params-FZI2OGW + │ └── pathway.txt + │ └── raw-pathway.txt + │ └── egfr-omicsintegrator1-params-GUMLBDZ + │ └── pathway.txt + │ └── raw-pathway.txt + │ └── egfr-omicsintegrator1-params-PCWFPQW + │ └── pathway.txt + │ └── raw-pathway.txt + │ └── egfr-omicsintegrator2-params-EHHWPMD + │ └── pathway.txt + │ └── raw-pathway.txt + │ └── egfr-omicsintegrator2-params-IV3IPCJ + │ └── pathway.txt + │ └── raw-pathway.txt + │ └── egfr-pathlinker-params-4YXABT7 + │ └── pathway.txt + │ └── raw-pathway.txt + │ └── egfr-pathlinker-params-7S4SLU6 + │ └── pathway.txt + │ └── raw-pathway.txt + │ └── egfr-pathlinker-params-D4TUKMX + │ └── pathway.txt + │ └── raw-pathway.txt + │ └── egfr-pathlinker-params-VQL7BDZ + │ └── pathway.txt + │ └── raw-pathway.txt + │ └── egfr-rwr-params-34NN6EK + │ └── pathway.txt + │ └── raw-pathway.txt + │ └── egfr-rwr-params-GGZCZBU + │ └── pathway.txt + │ └── raw-pathway.txt + │ └── egfr-strwr-params-34NN6EK + │ └── pathway.txt + │ └── raw-pathway.txt + │ └── egfr-strwr-params-GGZCZBU + │ └── pathway.txt + │ └── raw-pathway.txt + │ └── logs + │ └── datasets-egfr.yaml + │ └── parameters-allpairs-params-BEH6YB2.yaml + │ └── parameters-domino-params-V3X4RW7.yaml + │ └── parameters-meo-params-FJBHHNE.yaml + │ └── parameters-meo-params-GKEDDFZ.yaml + │ └── parameters-meo-params-JQ4DL7K.yaml + │ └── parameters-meo-params-OXXIFMZ.yaml + │ └── parameters-mincostflow-params-42UBTQI.yaml + │ └── parameters-mincostflow-params-4G2PQRB.yaml + │ └── parameters-mincostflow-params-GGT4CVE.yaml + │ └── parameters-omicsintegrator1-params-FZI2OGW.yaml + │ └── parameters-omicsintegrator1-params-GUMLBDZ.yaml + │ └── parameters-omicsintegrator1-params-PCWFPQW.yaml + │ └── parameters-omicsintegrator2-params-EHHWPMD.yaml + │ └── parameters-omicsintegrator2-params-IV3IPCJ.yaml + │ └── parameters-pathlinker-params-4YXABT7.yaml + │ └── parameters-pathlinker-params-7S4SLU6.yaml + │ └── parameters-pathlinker-params-D4TUKMX.yaml + │ └── parameters-pathlinker-params-VQL7BDZ.yaml + │ └── parameters-rwr-params-34NN6EK.yaml + │ └── parameters-rwr-params-GGZCZBU.yaml + │ └── parameters-strwr-params-34NN6EK.yaml + │ └── parameters-strwr-params-GGZCZBU.yaml + │ └── prepared + │ └── egfr-domino-inputs + │ ├── active_genes.txt + │ └── network.txt + │ └── egfr-meo-inputs + │ ├── edges.txt + │ ├── sources.txt + │ └── targets.txt + │ └── egfr-mincostflow-inputs + │ ├── edges.txt + │ ├── sources.txt + │ └── targets.txt + │ └── egfr-omicsintegrator1-inputs + │ ├── dummy_nodes.txt + │ ├── edges.txt + │ └── prizes.txt + │ └── egfr-omicsintegrator2-inputs + │ ├── edges.txt + │ └── prizes.txt + │ └── egfr-pathlinker-inputs + │ ├── network.txt + │ ── nodetypes.txt + │ └── egfr-rwr-inputs + │ ├── network.txt + │ └── nodes.txt + │ └── egfr-strwr-inputs + | ├── network.txt + | ├── sources.txt + | └── targets.txt + +2.4 Reviewing the pathway.txt Files +------------------------------------------- +After running the intermediate configuration file, the output/intermediate/ directory will contain many more subfolders and files. + +Just like in the beginner tutorial, each algorithm's results can be found in the spras/output/intermediate/ directory. +Within it, you'll see subfolders corresponding to each dataset-algorithm-parameter combination. +Each folder contains a pathway.txt file that contains the standardized reconstructed subnetwork for that specific run. + +For example, the file egfr-mincostflow-params-42UBTQI/pathway.txt contains the following reconstructed subnetwork: + +.. code-block:: text + + Node1 Node2 Rank Direction + CBL_HUMAN EGFR_HUMAN 1 U + EGFR_HUMAN EGF_HUMAN 1 U + EMD_HUMAN LMNA_HUMAN 1 U + FYN_HUMAN KS6A3_HUMAN 1 U + EGF_HUMAN HDAC6_HUMAN 1 U + HDAC6_HUMAN HS90A_HUMAN 1 U + KS6A3_HUMAN SRC_HUMAN 1 U + EGF_HUMAN LMNA_HUMAN 1 U + MYH9_HUMAN S10A4_HUMAN 1 U + EGF_HUMAN S10A4_HUMAN 1 U + EMD_HUMAN SRC_HUMAN 1 U + + +And the file egfr-omicsintegrator1-params-GUMLBDZ/pathway.txt contains the following reconstructed subnetwork: + +.. code-block:: text + + Node1 Node2 Rank Direction + CBLB_HUMAN EGFR_HUMAN 1 U + CBL_HUMAN CD2AP_HUMAN 1 U + CBL_HUMAN CRKL_HUMAN 1 U + CBL_HUMAN EGFR_HUMAN 1 U + CBL_HUMAN PLCG1_HUMAN 1 U + CDK1_HUMAN NPM_HUMAN 1 D + CHD4_HUMAN HDAC2_HUMAN 1 U + EGFR_HUMAN EGF_HUMAN 1 U + EGFR_HUMAN GRB2_HUMAN 1 U + EIF3B_HUMAN EIF3G_HUMAN 1 U + FAK1_HUMAN PAXI_HUMAN 1 U + GAB1_HUMAN PTN11_HUMAN 1 U + GRB2_HUMAN PTN11_HUMAN 1 U + GRB2_HUMAN SHC1_HUMAN 1 U + HDAC2_HUMAN SIN3A_HUMAN 1 U + HGS_HUMAN STAM2_HUMAN 1 U + KS6A1_HUMAN MK01_HUMAN 1 U + MK01_HUMAN ABI1_HUMAN 1 D + MK01_HUMAN ERF_HUMAN 1 D + MRE11_HUMAN RAD50_HUMAN 1 U + + +As you explore more of these files, you'll notice that the subnetworks vary widely across algorithms and parameter settings. +While you can still open and inspect these files manually, the number of outputs is much greater than in the beginner.yaml run, making manual inspection less practical. +The pathway.txt outputs serve as the foundation for further post-analysis, where you can systematically compare and interpret the reconstructed networks in greater detail. + +In the next steps, we'll use SPRAS's post analysis tools to further explore and analyze these outputs. + +Step 3: Use ML Post-Analysis +============================= + +To enable downstream analyses, update the analysis section in your configuration file by setting both summary, cytoscape, and ml, to true. Your analysis section in the configuration file should look like this: + +.. code-block:: text + + analysis: + ml: + include: true + +In this part of the tutorial, we're also including the machine learning (ml) section to enable machine learning-based post-analysis built within SPRAS. + +The ml analysis will perform unsupervised analyses such as Principal Component Analysis (PCA), Hierarchical Agglomerative Clustering (HAC), ensembling, and Jaccard similarity comparisons of the pathways. +These analyses help uncover patterns and similarities between different algorithms run on a given dataset +- if aggregate_per_algorithm: is set to true, it additionally groups outputs by algorithm within each dataset to uncover patterns and similarities for an algorithm +- The ML section includes configurable parameters that let you adjust the behavior of the ml analyses performed + +With these updates, SPRAS will run the full set of unsupervised machine learning analyses across all outputs for a given dataset. + +After saving the changes in the configuration file, rerun with: + +.. code:: bash + + snakemake --cores 4 --configfile config/intermediate.yaml + + +What Happens When You Run This Command +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +1. Reusing cached results + +Snakemake reads the options set in intermediate.yaml and checks for any requested post-analysis steps. +It reuses cached results; in this case, the pathway.txt files generated from the previously executed algorithms + parameter combinations on the egfr dataset. + +2. Running the ml analysis + +SPRAS aggregates all files generated for a dataset. +These groupings include all the reconstructed subnetworks produced across algorithm for a given dataset (and, if enabled, grouped outputs per algorithm for a given dataset). +SPRAS then performs all machine learning analyses on each grouping and saves the results in the dataset-ml/ directory. + + +What Your Directory Structure Should Like After This Run: +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. code-block:: text + + spras/ + ├── .snakemake/ + │ └── log/ + │ └── ... snakemake log files ... + ├── config/ + │ └── basic.yaml + ├── inputs/ + │ ├── phosphosite-irefindex13.0-uniprot.txt + │ └── tps-egfr-prizes.txt + ├── outputs/ + │ └── basic/ + │ └── dataset-egfr-merged.pickle + │ └── egfr-meo-params-FJBHHNE + │ └── pathway.txt + │ └── raw-pathway.txt + │ └── egfr-meo-params-GKEDDFZ + │ └── pathway.txt + │ └── raw-pathway.txt + │ └── egfr-meo-params-JQ4DL7K + │ └── pathway.txt + │ └── raw-pathway.txt + │ └── egfr-meo-params-OXXIFMZ + │ └── pathway.txt + │ └── raw-pathway.txt + │ └── egfr-mincostflow-params-42UBTQI + │ └── pathway.txt + │ └── raw-pathway.txt + │ └── egfr-mincostflow-params-4G2PQRB + │ └── pathway.txt + │ └── raw-pathway.txt + │ └── egfr-omicsintegrator1-params-FZI2OGW + │ └── pathway.txt + │ └── raw-pathway.txt + │ └── egfr-omicsintegrator1-params-GUMLBDZ + │ └── pathway.txt + │ └── raw-pathway.txt + │ └── egfr-omicsintegrator1-params-PCWFPQW + │ └── pathway.txt + │ └── raw-pathway.txt + │ └── egfr-omicsintegrator2-params-EHHWPMD + │ └── pathway.txt + │ └── raw-pathway.txt + │ └── egfr-omicsintegrator2-params-IV3IPCJ + │ └── pathway.txt + │ └── raw-pathway.txt + │ └── egfr-pathlinker-params-4YXABT7 + │ └── pathway.txt + │ └── raw-pathway.txt + │ └── egfr-pathlinker-params-7S4SLU6 + │ └── pathway.txt + │ └── raw-pathway.txt + │ └── egfr-pathlinker-params-D4TUKMX + │ └── pathway.txt + │ └── raw-pathway.txt + │ └── egfr-pathlinker-params-VQL7BDZ + │ └── pathway.txt + │ └── raw-pathway.txt + │ └── egfr-rwr-params-34NN6EK + │ └── pathway.txt + │ └── raw-pathway.txt + │ └── egfr-rwr-params-GGZCZBU + │ └── pathway.txt + │ └── raw-pathway.txt + │ └── egfr-strwr-params-34NN6EK + │ └── pathway.txt + │ └── raw-pathway.txt + │ └── egfr-strwr-params-GGZCZBU + │ └── pathway.txt + │ └── raw-pathway.txt + │ └── egfr-ml + │ └── ensemble-pathway.txt + │ └── hac-clusters-horizontal.txt + │ └── hac-clusters-vertical.txt + │ └── hac-horizontal.png + │ └── hac-vertical.png + │ └── jaccard-heatmap.png + │ └── jaccard-matrix.txt + │ └── pca-coordinates.txt + │ └── pca-variance.txt + │ └── pca.png + │ └── logs + │ └── datasets-egfr.yaml + │ └── parameters-allpairs-params-BEH6YB2.yaml + │ └── parameters-domino-params-V3X4RW7.yaml + │ └── parameters-meo-params-FJBHHNE.yaml + │ └── parameters-meo-params-GKEDDFZ.yaml + │ └── parameters-meo-params-JQ4DL7K.yaml + │ └── parameters-meo-params-OXXIFMZ.yaml + │ └── parameters-mincostflow-params-42UBTQI.yaml + │ └── parameters-mincostflow-params-4G2PQRB.yaml + │ └── parameters-mincostflow-params-GGT4CVE.yaml + │ └── parameters-omicsintegrator1-params-FZI2OGW.yaml + │ └── parameters-omicsintegrator1-params-GUMLBDZ.yaml + │ └── parameters-omicsintegrator1-params-PCWFPQW.yaml + │ └── parameters-omicsintegrator2-params-EHHWPMD.yaml + │ └── parameters-omicsintegrator2-params-IV3IPCJ.yaml + │ └── parameters-pathlinker-params-4YXABT7.yaml + │ └── parameters-pathlinker-params-7S4SLU6.yaml + │ └── parameters-pathlinker-params-D4TUKMX.yaml + │ └── parameters-pathlinker-params-VQL7BDZ.yaml + │ └── parameters-rwr-params-34NN6EK.yaml + │ └── parameters-rwr-params-GGZCZBU.yaml + │ └── parameters-strwr-params-34NN6EK.yaml + │ └── parameters-strwr-params-GGZCZBU.yaml + │ └── prepared + │ └── egfr-domino-inputs + │ ├── active_genes.txt + │ └── network.txt + │ └── egfr-meo-inputs + │ ├── edges.txt + │ ├── sources.txt + │ └── targets.txt + │ └── egfr-mincostflow-inputs + │ ├── edges.txt + │ ├── sources.txt + │ └── targets.txt + │ └── egfr-omicsintegrator1-inputs + │ ├── dummy_nodes.txt + │ ├── edges.txt + │ └── prizes.txt + │ └── egfr-omicsintegrator2-inputs + │ ├── edges.txt + │ └── prizes.txt + │ └── egfr-pathlinker-inputs + │ ├── network.txt + │ ── nodetypes.txt + │ └── egfr-rwr-inputs + │ ├── network.txt + │ └── nodes.txt + │ └── egfr-strwr-inputs + | ├── network.txt + | ├── sources.txt + | └── targets.txt + +Step 3.1: Reviewing the Outputs +-------------------------------- + +Ensembles +^^^^^^^^^ +After running multiple algorithms or parameter settings on the same dataset, SPRAS can ensemble the resulting pathways to identify consistent, high-confidence interactions. + +Each pathway output is represented as a binary edge list (1 = edge present, 0 = edge absent). +SPRAS calculates the mean of these binary values across all runs to determine the edge frequency (the proportion of times each edge appears across the outputs). +Edges that occur more often are considered more robust and can be used to build a consensus network. + + +.. code-block:: text + + Node1 Node2 Frequency Direction + EGF_HUMAN EGFR_HUMAN 0.42857142857142855 D + EGF_HUMAN S10A4_HUMAN 0.38095238095238093 D + S10A4_HUMAN MYH9_HUMAN 0.38095238095238093 D + K7PPA8_HUMAN MDM2_HUMAN 0.09523809523809523 D + MDM2_HUMAN P53_HUMAN 0.19047619047619047 D + S10A4_HUMAN K7PPA8_HUMAN 0.19047619047619047 D + K7PPA8_HUMAN SIR1_HUMAN 0.19047619047619047 D + MDM2_HUMAN MDM4_HUMAN 0.09523809523809523 D + MDM4_HUMAN P53_HUMAN 0.09523809523809523 D + CD2A2_HUMAN CDK4_HUMAN 0.09523809523809523 D + CDK4_HUMAN RB_HUMAN 0.09523809523809523 D + MDM2_HUMAN CD2A2_HUMAN 0.09523809523809523 D + EP300_HUMAN P53_HUMAN 0.2857142857142857 D + K7PPA8_HUMAN EP300_HUMAN 0.09523809523809523 D + ... + +High frequency edges indicate interactions consistently recovered by multiple algorithms, suggesting stronger biological relevance. +Low frequency edges may reflect noise or algorithm-specific connections. + +HAC +^^^ +SPRAS includes Hierarchical Agglomerative Clustering (HAC) to group similar pathways outputs based on shared edges. +This helps identify clusters of algorithms that produce comparable subnetworks and highlights distinct reconstruction behaviors. + +In the plots below, each branch represents a cluster of related pathways. +Shorter distances between branches indicate greater similarity. + +.. image:: ../_static/images/hac-horizontal.png + :alt: description of the image + :width: 500 + :align: center + +.. raw:: html + +
+ +.. image:: ../_static/images/hac-vertical.png + :alt: description of the image + :width: 300 + :align: center + +.. raw:: html + +
+ +HAC visualizations help compare which algorithms and parameter settings produce similar pathway structures. +Tight clusters indicate similar behavior, while isolated branches may reveal unique or outlier results. + +PCA +^^^ +SPRAS also includes Principal Component Analysis (PCA) to visualize variation across pathway outputs. +Each point represents a pathway, places based on its overall network structure. +Pathways that cluster together in PCA space are more similar, while those farther apart differ in their reconstructed subnetworks. + +.. image:: ../_static/images/pca.png + :alt: description of the image + :width: 500 + :align: center + +.. raw:: html + +
+ +PCA can help identify patterns such as clusters of similar algorithms, parameter sensitivities, or outlier outputs. + +Jaccard Similarity +^^^^^^^^^^^^^^^^^^ + +SPRAS computes pairwise Jaccard similarity between pathway outputs to measure how much overlap exists between their reconstructed subnetworks. +The Jaccard index is calculated from the binary edge representation of each pathway and reflects the proportion of shared edges between two pathways relative to their total combined edges. + +Higher similarity values indicate that pathways share many of the same interactions, while lower values suggest distinct or divergent reconstructions. + +.. image:: ../_static/images/jaccard-heatmap.png + :alt: description of the image + :width: 500 + :align: center + +.. raw:: html + +
+ +The heatmap visualizes how similar the output pathways are between algorithms and parameter settings. \ No newline at end of file diff --git a/docs/tutorial/introduction.rst b/docs/tutorial/introduction.rst new file mode 100644 index 000000000..bf1c1bd8f --- /dev/null +++ b/docs/tutorial/introduction.rst @@ -0,0 +1,62 @@ +############## +SPRAS Tutorial +############## + +Purpose of this tutorial +======================== +This tutorial will introduce participants to SPRAS and demonstrate how it can be used to explore biological pathways from omics data. + +Together, we will cover: + +1. How to set up and run SPRAS +2. Running multiple algorithms with different parameters across one datasets +3. Using the post-analysis tools to evaluate and compare results +4. Building datasets for analysis +5. Other things you can do with SPRAS + +Prerequisites for this tutorial +=============================== +Required software: + +- `Conda `__ : for managing environments +- `Docker `__ : for containerized runs +- `Cytoscape `__ for visualizing networks (download locally, the web version will not suffice) +- `Git `__: for cloning the SPRAS repository +- A terminal or code editor (`VS Code `__ is recommended, but any terminal will work) + +Required knowledge: + +- Basic Python skills +- Basic biology concepts + +############### +SPRAS Overview +############### + +What is pathway reconstruction? +=============================== +Pathway reconstruction is a computational approach used in biology to propose candidate biological pathways (such as signaling pathways) from high-throughput experimental data. + +Curated pathway databases provide references to pathways, but they are often generalized and may not capture the context-specific details relevant to a particular disease or experimental condition. +To address this, pathway reconstruction algorithms help map molecules of interest (such as proteins, genes, or metabolites identified in omics experiments) onto large-scale interaction networks, called interactomes (graphs of molecular interactions in a cell). +The result is a customized subnetwork (pathway) that reflects the biology of the specific experiment or condition. + +Why use pathway reconstruction? +=============================== +Pathway reconstruction algorithms allow researchers to systematically propose context-specific subnetworks without performing exhaustive experiments testing each individual interaction. +Different algorithms use distinct computational strategies and parameters, providing flexibility to highlight various aspects of the underlying biology and generate new, testable hypotheses giving researchers the flexibility to create and identify different subnetworks specific to their experimental conditions. + +What is SPRAS? +=============== +The Signaling Pathway Reconstruction Analysis Streamliner (SPRAS) is a computational framework that unifies, standardizes, and streamlines the use of diverse pathway reconstructon algorithms. + +SPRAS provides an abstraction layer for pathway reconstruction algorithms by organizing every step into a unified schema. It uses workflow management (Snakemake), containerization, and config-driven runs to build modular and interoperable pipelines that cover the entire process: + +1. Pre-processing of data +2. Algorithm execution +3. Post-processing of results +4. Downstream analysis and evaluation + +A key strength of SPRAS is automation. From user provided input data and configurations, it can generate and execute complete workflows without requiring users to write complex scripts. This lowers the barrier to entry, allowing researchers to apply, evaluate, and compare multiple pathway reconstruction algorithms without deep computational expertise. + +SPRAS also supports scalable analyses, making it especially valuable for a large number of datasets and systematic investigations. In addition, it provides built-in evaluation and post analysis tools that provide further insights of the algorithm outputs. \ No newline at end of file diff --git a/docs/tutorial/planning.txt b/docs/tutorial/planning.txt new file mode 100644 index 000000000..f45c57ed8 --- /dev/null +++ b/docs/tutorial/planning.txt @@ -0,0 +1,95 @@ +My current plan for my COMBINE 25 tutorial (that will then be used for the spras doc tutorials) +- I will be testing this on a user that has basic python knowledge but doesn't know spras + +*Tony will be giving a presentation on PRAs and SPRAS prior (hopefully) to this tutorial + +0) +- need to preinstall conda, docker, vscode(?) (or run on terminal), git, and cytoscape prior + - a few minimum dependencies like Docker and conda and git installed already, the rest are optional but recommended +- Basic Python knowledge (running scripts, installing packages) +- Some basic biology knowledge: what is a protein, a protein interaction etc +Overview of PRAs (Pathway Reconstruction Algorithms) in plain language +What SPRAS is and what problems it solves + +My plan is to create subfolders within the output directory so that results are separated by configuration or tutorial section (right now just "basic" and "medium"). +I rely on subfolders when I’m running many different SPRAS tests across datasets or testing different things with different configs, since it helps keep everything organized which is what will be used here. +In the basic case, we will be doing caching for the egfr dataset in its own subfolder. In the medium case, we will be building a new dataset and then running on that in its own subfolder. + + +1) basic +Goal: Get new users comfortable installing SPRAS +Goal: Run SPRAS with one algorthm on a small example dataset, then re-run it with three different parameter settings to see how network structure changes. + +Slides and/or Information to add to docs directly: +Installation & environment setup (Docker Anaconda and Cytoscpare locally) +SPRAS directory structure for a user & configuration files +- config folder +- input folder +- output folder + - can control the creation of subfolders of outputs in the config file + - might need to be in a seperate step/slide +Running SPRAS on a provided example dataset +- set up one algortihm and run +- then run one algortihm with 3 different preset parameter settings +Understanding the outputs +- show the output and structure +- show it visulized +Viewing logs and monitoring runs + +Things to make: +- need one small dataset to run on (either make a dummy one or just use egfr) + - Building around EGFR will be the least amount of work, and that pathway may be a good fit for the COMBINE audience +- basic-config + +2) medium +Goal: Run SPRAS with a couple more algorthm with many parameter settings on a smaller and larger example dataset to see how network structure changes. +Goal: Use more of the post analysis tools +- summary stats +- ML + +Slides and/or Information to add to docs directly: +For the larger dataset, teach a user how to turn data into the input structure we require +- Formatting prize/source/target files and interactome files & explaining biological meaning +Adding multiple PRAs to the workflow +- preset parameters +Use/Show summary stats and ML code +- will be useful for parameter tuning (the next tutorial) + + +Things to make: +- medium-config +- need raw data to make the larger example + - can use panther pathways + + +3) hard +Goal: Run SPRAS with all algortihms with larger example dataset to create reproducible benchmarking experiments using gold standard data. +Goal: use evaluation code +Goal: learn how to parameter tune + +Slides and/or Information to add to docs directly: +Configure the config file with larger dataset +- set all Algorithms datasets, gold standards +Explain parameter tuning +Teach parameter tuning +- Define coarse parameter spaces and define hueristics +- Define fine parameter spaces after and hueristics +- repeat +- show how to use past post analysis tools help with finding parameters + +(maybe I show an example of this rather than doing it in a way the tutorial people do it) +(then people can copy and paste it in and experience it?) + +Explain parameter selection +Use the evaluation code and understanding the outputs +- Per-pathway precision and recall plots / pca chosen and pr curves for ensemble +- Where PCA-based chosen params land vs the full grid +Agreement/disagreement between algorithms (heatmaps) using ML + +Things to make: +- hard-config +- make a hard-config with tuned parameters to provide after people attempt tuning + + +Stuff I need to figure out: +# test these tutorials for mac and windows user \ No newline at end of file