diff --git a/.gitignore b/.gitignore index 0f90282a..e82a0fd2 100644 --- a/.gitignore +++ b/.gitignore @@ -9,6 +9,7 @@ Output/ .ipynb_checkpoints* __pycache__* *.egg-info* +.eggs* dist/* # typical latex tmp files diff --git a/.travis.yml b/.travis.yml index f0c94de7..1de3453d 100644 --- a/.travis.yml +++ b/.travis.yml @@ -17,18 +17,17 @@ install: - conda config --set always_yes yes --set changeps1 no - conda update -q conda - - conda config --add channels bioconda - - conda config --add channels conda-forge - - # build package with cond - - conda install conda-build - - conda build conda.recipe --output-folder=$HOME/build - - conda config --add channels "file://${HOME}/build" - - # test package + # install dependencies - source $HOME/miniconda/etc/profile.d/conda.sh - - conda create -q -n drop drop_travis + - conda create -q -n drop -c conda-forge -c bioconda python=$TRAVIS_PYTHON_VERSION r-base=4.0.2 - conda activate drop + - conda install -c conda-forge -c bioconda drop + - conda remove --force drop wbuild + - conda install -c conda-forge -c bioconda wbuild=1.7.1 + - pip install -r requirements_test.txt + + # install drop + - pip install . -vv script: - conda list @@ -37,10 +36,7 @@ script: - samtools --version - bcftools --version - drop --version + - wbuild --version - python --version - - - mkdir drop_demo - - cd drop_demo - - drop demo - - snakemake -n - - snakemake --jobs 2 --cores 2 + + - pytest -vv -s diff --git a/README.md b/README.md index 017d6fb3..b829b846 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # Detection of RNA Outlier Pipeline [![Pipeline status](https://travis-ci.org/gagneurlab/drop.svg?branch=master)](https://travis-ci.org/gagneurlab/drop) -[![Version](https://img.shields.io/badge/Version-0.9.2-green.svg)](https://github.com/gagneurlab/drop/releases/tag/0.9.2) +[![Version](https://img.shields.io/github/v/release/gagneurlab/drop?include_prereleases)](https://github.com/gagneurlab/drop/releases) [![Version](https://readthedocs.org/projects/gagneurlab-drop/badge/?version=latest)](https://gagneurlab-drop.readthedocs.io/en/latest) The manuscript main file, supplementary figures and table can be found in the manuscript folder or in @@ -51,7 +51,8 @@ snakemake aberrantExpression -n ``` ## Datasets -The following publicly-available datasets of gene counts can be used as controls: +The following publicly-available datasets of gene counts can be used as controls. +Please cite as instructed for each dataset. * 119 non-strand specific fibroblasts: [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.3887451.svg)](https://doi.org/10.5281/zenodo.3887451) diff --git a/conda.recipe/meta.yaml b/conda.recipe/meta.yaml deleted file mode 100644 index 1538f925..00000000 --- a/conda.recipe/meta.yaml +++ /dev/null @@ -1,91 +0,0 @@ -package: - name: "drop_travis" - version: "0.9.2" - -source: - path: .. - -build: - number: 0 - noarch: python - entry_points: - - drop=drop.cli:main - script: "{{ PYTHON }} -m pip install . -vv" - -requirements: - host: - - python >=3.6 - - pip - - wbuild =1.7.0 - - run: - - python >=3.6 - - pandas - - Click >=7.0 - - click-log - - python-dateutil - - # snakemake/wbuild - - snakemake >=5.5.2 - - wbuild =1.7.0 - - pandoc - - graphviz - - # command line tools - - tabix - - samtools >=1.7 - - bcftools >=1.7 - - gatk4 >=4.0.4 - - star >=2.7 - - # R dependencies - - r-base>=4.0.0 - - r-rmarkdown - - r-knitr - - r-ggplot2 - - r-ggthemes - - r-cowplot - - r-data.table - - r-dplyr - - r-tidyr - - r-magrittr - - r-devtools - - r-tmae - - # bioconductor packages - - bioconductor-deseq2 - - bioconductor-GenomicScores - - bioconductor-outrider - - bioconductor-fraser - - bioconductor-variantannotation - - bioconductor-bsgenome.hsapiens.ucsc.hg19 - #- bioconductor-mafdb.gnomad.r2.1.hs37d5 - #- bioconductor-mafdb.gnomad.r2.1.grch38 - -test: - imports: - - drop - commands: - - drop --help - requires: - - pytest - -about: - home: https://github.com/gagneurlab/drop - license: MIT - license_family: OTHER - summary: Detection of RNA Outlier Pipeline - doc_url: https://gagneurlab-drop.readthedocs.io/en/latest/ - dev_url: https://github.com/gagneurlab/drop - -extra: - container: - # click requires a unicode locale when used with Python 3 - # extended-base generates en_US.UTF-8 locale and sets LC_ALL, LANG properly - extended-base: true - identifiers: - - https://dx.doi.org/10.21203/rs.2.19080/v1 - recipe-maintainers: - - c-mertes - - mumichae - diff --git a/docs/source/conf.py b/docs/source/conf.py index 827fd0ef..eedad33e 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -23,7 +23,7 @@ author = 'Michaela Müller' # The full version, including alpha/beta/rc tags -release = '0.9.2' +release = '1.0.0' # -- General configuration --------------------------------------------------- diff --git a/docs/source/installation.rst b/docs/source/installation.rst index 80742309..e166c73f 100644 --- a/docs/source/installation.rst +++ b/docs/source/installation.rst @@ -101,7 +101,7 @@ Alternatively, DROP can be installed without ``conda``. In this case the followi * `python `_ >= 3.6 and `pip `_ >= 19.1 - * `R `_ >= 3.6 and corresponding `bioconductor `_ version + * `R `_ >= 3.6, <=4.0.2 and corresponding `bioconductor `_ version * Commandline tools: diff --git a/docs/source/prepare.rst b/docs/source/prepare.rst index 95713613..e0b10c19 100644 --- a/docs/source/prepare.rst +++ b/docs/source/prepare.rst @@ -39,15 +39,13 @@ Parameter Type Description =================== ========== ======================================================================================================================================= ====== projectTitle character Title of the project to be displayed on the rendered HTML output ``Project 1`` htmlOutputPath character Full path of the folder where the HTML files are rendered ``/data/project1/htmlOutput`` -indexWithFolderName boolean variable needed for wBuild, do not edit it ``true`` -fileRegex character variable needed for wBuild, do not edit it ``.*\.R`` +indexWithFolderName boolean If true, the basename of the project directory will be used as prefix for the index.html file ``true`` genomeAssembly character Either hg19 or hg38, depending on the genome assembly used for mapping ``/data/project1`` sampleAnnotation character Full path of the sample annotation table ``/data/project1/sample_annotation.tsv`` root character Full path of the folder where the subdirectories processed_data and processed_results will be created containing DROP's output files. ``/data/project1`` geneAnnotation dictionary A key-value list of the annotation name (key) and the full path to the GTF file (value). More than one annotation file can be provided. ``anno1: /path/to/gtf1.gtf`` ``anno2: /path/to/gtf2.gtf`` -scanBamParam character Either null or the path to an Rds file containing a scanBamParam object. Refer to the advanced options below. ``/path/to/scanBamParam.Rds`` tools dictionary A key-value list of different commands (key) and the command (value) to run them ``gatkCmd: gatk`` ``bcftoolsCmd: bcftools`` @@ -126,10 +124,19 @@ qcGroups list Same as “groups”, but for the VCF-BAM matc Creating the sample annotation table ------------------------------------ -For details on how to generate the sample annotation, please refer to the DROP manuscript. -Here we provide some examples on how to deal with certain situations. For simplicity, we -do not include the other compulsory columns ``PAIRED_END``, ``COUNT_MODE``, -``COUNT_OVERLAPS`` and ``STRAND``. +For a detailed explanation of the columns of the sample annotation, please refer to +the DROP manuscript. +Inside the sample annotation, each row corresponds to a unique pair of RNA and DNA +samples derived from the same individual. An RNA assay can belong to one or more DNA +assays, and vice-versa. If so, they must be specified in different rows. The required +columns are ``RNA_ID``, ``RNA_BAM_FILE`` and ``DROP_GROUP``, plus other module-specific +ones (see DROP manuscript). In case external counts are included, add a new row for each +sample from those files (or a subset if not all samples are needed). + +The sample annotation file should be saved in the tab-separated values (tsv) format. The +column order does not matter. Also, it does not matter where it is stored, as the path is +specified in the config file. Here we provide some examples on how to deal with certain +situations. For simplicity, we do not include all possible columns in the examples. Example of RNA replicates ++++++++++++++++++++++++++++++++++ @@ -144,22 +151,41 @@ S10R_M S10G MUSCLE /path/to/S10R_M.BAM /path/to/S10G.vcf.gz Example of DNA replicates ++++++++++++++++++++++++++++++++++ -====== ====== ========== =================== == -RNA_ID DNA_ID DROP_GROUP RNA_BAM_FILE DNA_VCF_FILE -====== ====== ========== =================== == -S20R S20E WES /path/to/S20R.BAM /path/to/S20E.vcf.gz -S20R S20G WGS /path/to/S20R.BAM /path/to/S20G.vcf.gz -====== ====== ========== =================== == +====== ====== ========== ================= == +RNA_ID DNA_ID DROP_GROUP RNA_BAM_FILE DNA_VCF_FILE +====== ====== ========== ================= == +S20R S20E WES /path/to/S20R.BAM /path/to/S20E.vcf.gz +S20R S20G WGS /path/to/S20R.BAM /path/to/S20G.vcf.gz +====== ====== ========== ================= == Example of a multi-sample vcf file ++++++++++++++++++++++++++++++++++ -====== ====== ========== =================== == -RNA_ID DNA_ID DROP_GROUP RNA_BAM_FILE DNA_VCF_FILE -====== ====== ========== =================== == -S10R S10G WGS /path/to/S10R.BAM /path/to/multi_sample.vcf.gz -S20R S20G WGS /path/to/S20R.BAM /path/to/multi_sample.vcf.gz -====== ====== ========== =================== == +====== ====== ========== ================= == +RNA_ID DNA_ID DROP_GROUP RNA_BAM_FILE DNA_VCF_FILE +====== ====== ========== ================= == +S10R S10G WGS /path/to/S10R.BAM /path/to/multi_sample.vcf.gz +S20R S20G WGS /path/to/S20R.BAM /path/to/multi_sample.vcf.gz +====== ====== ========== ================= == + +External count matrices ++++++++++++++++++++++++ + +In case counts from external matrices are to be integrated into the analysis, +the file must be specified in the GENE_COUNTS_FILE column. A new row must be +added for each sample from the count matrix that should be included in the +analysis. An RNA_BAM_FILE must not be specified. The DROP_GROUP of the local +and external samples that are to be analyzed together must be the same. +Similarly, the GENE_ANNOTATION of the external counts and the key of the `geneAnnotation` +parameter from the config file must match. + +====== ====== ========== ================= == +RNA_ID DNA_ID DROP_GROUP RNA_BAM_FILE GENE_COUNTS_FILE +====== ====== ========== ================= == +S10R S10G BLOOD /path/to/S10R.BAM +EXT-1R BLOOD /path/to/externalCounts.tsv.gz +EXT-2R BLOOD /path/to/externalCounts.tsv.gz +====== ====== ========== ================= == Advanced options @@ -183,12 +209,4 @@ We recommend the search space to be at most N/3 for the aberrant expression, and N/6 for the aberrant splicing case. Nevertheless, the user can specify the denominator with the parameter ``maxTestedDimensionProportion``. -In order to influence which fields of the BAM files are imported, the user can -provide a ``scanBamParam`` object. This will affect how the files are counted in -the aberrant expression and splicing modules. Refer to the function's -`documentation `_ for details. - - - - diff --git a/drop/GeneticDiagnosis_Demo.R b/drop/GeneticDiagnosis_Demo.R deleted file mode 100644 index dee82bee..00000000 --- a/drop/GeneticDiagnosis_Demo.R +++ /dev/null @@ -1,26 +0,0 @@ -#'--- -#' title: Genetic Diagnosis - Pipeline steps in Demo Project -#' author: salazar -#' wb: -#' input: -#' - indexFile: '`sm parser.getProcDataDir() + "/indexNames.txt" `' -#' output: -#' html_document: -#' code_folding: hide -#' code_download: TRUE -#'--- - - -#' # Links for Pipeline Steps - -indexNames <- scan(snakemake@input$indexFile, character(), quote = "", sep='\n') -pipeline_names <- gsub("index.html", "", indexNames) -pipeline_names <- gsub("-", " ",pipeline_names) -pipeline_names <- toupper(gsub("_", " ",pipeline_names)) - -links <- paste('[', pipeline_names ,"](./", indexNames, '){target="_blank"}', sep = '') -links <- paste(links, sep = '\n') - - -#' `r links` - diff --git a/drop/__init__.py b/drop/__init__.py index 8540a900..5cfb47aa 100644 --- a/drop/__init__.py +++ b/drop/__init__.py @@ -1,21 +1,3 @@ -from .setupDrop import setupDrop as drop -from .configHelper import ConfigHelper as config -from .submodules import * - -def init(): - wbuild.cli.init() - # compy our template - -def update(): - wbuild.cli.update() - -if __name__ == '__main__': - import sys - import wbuild - - arg = sys.args[1] - if arg == 'init': - init() - elif arg == 'update': - update() - +from .setupDrop import * +from . import config +from . import utils diff --git a/drop/cli.py b/drop/cli.py index 4adca800..b726ceab 100644 --- a/drop/cli.py +++ b/drop/cli.py @@ -1,73 +1,106 @@ -import click import wbuild import drop import yaml -import pathlib -import shutil -import distutils.dir_util +from pathlib import Path +from shutil import copyfile +from distutils.dir_util import mkpath, copy_tree, remove_tree import subprocess +import click import click_log import logging + +wbuildPath = Path(wbuild.__file__).parent / ".wBuild" logger = logging.getLogger(__name__) click_log.basic_config(logger) @click.group() @click_log.simple_verbosity_option(logger) -@click.version_option('0.9.2',prog_name='drop') +@click.version_option('1.0.0',prog_name='drop') def main(): pass -def setup_paths(): - templatePath = pathlib.Path(drop.__file__).parent / "template" - modulePath = pathlib.Path(drop.__file__).parent / "modules" - wbuildPath = pathlib.Path(wbuild.__file__).parent / ".wBuild" - return templatePath, modulePath, wbuildPath - -def setFiles(): - templatePath, modulePath, wbuildPath = setup_paths() - distutils.dir_util.copy_tree(str(wbuildPath), "./.wBuild") - - shutil.copy(str(templatePath / "Snakefile"), ".") - - ### search for a file containing the word readme and .md - if not len(list(pathlib.Path(".").glob("readme*.md"))) > 0: - open("readme.md", "a").close() - if not pathlib.Path("config.yaml").is_file(): - shutil.copy(str(templatePath / 'config.yaml'), '.') - if not pathlib.Path("Scripts").is_dir(): - distutils.dir_util.copy_tree(str(templatePath / 'Scripts'), 'Scripts') - - if pathlib.Path(".drop").is_dir(): - distutils.dir_util.remove_tree(".drop") - print("overwriting module scripts") - distutils.dir_util.copy_tree(str(modulePath), ".drop/modules") + +def copyModuleCode(repoPaths, projectPaths): + repo_map = { + "aberrant-expression-pipeline": "AberrantExpression", + "aberrant-splicing-pipeline": "AberrantSplicing", + "mae-pipeline": "MonoallelicExpression" + } + + for repo, analysis_dir in repo_map.items(): + module_repo = repoPaths["modules"] / repo + module_project = projectPaths["Scripts"] / analysis_dir / "pipeline" + if module_project.is_dir(): + remove_tree(module_project) + print(f"overwriting pipeline scripts for {analysis_dir}") + copy_tree(str(module_repo), str(module_project)) + + +def removeFile(filePath, warn=True): + filePath = Path(filePath) + if filePath.is_file(): + if warn: + input(f"The file {str(filePath)} will be overwritten. Press Enter to continue...") + filePath.unlink() + + +def setFiles(warn=True): + repoPaths, projectPaths = drop.setupPaths(projectRoot=Path.cwd().resolve()) + + # create new directories + for path in projectPaths.values(): + if not path.is_dir(): + path.mkdir(parents=True) + logger.info(f"create {str(path)}") + + # hidden files + copy_tree(str(wbuildPath), str(projectPaths["projectDir"] / ".wBuild")) + copy_tree(str(repoPaths["modules"] / "helpers"), str(projectPaths["dropDir"] / "helpers")) + # TODO: put version info there + + # copy Scripts and pipelines + copyfile(repoPaths["template"] / "Snakefile", projectPaths["projectDir"] / "Snakefile") + copy_tree(str(repoPaths["Scripts"]), str(projectPaths["Scripts"])) + copyModuleCode(repoPaths, projectPaths) + + config_file = projectPaths["projectDir"] / "config.yaml" + if not config_file.is_file(): + copyfile(repoPaths["template"] / "config.yaml", config_file) + + # search for a file containing the word readme and .md + if not list(projectPaths["projectDir"].glob("readme*.md")): + copyfile(repoPaths["template"] / "readme.md", projectPaths["projectDir"] / "readme.md") @main.command() def init(): - if pathlib.Path(".drop").is_dir(): + if Path(".drop").is_dir(): print(".drop already exists, use drop update instead to update to a newer version") else: setFiles() logger.info("init...done") + @main.command() def update(): # TODO: check drop version first setFiles() logger.info("update...done") + @main.command() def demo(): - + # TODO: check if previous project gets overwritten setFiles() - + removeFile("config.yaml", warn=False) + logger.info("init...done") + # download data logger.info("download data") - download_script = str(pathlib.Path(drop.__file__).parent / "download_data.sh") + download_script = str(Path(drop.__file__).parent / "download_data.sh") response = subprocess.run(["bash", download_script], stderr=subprocess.STDOUT) response.check_returncode() - + # fix config file with open("config.yaml", "r") as f: config = yaml.load(f, Loader=yaml.Loader) @@ -76,7 +109,7 @@ def demo(): "sampleAnnotation": None, "v29": ["geneAnnotation"], "genome": ["mae"], "qcVcf": ["mae"]} - + for key, sub in path_keys.items(): # iterate to key and entry dict_ = config @@ -84,10 +117,10 @@ def demo(): for x in sub: dict_ = dict_[x] # set absolute path - dict_[key] = str(pathlib.Path(dict_[key]).resolve()) - + dict_[key] = str(Path(dict_[key]).resolve()) + with open("config.yaml", "w") as f: yaml.safe_dump(config.copy(), f, default_flow_style=False, sort_keys=False) - + logger.info("demo project created") diff --git a/drop/config/DropConfig.py b/drop/config/DropConfig.py new file mode 100644 index 00000000..42bafdf2 --- /dev/null +++ b/drop/config/DropConfig.py @@ -0,0 +1,123 @@ +from .SampleAnnotation import SampleAnnotation +from .Submodules import AE, AS, MAE +from .ExportCounts import ExportCounts +from drop import utils +from pathlib import Path +import wbuild + + +class DropConfig: + CONFIG_KEYS = [ + # wbuild keys + "projectTitle", "htmlOutputPath", "scriptsPath", "indexWithFolderName", "fileRegex", "readmePath", + # global parameters + "root", "sampleAnnotation", "geneAnnotation", "genomeAssembly", "exportCounts", "tools", + # modules + "aberrantExpression", "aberrantSplicing", "mae" + ] + + def __init__(self, wbuildConfig): + """ + Parse wbuild/snakemake config object for DROP-specific content + :param wbuildConfig: wBuild config object + """ + + self.wBuildConfig = wbuildConfig + self.config_dict = self.setDefaults(wbuildConfig.getConfig()) + + self.root = Path(self.get("root")) + self.processedDataDir = self.root / "processed_data" + self.processedResultsDir = self.root / "processed_results" + utils.createDir(self.root) + utils.createDir(self.processedDataDir) + utils.createDir(self.processedResultsDir) + + self.htmlOutputPath = Path(self.get("htmlOutputPath")) + self.readmePath = Path(self.get("readmePath")) + + self.geneAnnotation = self.get("geneAnnotation") + self.genomeAssembly = self.get("genomeAssembly") + self.sampleAnnotation = SampleAnnotation(self.get("sampleAnnotation"), self.root) + + # setup submodules + cfg = self.config_dict + sa = self.sampleAnnotation + pd = self.processedDataDir + pr = self.processedResultsDir + self.AE = AE(cfg["aberrantExpression"], sa, pd, pr) + self.AS = AS(cfg["aberrantSplicing"], sa, pd, pr) + self.MAE = MAE(cfg["mae"], sa, pd, pr) + + self.exportCounts = ExportCounts( + self.config_dict, self.processedResultsDir, + self.sampleAnnotation, self.getGeneAnnotations(), self.get("genomeAssembly"), + aberrantExpression=self.AE, aberrantSplicing=self.AS + ) + + # legacy + utils.setKey(self.config_dict, None, "aberrantExpression", self.AE.dict_) + utils.setKey(self.config_dict, None, "aberrantSplicing", self.AS.dict_) + utils.setKey(self.config_dict, None, "mae", self.MAE.dict_) + + def setDefaults(self, config_dict): + """ + Check mandatory keys and set defaults for any missing keys + :param config_dict: config dictionary + :return: config dictionary with defaults + """ + # check mandatory keys + config_dict = utils.checkKeys(config_dict, keys=["htmlOutputPath", "root", "sampleAnnotation"], + check_files=True) + config_dict["geneAnnotation"] = utils.checkKeys(config_dict["geneAnnotation"], keys=None, check_files=True) + + config_dict["wBuildPath"] = utils.getWBuildPath() + + setKey = utils.setKey + setKey(config_dict, None, "fileRegex", ".*\.(R|md)") + setKey(config_dict, None, "genomeAssembly", "hg19") + + # set submodule dictionaries + setKey(config_dict, None, "aberrantExpression", dict()) + setKey(config_dict, None, "aberrantSplicing", dict()) + setKey(config_dict, None, "mae", dict()) + setKey(config_dict, None, "exportCounts", dict()) + + # commandline tools + setKey(config_dict, None, "tools", dict()) + setKey(config_dict, ["tools"], "samtoolsCmd", "samtools") + setKey(config_dict, ["tools"], "bcftoolsCmd", "bcftools") + setKey(config_dict, ["tools"], "gatkCmd", "gatk") + + return config_dict + + def getRoot(self, str_=True): + return utils.returnPath(self.root, str_=str_) + + def getProcessedDataDir(self, str_=True): + return utils.returnPath(self.processedDataDir, str_=str_) + + def getProcessedResultsDir(self, str_=True): + return utils.returnPath(self.processedResultsDir, str_=str_) + + def getHtmlOutputPath(self, str_=True): + return utils.returnPath(self.htmlOutputPath, str_=str_) + + def getHtmlFromScript(self, path, str_=True): + path = Path(path).with_suffix(".html") + file_name = wbuild.utils.pathsepsToUnderscore(str(path), dotsToUnderscore=False) + html_output_path = self.getHtmlOutputPath(str_=False) + return utils.returnPath(html_output_path / file_name, str_=str_) + + def get(self, key): + if key not in self.CONFIG_KEYS: + raise KeyError(f"{key} not defined for Drop config") + return self.wBuildConfig.get(key) + + def getGeneAnnotations(self): + return self.geneAnnotation + + def getGeneVersions(self): + return self.geneAnnotation.keys() + + def getGeneAnnotationFile(self, annotation): + return self.geneAnnotation[annotation] diff --git a/drop/config/ExportCounts.py b/drop/config/ExportCounts.py new file mode 100644 index 00000000..3fe2d2b1 --- /dev/null +++ b/drop/config/ExportCounts.py @@ -0,0 +1,83 @@ +from snakemake.io import expand +from drop import utils + +class ExportCounts: + + COUNT_TYPE_MAP = { + "geneCounts": "aberrantExpression", + "splitCounts": "aberrantSplicing", + "spliceSiteOverlapCounts": "aberrantSplicing" + } + + def __init__(self, dict_, outputRoot, sampleAnnotation, geneAnnotations, genomeAssembly, + aberrantExpression, aberrantSplicing): + """ + :param dict_: config dictionary for count export + :param sampleAnnotation: parsed sample annotation + :param geneAnnotations: list of gene annotation names + :param aberrantExpression: AberrantExpression object + :param aberrantSplicing: AberrantSplicing object + """ + self.CONFIG_KEYS = ["geneAnnotations", "excludeGroups"] + self.config_dict = self.setDefaults(dict_, geneAnnotations) + self.outputRoot = outputRoot + self.sa = sampleAnnotation + self.genomeAssembly = genomeAssembly + self.modules = { + "aberrantExpression": aberrantExpression, + "aberrantSplicing": aberrantSplicing + } + self.pattern = self.outputRoot / "exported_counts" / "{dataset}--{genomeAssembly}--{annotation}" + + def setDefaults(self, config_dict, gene_annotations): + utils.setKey(config_dict, None, "geneAnnotations", gene_annotations) + utils.setKey(config_dict, None, "excludeGroups", list()) + + # check consistency of gene annotations + anno_incomp = set(config_dict["geneAnnotations"]) - set(gene_annotations) + if len(anno_incomp) > 0: + message = f"{anno_incomp} are not valid annotation version in 'geneAnnotation'" + message += "but required in 'exportCounts'.\n Please make sure they match." + raise ValueError(message) + + return config_dict + + def get(self, key): + if key not in self.CONFIG_KEYS: + raise KeyError(f"{key} not defined for count export") + return self.config_dict[key] + + def getFilePattern(self, str_=True): + return utils.returnPath(self.pattern, str_=str_) + + def getExportGroups(self, modules=None): + """ + Determine from which DROP groups counts should be exported + :param modules: 'aberrantExpression' for gene counts, 'aberrantSplicing' for splicing counts export + :return: DROP groups from which to export counts + """ + if modules is None: + modules = self.modules.keys() + elif isinstance(modules, str): + modules = [modules] + groups = [] # get all active groups + for module in modules: + groups.extend(self.modules[module].groups) + export_groups = set(groups) - set(self.get("excludeGroups")) + return export_groups + + def getExportCountFiles(self, prefix): + """ + Determine export count files. + :param prefix: name of file + :return: list of files to + """ + if prefix not in self.COUNT_TYPE_MAP.keys(): + raise ValueError(f"{prefix} not a valid file type for exported counts") + + datasets = self.getExportGroups([self.COUNT_TYPE_MAP[prefix]]) + file_pattern = str(self.pattern / f"{prefix}.tsv.gz") + count_files = expand(file_pattern, annotation=self.get("geneAnnotations"), + dataset=datasets, genomeAssembly=self.genomeAssembly) + return count_files + diff --git a/drop/config/SampleAnnotation.py b/drop/config/SampleAnnotation.py new file mode 100644 index 00000000..0999e528 --- /dev/null +++ b/drop/config/SampleAnnotation.py @@ -0,0 +1,263 @@ +from drop import utils +import pandas as pd +from pathlib import Path +from snakemake.logging import logger +import warnings +warnings.filterwarnings("ignore", 'This pattern has match groups') + +class SampleAnnotation: + + SAMPLE_ANNOTATION_COLUMNS = [ + "RNA_ID", "RNA_BAM_FILE", "DNA_ID", "DNA_VCF_FILE", "DROP_GROUP", "GENE_COUNTS_FILE", "ANNOTATION", + "PAIRED_END", "COUNT_MODE", "COUNT_OVERLAPS", "STRAND" + ] + + def __init__(self, file, root): + """ + sa_file: sample annotation file location from config + root: output location for file mapping + """ + self.root = Path(root) + self.file = file + self.sa = self.parse() + self.idMapping = self.createIdMapping() + self.sampleFileMapping = self.createSampleFileMapping() + + self.rnaIDs = self.createGroupIds(file_type="RNA_BAM_FILE", sep=',') + self.dnaIDs = self.createGroupIds(file_type="DNA_VCF_FILE", sep=',') + # external counts + self.extGeneCountIDs = self.createGroupIds(file_type="GENE_COUNTS_FILE", sep=',') + + def parse(self, sep='\t'): + """ + read and check sample annotation for missing columns + clean columns and set types + """ + sa = pd.read_csv(self.file, sep=sep) + missing_cols = [x for x in self.SAMPLE_ANNOTATION_COLUMNS if x not in sa.columns.values] + if len(missing_cols) > 0: + raise ValueError(f"Incorrect columns in sample annotation file. Missing:\n{missing_cols}") + + # remove unwanted characters + col = sa["DROP_GROUP"] + col = col.str.replace(" ", "").str.replace("(|)", "", regex=True) + sa["DROP_GROUP"] = col + + # set ID type as string + for ID_key in ["RNA_ID", "DNA_ID"]: + sa[ID_key] = sa[ID_key].apply( + lambda x: str(x) if not pd.isnull(x) else x + ) + return sa + + ##### Construction + + def createIdMapping(self): + """ + Get mapping of RNA and DNA IDs + """ + return self.sa[["RNA_ID", "DNA_ID"]].drop_duplicates().dropna() + + def createSampleFileMapping(self): + """ + create a sample file mapping with unique entries of existing files + columns: [ID | ASSAY | FILE_TYPE | FILE_PATH ] + """ + + assay_mapping = {'RNA_ID': ['RNA_BAM_FILE', 'GENE_COUNTS_FILE'], 'DNA_ID': ['DNA_VCF_FILE']} + assay_subsets = [] + for id_, file_types in assay_mapping.items(): + for file_type in file_types: + df = self.sa[[id_, file_type]].dropna().drop_duplicates().copy() + df.rename(columns={id_: 'ID', file_type: 'FILE_PATH'}, inplace=True) + df['ASSAY'] = id_ + df['FILE_TYPE'] = file_type + assay_subsets.append(df) + file_mapping = pd.concat(assay_subsets) + + # cleaning SAMPLE_FILE_MAPPING + file_mapping.dropna(inplace=True) + file_mapping.drop_duplicates(inplace=True) + + # check for missing files + existing = utils.checkFileExists(file_mapping["FILE_PATH"]) + if len(existing) == 0: + message = "File mapping is empty. " + message += "Please check that all files in your sample annotation exist." + raise FileNotFoundError(message) + elif len(existing) < file_mapping.shape[0]: + missing = set(file_mapping["FILE_PATH"]) - set(existing) + logger.info(f"WARNING: {len(missing)} files missing in samples annotation. Ignoring...") + logger.debug(f"Missing files: {missing}") + file_mapping = file_mapping[file_mapping["FILE_PATH"].isin(existing)] + + # write file mapping + file_mapping.to_csv(self.root / "file_mapping.csv", index=False) + return file_mapping + + def createGroupIds(self, group_key="DROP_GROUP", file_type=None, sep=','): + """ + Create a mapping of DROP groups to lists of sample IDs + """ + if not file_type: + file_type = "RNA_BAM_FILE" + # infer ID type from file type + assay_id = self.subsetFileMapping(file_type)["ASSAY"].iloc[0] + + # Subset sample annotation to only IDs of specified file_type + ids = self.getSampleIDs(file_type) + df = self.sa[self.sa[assay_id].isin(ids)] + # mapping of ID to group names + df = df[[assay_id, group_key]].drop_duplicates().copy() + + # get unique group names + groups = [] + for s in set(self.sa[group_key]): + groups.extend(s.split(sep)) + groups = set(groups) + + # collect IDs per group + grouped = {gr: df[df[group_key].str.contains(f'(^|{sep}){gr}({sep}|$)')][assay_id].tolist() + for gr in groups} + # remove groups labeled as None + grouped = {gr: list(set(ids)) for gr, ids in grouped.items() if gr is not None} + return grouped + + ### Subsetting + + def subsetSampleAnnotation(self, column, values, subset=None): + """ + subset by one or more values of different columns from sample file mapping + :param column: valid column in sample annotation + :param values: values of column to subset + :param subset: subset sample annotation + """ + sa_cols = set(self.SAMPLE_ANNOTATION_COLUMNS) + if subset is None: + subset = self.sa + else: + # check type for subset + if not isinstance(subset, pd.DataFrame): + raise TypeError(f"Is not pandas DataFrame\n {subset}") + if not sa_cols <= set(subset.columns): # check if mandatory cols not contained + raise ValueError(f"Subset columns not the same as {sa_cols}\ngot: {subset.columns}") + + # check if column is valid + if not column in sa_cols: + raise KeyError(f"Column '{column}' invalid for sample annotation") + return utils.subsetBy(subset, column, values) + + def subsetFileMapping(self, file_type=None, sample_id=None): + """ + subset by one or more values of different columns from sample file mapping + file_type: file type/types, corresponding to 'FILE_TYPE' column + sample_id: sample ID/IDs + """ + subset = self.sampleFileMapping + subset = utils.subsetBy(subset, "FILE_TYPE", file_type) + subset = utils.subsetBy(subset, "ID", sample_id) + return subset + + def subsetGroups(self, subset_groups, assay="RNA", warn=30, error=10): + """ + Subset DROP group to sample IDs mapping by list of groups (`subset_groups`). + Give warning or error if subsetting results in too few sample IDs per group. + warn : number of samples threshold at which to warn about too few samples + error: number of samples threshold at which to give error + """ + ids_by_group = self.getGroupedIDs(assay) + + if subset_groups is None: + subset = ids_by_group + else: + subset_groups = [subset_groups] if subset_groups.__class__ == str else subset_groups + subset = {gr: ids for gr, ids in + ids_by_group.items() if gr in subset_groups} + + for group in subset_groups: + if len(subset[group]) < error: + message = f'Too few IDs in DROP_GROUP {group}' + message += f', please ensure that it has at least {error} IDs' + message += f', groups: {subset[group]}' + raise ValueError(message) + elif len(subset[group]) < warn: + logger.info(f'WARNING: Less than {warn} IDs in DROP_GROUP {group}') + + return subset + + ### Getters + + def getFilePath(self, sample_id, file_type, single_file=True): + """ + Get path to input data file by sample ID + """ + path = self.subsetFileMapping(file_type, sample_id)["FILE_PATH"] + path = path.tolist() + if single_file: + if len(path) > 1: + message = "Trying to return more than 1 path for 1 sample ID and file type" + raise ValueError(message) + path = path[0] + return path + + def getFilePaths(self, file_type, group=None): + """ + Get all file paths of a file type + file_type: 'RNA_BAM_FILE' or 'DNA_VCF_FILE' + group: name of DROP_GROUP + """ + if group is None: + sampleIDs = self.getSampleIDs(file_type) + else: + sampleIDs = self.getGroupedIDs(file_type)[group] + return self.getFilePath(sampleIDs, file_type, single_file=False) + + def getImportCountFiles(self, annotation, group, file_type="GENE_COUNTS_FILE", + annotation_key="ANNOTATION", group_key="DROP_GROUP"): + """ + :param annotation: annotation name as specified in config and ANNOTATION column + :param group: a group of the DROP_GROUP column + :return: set of unique external count file names + """ + subset = self.subsetSampleAnnotation(annotation_key, annotation) + subset = self.subsetSampleAnnotation(group_key, group, subset) + return set(subset[file_type].tolist()) + + def getRow(self, column, value): + sa = self.sa + if column not in sa.columns: + raise KeyError(f"column {column} not in sample annotation") + row = sa[sa[column] == value] + if row.shape[0] != 1: + raise ValueError(f"sa[sa[{column}] == {value}] should have 1 row") + return row + + ### DROP Groups ### + + def getGroupedIDs(self, assays): + """ + Get group to IDs mapping + :param assays: list of or single assay the IDs should be from. Can be file_type or 'RNA'/'DNA' + """ + assays = [assays] if isinstance(assays, str) else assays + groupedIDs = dict() + for assay in assays: + if "RNA" in assay: + groupedIDs.update(self.rnaIDs) + elif "DNA" in assay: + groupedIDs.update(self.dnaIDs) + elif "GENE_COUNT" in assay: + groupedIDs.update(self.extGeneCountIDs) + else: + raise ValueError(f"'{assay}' is not a valid assay name") + return groupedIDs + + def getGroups(self, assay="RNA"): + return self.getGroupedIDs(assay).keys() + + def getIDsByGroup(self, group, assay="RNA"): + return self.getGroupedIDs(assay)[group] + + def getSampleIDs(self, file_type): + ids = self.subsetFileMapping(file_type)["ID"] + return list(ids) diff --git a/drop/config/Submodules.py b/drop/config/Submodules.py new file mode 100644 index 00000000..cffc5120 --- /dev/null +++ b/drop/config/Submodules.py @@ -0,0 +1,159 @@ +from pathlib import Path +from snakemake.logging import logger +from snakemake.io import expand +from drop import utils + + +class Submodule: + + def __init__(self, config, sampleAnnotation, processedDataDir, processedResultsDir): + self.CONFIG_KEYS = [] + self.name = "Submodule" + self.processedDataDir = processedDataDir + self.processedResultsDir = processedResultsDir + self.sa = sampleAnnotation + self.dict_ = self.setDefaultKeys(config) + self.groups = self.dict_["groups"] + + def setDefaultKeys(self, dict_): + dict_ = {} if dict_ is None else dict_ + return dict_ + + def get(self, key): + if key not in self.CONFIG_KEYS: + raise KeyError(f"{key} not defined for {self.name} config") + return self.dict_[key] + + def getWorkdir(self, str_=True): + return utils.returnPath(Path("Scripts") / self.name / "pipeline", str_) + + +class AE(Submodule): + + def __init__(self, config, sampleAnnotation, processedDataDir, processedResultsDir): + super().__init__(config, sampleAnnotation, processedDataDir, processedResultsDir) + self.CONFIG_KEYS = [ + "groups", "fpkmCutoff", "implementation", "padjCutoff", "zScoreCutoff", + "maxTestedDimensionProportion" + ] + self.name = "AberrantExpression" + self.rnaIDs = self.sa.subsetGroups(self.groups, assay="RNA") + self.extRnaIDs = self.sa.subsetGroups(self.groups, assay="GENE_COUNTS", warn=0, error=0) + + def setDefaultKeys(self, dict_): + super().setDefaultKeys(dict_) + setKey = utils.setKey + setKey(dict_, None, "groups", self.sa.getGroups(assay="RNA")) + setKey(dict_, None, "fpkmCutoff", 1) + setKey(dict_, None, "implementation", "autoencoder") + setKey(dict_, None, "padjCutoff", .05) + setKey(dict_, None, "zScoreCutoff", 0) + setKey(dict_, None, "maxTestedDimensionProportion", 3) + return dict_ + + def getCountFiles(self, annotation, group): + """ + Get all count files from DROP (counted from BAM file) and external count matrices + :param annotation: annotation name from wildcard + :param group: DROP group name from wildcard + :return: list of files + """ + bam_IDs = self.sa.getIDsByGroup(group, assay="RNA") + file_stump = self.processedDataDir / "aberrant_expression" / annotation / "counts" / "{sampleID}.Rds" + count_files = expand(str(file_stump), sampleID=bam_IDs) + extCountFiles = self.sa.getImportCountFiles(annotation, group, file_type="GENE_COUNTS_FILE") + count_files.extend(extCountFiles) + return count_files + + def getCountParams(self, rnaID): + sa_row = self.sa.getRow("RNA_ID", rnaID) + count_params = sa_row[["STRAND", "COUNT_MODE", "PAIRED_END", "COUNT_OVERLAPS"]] + return count_params.iloc[0].to_dict() + + +class AS(Submodule): + + def __init__(self, config, sampleAnnotation, processedDataDir, processedResultsDir): + super().__init__(config, sampleAnnotation, processedDataDir, processedResultsDir) + self.CONFIG_KEYS = [ + "groups", "recount", "longRead", "filter", "minExpressionInOneSample", "minDeltaPsi", + "implementation", "padjCutoff", "zScoreCutoff", "deltaPsiCutoff", "maxTestedDimensionProportion" + ] + self.name = "AberrantSplicing" + self.rnaIDs = self.sa.subsetGroups(self.groups, assay="RNA") + + def setDefaultKeys(self, dict_): + super().setDefaultKeys(dict_) + setKey = utils.setKey + setKey(dict_, None, "groups", self.sa.getGroups(assay="RNA")) + setKey(dict_, None, "recount", False) + setKey(dict_, None, "longRead", False) + setKey(dict_, None, "keepNonStandardChrs", False) + setKey(dict_, None, "filter", True) + setKey(dict_, None, "minExpressionInOneSample", 20) + setKey(dict_, None, "minDeltaPsi", 0) + setKey(dict_, None, "implementation", "PCA") + setKey(dict_, None, "padjCutoff", 0.05) + setKey(dict_, None, "zScoreCutoff", 0.05) + setKey(dict_, None, "deltaPsiCutoff", 0.05) + setKey(dict_, None, "maxTestedDimensionProportion", 6) + return dict_ + + +class MAE(Submodule): + + def __init__(self, config, sampleAnnotation, processedDataDir, processedResultsDir): + super().__init__(config, sampleAnnotation, processedDataDir, processedResultsDir) + self.CONFIG_KEYS = [ + "groups", "genome", "qcVcf", "qcGroups", "gatkIgnoreHeaderCheck", "padjCutoff", + "allelicRatioCutoff", "maxAF", "gnomAD" + ] + self.name = "MonoallelicExpression" + self.qcGroups = self.dict_["qcGroups"] + self.maeIDs = self.createMaeIDS(id_sep='--') + + def setDefaultKeys(self, dict_): + super().setDefaultKeys(dict_) + setKey = utils.setKey + dict_ = utils.checkKeys(dict_, keys=["genome", "qcVcf"], check_files=True) + groups = setKey(dict_, None, "groups", self.sa.getGroups(assay="DNA")) + setKey(dict_, None, "qcGroups", groups) + setKey(dict_, None, "gatkIgnoreHeaderCheck", True) + setKey(dict_, None, "padjCutoff", .05) + setKey(dict_, None, "allelicRatioCutoff", 0.8) + setKey(dict_, None, "maxAF", .001) + setKey(dict_, None, "addAF", False) + setKey(dict_, None, "maxVarFreqCohort", 0.04) + setKey(dict_, None, "gnomAD", False) + return dict_ + + def createMaeIDS(self, id_sep='--'): + """ + Create MAE IDs from sample annotation + :param id_sep: separator + :return: {drop group name : list of MAE IDs per group} + """ + grouped_rna_ids = self.sa.subsetGroups(self.groups, assay="RNA", warn=1, error=1) + id_map = self.sa.idMapping + mae_ids = {} + for gr, rna_ids in grouped_rna_ids.items(): + subset = id_map[id_map["RNA_ID"].isin(rna_ids)] + dna_rna_pairs = zip(subset["DNA_ID"], subset["RNA_ID"]) + mae_ids[gr] = list(map(id_sep.join, dna_rna_pairs)) + return mae_ids + + def getMaeByGroup(self, group): + if not isinstance(group, str): + group = list(group)[0] + return self.maeIDs[group] + + def getMaeAll(self): + """ + Get a list of all MAE IDs from the groups specified in the config. + Useful for collecting all MAE IDs ungrouped. + """ + all_ids = [] + groups = [self.groups] if isinstance(self.groups, str) else self.groups + for group in groups: + all_ids.extend(self.getMaeByGroup(group)) + return all_ids diff --git a/drop/config/__init__.py b/drop/config/__init__.py new file mode 100644 index 00000000..2e4c5a54 --- /dev/null +++ b/drop/config/__init__.py @@ -0,0 +1,3 @@ +from .DropConfig import DropConfig +from .SampleAnnotation import SampleAnnotation +from .Submodules import AE, AS, MAE diff --git a/drop/configHelper.py b/drop/configHelper.py deleted file mode 100644 index a162265c..00000000 --- a/drop/configHelper.py +++ /dev/null @@ -1,412 +0,0 @@ -from drop import submodules -import pandas as pd -import wbuild -import pathlib -from snakemake.logging import logger -from snakemake.io import expand -import warnings -warnings.filterwarnings("ignore", 'This pattern has match groups') - -SAMPLE_ANNOTATION_COLUMNS = ["RNA_ID", "RNA_BAM_FILE", "DNA_ID", "DNA_VCF_FILE", "DROP_GROUP", -"PAIRED_END", "COUNT_MODE", "COUNT_OVERLAPS", "STRAND"] -VERBOSE = True - -class ConfigHelper: - - def __init__(self, config, method=None): - - self.method = method - if config is None: - wconf = wbuild.utils.Config() - config = wconf.conf_dict - - config = self.checkConfig(config) - self.config = self.setDefaults(config) - self.createDirs() - - def parse(self): - """ - parse sample annotation, create sample-file mapping and extract IDs for each submodule - """ - # sample annotation - self.sample_annotation = self.getSampleAnnotation(self.config["sampleAnnotation"]) - self.sample_file_mapping = self.createSampleFileMapping(self.sample_annotation) - - self.all_rna_ids = self.createGroupIds(group_key="DROP_GROUP", file_type="RNA_BAM_FILE", sep=',') - - if self.method == "AE" or self.method is None: - groups = self.setKey(self.config, ["aberrantExpression"], "groups", self.all_rna_ids.keys()) - self.outrider_ids = self.subsetGroups(self.all_rna_ids, groups) - self.config["outrider_ids"] = self.outrider_ids - - if self.method == "AS" or self.method is None: - groups = self.setKey(self.config, ["aberrantSplicing"], "groups", self.all_rna_ids.keys()) - self.fraser_ids = self.subsetGroups(self.all_rna_ids, groups) - self.config["fraser_ids"] = self.fraser_ids - - if self.method == "MAE" or self.method is None: - # get MAE groups or set default - groups = self.setKey(self.config, ["mae"], "groups", self.all_rna_ids.keys()) - # get QC groups as subset of MAE groups - self.setKey(self.config, ["mae"], "qcGroups", groups) - - self.mae_ids = self.createMaeIDS(self.all_rna_ids, groups, id_sep='--') - self.config["mae_ids"] = self.mae_ids - - return self.config - - - #### FILE CHECKING AND DEFAULT SETTINGS - - def createDirs(self): - - def createIfMissing(directory): - directory = pathlib.Path(directory) - if not directory.exists(): - logger.debug(f"creating {directory}") - directory.mkdir(parents=True) - - createIfMissing(self.getProcDataDir()) - createIfMissing(self.getProcResultsDir()) - - def checkConfig(self, config): - - def check_keys(dict_, keys): - keys = dict_.keys() if keys is None else keys - for key in keys: - try: - dict_[key] - except: - raise KeyError(f"{key} is mandatory but missing") - # get real path - if isinstance(dict_[key], str): - filename = dict_[key] - dict_[key] = str(pathlib.Path(filename).expanduser()) - - FILE_KEYS = ["htmlOutputPath", "root", "geneAnnotation", "sampleAnnotation", "mae"] - check_keys(config, keys=FILE_KEYS) - check_keys(config["geneAnnotation"], keys=None) - check_keys(config["mae"], keys=["genome", "qcVcf"]) - - return config - - def setDefaults(self, config, method=None): - """ - set defaults for config keys - """ - - config["indexWithFolderName"] = True - config["fileRegex"] = ".*\.R" - - setKey = self.setKey - setKey(config, None, "projectTitle", "DROP: Detection of RNA Outlier Pipeline") - - setKey(config, None, "genomeAssembly", "hg19") - setKey(config, None, "scanBamParam", "null") - - # export settings - setKey(config, None, "exportCounts", dict(), verbose=VERBOSE) - gene_annotations = list(config["geneAnnotation"].keys()) - setKey(config, ["exportCounts"], "geneAnnotations", gene_annotations, verbose=VERBOSE) - setKey(config, ["exportCounts"], "excludeGroups", list(), verbose=VERBOSE) - - # check consistency of gene annotations - anno_incomp = set(config["exportCounts"]["geneAnnotations"]) - set(gene_annotations) - if len(anno_incomp) > 0: - message = f"{anno_incomp} are not valid annotation version in 'geneAnnotation'" - message += "but required in 'exportCounts'.\n Please make sure they match." - raise ValueError(message) - - if self.method is None: - tmp_dir = submodules.getTmpDir() - else: - tmp_dir = submodules.getMethodPath(self.method, type_='tmp_dir') - setKey(config, None, "tmpdir", tmp_dir) - - # aberrant expression - if self.method == "AE" or self.method is None: - setKey(config, None, "aberrantExpression", dict(), verbose=VERBOSE) - setKey(config, ["aberrantExpression"], "fpkmCutoff", 1, verbose=VERBOSE) - setKey(config, ["aberrantExpression"], "implementation", "autoencoder", verbose=VERBOSE) - setKey(config, ["aberrantExpression"], "groups", None, verbose=VERBOSE) - setKey(config, ["aberrantExpression"], "padjCutoff", .05, verbose=VERBOSE) - setKey(config, ["aberrantExpression"], "zScoreCutoff", 0, verbose=VERBOSE) - setKey(config, ["aberrantExpression"], "maxTestedDimensionProportion", 3, verbose=VERBOSE) - - # aberrant splicing - if self.method == "AS" or self.method is None: - setKey(config, None, "aberrantSplicing", dict(), verbose=VERBOSE) - setKey(config, ["aberrantSplicing"], "groups", None, verbose=VERBOSE) - setKey(config, ["aberrantSplicing"], "recount", False, verbose=VERBOSE) - setKey(config, ["aberrantSplicing"], "longRead", False, verbose=VERBOSE) - setKey(config, ["aberrantSplicing"], "filter", True, verbose=VERBOSE) - setKey(config, ["aberrantSplicing"], "minExpressionInOneSample", 20, verbose=VERBOSE) - setKey(config, ["aberrantSplicing"], "minDeltaPsi", 0, verbose=VERBOSE) - setKey(config, ["aberrantSplicing"], "implementation", "PCA", verbose=VERBOSE) - setKey(config, ["aberrantSplicing"], "padjCutoff", 0.05, verbose=VERBOSE) - setKey(config, ["aberrantSplicing"], "zScoreCutoff", 0.05, verbose=VERBOSE) - setKey(config, ["aberrantSplicing"], "deltaPsiCutoff", 0.05, verbose=VERBOSE) - setKey(config, ["aberrantSplicing"], "maxTestedDimensionProportion", 6, verbose=VERBOSE) - - # monoallelic expression - if self.method == "MAE" or self.method is None: - setKey(config, None, "mae", dict(), verbose=VERBOSE) - setKey(config, ["mae"], "gatkIgnoreHeaderCheck", True, verbose=VERBOSE) - setKey(config, ["mae"], "padjCutoff", .05, verbose=VERBOSE) - setKey(config, ["mae"], "allelicRatioCutoff", 0.8, verbose=VERBOSE) - setKey(config, ["mae"], "addAF", False, verbose=VERBOSE) - setKey(config, ["mae"], "maxAF", .001, verbose=VERBOSE) - setKey(config, ["mae"], "maxVarFreqCohort", .05, verbose=VERBOSE) - setKey(config, ["mae"], "groups", None, verbose=VERBOSE) - setKey(config, ["mae"], "qcGroups", None, verbose=VERBOSE) - - # commandline tools - setKey(config, None, "tools", dict(), verbose=VERBOSE) - setKey(config, ["tools"], "samtoolsCmd", "samtools", verbose=VERBOSE) - setKey(config, ["tools"], "bcftoolsCmd", "bcftools", verbose=VERBOSE) - setKey(config, ["tools"], "gatkCmd", "gatk", verbose=VERBOSE) - - return config - - def setKey(self, dict_, sub, key, default, verbose=False): - if sub is not None: - if not isinstance(sub, list): - raise TypeError(f"{sub} is not of type list") - for x in sub: - dict_ = dict_[x] - if key not in dict_ or dict_[key] is None: - logger.debug(f'{key} not in config{sub}, using default') - dict_[key] = default - return dict_[key] - - - #### FILE PARSING AND ID EXTRACTION - - def getSampleAnnotation(self, filename, sep='\t'): - """ - read and check sample annotation for missing columns - """ - sample_annotation = pd.read_csv(filename, sep=sep) - missing_cols = [x for x in SAMPLE_ANNOTATION_COLUMNS if x not in sample_annotation.columns.values] - if len(missing_cols) > 0: - raise ValueError(f"Incorrect columns in sample annotation, missing:\n {missing_cols}\n") - - # remove unwanted characters - col = sample_annotation["DROP_GROUP"] - col = col.str.replace(" ", "").str.replace("(|)", "", regex=True) - sample_annotation["DROP_GROUP"] = col - - # set ID type as string - for ID_key in ["RNA_ID", "DNA_ID"]: - sample_annotation[ID_key] = sample_annotation[ID_key].apply( - lambda x: str(x) if not pd.isnull(x) else x - ) - - return sample_annotation - - def createSampleFileMapping(self, sample_annotation): - """ - create a sample file mapping with unique entries of existing files - columns: [ID | ASSAY | FILE_TYPE | FILE_PATH ] - """ - - assay_mapping = {'RNA_ID':'RNA_BAM_FILE', 'DNA_ID':'DNA_VCF_FILE'} - - assay_subsets = [] - for id_, file_type in assay_mapping.items(): - df = sample_annotation[[id_, file_type]].drop_duplicates().copy() - df.rename(columns={id_:'ID', file_type:'FILE_PATH'}, inplace=True) - df['ASSAY'] = id_ - df['FILE_TYPE'] = file_type - assay_subsets.append(df) - - file_mapping = pd.concat(assay_subsets) - - # cleaning SAMPLE_FILE_MAPPING - file_mapping.dropna(inplace=True) - existent = [pathlib.Path(x).exists() for x in file_mapping["FILE_PATH"]] - if sum(existent) < file_mapping.shape[0]: - logger.info("WARNING: there are files in the sample annotation that do not exist") - file_mapping = file_mapping[existent].drop_duplicates() - if file_mapping.shape[0] == 0: - raise ValueError("No files exist in sample annotation. Please check your sample annotation.") - - file_mapping.to_csv(self.getProcDataDir() + "/file_mapping.csv", index=False) - - return file_mapping - - def createGroupIds(self, group_key="DROP_GROUP", file_type="RNA_BAM_FILE", sep=','): - """ - Create a full and filtered list of RNA assay IDs subsetted by specified DROP groups - """ - sa = self.sample_annotation - sf = self.sample_file_mapping - - assay_id = sf[sf["FILE_TYPE"] == file_type]["ASSAY"].iloc[0] - - # Get unique groups - ids = self.getSampleIDs(file_type) - df = sa[sa[assay_id].isin(ids)] - df = df[[assay_id, group_key]].drop_duplicates().copy() - - # get unique group names - groups = [] - for s in set(sa[group_key]): - groups.extend(s.split(sep)) - groups = set(groups) - - # collect IDs per group - grouped = {gr : df[df[group_key].str.contains(f'(^|{sep}){gr}({sep}|$)')][assay_id].tolist() - for gr in groups} - # remove groups labeled as None - grouped = {gr : list(set(ids)) for gr, ids in grouped.items() if gr is not None} - return grouped - - - def subsetGroups(self, ids_by_group, subset_groups, warn=30, error=10): - """ - warn : number of samples threshold at which to warn about too few samples - error: number of samples threshold at which to give error - """ - if subset_groups is None: - subset = ids_by_group - else: - subset_groups = [subset_groups] if subset_groups.__class__ == str else subset_groups - subset = {gr:ids for gr, ids in ids_by_group.items() if gr in subset_groups} - - for group in subset_groups: - if len(subset[group]) < error: - message = f'Too few IDs in DROP_GROUP {group}' - message += f', please ensure that it has at least {error} IDs' - message += f', groups: {subset[group]}' - raise ValueError(message) - elif len(subset[group]) < warn: - logger.info(f'WARNING: Less than {warn} IDs in DROP_GROUP {group}') - - return subset - - def createMaeIDS(self, ids_by_group, subset_groups, id_sep='--'): - """ - create MAE IDs from smaple annotation - """ - rna_id_by_group = self.subsetGroups(ids_by_group, subset_groups, warn=1, error=1) - all_mae_files = self.sample_annotation[["RNA_ID", "DNA_ID"]].drop_duplicates().dropna() - - # subset by group - mae_ids = {} - for gr, rna_ids in rna_id_by_group.items(): - mae_subset = all_mae_files [all_mae_files ["RNA_ID"].isin(rna_ids)] - vcf_rna_pairs = zip(mae_subset["DNA_ID"], mae_subset["RNA_ID"]) - mae_ids[gr] = list(map(id_sep.join, vcf_rna_pairs)) - - return mae_ids - - - #### GETTERS #### - - def getProcDataDir(self): - return self.config["root"] + "/processed_data" - - def getProcResultsDir(self): - return self.config["root"] + "/processed_results" - - def getSampleIDs(self, file_type): - fm = self.sample_file_mapping - return list(fm[fm["FILE_TYPE"] == file_type]["ID"]) - - - def checkFileExists(self, sampleID, file_type, verbose=True): - # note: we already checked for non-existing files in the init, so we - # only need to check whether the ID is in the sample_file_mapping here - x = self.sample_file_mapping.query("(FILE_TYPE == @file_type) & (ID == @sampleID)")["FILE"] - exists = (len(x) != 0) - if (not exists) and verbose: - logger.debug(f"FILE NOT FOUND FOR sampleID: {sampleID} and file type {file_type}") - return exists - - def getFilePath(self, sampleId, file_type): - """ - Function for getting the file path given the sampleId and file type - sampleId: ID of sample - file_type: e.g. "RNA_BAM_FILE", "DNA_VCF_FILE" - """ - fm = self.sample_file_mapping - if isinstance(file_type, str): - path = fm.query("FILE_TYPE == @file_type") - else: - path = fm[fm["FILE_TYPE"].isin(file_type)] - path = path.query("ID == @sampleId")["FILE_PATH"] - return path.iloc[0] - - def getFilePaths(self, file_type, group=None, ids_by_group=None): - """ - file_type: e.g. "RNA_BAM_FILE", "DNA_VCF_FILE" - group: name of DROP_GROUP - ids_by_group: dictionary of IDs by group names - """ - - # subset by group if group is specified - if group is None or ids_by_group is None: - sampleIDs = self.sample_file_mapping.query("FILE_TYPE == @file_type")["ID"] - else: - sampleIDs = ids_by_group[group] - - files = [] # collect file names - for sampleID in sampleIDs: - files.append(self.getFilePath(sampleID, file_type)) - return files - - def getRNAByGroup(self, group): - if not isinstance(group, str): - group = list(group)[0] - return self.all_rna_ids[group] - - def getMaeByGroup(self, group): - if self.method != 'MAE': - self.method = 'MAE' - self.parse() - if not isinstance(group, str): - group = list(group)[0] - return self.mae_ids[group] - - def getMaeAll(self): - """ - Get a list of all MAE IDs from the groups specified in the config. - Useful for collecting all MAE IDs ungrouped. - """ - if self.method != 'MAE': - self.method = 'MAE' - self.parse() - all_ids = [] - groups = self.config["mae"]["groups"] - if groups.__class__ == str: - groups = [groups] - for group in groups: - all_ids.extend(self.mae_ids[group]) - return all_ids - - def getGeneAnnotationFile(self, annotation): - return self.config["geneAnnotation"][annotation] - - def getExportGroups(self, modules=["aberrantExpression", "aberrantSplicing"]): - groups = [] # get all active groups - for module in modules: - groups.extend(self.config[module]["groups"]) - exclude = self.config["exportCounts"]["excludeGroups"] - return set(groups) - set(exclude) - - def getExportCountFiles(self, prefix): - - count_type_map = {"geneCounts":"aberrantExpression", - "splitCounts":"aberrantSplicing", - "spliceSiteOverlapCounts":"aberrantSplicing"} - if prefix not in count_type_map.keys(): - raise ValueError(f"{prefix} not a valid file type for exported counts") - - datasets = self.getExportGroups([count_type_map[prefix]]) - annotations = self.config["exportCounts"]["geneAnnotations"] - genomeAssembly = self.config["genomeAssembly"] - - pattern = self.getProcResultsDir() + f"/exported_counts/{{dataset}}--{{genomeAssembly}}--{{annotation}}/{prefix}.tsv.gz" - return expand(pattern, annotation=annotations, dataset=datasets, genomeAssembly=genomeAssembly) - diff --git a/drop/download_data.sh b/drop/download_data.sh index c265486c..9107ce43 100644 --- a/drop/download_data.sh +++ b/drop/download_data.sh @@ -3,15 +3,22 @@ set -e # get data resource_url="https://www.cmm.in.tum.de/public/paper/drop_analysis/resource.tar.gz" -wget -Nc $resource_url -tar -zxvf resource.tar.gz -rm -rf Data -mv resource Data +tmpdir="$(dirname "$(tempfile)")" +wget -nc -P $tmpdir $resource_url +mkdir -p Data +if [ -z "$(ls Data)" ]; then + tar -zxvf "$tmpdir/resource.tar.gz" -C . + rm -rf Data + mv resource Data +else + echo "Data directory not empty, is not updated" +fi # prepare data -cd Data +cd ./Data +echo "cp config_relative_wb1.8.yaml ../config.yaml" +cp config_relative_wb1.8.yaml ../config.yaml python fix_sample_anno.py -gunzip chr21.fa.gz -# copy config -cp config_relative.yaml ../config.yaml +# unzip fasta +if [ ! -f "chr21.fa" ]; then gunzip chr21.fa.gz; fi diff --git a/drop/modules/aberrant-expression-pipeline/Scripts/Counting/Datasets.R b/drop/modules/aberrant-expression-pipeline/Counting/Datasets.R similarity index 68% rename from drop/modules/aberrant-expression-pipeline/Scripts/Counting/Datasets.R rename to drop/modules/aberrant-expression-pipeline/Counting/Datasets.R index ad1f167a..06824fce 100644 --- a/drop/modules/aberrant-expression-pipeline/Scripts/Counting/Datasets.R +++ b/drop/modules/aberrant-expression-pipeline/Counting/Datasets.R @@ -2,21 +2,19 @@ #' title: Counts Overview #' author: mumichae, salazar #' wb: -#' params: -#' - ids: '`sm parser.outrider_ids`' -#' - tmpdir: '`sm drop.getMethodPath(METHOD, "tmp_dir")`' +#' log: +#' - snakemake: '`sm str(tmp_dir / "AE" / "Count_Overview.Rds")`' #' input: #' - summaries: '`sm expand(config["htmlOutputPath"] + #' "/AberrantExpression/Counting/{annotation}/Summary_{dataset}.html", -#' annotation=list(config["geneAnnotation"].keys()), dataset=parser.outrider_ids)`' +#' annotation=cfg.getGeneVersions(), dataset=cfg.AE.groups)`' #' output: #' html_document: #' code_folding: hide #' code_download: TRUE #'--- -saveRDS(snakemake, file.path(snakemake@params$tmpdir, "counting_overview.snakemake") ) -# snakemake <- readRDS(".drop/tmp/AE/counting_overview.snakemake") +saveRDS(snakemake, snakemake@log$snakemake) # Obtain the annotations and datasets gene_annotation_names <- names(snakemake@config$geneAnnotation) diff --git a/drop/modules/aberrant-expression-pipeline/Scripts/Counting/Summary.R b/drop/modules/aberrant-expression-pipeline/Counting/Summary.R similarity index 94% rename from drop/modules/aberrant-expression-pipeline/Scripts/Counting/Summary.R rename to drop/modules/aberrant-expression-pipeline/Counting/Summary.R index 0a1bf17e..832ccd08 100644 --- a/drop/modules/aberrant-expression-pipeline/Scripts/Counting/Summary.R +++ b/drop/modules/aberrant-expression-pipeline/Counting/Summary.R @@ -2,12 +2,12 @@ #' title: "Counts Summary: `r gsub('_', ' ', snakemake@wildcards$dataset)`" #' author: #' wb: -#' params: -#' - tmpdir: '`sm drop.getMethodPath(METHOD, "tmp_dir")`' +#' log: +#' - snakemake: '`sm str(tmp_dir / "AE" / "{annotation}" / "{dataset}" / "count_summary.Rds")`' #' input: -#' - ods: '`sm parser.getProcResultsDir() + +#' - ods: '`sm cfg.getProcessedResultsDir() + #' "/aberrant_expression/{annotation}/outrider/{dataset}/ods_unfitted.Rds"`' -#' - bam_cov: '`sm parser.getProcDataDir() + +#' - bam_cov: '`sm cfg.getProcessedDataDir() + #' "/aberrant_expression/{annotation}/outrider/{dataset}/bam_coverage.tsv"`' #' output: #' - wBhtml: '`sm config["htmlOutputPath"] + @@ -19,8 +19,7 @@ #' code_download: TRUE #'--- -saveRDS(snakemake, file.path(snakemake@config$tmpdir, "AE/counting_summary.snakemake") ) -#snakemake <- readRDS(".drop/tmp/AE/counting_summary.snakemake") +saveRDS(snakemake, snakemake@log$snakemake) suppressPackageStartupMessages({ library(OUTRIDER) diff --git a/drop/modules/aberrant-expression-pipeline/Scripts/Counting/countReads.R b/drop/modules/aberrant-expression-pipeline/Counting/countReads.R similarity index 72% rename from drop/modules/aberrant-expression-pipeline/Scripts/Counting/countReads.R rename to drop/modules/aberrant-expression-pipeline/Counting/countReads.R index df2b6d17..bcbe4fef 100644 --- a/drop/modules/aberrant-expression-pipeline/Scripts/Counting/countReads.R +++ b/drop/modules/aberrant-expression-pipeline/Counting/countReads.R @@ -2,20 +2,20 @@ #' title: Count reads #' author: Michaela Mueller #' wb: +#' log: +#' snakemake: '`sm str(tmp_dir / "AE" / "{annotation}" / "counts" / "{sampleID}.Rds")`' #' params: -#' - tmpdir: '`sm drop.getMethodPath(METHOD, "tmp_dir")`' +#' - COUNT_PARAMS: '`sm lambda w: cfg.AE.getCountParams(w.sampleID)`' #' input: -#' - sample_bam: '`sm lambda wildcards: parser.getFilePath(wildcards.sampleID, file_type="RNA_BAM_FILE") `' -#' - count_ranges: '`sm parser.getProcDataDir() + "/aberrant_expression/{annotation}/count_ranges.Rds" `' +#' - sample_bam: '`sm lambda w: sa.getFilePath(w.sampleID, file_type="RNA_BAM_FILE") `' +#' - count_ranges: '`sm cfg.getProcessedDataDir() + "/aberrant_expression/{annotation}/count_ranges.Rds" `' #' output: -#' - counts: '`sm parser.getProcDataDir() + "/aberrant_expression/{annotation}/counts/{sampleID,[^/]+}.Rds"`' +#' - counts: '`sm cfg.getProcessedDataDir() + "/aberrant_expression/{annotation}/counts/{sampleID,[^/]+}.Rds"`' #' type: script #' threads: 1 #'--- - -saveRDS(snakemake, file.path(snakemake@params$tmpdir, "counts.snakemake")) -# snakemake <- readRDS(".drop/tmp/AE/counts.snakemake") +saveRDS(snakemake, snakemake@log$snakemake) suppressPackageStartupMessages({ library(data.table) @@ -26,13 +26,12 @@ suppressPackageStartupMessages({ # Get strand specific information from sample annotation sampleID <- snakemake@wildcards$sampleID -sample_anno <- fread(snakemake@config$sampleAnnotation) -sample_anno <- sample_anno[RNA_ID == sampleID][1,] -strand <- tolower(sample_anno$STRAND) -count_mode <- sample_anno$COUNT_MODE -paired_end <- as.logical(sample_anno$PAIRED_END) -overlap <- as.logical(sample_anno$COUNT_OVERLAPS) +count_params <- snakemake@params$COUNT_PARAMS +strand <- tolower(count_params$STRAND) +count_mode <- count_params$COUNT_MODE +paired_end <- as.logical(count_params$PAIRED_END) +overlap <- as.logical(count_params$COUNT_OVERLAPS) inter_feature <- ! overlap # inter_feature = FALSE does not allow overlaps # infer preprocessing and strand info @@ -80,6 +79,7 @@ se <- summarizeOverlaps( , preprocess.reads = preprocess_reads , BPPARAM = MulticoreParam(snakemake@threads) ) +colnames(se) <- sampleID saveRDS(se, snakemake@output$counts) message("done") diff --git a/drop/modules/aberrant-expression-pipeline/Scripts/Counting/exportCounts.R b/drop/modules/aberrant-expression-pipeline/Counting/exportCounts.R similarity index 58% rename from drop/modules/aberrant-expression-pipeline/Scripts/Counting/exportCounts.R rename to drop/modules/aberrant-expression-pipeline/Counting/exportCounts.R index c9422662..5fd5f5d5 100644 --- a/drop/modules/aberrant-expression-pipeline/Scripts/Counting/exportCounts.R +++ b/drop/modules/aberrant-expression-pipeline/Counting/exportCounts.R @@ -2,19 +2,17 @@ #' title: Export counts in tsv format #' author: Michaela Mueller, vyepez #' wb: -#' params: -#' - tmpdir: '`sm drop.getMethodPath(METHOD, "tmp_dir")`' +#' log: +#' - snakemake: '`sm str(tmp_dir / "AE" / "{annotation}" / "{dataset}" / "export_{genomeAssembly}.Rds")`' #' input: -#' - counts: '`sm parser.getProcDataDir() + +#' - counts: '`sm cfg.getProcessedDataDir() + #' "/aberrant_expression/{annotation}/outrider/{dataset}/total_counts.Rds"`' #' output: -#' - export: '`sm parser.getProcResultsDir() + "/exported_counts/{dataset}--{genomeAssembly}--{annotation}/" -#' + "geneCounts.tsv.gz"`' +#' - export: '`sm cfg.exportCounts.getFilePattern(str_=False) / "geneCounts.tsv.gz"`' #' type: script #'--- -saveRDS(snakemake, file.path(snakemake@params$tmpdir, "merge_counts.snakemake")) -# snakemake <- readRDS(".drop/tmp/AE/merge_counts.snakemake") +saveRDS(snakemake, snakemake@log$snakemake) suppressPackageStartupMessages({ library(data.table) diff --git a/drop/modules/aberrant-expression-pipeline/Scripts/Counting/filterCounts.R b/drop/modules/aberrant-expression-pipeline/Counting/filterCounts.R similarity index 72% rename from drop/modules/aberrant-expression-pipeline/Scripts/Counting/filterCounts.R rename to drop/modules/aberrant-expression-pipeline/Counting/filterCounts.R index 86a23a7a..2f337e2c 100644 --- a/drop/modules/aberrant-expression-pipeline/Scripts/Counting/filterCounts.R +++ b/drop/modules/aberrant-expression-pipeline/Counting/filterCounts.R @@ -2,21 +2,20 @@ #' title: Filter Counts for OUTRIDER #' author: Michaela Mueller #' wb: -#' params: -#' - tmpdir: '`sm drop.getMethodPath(METHOD, "tmp_dir")`' +#' log: +#' - snakemake: '`sm str(tmp_dir / "AE" / "{annotation}" / "{dataset}" / "filter.Rds")`' #' input: -#' - counts: '`sm parser.getProcDataDir() + +#' - counts: '`sm cfg.getProcessedDataDir() + #' "/aberrant_expression/{annotation}/outrider/{dataset}/total_counts.Rds"`' -#' - txdb: '`sm parser.getProcDataDir() + +#' - txdb: '`sm cfg.getProcessedDataDir() + #' "/aberrant_expression/{annotation}/txdb.db"`' #' output: -#' - ods: '`sm parser.getProcResultsDir() + +#' - ods: '`sm cfg.getProcessedResultsDir() + #' "/aberrant_expression/{annotation}/outrider/{dataset}/ods_unfitted.Rds"`' #' type: script #'--- -saveRDS(snakemake, file.path(snakemake@params$tmpdir, "filter_counts.snakemake") ) -# snakemake <- readRDS(".drop/tmp/AE/filter_counts.snakemake") +saveRDS(snakemake, snakemake@log$snakemake) suppressPackageStartupMessages({ library(data.table) @@ -26,7 +25,6 @@ suppressPackageStartupMessages({ }) counts <- readRDS(snakemake@input$counts) -colData(counts)$sampleID <- colData(counts)$RNA_ID ods <- OutriderDataSet(counts) txdb <- loadDb(snakemake@input$txdb) diff --git a/drop/modules/aberrant-expression-pipeline/Scripts/Counting/mergeCounts.R b/drop/modules/aberrant-expression-pipeline/Counting/mergeCounts.R similarity index 54% rename from drop/modules/aberrant-expression-pipeline/Scripts/Counting/mergeCounts.R rename to drop/modules/aberrant-expression-pipeline/Counting/mergeCounts.R index a07d20e2..1e7fe81f 100644 --- a/drop/modules/aberrant-expression-pipeline/Scripts/Counting/mergeCounts.R +++ b/drop/modules/aberrant-expression-pipeline/Counting/mergeCounts.R @@ -2,26 +2,20 @@ #' title: Merge the counts for all samples #' author: Michaela Müller #' wb: -#' py: -#' - | -#' def getCountFiles(annotation, dataset): -#' ids = parser.outrider_ids[dataset] -#' file_stump = parser.getProcDataDir() + f"/aberrant_expression/{annotation}/counts/" -#' return expand(file_stump + "{sampleID}.Rds", sampleID=ids) +#' log: +#' - snakemake: '`sm str(tmp_dir / "AE" / "{annotation}" / "{dataset}" / "merge.Rds")`' #' params: -#' - ids: '`sm parser.outrider_ids`' -#' - tmpdir: '`sm drop.getMethodPath(METHOD, "tmp_dir")`' +#' - exCountIDs: '`sm lambda w: sa.getIDsByGroup(w.dataset, assay=["GENE_COUNT","RNA"])`' #' input: -#' - counts: '`sm lambda wildcards: getCountFiles(wildcards.annotation, wildcards.dataset)`' +#' - counts: '`sm lambda w: cfg.AE.getCountFiles(w.annotation, w.dataset)`' #' output: -#' - counts: '`sm parser.getProcDataDir() + +#' - counts: '`sm cfg.getProcessedDataDir() + #' "/aberrant_expression/{annotation}/outrider/{dataset}/total_counts.Rds"`' #' threads: 30 #' type: script #'--- -saveRDS(snakemake, file.path(snakemake@params$tmpdir, "merge_counts.snakemake")) -# snakemake <- readRDS(".drop/tmp/AE/merge_counts.snakemake") +saveRDS(snakemake, snakemake@log$snakemake) suppressPackageStartupMessages({ library(data.table) @@ -33,21 +27,26 @@ suppressPackageStartupMessages({ register(MulticoreParam(snakemake@threads)) # Read counts -counts_list <- bplapply(snakemake@input$counts, readRDS) -names <- snakemake@params$ids[[snakemake@wildcards$dataset]] -names(counts_list) <- names - +counts_list <- bplapply(snakemake@input$counts, function(f){ + if(grepl('Rds$', f)) + assay(readRDS(f)) + else + as.matrix(fread(f), rownames = "geneID") +}) message(paste("read", length(counts_list), 'files')) +# check rownames and proceed only if they are the same +row_names_objects <- lapply(counts_list, rownames) +if( length(unique(row_names_objects)) > 1 ){ + stop('The rows (genes) of the count matrices to be merged are not the same.') +} + # merge counts -## more efficient merging matrices instead of complete SE -merged_assays <- do.call(cbind, lapply(counts_list, assay, withDimnames=FALSE)) +merged_assays <- do.call(cbind, counts_list) total_counts <- SummarizedExperiment(assays=list(counts=merged_assays)) # total_counts <- do.call(cbind, counts_list) -colnames(total_counts) <- names(counts_list) rownames(total_counts) <- rownames(counts_list[[1]]) -rowRanges(total_counts) <- rowRanges(counts_list[[1]]) # Add sample annotation data (colData) sample_anno <- fread(snakemake@config$sampleAnnotation) diff --git a/drop/modules/aberrant-expression-pipeline/Scripts/Counting/preprocessGeneAnnotation.R b/drop/modules/aberrant-expression-pipeline/Counting/preprocessGeneAnnotation.R similarity index 69% rename from drop/modules/aberrant-expression-pipeline/Scripts/Counting/preprocessGeneAnnotation.R rename to drop/modules/aberrant-expression-pipeline/Counting/preprocessGeneAnnotation.R index 58b82f71..dd8c9b6b 100644 --- a/drop/modules/aberrant-expression-pipeline/Scripts/Counting/preprocessGeneAnnotation.R +++ b/drop/modules/aberrant-expression-pipeline/Counting/preprocessGeneAnnotation.R @@ -2,20 +2,19 @@ #' title: Preprocess Gene Annotations #' author: mumichae #' wb: -#' params: -#' - tmpdir: '`sm drop.getMethodPath(METHOD, "tmp_dir")`' +#' log: +#' - snakemake: '`sm str(tmp_dir / "AE" / "{annotation}" / "preprocess.Rds")`' #' input: -#' - gtf: '`sm lambda wildcards: parser.getGeneAnnotationFile(wildcards.annotation) `' +#' - gtf: '`sm lambda wildcards: cfg.getGeneAnnotationFile(wildcards.annotation) `' #' output: -#' - txdb: '`sm parser.getProcDataDir() + "/aberrant_expression/{annotation}/txdb.db"`' -#' - count_ranges: '`sm parser.getProcDataDir() + +#' - txdb: '`sm cfg.getProcessedDataDir() + "/aberrant_expression/{annotation}/txdb.db"`' +#' - count_ranges: '`sm cfg.getProcessedDataDir() + #' "/aberrant_expression/{annotation}/count_ranges.Rds" `' -#' - gene_name_mapping: '`sm parser.getProcDataDir() + "/aberrant_expression/{annotation}/gene_name_mapping_{annotation}.tsv"`' +#' - gene_name_mapping: '`sm cfg.getProcessedDataDir() + "/aberrant_expression/{annotation}/gene_name_mapping_{annotation}.tsv"`' #' type: script #'--- -saveRDS(snakemake, file.path(snakemake@params$tmpdir, "annotation.snakemake") ) -# snakemake <- readRDS(".drop/tmp/AE/annotation.snakemake") +saveRDS(snakemake, snakemake@log$snakemake) suppressPackageStartupMessages({ library(GenomicFeatures) diff --git a/drop/modules/aberrant-expression-pipeline/Scripts/OUTRIDER/Datasets.R b/drop/modules/aberrant-expression-pipeline/OUTRIDER/Datasets.R similarity index 70% rename from drop/modules/aberrant-expression-pipeline/Scripts/OUTRIDER/Datasets.R rename to drop/modules/aberrant-expression-pipeline/OUTRIDER/Datasets.R index 0684ed3b..dbb9448e 100644 --- a/drop/modules/aberrant-expression-pipeline/Scripts/OUTRIDER/Datasets.R +++ b/drop/modules/aberrant-expression-pipeline/OUTRIDER/Datasets.R @@ -2,20 +2,19 @@ #' title: Results Overview #' author: mumichae #' wb: -#' params: -#' - tmpdir: '`sm drop.getMethodPath(METHOD, "tmp_dir")`' +#' log: +#' - snakemake: '`sm str(tmp_dir / "AE" / "OUTRIDER_Overview.Rds")`' #' input: #' - summaries: '`sm expand(config["htmlOutputPath"] + #' "/AberrantExpression/Outrider/{annotation}/Summary_{dataset}.html", -#' annotation=config["geneAnnotation"].keys() , dataset=config["aberrantExpression"]["groups"])`' +#' annotation=cfg.getGeneVersions(), dataset=cfg.AE.groups)`' #' output: #' html_document: #' code_folding: hide #' code_download: TRUE #'--- -saveRDS(snakemake, file.path(snakemake@params$tmpdir, "outrider_overview.snakemake")) -# snakemake <- readRDS(".drop/tmp/AE/outrider_overview.snakemake") +saveRDS(snakemake, snakemake@log$snakemake) # Obtain the annotations and datasets datasets <- snakemake@config$aberrantExpression$groups diff --git a/drop/modules/aberrant-expression-pipeline/Scripts/OUTRIDER/Summary.R b/drop/modules/aberrant-expression-pipeline/OUTRIDER/Summary.R similarity index 89% rename from drop/modules/aberrant-expression-pipeline/Scripts/OUTRIDER/Summary.R rename to drop/modules/aberrant-expression-pipeline/OUTRIDER/Summary.R index 5f1652fb..4e370f0c 100644 --- a/drop/modules/aberrant-expression-pipeline/Scripts/OUTRIDER/Summary.R +++ b/drop/modules/aberrant-expression-pipeline/OUTRIDER/Summary.R @@ -2,12 +2,15 @@ #' title: "OUTRIDER Summary: `r gsub('_', ' ', snakemake@wildcards$dataset)`" #' author: mumichae, vyepez #' wb: +#' log: +#' - snakemake: '`sm str(tmp_dir / "AE" / "{annotation}" / "{dataset}" / "OUTRIDER_summary.Rds")`' #' params: -#' - tmpdir: '`sm drop.getMethodPath(METHOD, "tmp_dir")`' +#' - padjCutoff: '`sm cfg.AE.get("padjCutoff")`' +#' - zScoreCutoff: '`sm cfg.AE.get("zScoreCutoff")`' #' input: -#' - ods: '`sm parser.getProcResultsDir() + +#' - ods: '`sm cfg.getProcessedResultsDir() + #' "/aberrant_expression/{annotation}/outrider/{dataset}/ods.Rds"`' -#' - results: '`sm parser.getProcResultsDir() + +#' - results: '`sm cfg.getProcessedResultsDir() + #' "/aberrant_expression/{annotation}/outrider/{dataset}/OUTRIDER_results.tsv"`' #' output: #' - wBhtml: '`sm config["htmlOutputPath"] + @@ -20,8 +23,7 @@ #'--- #+ echo=F -saveRDS(snakemake, file.path(snakemake@params$tmpdir, "outrider_summary.snakemake")) -# snakemake <- readRDS(".drop/tmp/AE/outrider_summary.snakemake") +saveRDS(snakemake, snakemake@log$snakemake) suppressPackageStartupMessages({ library(OUTRIDER) @@ -33,8 +35,6 @@ suppressPackageStartupMessages({ library(ggthemes) }) -ae_params <- snakemake@config$aberrantExpression - # used for most plots dataset_title <- paste("Dataset:", snakemake@wildcards$dataset) @@ -55,8 +55,8 @@ plotEncDimSearch(ods) + #' ### Aberrantly expressed genes per sample plotAberrantPerSample(ods, main = dataset_title, - padjCutoff = ae_params$padjCutoff, - zScoreCutoff = ae_params$zScoreCutoff) + padjCutoff = snakemake@params$padjCutoff, + zScoreCutoff = snakemake@params$zScoreCutoff) #' ### Batch correction diff --git a/drop/modules/aberrant-expression-pipeline/Scripts/OUTRIDER/results.R b/drop/modules/aberrant-expression-pipeline/OUTRIDER/results.R similarity index 60% rename from drop/modules/aberrant-expression-pipeline/Scripts/OUTRIDER/results.R rename to drop/modules/aberrant-expression-pipeline/OUTRIDER/results.R index f4c0b970..01c1aca6 100644 --- a/drop/modules/aberrant-expression-pipeline/Scripts/OUTRIDER/results.R +++ b/drop/modules/aberrant-expression-pipeline/OUTRIDER/results.R @@ -2,19 +2,23 @@ #' title: OUTRIDER Results #' author: mumichae #' wb: +#' log: +#' - snakemake: '`sm str(tmp_dir / "AE" / "{annotation}" / "{dataset}" / "OUTRIDER_results.Rds")`' #' params: -#' - tmpdir: '`sm drop.getMethodPath(METHOD, "tmp_dir")`' +#' - padjCutoff: '`sm cfg.AE.get("padjCutoff")`' +#' - zScoreCutoff: '`sm cfg.AE.get("zScoreCutoff")`' #' input: -#' - ods: '`sm parser.getProcResultsDir() + "/aberrant_expression/{annotation}/outrider/{dataset}/ods.Rds"`' -#' - gene_name_mapping: '`sm parser.getProcDataDir() + "/aberrant_expression/{annotation}/gene_name_mapping_{annotation}.tsv"`' +#' - add_HPO_cols: '`sm str(projectDir / ".drop" / "helpers" / "add_HPO_cols.R")`' +#' - ods: '`sm cfg.getProcessedResultsDir() + "/aberrant_expression/{annotation}/outrider/{dataset}/ods.Rds"`' +#' - gene_name_mapping: '`sm cfg.getProcessedDataDir() + "/aberrant_expression/{annotation}/gene_name_mapping_{annotation}.tsv"`' #' output: -#' - results: '`sm parser.getProcResultsDir() + "/aberrant_expression/{annotation}/outrider/{dataset}/OUTRIDER_results.tsv"`' -#' - results_all: '`sm parser.getProcResultsDir() + "/aberrant_expression/{annotation}/outrider/{dataset}/OUTRIDER_results_all.Rds"`' +#' - results: '`sm cfg.getProcessedResultsDir() + "/aberrant_expression/{annotation}/outrider/{dataset}/OUTRIDER_results.tsv"`' +#' - results_all: '`sm cfg.getProcessedResultsDir() + "/aberrant_expression/{annotation}/outrider/{dataset}/OUTRIDER_results_all.Rds"`' #' type: script #'--- -saveRDS(snakemake, file.path(snakemake@params$tmpdir, "outrider_results.snakemake")) -# snakemake <- readRDS(".drop/tmp/AE/outrider_results.snakemake") +saveRDS(snakemake, snakemake@log$snakemake) +source(snakemake@input$add_HPO_cols) suppressPackageStartupMessages({ library(dplyr) @@ -24,10 +28,6 @@ suppressPackageStartupMessages({ library(OUTRIDER) }) -source("../helpers/add_HPO_cols.R") - -ae_params <- snakemake@config$aberrantExpression - ods <- readRDS(snakemake@input$ods) res <- results(ods, all = TRUE) @@ -38,8 +38,8 @@ res[, foldChange := round(2^l2fc, 2)] saveRDS(res, snakemake@output$results_all) # Subset to significant results -res <- res[padjust <= ae_params$padjCutoff & - abs(zScore) > ae_params$zScoreCutoff] +res <- res[padjust <= snakemake@params$padjCutoff & + abs(zScore) > snakemake@params$zScoreCutoff] gene_annot_dt <- fread(snakemake@input$gene_name_mapping) if(!is.null(gene_annot_dt$gene_name)){ diff --git a/drop/modules/aberrant-expression-pipeline/Scripts/OUTRIDER/runOutrider.R b/drop/modules/aberrant-expression-pipeline/OUTRIDER/runOutrider.R similarity index 83% rename from drop/modules/aberrant-expression-pipeline/Scripts/OUTRIDER/runOutrider.R rename to drop/modules/aberrant-expression-pipeline/OUTRIDER/runOutrider.R index d97c9cc1..0c32dc3d 100644 --- a/drop/modules/aberrant-expression-pipeline/Scripts/OUTRIDER/runOutrider.R +++ b/drop/modules/aberrant-expression-pipeline/OUTRIDER/runOutrider.R @@ -2,18 +2,21 @@ #' title: Filter Counts for OUTRIDER #' author: Michaela Mueller #' wb: -#' params: -#' - tmpdir: '`sm drop.getMethodPath(METHOD, "tmp_dir")`' +#' log: +#' - snakemake: '`sm str(tmp_dir / "AE" / "{annotation}" / "{dataset}" / "runOUTRIDER.Rds")`' #' input: -#' - ods: '`sm parser.getProcResultsDir() + +#' - ods: '`sm cfg.getProcessedResultsDir() + #' "/aberrant_expression/{annotation}/outrider/{dataset}/ods_unfitted.Rds"`' #' output: -#' - ods: '`sm parser.getProcResultsDir() + +#' - ods: '`sm cfg.getProcessedResultsDir() + #' "/aberrant_expression/{annotation}/outrider/{dataset}/ods.Rds"`' #' type: script #' threads: 30 #'--- +#+ echo=F +saveRDS(snakemake, snakemake@log$snakemake) + suppressPackageStartupMessages({ library(OUTRIDER) library(SummarizedExperiment) @@ -23,9 +26,6 @@ suppressPackageStartupMessages({ library(magrittr) }) -saveRDS(snakemake, file.path(snakemake@params$tmpdir, "outrider.snakemake")) -# snakemake <- readRDS(".drop/tmp/AE/outrider.snakemake") - ods <- readRDS(snakemake@input$ods) implementation <- snakemake@config$aberrantExpression$implementation mp <- snakemake@config$aberrantExpression$maxTestedDimensionProportion diff --git a/drop/modules/aberrant-expression-pipeline/Snakefile b/drop/modules/aberrant-expression-pipeline/Snakefile index e88085f3..1fca7d22 100644 --- a/drop/modules/aberrant-expression-pipeline/Snakefile +++ b/drop/modules/aberrant-expression-pipeline/Snakefile @@ -1,41 +1,26 @@ -### SNAKEFILE ABERRANT EXPRESSION -import os -import drop -import pathlib -import pandas as pd +AE_WORKDIR = cfg.AE.getWorkdir(str_=False) -METHOD = 'AE' -SCRIPT_ROOT = drop.getMethodPath(METHOD, type_='workdir', str_=False) -CONF_FILE = drop.getConfFile() +AE_index_name = "aberrant-expression-pipeline" +AE_index_input, AE_index_output, AE_graph_file, _ = createIndexRule( + scriptsPath=str(AE_WORKDIR), + index_name=AE_index_name +) -parser = drop.config(config, METHOD) -config = parser.parse() -include: config['wBuildPath'] + "/wBuild.snakefile" - -rule all: - input: - rules.Index.output, config["htmlOutputPath"] + "/aberrant_expression_readme.html", - expand( - config["htmlOutputPath"] + "/Scripts_Counting_Datasets.html", - annotation=list(config["geneAnnotation"].keys()) - ), - expand( - parser.getProcResultsDir() + "/aberrant_expression/{annotation}" + - "/outrider/{dataset}/OUTRIDER_results.tsv", - annotation=list(config["geneAnnotation"].keys()), - dataset=parser.outrider_ids - ) - output: touch(drop.getMethodPath(METHOD, type_='final_file')) +rule aberrantExpression: + input: AE_index_input, AE_graph_file + output: AE_index_output + run: ci(str(AE_WORKDIR), AE_index_name) +rule aberrantExpression_dependency: + output: AE_graph_file + shell: "snakemake --rulegraph {AE_index_output} | dot -Tsvg -Grankdir=TB > {output}" -rule bam_stats: +rule aberrantExpression_bamStats: input: - bam = lambda wildcards: parser.getFilePath(wildcards.sampleID, - file_type="RNA_BAM_FILE"), - ucsc2ncbi = SCRIPT_ROOT / "resource" / "chr_UCSC_NCBI.txt" + bam = lambda wildcards: sa.getFilePath(wildcards.sampleID, "RNA_BAM_FILE"), + ucsc2ncbi =AE_WORKDIR / "resource" / "chr_UCSC_NCBI.txt" output: - parser.getProcDataDir() + - "/aberrant_expression/{annotation}/coverage/{sampleID}.tsv" + cfg.processedDataDir / "aberrant_expression" / "{annotation}" / "coverage" / "{sampleID}.tsv" params: samtools = config["tools"]["samtoolsCmd"] shell: @@ -55,39 +40,19 @@ rule bam_stats: echo -e "{wildcards.sampleID}\t${{count}}" > {output} """ -rule merge_bam_stats: +rule aberrantExpression_mergeBamStats: input: - lambda wildcards: expand( - parser.getProcDataDir() + + lambda w: expand(cfg.getProcessedDataDir() + "/aberrant_expression/{{annotation}}/coverage/{sampleID}.tsv", - sampleID=parser.outrider_ids[wildcards.dataset]) + sampleID=sa.getIDsByGroup(w.dataset), assay="RNA") output: - parser.getProcDataDir() + "/aberrant_expression/{annotation}/" + - "outrider/{dataset}/bam_coverage.tsv" + cfg.getProcessedDataDir() + "/aberrant_expression/{annotation}/outrider/{dataset}/bam_coverage.tsv" params: - ids = lambda wildcards: parser.outrider_ids[wildcards.dataset] + exIDs = lambda w: sa.getIDsByGroup(w.dataset, assay="GENE_COUNT") run: with open(output[0], "w") as bam_stats: bam_stats.write("sampleID\trecord_count\n") for f in input: bam_stats.write(open(f, "r").read()) - -rulegraph_filename = f'{config["htmlOutputPath"]}/{METHOD}_rulegraph' - -rule produce_rulegraph: - input: - expand(rulegraph_filename + ".{fmt}", fmt=["svg", "png"]) - -rule create_graph: - output: - svg = f"{rulegraph_filename}.svg", - png = f"{rulegraph_filename}.png" - shell: - """ - snakemake --configfile {CONF_FILE} --rulegraph | dot -Tsvg > {output.svg} - snakemake --configfile {CONF_FILE} --rulegraph | dot -Tpng > {output.png} - """ -rule unlock: - output: touch(drop.getMethodPath(METHOD, type_="unlock")) - shell: "snakemake --unlock --configfile {CONF_FILE}" - + for eid in params.exIDs: + bam_stats.write(f"{eid}\tNA\n") diff --git a/drop/modules/aberrant-splicing-pipeline/Counting/00_define_datasets_from_anno.R b/drop/modules/aberrant-splicing-pipeline/Counting/00_define_datasets_from_anno.R new file mode 100644 index 00000000..72e17db9 --- /dev/null +++ b/drop/modules/aberrant-splicing-pipeline/Counting/00_define_datasets_from_anno.R @@ -0,0 +1,58 @@ +#'--- +#' title: Create datasets from annotation file +#' author: Christian Mertes, mumichae +#' wb: +#' log: +#' - snakemake: '`sm str(tmp_dir / "AS" / "{dataset}" / "00_defineDataset.Rds")`' +#' params: +#' - setup: '`sm cfg.AS.getWorkdir() + "/config.R"`' +#' - ids: '`sm lambda w: sa.getIDsByGroup(w.dataset, assay="RNA")`' +#' - fileMappingFile: '`sm cfg.getRoot() + "/file_mapping.csv"`' +#' input: +#' - sampleAnnoFile: '`sm config["sampleAnnotation"]`' +#' output: +#' - colData: '`sm cfg.getProcessedDataDir() + +#' "/aberrant_splicing/annotations/{dataset}.tsv"`' +#' - wBhtml: '`sm config["htmlOutputPath"] + +#' "/AberrantSplicing/annotations/{dataset}.html"`' +#' type: noindex +#' output: +#' html_document: +#' code_folding: hide +#' code_download: TRUE +#'--- + +saveRDS(snakemake, snakemake@log$snakemake) +source(snakemake@params$setup, echo=FALSE) + +#+ input +outFile <- snakemake@output$colData +annoFile <- snakemake@input$sampleAnnoFile +fileMapFile <- snakemake@params$fileMapping + +#+ dataset name + +name <- snakemake@wildcards$dataset +anno <- fread(annoFile) +mapping <- fread(fileMapFile) + +subset_ids <- snakemake@params$ids +annoSub <- anno[RNA_ID %in% subset_ids] +colData <- merge( + annoSub[,.(sampleID = RNA_ID, STRAND, PAIRED_END)], + mapping[FILE_TYPE == "RNA_BAM_FILE", .(sampleID=ID, bamFile=FILE_PATH)]) + +#' +#' ## Dataset: `r name` +#' +#+ echo=FALSE +finalTable <- colData + +#' +#' ## Final sample table `r name` +#' +#+ savetable +DT::datatable(finalTable, options=list(scrollX=TRUE)) + +dim(finalTable) +write_tsv(finalTable, file=outFile) diff --git a/drop/modules/aberrant-splicing-pipeline/Scripts/Counting/01_0_countRNA_init.R b/drop/modules/aberrant-splicing-pipeline/Counting/01_0_countRNA_init.R similarity index 70% rename from drop/modules/aberrant-splicing-pipeline/Scripts/Counting/01_0_countRNA_init.R rename to drop/modules/aberrant-splicing-pipeline/Counting/01_0_countRNA_init.R index 60096465..c54948ee 100644 --- a/drop/modules/aberrant-splicing-pipeline/Scripts/Counting/01_0_countRNA_init.R +++ b/drop/modules/aberrant-splicing-pipeline/Counting/01_0_countRNA_init.R @@ -2,23 +2,24 @@ #' title: Initialize Counting #' author: Luise Schuller #' wb: +#' log: +#' - snakemake: '`sm str(tmp_dir / "AS" / "{dataset}" / "01_0_init.Rds")`' #' params: -#' - tmpdir: '`sm drop.getMethodPath(METHOD, "tmp_dir")`' -#' - workingDir: '`sm parser.getProcDataDir() + "/aberrant_splicing/datasets"`' +#' - setup: '`sm cfg.AS.getWorkdir() + "/config.R"`' +#' - workingDir: '`sm cfg.getProcessedDataDir() + "/aberrant_splicing/datasets"`' #' input: -#' - colData: '`sm parser.getProcDataDir() + +#' - colData: '`sm cfg.getProcessedDataDir() + #' "/aberrant_splicing/annotations/{dataset}.tsv"`' #' output: -#' - fdsobj: '`sm parser.getProcDataDir() + +#' - fdsobj: '`sm cfg.getProcessedDataDir() + #' "/aberrant_splicing/datasets/savedObjects/raw-{dataset}/fds-object.RDS"`' -#' - done_fds: '`sm parser.getProcDataDir() + +#' - done_fds: '`sm cfg.getProcessedDataDir() + #' "/aberrant_splicing/datasets/cache/raw-{dataset}/fds.done" `' #' type: script #'--- -saveRDS(snakemake, file.path(snakemake@params$tmpdir, "FRASER_01_0.snakemake")) -# snakemake <- readRDS(".drop/tmp/AS/FRASER_01_0.snakemake") -source("Scripts/_helpers/config.R") +saveRDS(snakemake, snakemake@log$snakemake) +source(snakemake@params$setup, echo=FALSE) dataset <- snakemake@wildcards$dataset colDataFile <- snakemake@input$colData diff --git a/drop/modules/aberrant-splicing-pipeline/Scripts/Counting/01_1_countRNA_splitReads_samplewise.R b/drop/modules/aberrant-splicing-pipeline/Counting/01_1_countRNA_splitReads_samplewise.R similarity index 74% rename from drop/modules/aberrant-splicing-pipeline/Scripts/Counting/01_1_countRNA_splitReads_samplewise.R rename to drop/modules/aberrant-splicing-pipeline/Counting/01_1_countRNA_splitReads_samplewise.R index f3286d7d..aa27e6c9 100644 --- a/drop/modules/aberrant-splicing-pipeline/Scripts/Counting/01_1_countRNA_splitReads_samplewise.R +++ b/drop/modules/aberrant-splicing-pipeline/Counting/01_1_countRNA_splitReads_samplewise.R @@ -2,23 +2,25 @@ #' title: Count Split Reads #' author: Luise Schuller #' wb: +#' log: +#' - snakemake: '`sm str(tmp_dir / "AS" / "{dataset}" / "splitReads" / "{sample_id}.Rds")`' #' params: -#' - tmpdir: '`sm drop.getMethodPath(METHOD, "tmp_dir")`' -#' - workingDir: '`sm parser.getProcDataDir() + "/aberrant_splicing/datasets"`' +#' - setup: '`sm cfg.AS.getWorkdir() + "/config.R"`' +#' - workingDir: '`sm cfg.getProcessedDataDir() + "/aberrant_splicing/datasets"`' #' input: -#' - done_fds: '`sm parser.getProcDataDir() + -#' "/aberrant_splicing/datasets/cache/raw-{dataset}/fds.done" `' +#' - done_fds: '`sm cfg.getProcessedDataDir() + +#' "/aberrant_splicing/datasets/cache/raw-{dataset}/fds.done"`' #' output: -#' - done_sample_splitCounts: '`sm parser.getProcDataDir() + +#' - done_sample_splitCounts: '`sm cfg.getProcessedDataDir() + #' "/aberrant_splicing/datasets/cache/raw-{dataset}" #' +"/sample_tmp/splitCounts/sample_{sample_id}.done"`' #' threads: 3 #' type: script #'--- -saveRDS(snakemake, file.path(snakemake@params$tmpdir, "FRASER_01_1.snakemake")) -# snakemake <- readRDS(".drop/tmp/AS/FRASER_01_1.snakemake") -source("Scripts/_helpers/config.R") +saveRDS(snakemake, snakemake@log$snakemake) +source(snakemake@params$setup, echo=FALSE) +library(BSgenome.Hsapiens.UCSC.hg19) dataset <- snakemake@wildcards$dataset workingDir <- snakemake@params$workingDir @@ -36,10 +38,8 @@ genome <- NULL if(strandSpecific(fds) == 0){ if(snakemake@config$genomeAssembly == 'hg19'){ - library(BSgenome.Hsapiens.UCSC.hg19) genome <- BSgenome.Hsapiens.UCSC.hg19 } else if(snakemake@config$genomeAssembly == 'hg38'){ - library(BSgenome.Hsapiens.UCSC.hg38) genome <- BSgenome.Hsapiens.UCSC.hg38 } } diff --git a/drop/modules/aberrant-splicing-pipeline/Scripts/Counting/01_2_countRNA_splitReads_merge.R b/drop/modules/aberrant-splicing-pipeline/Counting/01_2_countRNA_splitReads_merge.R similarity index 75% rename from drop/modules/aberrant-splicing-pipeline/Scripts/Counting/01_2_countRNA_splitReads_merge.R rename to drop/modules/aberrant-splicing-pipeline/Counting/01_2_countRNA_splitReads_merge.R index af61612a..e28ff0dd 100644 --- a/drop/modules/aberrant-splicing-pipeline/Scripts/Counting/01_2_countRNA_splitReads_merge.R +++ b/drop/modules/aberrant-splicing-pipeline/Counting/01_2_countRNA_splitReads_merge.R @@ -5,30 +5,31 @@ #' py: #' - | #' def getSplitCountFiles(dataset): -#' ids = parser.fraser_ids[dataset] -#' file_stump = parser.getProcDataDir() + f"/aberrant_splicing/datasets/cache/raw-{dataset}/sample_tmp/splitCounts/" +#' ids = sa.getIDsByGroup(dataset, assay="RNA") +#' file_stump = cfg.getProcessedDataDir() + f"/aberrant_splicing/datasets/cache/raw-{dataset}/sample_tmp/splitCounts/" #' return expand(file_stump + "sample_{sample_id}.done", sample_id=ids) +#' log: +#' - snakemake: '`sm str(tmp_dir / "AS" / "{dataset}" / "01_2_splitReadsMerge.Rds")`' #' params: -#' - tmpdir: '`sm drop.getMethodPath(METHOD, "tmp_dir")`' -#' - workingDir: '`sm parser.getProcDataDir() + "/aberrant_splicing/datasets"`' +#' - setup: '`sm cfg.AS.getWorkdir() + "/config.R"`' +#' - workingDir: '`sm cfg.getProcessedDataDir() + "/aberrant_splicing/datasets"`' #' threads: 20 #' input: -#' - sample_counts: '`sm lambda wildcards: getSplitCountFiles(wildcards.dataset)`' +#' - sample_counts: '`sm lambda w: getSplitCountFiles(w.dataset)`' #' output: -#' - countsJ: '`sm parser.getProcDataDir() + +#' - countsJ: '`sm cfg.getProcessedDataDir() + #' "/aberrant_splicing/datasets/savedObjects/raw-{dataset}/rawCountsJ.h5"`' -#' - gRangesSplitCounts: '`sm parser.getProcDataDir() + +#' - gRangesSplitCounts: '`sm cfg.getProcessedDataDir() + #' "/aberrant_splicing/datasets/cache/raw-{dataset}/gRanges_splitCounts.rds"`' -#' - gRangesNonSplitCounts: '`sm parser.getProcDataDir() + +#' - gRangesNonSplitCounts: '`sm cfg.getProcessedDataDir() + #' "/aberrant_splicing/datasets/cache/raw-{dataset}/gRanges_NonSplitCounts.rds"`' -#' - spliceSites: '`sm parser.getProcDataDir() + +#' - spliceSites: '`sm cfg.getProcessedDataDir() + #' "/aberrant_splicing/datasets/cache/raw-{dataset}/spliceSites_splitCounts.rds"`' #' type: script #'--- -saveRDS(snakemake, file.path(snakemake@params$tmpdir, "FRASER_01_2.snakemake")) -# snakemake <- readRDS(".drop/tmp/AS/FRASER_01_2.snakemake") -source("Scripts/_helpers/config.R") +saveRDS(snakemake, snakemake@log$snakemake) +source(snakemake@params$setup, echo=FALSE) dataset <- snakemake@wildcards$dataset workingDir <- snakemake@params$workingDir diff --git a/drop/modules/aberrant-splicing-pipeline/Scripts/Counting/01_3_countRNA_nonSplitReads_samplewise.R b/drop/modules/aberrant-splicing-pipeline/Counting/01_3_countRNA_nonSplitReads_samplewise.R similarity index 76% rename from drop/modules/aberrant-splicing-pipeline/Scripts/Counting/01_3_countRNA_nonSplitReads_samplewise.R rename to drop/modules/aberrant-splicing-pipeline/Counting/01_3_countRNA_nonSplitReads_samplewise.R index 80562f81..c5d9e6a7 100644 --- a/drop/modules/aberrant-splicing-pipeline/Scripts/Counting/01_3_countRNA_nonSplitReads_samplewise.R +++ b/drop/modules/aberrant-splicing-pipeline/Counting/01_3_countRNA_nonSplitReads_samplewise.R @@ -2,22 +2,23 @@ #' title: Nonsplit Counts #' author: Luise Schuller #' wb: +#' log: +#' - snakemake: '`sm str(tmp_dir / "AS" / "{dataset}" / "nonsplitReads" / "{sample_id}.Rds")`' #' params: -#' - tmpdir: '`sm drop.getMethodPath(METHOD, "tmp_dir")`' -#' - workingDir: '`sm parser.getProcDataDir() + "/aberrant_splicing/datasets"`' +#' - setup: '`sm cfg.AS.getWorkdir() + "/config.R"`' +#' - workingDir: '`sm cfg.getProcessedDataDir() + "/aberrant_splicing/datasets"`' #' input: -#' - spliceSites: '`sm parser.getProcDataDir() + +#' - spliceSites: '`sm cfg.getProcessedDataDir() + #' "/aberrant_splicing/datasets/cache/raw-{dataset}/spliceSites_splitCounts.rds"`' #' output: -#' - done_sample_nonSplitCounts : '`sm parser.getProcDataDir() + +#' - done_sample_nonSplitCounts : '`sm cfg.getProcessedDataDir() + #' "/aberrant_splicing/datasets/cache/raw-{dataset}/sample_tmp/nonSplitCounts/sample_{sample_id}.done"`' #' threads: 3 #' type: script #'--- -saveRDS(snakemake, file.path(snakemake@params$tmpdir, "FRASER_01_3.snakemake")) -# snakemake <- readRDS(".drop/tmp/AS/FRASER_01_3.snakemake") -source("Scripts/_helpers/config.R") +saveRDS(snakemake, snakemake@log$snakemake) +source(snakemake@params$setup, echo=FALSE) dataset <- snakemake@wildcards$dataset colDataFile <- snakemake@input$colData diff --git a/drop/modules/aberrant-splicing-pipeline/Scripts/Counting/01_4_countRNA_nonSplitReads_merge.R b/drop/modules/aberrant-splicing-pipeline/Counting/01_4_countRNA_nonSplitReads_merge.R similarity index 71% rename from drop/modules/aberrant-splicing-pipeline/Scripts/Counting/01_4_countRNA_nonSplitReads_merge.R rename to drop/modules/aberrant-splicing-pipeline/Counting/01_4_countRNA_nonSplitReads_merge.R index 82bcbe87..8a895790 100644 --- a/drop/modules/aberrant-splicing-pipeline/Scripts/Counting/01_4_countRNA_nonSplitReads_merge.R +++ b/drop/modules/aberrant-splicing-pipeline/Counting/01_4_countRNA_nonSplitReads_merge.R @@ -5,27 +5,27 @@ #' py: #' - | #' def getNonSplitCountFiles(dataset): -#' ids = parser.fraser_ids[dataset] -#' file_stump = parser.getProcDataDir() + f"/aberrant_splicing/datasets/cache/raw-{dataset}/sample_tmp/nonSplitCounts/" +#' ids = sa.getIDsByGroup(dataset, assay="RNA") +#' file_stump = cfg.getProcessedDataDir() + f"/aberrant_splicing/datasets/cache/raw-{dataset}/sample_tmp/nonSplitCounts/" #' return expand(file_stump + "sample_{sample_id}.done", sample_id=ids) +#' log: +#' - snakemake: '`sm str(tmp_dir / "AS" / "{dataset}" / "01_4_nonSplitReadsMerge.Rds")`' #' params: -#' - tmpdir: '`sm drop.getMethodPath(METHOD, "tmp_dir")`' -#' - workingDir: '`sm parser.getProcDataDir() + "/aberrant_splicing/datasets"`' +#' - setup: '`sm cfg.AS.getWorkdir() + "/config.R"`' +#' - workingDir: '`sm cfg.getProcessedDataDir() + "/aberrant_splicing/datasets"`' #' threads: 20 #' input: -#' - sample_counts: '`sm lambda wildcards: getNonSplitCountFiles(wildcards.dataset)`' -#' - gRangesNonSplitCounts: '`sm parser.getProcDataDir() + +#' - sample_counts: '`sm lambda w: getNonSplitCountFiles(w.dataset)`' +#' - gRangesNonSplitCounts: '`sm cfg.getProcessedDataDir() + #' "/aberrant_splicing/datasets/cache/raw-{dataset}/gRanges_NonSplitCounts.rds"`' #' output: -#' - countsSS: '`sm parser.getProcDataDir() + +#' - countsSS: '`sm cfg.getProcessedDataDir() + #' "/aberrant_splicing/datasets/savedObjects/raw-{dataset}/rawCountsSS.h5"`' #' type: script #'--- -saveRDS(snakemake, file.path(snakemake@params$tmpdir, "FRASER_01_4.snakemake")) -# snakemake <- readRDS(".drop/tmp/AS/FRASER_01_4.snakemake") - -source("Scripts/_helpers/config.R") +saveRDS(snakemake, snakemake@log$snakemake) +source(snakemake@params$setup, echo=FALSE) dataset <- snakemake@wildcards$dataset workingDir <- snakemake@params$workingDir diff --git a/drop/modules/aberrant-splicing-pipeline/Scripts/Counting/01_5_countRNA_collect.R b/drop/modules/aberrant-splicing-pipeline/Counting/01_5_countRNA_collect.R similarity index 74% rename from drop/modules/aberrant-splicing-pipeline/Scripts/Counting/01_5_countRNA_collect.R rename to drop/modules/aberrant-splicing-pipeline/Counting/01_5_countRNA_collect.R index 3b2a78f5..da3d38b6 100644 --- a/drop/modules/aberrant-splicing-pipeline/Scripts/Counting/01_5_countRNA_collect.R +++ b/drop/modules/aberrant-splicing-pipeline/Counting/01_5_countRNA_collect.R @@ -2,28 +2,28 @@ #' title: Collect all counts to FRASER Object #' author: Luise Schuller #' wb: +#' log: +#' - snakemake: '`sm str(tmp_dir / "AS" / "{dataset}" / "01_5_collect.Rds")`' #' params: -#' - tmpdir: '`sm drop.getMethodPath(METHOD, "tmp_dir")`' -#' - workingDir: '`sm parser.getProcDataDir() + "/aberrant_splicing/datasets"`' +#' - setup: '`sm cfg.AS.getWorkdir() + "/config.R"`' +#' - workingDir: '`sm cfg.getProcessedDataDir() + "/aberrant_splicing/datasets"`' #' input: -#' - countsJ: '`sm parser.getProcDataDir() + +#' - countsJ: '`sm cfg.getProcessedDataDir() + #' "/aberrant_splicing/datasets/savedObjects/raw-{dataset}/rawCountsJ.h5"`' -#' - countsSS: '`sm parser.getProcDataDir() + +#' - countsSS: '`sm cfg.getProcessedDataDir() + #' "/aberrant_splicing/datasets/savedObjects/raw-{dataset}/rawCountsSS.h5"`' -#' - gRangesSplitCounts: '`sm parser.getProcDataDir() + +#' - gRangesSplitCounts: '`sm cfg.getProcessedDataDir() + #' "/aberrant_splicing/datasets/cache/raw-{dataset}/gRanges_splitCounts.rds"`' -#' - spliceSites: '`sm parser.getProcDataDir() + +#' - spliceSites: '`sm cfg.getProcessedDataDir() + #' "/aberrant_splicing/datasets/cache/raw-{dataset}/spliceSites_splitCounts.rds"`' #' output: -#' - counting_done: '`sm parser.getProcDataDir() + +#' - counting_done: '`sm cfg.getProcessedDataDir() + #' "/aberrant_splicing/datasets/savedObjects/raw-{dataset}/counting.done" `' #' type: script #'--- -saveRDS(snakemake, file.path(snakemake@params$tmpdir, "FRASER_01_5.snakemake")) -# snakemake <- readRDS(".drop/tmp/AS/FRASER_01_5.snakemake") - -source("Scripts/_helpers/config.R") +saveRDS(snakemake, snakemake@log$snakemake) +source(snakemake@params$setup, echo=FALSE) dataset <- snakemake@wildcards$dataset workingDir <- snakemake@params$workingDir diff --git a/drop/modules/aberrant-splicing-pipeline/Scripts/Counting/02_psi_value_calculation_FraseR.R b/drop/modules/aberrant-splicing-pipeline/Counting/02_psi_value_calculation_FraseR.R similarity index 65% rename from drop/modules/aberrant-splicing-pipeline/Scripts/Counting/02_psi_value_calculation_FraseR.R rename to drop/modules/aberrant-splicing-pipeline/Counting/02_psi_value_calculation_FraseR.R index 42e9f8bd..5c0a0f1c 100644 --- a/drop/modules/aberrant-splicing-pipeline/Scripts/Counting/02_psi_value_calculation_FraseR.R +++ b/drop/modules/aberrant-splicing-pipeline/Counting/02_psi_value_calculation_FraseR.R @@ -2,22 +2,23 @@ #' title: Calculate PSI values #' author: Christian Mertes #' wb: +#' log: +#' - snakemake: '`sm str(tmp_dir / "AS" / "{dataset}" / "02_PSIcalc.Rds")`' #' params: -#' - tmpdir: '`sm drop.getMethodPath(METHOD, "tmp_dir")`' -#' - workingDir: '`sm parser.getProcDataDir() + "/aberrant_splicing/datasets/"`' +#' - setup: '`sm cfg.AS.getWorkdir() + "/config.R"`' +#' - workingDir: '`sm cfg.getProcessedDataDir() + "/aberrant_splicing/datasets/"`' #' threads: 30 #' input: -#' - counting_done: '`sm parser.getProcDataDir() + +#' - counting_done: '`sm cfg.getProcessedDataDir() + #' "/aberrant_splicing/datasets/savedObjects/raw-{dataset}/counting.done" `' #' output: -#' - psiSS: '`sm parser.getProcDataDir() + +#' - psiSS: '`sm cfg.getProcessedDataDir() + #' "/aberrant_splicing/datasets/savedObjects/raw-{dataset}/psiSite.h5"`' #' type: script #'--- -saveRDS(snakemake, file.path(snakemake@params$tmpdir, "FRASER_02.snakemake")) -# snakemake <- readRDS(".drop/tmp/AS/FRASER_02.snakemake") -source("Scripts/_helpers/config.R") +saveRDS(snakemake, snakemake@log$snakemake) +source(snakemake@params$setup, echo=FALSE) dataset <- snakemake@wildcards$dataset workingDir <- snakemake@params$workingDir diff --git a/drop/modules/aberrant-splicing-pipeline/Scripts/Counting/03_filter_expression_FraseR.R b/drop/modules/aberrant-splicing-pipeline/Counting/03_filter_expression_FraseR.R similarity index 76% rename from drop/modules/aberrant-splicing-pipeline/Scripts/Counting/03_filter_expression_FraseR.R rename to drop/modules/aberrant-splicing-pipeline/Counting/03_filter_expression_FraseR.R index 1eefbead..86b6eab0 100644 --- a/drop/modules/aberrant-splicing-pipeline/Scripts/Counting/03_filter_expression_FraseR.R +++ b/drop/modules/aberrant-splicing-pipeline/Counting/03_filter_expression_FraseR.R @@ -2,25 +2,26 @@ #' title: Filter and clean dataset #' author: Christian Mertes #' wb: +#' log: +#' - snakemake: '`sm str(tmp_dir / "AS" / "{dataset}" / "03_filter.Rds")`' #' params: -#' - tmpdir: '`sm drop.getMethodPath(METHOD, "tmp_dir")`' -#' - workingDir: '`sm parser.getProcDataDir() + "/aberrant_splicing/datasets/"`' +#' - setup: '`sm cfg.AS.getWorkdir() + "/config.R"`' +#' - workingDir: '`sm cfg.getProcessedDataDir() + "/aberrant_splicing/datasets/"`' #' input: -#' - psiSS: '`sm parser.getProcDataDir()+ +#' - psiSS: '`sm cfg.getProcessedDataDir()+ #' "/aberrant_splicing/datasets/savedObjects/raw-{dataset}/psiSite.h5"`' #' output: -#' - fds: '`sm parser.getProcDataDir() + +#' - fds: '`sm cfg.getProcessedDataDir() + #' "/aberrant_splicing/datasets/savedObjects/{dataset}/fds-object.RDS"`' -#' - done: '`sm parser.getProcDataDir() + +#' - done: '`sm cfg.getProcessedDataDir() + #' "/aberrant_splicing/datasets/savedObjects/{dataset}/filter.done" `' #' threads: 3 #' type: script #'--- -saveRDS(snakemake, file.path(snakemake@params$tmpdir, "FRASER_03.snakemake")) -# snakemake <- readRDS(".drop/tmp/AS/FRASER_03.snakemake") +saveRDS(snakemake, snakemake@log$snakemake) +source(snakemake@params$setup, echo=FALSE) -source("Scripts/_helpers/config.R") opts_chunk$set(fig.width=12, fig.height=8) # input diff --git a/drop/modules/aberrant-splicing-pipeline/Scripts/Counting/DatasetsF.R b/drop/modules/aberrant-splicing-pipeline/Counting/DatasetsF.R similarity index 66% rename from drop/modules/aberrant-splicing-pipeline/Scripts/Counting/DatasetsF.R rename to drop/modules/aberrant-splicing-pipeline/Counting/DatasetsF.R index e94814fa..6c297f3d 100644 --- a/drop/modules/aberrant-splicing-pipeline/Scripts/Counting/DatasetsF.R +++ b/drop/modules/aberrant-splicing-pipeline/Counting/DatasetsF.R @@ -1,18 +1,17 @@ #'--- #' title: FRASER counting analysis over all datasets #' wb: -#' params: -#' - tmpdir: '`sm drop.getMethodPath(METHOD, "tmp_dir")`' +#' log: +#' - snakemake: '`sm str(tmp_dir / "AS" / "CountingOverview.Rds")`' #' input: #' - counting_summary: '`sm expand(config["htmlOutputPath"] + #' "/AberrantSplicing/{dataset}_countSummary.html", -#' dataset=config["aberrantSplicing"]["groups"])`' +#' dataset=cfg.AS.groups)`' #' output: #' html_document #'--- -saveRDS(snakemake, file.path(snakemake@params$tmpdir, "FRASER_cs.snakemake")) -# snakemake <- readRDS(".drop/tmp/AS/FRASER_cs.snakemake") +saveRDS(snakemake, snakemake@log$snakemake) datasets <- snakemake@config$aberrantSplicing$groups @@ -24,4 +23,4 @@ devNull <- sapply(datasets, function(name){ "
", "Count Summary", "
", "

" )) -}) \ No newline at end of file +}) diff --git a/drop/modules/aberrant-splicing-pipeline/Scripts/Counting/Summary.R b/drop/modules/aberrant-splicing-pipeline/Counting/Summary.R similarity index 77% rename from drop/modules/aberrant-splicing-pipeline/Scripts/Counting/Summary.R rename to drop/modules/aberrant-splicing-pipeline/Counting/Summary.R index 277aef84..b57fdb7f 100644 --- a/drop/modules/aberrant-splicing-pipeline/Scripts/Counting/Summary.R +++ b/drop/modules/aberrant-splicing-pipeline/Counting/Summary.R @@ -2,10 +2,13 @@ #' title: "Count Summary: `r gsub('_', ' ', snakemake@wildcards$dataset)`" #' author: Christian Mertes #' wb: +#' log: +#' - snakemake: '`sm str(tmp_dir / "AS" / "{dataset}" / "CountSummary.Rds")`' #' params: -#' - workingDir: '`sm parser.getProcDataDir() + "/aberrant_splicing/datasets/"`' +#' - setup: '`sm cfg.AS.getWorkdir() + "/config.R"`' +#' - workingDir: '`sm cfg.getProcessedDataDir() + "/aberrant_splicing/datasets/"`' #' input: -#' - filter: '`sm parser.getProcDataDir() + +#' - filter: '`sm cfg.getProcessedDataDir() + #' "/aberrant_splicing/datasets/savedObjects/{dataset}/filter.done" `' #' output: #' - wBhtml: '`sm config["htmlOutputPath"] + @@ -13,8 +16,8 @@ #' type: noindex #'--- -#+ echo=FALSE -source("Scripts/_helpers/config.R") +saveRDS(snakemake, snakemake@log$snakemake) +source(snakemake@params$setup, echo=FALSE) suppressPackageStartupMessages({ library(cowplot) diff --git a/drop/modules/aberrant-splicing-pipeline/Scripts/Counting/exportCounts.R b/drop/modules/aberrant-splicing-pipeline/Counting/exportCounts.R similarity index 56% rename from drop/modules/aberrant-splicing-pipeline/Scripts/Counting/exportCounts.R rename to drop/modules/aberrant-splicing-pipeline/Counting/exportCounts.R index e5e72616..b59bd5fa 100644 --- a/drop/modules/aberrant-splicing-pipeline/Scripts/Counting/exportCounts.R +++ b/drop/modules/aberrant-splicing-pipeline/Counting/exportCounts.R @@ -1,28 +1,27 @@ #'--- #' title: Collect all counts from FRASER Object -#' author: Michaela Mueller, vyepez +#' author: mumichae, vyepez #' wb: +#' log: +#' - snakemake: '`sm str(tmp_dir / "AS" / "{dataset}" / "{genomeAssembly}--{annotation}_export.Rds")`' #' params: -#' - tmpdir: '`sm drop.getMethodPath(METHOD, "tmp_dir")`' -#' - workingDir: '`sm parser.getProcDataDir() + "/aberrant_splicing/datasets"`' +#' - setup: '`sm cfg.AS.getWorkdir() + "/config.R"`' +#' - workingDir: '`sm cfg.getProcessedDataDir() + "/aberrant_splicing/datasets"`' #' input: -#' - gRangesSplitCounts: '`sm parser.getProcDataDir() + +#' - gRangesSplitCounts: '`sm cfg.getProcessedDataDir() + #' "/aberrant_splicing/datasets/cache/raw-{dataset}/gRanges_splitCounts.rds"`' -#' - spliceSites: '`sm parser.getProcDataDir() + +#' - spliceSites: '`sm cfg.getProcessedDataDir() + #' "/aberrant_splicing/datasets/cache/raw-{dataset}/spliceSites_splitCounts.rds"`' -#' - counting_done: '`sm parser.getProcDataDir() + +#' - counting_done: '`sm cfg.getProcessedDataDir() + #' "/aberrant_splicing/datasets/savedObjects/raw-{dataset}/counting.done" `' #' output: -#' - split_counts: '`sm parser.getProcResultsDir() + "/exported_counts/{dataset}--{genomeAssembly}--{annotation}/" -#' + "splitCounts.tsv.gz"`' -#' - nonsplit_counts: '`sm parser.getProcResultsDir() + "/exported_counts/{dataset}--{genomeAssembly}--{annotation}/" -#' + "spliceSiteOverlapCounts.tsv.gz"`' +#' - split_counts: '`sm cfg.exportCounts.getFilePattern(str_=False) / "splitCounts.tsv.gz"`' +#' - nonsplit_counts: '`sm cfg.exportCounts.getFilePattern(str_=False) / "spliceSiteOverlapCounts.tsv.gz"`' #' type: script #'--- -saveRDS(snakemake, file.path(snakemake@params$tmpdir, "export.snakemake")) -# snakemake <- readRDS(".drop/tmp/AS/export.snakemake") -source("Scripts/_helpers/config.R") +saveRDS(snakemake, snakemake@log$snakemake) +source(snakemake@params$setup, echo=FALSE) dataset <- snakemake@wildcards$dataset workingDir <- snakemake@params$workingDir @@ -35,14 +34,14 @@ spliceSiteCoords <- readRDS(snakemake@input$spliceSites) # obtain the split counts splitCounts <- counts(fds, type="j") -gr_dt <- as.data.table(splitCounts_gRanges)[, c(1:3,5)] +gr_dt <- as.data.table(splitCounts_gRanges) splitCounts <- cbind(gr_dt, as.matrix(splitCounts)) fwrite(splitCounts, file = snakemake@output$split_counts, quote = FALSE, row.names = FALSE, sep = '\t', compress = 'gzip') # obtain the non split counts nonSplitCounts <- counts(fds, type="ss") -grns_dt <- as.data.table(spliceSiteCoords)[, c(1:3,5)] +grns_dt <- as.data.table(spliceSiteCoords) nonSplitCounts <- cbind(grns_dt, as.matrix(nonSplitCounts)) fwrite(nonSplitCounts, file = snakemake@output$nonsplit_counts, quote = FALSE, row.names = FALSE, sep = '\t', compress = 'gzip') diff --git a/drop/modules/aberrant-splicing-pipeline/Scripts/FRASER/04_fit_hyperparameters_FraseR.R b/drop/modules/aberrant-splicing-pipeline/FRASER/04_fit_hyperparameters_FraseR.R similarity index 77% rename from drop/modules/aberrant-splicing-pipeline/Scripts/FRASER/04_fit_hyperparameters_FraseR.R rename to drop/modules/aberrant-splicing-pipeline/FRASER/04_fit_hyperparameters_FraseR.R index ec741046..d7512cf4 100644 --- a/drop/modules/aberrant-splicing-pipeline/Scripts/FRASER/04_fit_hyperparameters_FraseR.R +++ b/drop/modules/aberrant-splicing-pipeline/FRASER/04_fit_hyperparameters_FraseR.R @@ -2,23 +2,23 @@ #' title: Hyper parameter optimization #' author: Christian Mertes #' wb: +#' log: +#' - snakemake: '`sm str(tmp_dir / "AS" / "{dataset}" / "04_hyper.Rds")`' #' params: -#' - tmpdir: '`sm drop.getMethodPath(METHOD, "tmp_dir")`' -#' - workingDir: '`sm parser.getProcDataDir() + "/aberrant_splicing/datasets/"`' +#' - setup: '`sm cfg.AS.getWorkdir() + "/config.R"`' +#' - workingDir: '`sm cfg.getProcessedDataDir() + "/aberrant_splicing/datasets/"`' #' threads: 12 #' input: -#' - filter: '`sm parser.getProcDataDir() + +#' - filter: '`sm cfg.getProcessedDataDir() + #' "/aberrant_splicing/datasets/savedObjects/{dataset}/filter.done" `' #' output: -#' - hyper: '`sm parser.getProcDataDir() + +#' - hyper: '`sm cfg.getProcessedDataDir() + #' "/aberrant_splicing/datasets/savedObjects/{dataset}/hyper.done" `' #' type: script #'--- -saveRDS(snakemake, file.path(snakemake@params$tmpdir, "FRASER_04.snakemake")) -# snakemake <- readRDS(".drop/tmp/AS/FRASER_04.snakemake") - -source("Scripts/_helpers/config.R") +saveRDS(snakemake, snakemake@log$snakemake) +source(snakemake@params$setup, echo=FALSE) #+ input dataset <- snakemake@wildcards$dataset diff --git a/drop/modules/aberrant-splicing-pipeline/Scripts/FRASER/05_fit_autoencoder_FraseR.R b/drop/modules/aberrant-splicing-pipeline/FRASER/05_fit_autoencoder_FraseR.R similarity index 70% rename from drop/modules/aberrant-splicing-pipeline/Scripts/FRASER/05_fit_autoencoder_FraseR.R rename to drop/modules/aberrant-splicing-pipeline/FRASER/05_fit_autoencoder_FraseR.R index 65e1ebc9..1405d5f7 100644 --- a/drop/modules/aberrant-splicing-pipeline/Scripts/FRASER/05_fit_autoencoder_FraseR.R +++ b/drop/modules/aberrant-splicing-pipeline/FRASER/05_fit_autoencoder_FraseR.R @@ -2,23 +2,23 @@ #' title: Fitting the autoencoder #' author: Christian Mertes #' wb: +#' log: +#' - snakemake: '`sm str(tmp_dir / "AS" / "{dataset}" / "05_fit.Rds")`' #' params: -#' - tmpdir: '`sm drop.getMethodPath(METHOD, "tmp_dir")`' -#' - workingDir: '`sm parser.getProcDataDir() + "/aberrant_splicing/datasets/"`' +#' - setup: '`sm cfg.AS.getWorkdir() + "/config.R"`' +#' - workingDir: '`sm cfg.getProcessedDataDir() + "/aberrant_splicing/datasets/"`' #' threads: 20 #' input: -#' - hyper: '`sm parser.getProcDataDir() + +#' - hyper: '`sm cfg.getProcessedDataDir() + #' "/aberrant_splicing/datasets/savedObjects/{dataset}/hyper.done" `' #' output: -#' - fdsout: '`sm parser.getProcDataDir() + +#' - fdsout: '`sm cfg.getProcessedDataDir() + #' "/aberrant_splicing/datasets/savedObjects/{dataset}/predictedMeans_psiSite.h5"`' #' type: script #'--- -saveRDS(snakemake, file.path(snakemake@params$tmpdir, "FRASER_05.snakemake")) -# snakemake <- readRDS(".drop/tmp/AS/FRASER_05.snakemake") - -source("Scripts/_helpers/config.R") +saveRDS(snakemake, snakemake@log$snakemake) +source(snakemake@params$setup, echo=FALSE) dataset <- snakemake@wildcards$dataset workingDir <- snakemake@params$workingDir diff --git a/drop/modules/aberrant-splicing-pipeline/Scripts/FRASER/06_calculation_stats_AE_FraseR.R b/drop/modules/aberrant-splicing-pipeline/FRASER/06_calculation_stats_AE_FraseR.R similarity index 70% rename from drop/modules/aberrant-splicing-pipeline/Scripts/FRASER/06_calculation_stats_AE_FraseR.R rename to drop/modules/aberrant-splicing-pipeline/FRASER/06_calculation_stats_AE_FraseR.R index e0185034..42385417 100644 --- a/drop/modules/aberrant-splicing-pipeline/Scripts/FRASER/06_calculation_stats_AE_FraseR.R +++ b/drop/modules/aberrant-splicing-pipeline/FRASER/06_calculation_stats_AE_FraseR.R @@ -2,25 +2,25 @@ #' title: Calculate P values #' author: Christian Mertes #' wb: +#' log: +#' - snakemake: '`sm str(tmp_dir / "AS" / "{dataset}" / "06_stats.Rds")`' #' params: -#' - tmpdir: '`sm drop.getMethodPath(METHOD, "tmp_dir")`' -#' - workingDir: '`sm parser.getProcDataDir() + "/aberrant_splicing/datasets/"`' +#' - setup: '`sm cfg.AS.getWorkdir() + "/config.R"`' +#' - workingDir: '`sm cfg.getProcessedDataDir() + "/aberrant_splicing/datasets/"`' #' threads: 20 #' input: -#' - fdsin: '`sm parser.getProcDataDir() + +#' - fdsin: '`sm cfg.getProcessedDataDir() + #' "/aberrant_splicing/datasets/savedObjects/{dataset}/" + #' "predictedMeans_psiSite.h5"`' #' output: -#' - fdsout: '`sm parser.getProcDataDir() + +#' - fdsout: '`sm cfg.getProcessedDataDir() + #' "/aberrant_splicing/datasets/savedObjects/{dataset}/" + #' "padjBetaBinomial_psiSite.h5"`' #' type: script #'--- -saveRDS(snakemake, file.path(snakemake@params$tmpdir, "FRASER_06.snakemake")) -# snakemake <- readRDS(".drop/tmp/AS/FRASER_06.snakemake") - -source("Scripts/_helpers/config.R") +saveRDS(snakemake, snakemake@log$snakemake) +source(snakemake@params$setup, echo=FALSE) dataset <- snakemake@wildcards$dataset fdsFile <- snakemake@input$fdsin diff --git a/drop/modules/aberrant-splicing-pipeline/Scripts/FRASER/07_extract_results_FraseR.R b/drop/modules/aberrant-splicing-pipeline/FRASER/07_extract_results_FraseR.R similarity index 75% rename from drop/modules/aberrant-splicing-pipeline/Scripts/FRASER/07_extract_results_FraseR.R rename to drop/modules/aberrant-splicing-pipeline/FRASER/07_extract_results_FraseR.R index a044a324..7a4229b9 100644 --- a/drop/modules/aberrant-splicing-pipeline/Scripts/FRASER/07_extract_results_FraseR.R +++ b/drop/modules/aberrant-splicing-pipeline/FRASER/07_extract_results_FraseR.R @@ -2,27 +2,31 @@ #' title: Results of FRASER analysis #' author: Christian Mertes #' wb: +#' log: +#' - snakemake: '`sm str(tmp_dir / "AS" / "{dataset}" / "07_results.Rds")`' #' params: -#' - tmpdir: '`sm drop.getMethodPath(METHOD, "tmp_dir")`' -#' - workingDir: '`sm parser.getProcDataDir() + "/aberrant_splicing/datasets/"`' +#' - workingDir: '`sm cfg.getProcessedDataDir() + "/aberrant_splicing/datasets/"`' +#' - padjCutoff: '`sm cfg.AS.get("padjCutoff")`' +#' - zScoreCutoff: '`sm cfg.AS.get("zScoreCutoff")`' +#' - deltaPsiCutoff: '`sm cfg.AS.get("deltaPsiCutoff")`' #' threads: 10 #' input: -#' - fdsin: '`sm parser.getProcDataDir() + +#' - setup: '`sm cfg.AS.getWorkdir() + "/config.R"`' +#' - add_HPO_cols: '`sm str(projectDir / ".drop" / "helpers" / "add_HPO_cols.R")`' +#' - fdsin: '`sm cfg.getProcessedDataDir() + #' "/aberrant_splicing/datasets/savedObjects/{dataset}/" + #' "padjBetaBinomial_psiSite.h5"`' #' output: -#' - resultTableJunc: '`sm parser.getProcDataDir() + +#' - resultTableJunc: '`sm cfg.getProcessedDataDir() + #' "/aberrant_splicing/results/{dataset}_results_per_junction.tsv"`' -#' - resultTableGene: '`sm parser.getProcDataDir() + +#' - resultTableGene: '`sm cfg.getProcessedDataDir() + #' "/aberrant_splicing/results/{dataset}_results.tsv"`' #' type: script #'--- -saveRDS(snakemake, file.path(snakemake@params$tmpdir, "FRASER_07.snakemake")) -# snakemake <- readRDS(".drop/tmp/AS/FRASER_07.snakemake") - -source("Scripts/_helpers/config.R") -source("../helpers/add_HPO_cols.R") +saveRDS(snakemake, snakemake@log$snakemake) +source(snakemake@input$setup, echo=FALSE) +source(snakemake@input$add_HPO_cols) opts_chunk$set(fig.width=12, fig.height=8) @@ -34,8 +38,6 @@ register(MulticoreParam(snakemake@threads)) # Limit number of threads for DelayedArray operations setAutoBPPARAM(MulticoreParam(snakemake@threads)) -params <- snakemake@config$aberrantSplicing - # Load data and annotate ranges with gene names fds <- loadFraserDataSet(dir=workingDir, name=dataset) GRCh <- ifelse(snakemake@config$genomeAssembly == 'hg19', 37, @@ -46,9 +48,9 @@ colData(fds)$sampleID <- as.character(colData(fds)$sampleID) # Extract results per junction res_junc <- results(fds, - padjCutoff=params$padjCutoff, - zScoreCutoff=params$zScoreCutoff, - deltaPsiCutoff=params$deltaPsiCutoff, + padjCutoff=snakemake@params$padjCutoff, + zScoreCutoff=snakemake@params$zScoreCutoff, + deltaPsiCutoff=snakemake@params$deltaPsiCutoff, additionalColumns=c("other_hgnc_symbol")) res_junc_dt <- as.data.table(res_junc) print('Results per junction extracted') diff --git a/drop/modules/aberrant-splicing-pipeline/Scripts/FRASER/Datasets.R b/drop/modules/aberrant-splicing-pipeline/FRASER/Datasets.R similarity index 66% rename from drop/modules/aberrant-splicing-pipeline/Scripts/FRASER/Datasets.R rename to drop/modules/aberrant-splicing-pipeline/FRASER/Datasets.R index 3492716d..9ba6057d 100644 --- a/drop/modules/aberrant-splicing-pipeline/Scripts/FRASER/Datasets.R +++ b/drop/modules/aberrant-splicing-pipeline/FRASER/Datasets.R @@ -1,18 +1,16 @@ #'--- #' title: Full FRASER analysis over all datasets #' wb: -#' params: -#' - tmpdir: '`sm drop.getMethodPath(METHOD, "tmp_dir")`' +#' log: +#' - snakemake: '`sm str(tmp_dir / "AS" / "FRASER_datasets.Rds")`' #' input: #' - fraser_summary: '`sm expand(config["htmlOutputPath"] + -#' "/AberrantSplicing/{dataset}_summary.html", -#' dataset=config["aberrantSplicing"]["groups"])`' +#' "/AberrantSplicing/{dataset}_summary.html", dataset=cfg.AS.groups)`' #' output: #' html_document #'--- -saveRDS(snakemake, file.path(snakemake@params$tmpdir, "FRASER_99.snakemake")) -# snakemake <- readRDS(".drop/tmp/AS/FRASER_99.snakemake") +saveRDS(snakemake, snakemake@log$snakemake) datasets <- snakemake@config$aberrantSplicing$groups diff --git a/drop/modules/aberrant-splicing-pipeline/Scripts/FRASER/Summary.R b/drop/modules/aberrant-splicing-pipeline/FRASER/Summary.R similarity index 88% rename from drop/modules/aberrant-splicing-pipeline/Scripts/FRASER/Summary.R rename to drop/modules/aberrant-splicing-pipeline/FRASER/Summary.R index 07b7e7ee..ade2a80d 100644 --- a/drop/modules/aberrant-splicing-pipeline/Scripts/FRASER/Summary.R +++ b/drop/modules/aberrant-splicing-pipeline/FRASER/Summary.R @@ -2,13 +2,16 @@ #' title: "FRASER Summary: `r gsub('_', ' ', snakemake@wildcards$dataset)`" #' author: mumichae, vyepez, ischeller #' wb: +#' log: +#' - snakemake: '`sm str(tmp_dir / "AS" / "{dataset}" / "FRASER_summary.Rds")`' #' params: -#' - workingDir: '`sm parser.getProcDataDir() + "/aberrant_splicing/datasets/"`' +#' - setup: '`sm cfg.AS.getWorkdir() + "/config.R"`' +#' - workingDir: '`sm cfg.getProcessedDataDir() + "/aberrant_splicing/datasets/"`' #' input: -#' - fdsin: '`sm parser.getProcDataDir() + +#' - fdsin: '`sm cfg.getProcessedDataDir() + #' "/aberrant_splicing/datasets/savedObjects/{dataset}/" + #' "padjBetaBinomial_psiSite.h5"`' -#' - results: '`sm parser.getProcDataDir() + +#' - results: '`sm cfg.getProcessedDataDir() + #' "/aberrant_splicing/results/{dataset}_results.tsv"`' #' output: #' - wBhtml: '`sm config["htmlOutputPath"] + @@ -16,8 +19,8 @@ #' type: noindex #'--- -#+ load config and setup, echo=FALSE -source("Scripts/_helpers/config.R") +saveRDS(snakemake, snakemake@log$snakemake) +source(snakemake@params$setup, echo=FALSE) suppressPackageStartupMessages({ library(cowplot) diff --git a/drop/modules/aberrant-splicing-pipeline/Scripts/Counting/00_define_datasets_from_anno.R b/drop/modules/aberrant-splicing-pipeline/Scripts/Counting/00_define_datasets_from_anno.R deleted file mode 100644 index 882b0d87..00000000 --- a/drop/modules/aberrant-splicing-pipeline/Scripts/Counting/00_define_datasets_from_anno.R +++ /dev/null @@ -1,41 +0,0 @@ -#'--- -#' title: Create datasets from annotation file -#' author: Christian Mertes -#' wb: -#' params: -#' - ids: '`sm parser.fraser_ids`' -#' - tmpdir: '`sm drop.getMethodPath(METHOD, "tmp_dir")`' -#' - fileMappingFile: '`sm parser.getProcDataDir() + "/file_mapping.csv"`' -#' input: -#' - sampleAnnoFile: '`sm config["sampleAnnotation"]`' -#' output: -#' - colData: '`sm parser.getProcDataDir() + -#' "/aberrant_splicing/annotations/{dataset}.tsv"`' -#' type: script -#'--- - -saveRDS(snakemake, file.path(snakemake@params$tmpdir, "FRASER_00.snakemake")) -# snakemake <- readRDS(".drop/tmp/AS/FRASER_00.snakemake") - -#+ load main config, echo=FALSE -source("Scripts/_helpers/config.R", echo=FALSE) - -#+ input -outFile <- snakemake@output$colData -annoFile <- snakemake@input$sampleAnnoFile -fileMapFile <- snakemake@params$fileMapping - -#+ dataset name - -name <- snakemake@wildcards$dataset -anno <- fread(annoFile) -mapping <- fread(fileMapFile) - -subset_ids <- snakemake@params$ids[[name]] -annoSub <- anno[RNA_ID %in% subset_ids] -colData <- merge( - annoSub[,.(sampleID = RNA_ID, STRAND, PAIRED_END)], - mapping[FILE_TYPE == "RNA_BAM_FILE", .(sampleID=ID, bamFile=FILE_PATH)]) -colData <- unique(colData) # needed in case of DNA replicates - -write_tsv(colData, file=outFile) diff --git a/drop/modules/aberrant-splicing-pipeline/Snakefile b/drop/modules/aberrant-splicing-pipeline/Snakefile index 33a40eb5..60d6829b 100644 --- a/drop/modules/aberrant-splicing-pipeline/Snakefile +++ b/drop/modules/aberrant-splicing-pipeline/Snakefile @@ -1,38 +1,16 @@ -### SNAKEFILE ABERRANT SPLICING - -import os -import drop - -METHOD = 'AS' -SCRIPT_ROOT = os.getcwd() -CONF_FILE = drop.getConfFile() - -parser = drop.config(config, METHOD) -config = parser.parse() -include: config['wBuildPath'] + "/wBuild.snakefile" - - -rule all: - input: rules.Index.output, config["htmlOutputPath"] + "/aberrant_splicing_readme.html" - output: touch(drop.getMethodPath(METHOD, type_='final_file')) - -rulegraph_filename = f'{config["htmlOutputPath"]}/{METHOD}_rulegraph' - -rule produce_rulegraph: - input: - expand(rulegraph_filename + ".{fmt}", fmt=["svg", "png"]) - -rule create_graph: - output: - svg = f"{rulegraph_filename}.svg", - png = f"{rulegraph_filename}.png" - shell: - """ - snakemake --configfile {CONF_FILE} --rulegraph | dot -Tsvg > {output.svg} - snakemake --configfile {CONF_FILE} --rulegraph | dot -Tpng > {output.png} - """ - -rule unlock: - output: touch(drop.getMethodPath(METHOD, type_="unlock")) - shell: "snakemake --unlock --configfile {CONF_FILE}" - +AS_WORKDIR = cfg.AS.getWorkdir(str_=False) + +AS_index_name = "aberrant-splicing-pipeline" +AS_index_input, AS_index_output, AS_graph_file, _ = createIndexRule( + scriptsPath=str(AS_WORKDIR), + index_name=AS_index_name +) + +rule aberrantSplicing: + input: AS_index_input, AS_graph_file + output: AS_index_output + run: ci(str(AS_WORKDIR), AS_index_name) + +rule aberrantSplicing_dependency: + output: AS_graph_file + shell: "snakemake --rulegraph {AS_index_output} | dot -Tsvg -Grankdir=TB > {output}" \ No newline at end of file diff --git a/drop/modules/aberrant-splicing-pipeline/Scripts/_helpers/config.R b/drop/modules/aberrant-splicing-pipeline/config.R similarity index 100% rename from drop/modules/aberrant-splicing-pipeline/Scripts/_helpers/config.R rename to drop/modules/aberrant-splicing-pipeline/config.R diff --git a/drop/modules/mae-pipeline/Scripts/MAE/ASEReadCounter.sh b/drop/modules/mae-pipeline/MAE/ASEReadCounter.sh similarity index 100% rename from drop/modules/mae-pipeline/Scripts/MAE/ASEReadCounter.sh rename to drop/modules/mae-pipeline/MAE/ASEReadCounter.sh diff --git a/drop/modules/mae-pipeline/Scripts/MAE/Datasets.R b/drop/modules/mae-pipeline/MAE/Datasets.R similarity index 62% rename from drop/modules/mae-pipeline/Scripts/MAE/Datasets.R rename to drop/modules/mae-pipeline/MAE/Datasets.R index e237f91f..4d47edf4 100644 --- a/drop/modules/mae-pipeline/Scripts/MAE/Datasets.R +++ b/drop/modules/mae-pipeline/MAE/Datasets.R @@ -2,20 +2,17 @@ #' title: MAE analysis over all datasets #' author: #' wb: -#' params: -#' - tmpdir: '`sm drop.getMethodPath(METHOD, "tmp_dir")`' +#' log: +#' - snakemake: '`sm str(tmp_dir / "MAE" / "overview.Rds")`' #' input: -#' - html: '`sm expand(config["htmlOutputPath"] + +#' - html: '`sm expand(config["htmlOutputPath"] + #' "/MAE/{dataset}--{annotation}_results.html", -#' annotation=list(config["geneAnnotation"].keys()), -#' dataset=config["mae"]["groups"])`' +#' annotation=cfg.getGeneVersions(), dataset=cfg.MAE.groups)`' #' output: #' html_document #'--- - -saveRDS(snakemake, file.path(snakemake@params$tmpdir, "overview.snakemake") ) -# snakemake <- readRDS(".drop/tmp/MAE/overview.snakemake") +saveRDS(snakemake, snakemake@log$snakemake) # Obtain the annotations and datasets datasets <- snakemake@config$mae$groups diff --git a/drop/modules/mae-pipeline/Scripts/MAE/Results.R b/drop/modules/mae-pipeline/MAE/Results.R similarity index 79% rename from drop/modules/mae-pipeline/Scripts/MAE/Results.R rename to drop/modules/mae-pipeline/MAE/Results.R index caa8b312..cdec9d7d 100644 --- a/drop/modules/mae-pipeline/Scripts/MAE/Results.R +++ b/drop/modules/mae-pipeline/MAE/Results.R @@ -2,16 +2,20 @@ #' title: MAE Results table #' author: vyepez #' wb: +#' log: +#' - snakemake: '`sm str(tmp_dir / "MAE" / "{dataset}" / "{annotation}_results.Rds")`' +#' params: +#' - allelicRatioCutoff: '`sm cfg.MAE.get("allelicRatioCutoff")`' +#' - padjCutoff: '`sm cfg.MAE.get("padjCutoff")`' #' input: -#' - mae_res: '`sm lambda wildcards: expand(parser.getProcResultsDir() + -#' "/mae/samples/{id}_res.Rds", -#' id = parser.getMaeByGroup({wildcards.dataset}))`' -#' - gene_name_mapping: '`sm parser.getProcDataDir() + +#' - mae_res: '`sm lambda w: expand(cfg.getProcessedResultsDir() + +#' "/mae/samples/{id}_res.Rds", id=cfg.MAE.getMaeByGroup({w.dataset}))`' +#' - gene_name_mapping: '`sm cfg.getProcessedDataDir() + #' "/mae/gene_name_mapping_{annotation}.tsv"`' #' output: -#' - res_all: '`sm parser.getProcResultsDir() + +#' - res_all: '`sm cfg.getProcessedResultsDir() + #' "/mae/{dataset}/MAE_results_all_{annotation}.tsv.gz"`' -#' - res_signif: '`sm parser.getProcResultsDir() + +#' - res_signif: '`sm cfg.getProcessedResultsDir() + #' "/mae/{dataset}/MAE_results_{annotation}.tsv"`' #' - wBhtml: '`sm config["htmlOutputPath"] + #' "/MAE/{dataset}--{annotation}_results.html"`' @@ -19,8 +23,7 @@ #'--- #+ echo=F -saveRDS(snakemake, file.path(snakemake@config$tmpdir, "/MAE/mae_res_all.snakemake")) -# snakemake <- readRDS(".drop/tmp/MAE/mae_res_all.snakemake") +saveRDS(snakemake, snakemake@log$snakemake) suppressPackageStartupMessages({ library(data.table) @@ -31,14 +34,12 @@ suppressPackageStartupMessages({ library(R.utils) }) -params <- snakemake@config$mae - # Read all MAE results files rmae <- lapply(snakemake@input$mae_res, function(m){ rt <- readRDS(m) return(rt) }) %>% rbindlist() -rmae[, aux := paste(contig, position, sep = "-")] + # Add gene names gene_annot_dt <- fread(snakemake@input$gene_name_mapping) @@ -61,24 +62,16 @@ res_annot <- rbind(res_annot[gene_type == 'protein_coding'], res_annot[gene_type != 'protein_coding']) # Write all the other genes in another column +res_annot[, aux := paste(contig, position, sep = "-")] rvar <- unique(res_annot[, .(aux, gene_name)]) rvar[, N := 1:.N, by = aux] r_other <- rvar[N > 1, .(other_names = paste(gene_name, collapse = ',')), by = aux] -res <- merge(res_annot, r_other, by = 'aux', sort = FALSE, all.x = TRUE) -res <- res[, .SD[1], by = .(aux, ID)] -res <- rbind(res, rmae[! aux %in% res$aux], fill = TRUE) - -# Change rare if variant frequency is higher than threshold -res[, Nvar := .N, by = aux] -varsCohort <- params$maxVarFreqCohort * length(snakemake@input$mae_res) -res[Nvar > varsCohort, rare := FALSE] - -res[, aux := NULL] +res <- merge(res_annot, r_other, by = 'aux', sort = FALSE, all.x = TRUE) +res[, c('aux') := NULL] # Bring gene_name column front res <- cbind(res[, .(gene_name)], res[, -"gene_name"]) -setorder(res, contig, position) #' #' Number of samples: `r uniqueN(res$ID)` @@ -86,8 +79,8 @@ setorder(res, contig, position) #' Number of genes: `r uniqueN(res$gene_name)` # Subset for significant events -allelicRatioCutoff <- params$allelicRatioCutoff -res[, MAE := padj <= params$padjCutoff & +allelicRatioCutoff <- snakemake@params$allelicRatioCutoff +res[, MAE := padj <= snakemake@params$padjCutoff & (altRatio >= allelicRatioCutoff | altRatio <= (1-allelicRatioCutoff))] res[, MAE_ALT := MAE == TRUE & altRatio >= allelicRatioCutoff] diff --git a/drop/modules/mae-pipeline/Scripts/MAE/deseq_mae.R b/drop/modules/mae-pipeline/MAE/deseq_mae.R similarity index 75% rename from drop/modules/mae-pipeline/Scripts/MAE/deseq_mae.R rename to drop/modules/mae-pipeline/MAE/deseq_mae.R index 8a4e8a12..dc4a1f20 100644 --- a/drop/modules/mae-pipeline/Scripts/MAE/deseq_mae.R +++ b/drop/modules/mae-pipeline/MAE/deseq_mae.R @@ -1,18 +1,17 @@ #'--- #' title: Get MAE results -#' author: vyepez +#' author: vyepez, mumichae #' wb: -#' params: -#' - tmpdir: '`sm drop.getMethodPath(METHOD, "tmp_dir")`' +#' log: +#' - snakemake: '`sm str(tmp_dir / "MAE" / "deseq" / "{vcf}--{rna}.Rds")`' #' input: -#' - mae_counts: '`sm parser.getProcDataDir() + "/mae/allelic_counts/{vcf}--{rna}.csv.gz" `' +#' - mae_counts: '`sm cfg.getProcessedDataDir() + "/mae/allelic_counts/{vcf}--{rna}.csv.gz" `' #' output: -#' - mae_res: '`sm parser.getProcResultsDir() + "/mae/samples/{vcf}--{rna}_res.Rds"`' +#' - mae_res: '`sm cfg.getProcessedResultsDir() + "/mae/samples/{vcf}--{rna}_res.Rds"`' #' type: script #'--- -saveRDS(snakemake, file.path(snakemake@params$tmpdir,'deseq_mae.snakemake')) -# snakemake <- readRDS('.drop/tmp/MAE/deseq_mae.snakemake') +saveRDS(snakemake, snakemake@log$snakemake) suppressPackageStartupMessages({ library(stringr) diff --git a/drop/modules/mae-pipeline/Scripts/MAE/filterSNVs.sh b/drop/modules/mae-pipeline/MAE/filterSNVs.sh similarity index 100% rename from drop/modules/mae-pipeline/Scripts/MAE/filterSNVs.sh rename to drop/modules/mae-pipeline/MAE/filterSNVs.sh diff --git a/drop/modules/mae-pipeline/Scripts/MAE/gene_name_mapping.R b/drop/modules/mae-pipeline/MAE/gene_name_mapping.R similarity index 63% rename from drop/modules/mae-pipeline/Scripts/MAE/gene_name_mapping.R rename to drop/modules/mae-pipeline/MAE/gene_name_mapping.R index 450f9469..d6556eec 100644 --- a/drop/modules/mae-pipeline/Scripts/MAE/gene_name_mapping.R +++ b/drop/modules/mae-pipeline/MAE/gene_name_mapping.R @@ -2,17 +2,16 @@ #' title: Create GeneID-GeneName mapping #' author: mumichae #' wb: -#' params: -#' - tmpdir: '`sm drop.getMethodPath(METHOD, "tmp_dir")`' +#' log: +#' - snakemake: '`sm str(tmp_dir / "MAE" / "{annotation}.Rds")`' #' input: -#' - gtf: '`sm lambda wildcards: parser.getGeneAnnotationFile(wildcards.annotation) `' +#' - gtf: '`sm lambda w: cfg.getGeneAnnotationFile(w.annotation) `' #' output: -#' - gene_name_mapping: '`sm parser.getProcDataDir() + "/mae/gene_name_mapping_{annotation}.tsv"`' +#' - gene_name_mapping: '`sm cfg.getProcessedDataDir() + "/mae/gene_name_mapping_{annotation}.tsv"`' #' type: script #'--- -saveRDS(snakemake, paste0(snakemake@config$tmpdir, "/MAE/gene_map.snakemake")) -# snakemake <- readRDS(paste0(snakemake@config$tmpdir, "/MAE/gene_map.snakemake")) +saveRDS(snakemake, snakemake@log$snakemake) suppressPackageStartupMessages({ library(rtracklayer) diff --git a/drop/modules/mae-pipeline/Scripts/QC/ASEReadCounter.sh b/drop/modules/mae-pipeline/QC/ASEReadCounter.sh similarity index 100% rename from drop/modules/mae-pipeline/Scripts/QC/ASEReadCounter.sh rename to drop/modules/mae-pipeline/QC/ASEReadCounter.sh diff --git a/drop/modules/mae-pipeline/Scripts/QC/DNA_RNA_matrix_plot.R b/drop/modules/mae-pipeline/QC/DNA_RNA_matrix_plot.R similarity index 91% rename from drop/modules/mae-pipeline/Scripts/QC/DNA_RNA_matrix_plot.R rename to drop/modules/mae-pipeline/QC/DNA_RNA_matrix_plot.R index 3129b613..df039701 100644 --- a/drop/modules/mae-pipeline/Scripts/QC/DNA_RNA_matrix_plot.R +++ b/drop/modules/mae-pipeline/QC/DNA_RNA_matrix_plot.R @@ -2,8 +2,10 @@ #' title: DNA-RNA matching matrix #' author: vyepez #' wb: +#' log: +#' - snakemake: '`sm str(tmp_dir / "MAE" / "{dataset}" / "QC_matrix_plot.Rds")`' #' input: -#' - mat_qc: '`sm parser.getProcResultsDir() + +#' - mat_qc: '`sm cfg.getProcessedResultsDir() + #' "/mae/{dataset}/dna_rna_qc_matrix.Rds"`' #' output: #' - wBhtml: '`sm config["htmlOutputPath"] + "/QC/{dataset}.html"`' @@ -11,8 +13,7 @@ #'--- #+echo=F -saveRDS(snakemake, paste0(snakemake@config$tmpdir, "/MAE/qc_hist.snakemake")) -# snakemake <- readRDS(".drop/tmp/MAE/qc_hist.snakemake") +saveRDS(snakemake, snakemake@log$snakemake) suppressPackageStartupMessages({ library(reshape2) diff --git a/drop/modules/mae-pipeline/Scripts/QC/Datasets.R b/drop/modules/mae-pipeline/QC/Datasets.R similarity index 65% rename from drop/modules/mae-pipeline/Scripts/QC/Datasets.R rename to drop/modules/mae-pipeline/QC/Datasets.R index e7931cff..3dbf03dd 100644 --- a/drop/modules/mae-pipeline/Scripts/QC/Datasets.R +++ b/drop/modules/mae-pipeline/QC/Datasets.R @@ -2,17 +2,16 @@ #' title: VCF-BAM Matching Analysis over All Datasets #' author: #' wb: -#' params: -#' - tmpdir: '`sm drop.getMethodPath(METHOD, "tmp_dir")`' +#' log: +#' - snakemake: '`sm str(tmp_dir / "MAE" / "QC_overview.Rds")`' #' input: #' - html: '`sm expand(config["htmlOutputPath"] + "/QC/{dataset}.html", -#' dataset=config["mae"]["qcGroups"])`' +#' dataset=cfg.MAE.qcGroups)`' #' output: #' html_document #'--- -saveRDS(snakemake, file.path(snakemake@params$tmpdir, "overview_qc.snakemake") ) -# snakemake <- readRDS(".drop/tmp/MAE/overview_qc.snakemake") +saveRDS(snakemake, snakemake@log$snakemake) # Obtain the datasets datasets <- snakemake@config$mae$qcGroups diff --git a/drop/modules/mae-pipeline/Scripts/QC/create_matrix_dna_rna_cor.R b/drop/modules/mae-pipeline/QC/create_matrix_dna_rna_cor.R similarity index 80% rename from drop/modules/mae-pipeline/Scripts/QC/create_matrix_dna_rna_cor.R rename to drop/modules/mae-pipeline/QC/create_matrix_dna_rna_cor.R index 08c1a6d9..b4362f15 100644 --- a/drop/modules/mae-pipeline/Scripts/QC/create_matrix_dna_rna_cor.R +++ b/drop/modules/mae-pipeline/QC/create_matrix_dna_rna_cor.R @@ -2,23 +2,21 @@ #' title: Create QC matrix #' author: vyepez #' wb: -#' py: +#' log: +#' - snakemake: '`sm str(tmp_dir / "MAE" / "{dataset}" / "QC_matrix.Rds")`' #' params: -#' - tmpdir: '`sm drop.getMethodPath(METHOD, "tmp_dir")`' -#' - qcIdsRNA: '`sm parser.all_rna_ids`' +#' - rnaIds: '`sm lambda w: sa.getIDsByGroup(w.dataset, assay="RNA")`' #' input: -#' - mae_res: '`sm lambda wildcards: expand(parser.getProcDataDir() + -#' "/mae/RNA_GT/{rna}.Rds", -#' rna=parser.getRNAByGroup({wildcards.dataset}))`' +#' - mae_res: '`sm lambda w: expand(cfg.getProcessedDataDir() + +#' "/mae/RNA_GT/{rna}.Rds", rna=sa.getIDsByGroup(w.dataset, assay="RNA"))`' #' output: -#' - mat_qc: '`sm parser.getProcResultsDir() + +#' - mat_qc: '`sm cfg.getProcessedResultsDir() + #' "/mae/{dataset}/dna_rna_qc_matrix.Rds"`' #' threads: 20 #' type: script #'--- -saveRDS(snakemake, file.path(snakemake@params$tmpdir, "qc_matrix.snakemake")) -# snakemake <- readRDS(".drop/tmp/MAE/qc_matrix.snakemake") +saveRDS(snakemake, snakemake@log$snakemake) suppressPackageStartupMessages({ library(VariantAnnotation) @@ -36,7 +34,7 @@ gr_test <- readVcf(snakemake@config$mae$qcVcf) %>% granges() mcols(gr_test)$GT <- "0/0" # Obtain the rna and vcf files -rna_samples <- snakemake@params$qcIdsRNA[[snakemake@wildcards$dataset]] +rna_samples <- snakemake@params$rnaIds mae_res <- snakemake@input$mae_res vcf_cols <- sa[RNA_ID %in% rna_samples, .(DNA_ID, DNA_VCF_FILE)] %>% unique %>% na.omit() diff --git a/drop/modules/mae-pipeline/Scripts/QC/deseq_qc.R b/drop/modules/mae-pipeline/QC/deseq_qc.R similarity index 67% rename from drop/modules/mae-pipeline/Scripts/QC/deseq_qc.R rename to drop/modules/mae-pipeline/QC/deseq_qc.R index d557a3a2..c161c58a 100644 --- a/drop/modules/mae-pipeline/Scripts/QC/deseq_qc.R +++ b/drop/modules/mae-pipeline/QC/deseq_qc.R @@ -2,17 +2,16 @@ #' title: MAE test on qc variants #' author: vyepez #' wb: -#' params: -#' - tmpdir: '`sm drop.getMethodPath(METHOD, "tmp_dir")`' +#' log: +#' - snakemake: '`sm str(tmp_dir / "MAE" / "deseq" / "qc_{rna}.Rds")`' #' input: -#' - qc_counts: '`sm parser.getProcDataDir() + "/mae/allelic_counts/qc_{rna}.csv.gz" `' +#' - qc_counts: '`sm cfg.getProcessedDataDir() + "/mae/allelic_counts/qc_{rna}.csv.gz" `' #' output: -#' - mae_res: '`sm parser.getProcDataDir() + "/mae/RNA_GT/{rna}.Rds"`' +#' - mae_res: '`sm cfg.getProcessedDataDir() + "/mae/RNA_GT/{rna}.Rds"`' #' type: script #'--- -saveRDS(snakemake, file.path(snakemake@params$tmpdir,'deseq_qc.snakemake')) -# snakemake <- readRDS('.drop/tmp/MAE/deseq_qc.snakemake') +saveRDS(snakemake, snakemake@log$snakemake) suppressPackageStartupMessages({ library(GenomicRanges) diff --git a/drop/modules/mae-pipeline/Snakefile b/drop/modules/mae-pipeline/Snakefile index ccc4a821..67f8d30d 100644 --- a/drop/modules/mae-pipeline/Snakefile +++ b/drop/modules/mae-pipeline/Snakefile @@ -1,15 +1,20 @@ -### SNAKEFILE MONOALLELIC EXPRESSION -import os -import drop -import pathlib +MAE_WORKDIR = cfg.MAE.getWorkdir(str_=False) -METHOD = 'MAE' -SCRIPT_ROOT = drop.getMethodPath(METHOD, type_='workdir', str_=False) -CONF_FILE = drop.getConfFile() +MAE_index_name = "mae-pipeline" +MAE_index_input, MAE_index_output, MAE_graph_file, _ = createIndexRule( + scriptsPath=str(MAE_WORKDIR), + index_name=MAE_index_name +) + +rule mae: + input: MAE_index_input, MAE_graph_file + output: MAE_index_output + run: ci(str(MAE_WORKDIR), MAE_index_name) + +rule mae_dependency: + output: MAE_graph_file + shell: "snakemake --rulegraph {MAE_index_output} | dot -Tsvg -Grankdir=TB > {output}" -parser = drop.config(config, METHOD) -config = parser.parse() -include: config['wBuildPath'] + "/wBuild.snakefile" ###### FUNCTIONS ###### def fasta_dict(fasta_file): @@ -19,39 +24,28 @@ def getVcf(rna_id, vcf_id="qc"): if vcf_id == "qc": return config["mae"]["qcVcf"] else: - return parser.getProcDataDir() + f"/mae/snvs/{vcf_id}--{rna_id}.vcf.gz" + return cfg.getProcessedDataDir() + f"/mae/snvs/{vcf_id}--{rna_id}.vcf.gz" def getQC(format): if format == "UCSC": return config["mae"]["qcVcf"] elif format == "NCBI": - return parser.getProcDataDir() + "/mae/qc_vcf_ncbi.vcf.gz" + return cfg.processedDataDir / "mae" / "qc_vcf_ncbi.vcf.gz" else: raise ValueError(f"getQC: {format} is an invalid chromosome format") -def getChrMap(SCRIPT_ROOT, conversion): +def getChrMap(WORKDIR , conversion): if conversion == 'ncbi2ucsc': - return SCRIPT_ROOT/"resource"/"chr_NCBI_UCSC.txt" + return WORKDIR /"resource"/"chr_NCBI_UCSC.txt" elif conversion == 'ucsc2ncbi': - return SCRIPT_ROOT/"resource"/"chr_UCSC_NCBI.txt" + return WORKDIR /"resource"/"chr_UCSC_NCBI.txt" else: raise ValueError(f"getChrMap: {conversion} is an invalid conversion option") -def getScript(type, name): - return SCRIPT_ROOT/"Scripts"/type/name ###### -rule all: - input: - rules.Index.output, - config["htmlOutputPath"] + "/mae_readme.html", - rules.Scripts_MAE_Datasets_R.output, - rules.Scripts_QC_Datasets_R.output - output: touch(drop.getMethodPath(METHOD, type_='final_file')) - rule sampleQC: - input: rules.Scripts_QC_Datasets_R.output - output: touch(drop.getTmpDir() + "/sampleQC.done") + input: cfg.getHtmlFromScript(MAE_WORKDIR / "QC" / "Datasets.R") rule create_dict: input: config['mae']['genome'] @@ -59,18 +53,16 @@ rule create_dict: shell: "gatk CreateSequenceDictionary --REFERENCE {input[0]}" ## MAE -rule create_SNVs: +rule mae_createSNVs: input: - ncbi2ucsc = getChrMap(SCRIPT_ROOT, "ncbi2ucsc"), - ucsc2ncbi = getChrMap(SCRIPT_ROOT, "ucsc2ncbi"), - vcf_file = lambda wildcards: parser.getFilePath(sampleId=wildcards.vcf, - file_type='DNA_VCF_FILE'), - bam_file = lambda wildcards: parser.getFilePath(sampleId=wildcards.rna, - file_type='RNA_BAM_FILE'), - script = getScript("MAE", "filterSNVs.sh") + ncbi2ucsc = getChrMap(MAE_WORKDIR, "ncbi2ucsc"), + ucsc2ncbi = getChrMap(MAE_WORKDIR, "ucsc2ncbi"), + vcf_file = lambda w: sa.getFilePath(w.vcf, 'DNA_VCF_FILE'), + bam_file = lambda w: sa.getFilePath(w.rna, 'RNA_BAM_FILE'), + script =MAE_WORKDIR / "MAE" / "filterSNVs.sh" output: - snvs_filename=parser.getProcDataDir() + "/mae/snvs/{vcf}--{rna}.vcf.gz", - snvs_index=parser.getProcDataDir() + "/mae/snvs/{vcf}--{rna}.vcf.gz.tbi" + snvs_filename = cfg.processedDataDir / "mae" / "snvs" / "{vcf}--{rna}.vcf.gz", + snvs_index = cfg.processedDataDir / "mae" / "snvs" / "{vcf}--{rna}.vcf.gz.tbi" shell: """ {input.script} {input.ncbi2ucsc} {input.ucsc2ncbi} {input.vcf_file} \ @@ -78,18 +70,17 @@ rule create_SNVs: {config[tools][bcftoolsCmd]} {config[tools][samtoolsCmd]} """ -rule allelic_counts: +rule mae_allelicCounts: input: - ncbi2ucsc = getChrMap(SCRIPT_ROOT, "ncbi2ucsc"), - ucsc2ncbi = getChrMap(SCRIPT_ROOT, "ucsc2ncbi"), - vcf_file = lambda wildcards: getVcf(wildcards.rna, wildcards.vcf), - bam_file = lambda wildcards: parser.getFilePath(sampleId=wildcards.rna, - file_type='RNA_BAM_FILE'), - fasta = config['mae']['genome'], - dict = fasta_dict(config['mae']['genome']), - script = getScript("MAE", "ASEReadCounter.sh") + ncbi2ucsc = getChrMap(MAE_WORKDIR, "ncbi2ucsc"), + ucsc2ncbi = getChrMap(MAE_WORKDIR, "ucsc2ncbi"), + vcf_file = lambda w: getVcf(w.rna, w.vcf), + bam_file = lambda w: sa.getFilePath(w.rna, 'RNA_BAM_FILE'), + fasta = config['mae']['genome'], + dict = fasta_dict(config['mae']['genome']), + script =MAE_WORKDIR / "MAE" / "ASEReadCounter.sh" output: - counted = parser.getProcDataDir() + "/mae/allelic_counts/{vcf}--{rna}.csv.gz" + counted = cfg.processedDataDir / "mae" / "allelic_counts" / "{vcf}--{rna}.csv.gz" shell: """ {input.script} {input.ncbi2ucsc} {input.ucsc2ncbi} \ @@ -98,10 +89,10 @@ rule allelic_counts: {config[tools][bcftoolsCmd]} """ ## QC -rule renameChrQC: +rule mae_renameChrQC: input: - ucsc2ncbi = getChrMap(SCRIPT_ROOT, "ucsc2ncbi"), - ncbi_vcf = getQC(format="UCSC") + ucsc2ncbi = getChrMap(MAE_WORKDIR, "ucsc2ncbi"), + ncbi_vcf = getQC(format="UCSC") output: ncbi_vcf = getQC(format="NCBI") shell: @@ -113,20 +104,19 @@ rule renameChrQC: $bcftools index -t {output.ncbi_vcf} """ -rule allelic_counts_qc: +rule mae_allelicCountsQC: input: - ncbi2ucsc = getChrMap(SCRIPT_ROOT, "ncbi2ucsc"), - ucsc2ncbi = getChrMap(SCRIPT_ROOT, "ucsc2ncbi"), - vcf_file_ucsc = getQC(format="UCSC"), - vcf_file_ncbi = getQC(format="NCBI"), - bam_file = lambda wildcards: parser.getFilePath(sampleId=wildcards.rna, - file_type='RNA_BAM_FILE'), - fasta = config['mae']['genome'], - dict = fasta_dict(config['mae']['genome']), - script_qc = getScript("QC", "ASEReadCounter.sh"), - script_mae = getScript("MAE", "ASEReadCounter.sh") + ncbi2ucsc = getChrMap(MAE_WORKDIR, "ncbi2ucsc"), + ucsc2ncbi = getChrMap(MAE_WORKDIR, "ucsc2ncbi"), + vcf_file_ucsc = getQC(format="UCSC"), + vcf_file_ncbi = getQC(format="NCBI"), + bam_file = lambda w: sa.getFilePath(w.rna, 'RNA_BAM_FILE'), + fasta = config['mae']['genome'], + dict = fasta_dict(config['mae']['genome']), + script_qc =MAE_WORKDIR / "QC" / "ASEReadCounter.sh", + script_mae =MAE_WORKDIR / "MAE" / "ASEReadCounter.sh" output: - counted = parser.getProcDataDir() + "/mae/allelic_counts/qc_{rna}.csv.gz" + counted = cfg.processedDataDir / "mae" / "allelic_counts" / "qc_{rna}.csv.gz" shell: """ {input.script_qc} {input.ncbi2ucsc} {input.ucsc2ncbi} \ @@ -135,24 +125,3 @@ rule allelic_counts_qc: {output.counted} {config[tools][bcftoolsCmd]} \ {config[tools][samtoolsCmd]} {input.script_mae} """ - -#### -rulegraph_filename = f'{config["htmlOutputPath"]}/{METHOD}_rulegraph' -rule produce_rulegraph: - input: - expand(rulegraph_filename + ".{fmt}", fmt=["svg", "png"]) - -rule create_graph: - output: - svg = f"{rulegraph_filename}.svg", - png = f"{rulegraph_filename}.png" - shell: - """ - snakemake --configfile {CONF_FILE} --rulegraph | dot -Tsvg > {output.svg} - snakemake --configfile {CONF_FILE} --rulegraph | dot -Tpng > {output.png} - """ - -rule unlock: - output: touch(drop.getMethodPath(METHOD, type_="unlock")) - shell: "snakemake --unlock --configfile {CONF_FILE}" - diff --git a/drop/setupDrop.py b/drop/setupDrop.py index 423e57d5..04407c6c 100644 --- a/drop/setupDrop.py +++ b/drop/setupDrop.py @@ -1,29 +1,32 @@ import drop import subprocess -import pathlib -import wbuild -import re +from pathlib import Path from snakemake.logging import logger -def setupDrop(config): - installRPackages() - - config['wBuildPath'] = str(pathlib.Path(wbuild.__file__).parent) - parser = drop.config(config) +def setupPaths(projectRoot): + # repository paths + repoRoot = Path(drop.__file__).parent + repoPaths = { + "template": repoRoot / "template", + "Scripts": repoRoot / "template" / "Scripts", + "modules": repoRoot / "modules" + } - tmp_dir, config_file, final_files = drop.setupTempFiles(parser.config) - parser.setKey(parser.config, sub=None, key="tmpdir", default=tmp_dir) - parser.setKey(parser.config, sub=None, key="configFile", default=config_file) - parser.setKey(parser.config, sub=None, key="finalFiles", default=final_files) - - return parser, parser.parse() + # project paths + projectPaths = { + "projectDir": projectRoot, + "Scripts": projectRoot / "Scripts", + "dropDir": projectRoot / ".drop", + "tmpDir": projectRoot / ".drop" / "tmp" + } + + return repoPaths, projectPaths def installRPackages(): logger.info("check for missing R packages") - script = pathlib.Path(drop.__file__).parent / "installRPackages.R" - requirements = pathlib.Path(drop.__file__).parent / 'requirementsR.txt' + script = Path(drop.__file__).parent / "installRPackages.R" + requirements = Path(drop.__file__).parent / 'requirementsR.txt' response = subprocess.run(["Rscript", script, requirements], stderr=subprocess.STDOUT) response.check_returncode() - diff --git a/drop/submodules.py b/drop/submodules.py deleted file mode 100644 index c29518ed..00000000 --- a/drop/submodules.py +++ /dev/null @@ -1,78 +0,0 @@ -import pathlib -import yaml -from snakemake.logging import logger - -METHODS = {'AE': 'aberrant-expression-pipeline', - 'AS': 'aberrant-splicing-pipeline', - 'MAE': 'mae-pipeline'} -ROOT = pathlib.Path.cwd() / '.drop' -MODULES = ROOT / 'modules' -TMP_DIR = ROOT /'tmp' - -def getTmpDir(str_=True): - p = TMP_DIR - if str_: - p = str(p) - return p - -def getConfFile(str_=True): - p = TMP_DIR / 'config.yaml' - if str_: - p = str(p) - return p - -def setupTempFiles(config): - # create temporary directory - if not TMP_DIR.exists(): - logger.info(f"create temporary files directory {TMP_DIR}") - TMP_DIR.mkdir(parents=True) - - # save config file - CONF_FILE = getConfFile(config) - with open(CONF_FILE, 'w') as f: - yaml.safe_dump(config.copy(), f) - - done_files = {} - for method in METHODS.keys(): - - # final rule output file - done_file = getMethodPath(method, type_='final_file', str_=False) - done_files[method] = str(done_file) - - # create module tmp Dir if missing - tmp_dir = getMethodPath(method, type_='tmp_dir', str_=False) - if not tmp_dir.exists(): - tmp_dir.mkdir(parents=True) - - return TMP_DIR, CONF_FILE, done_files - -def getMethodPath(method, type_, str_=True): - """ - type_: name of link flag for snakemake subworkflow - workdir: directory of the submodule - snakefile: path to Snakefile - config_file: path to config file copy - final_file: path to empty file used as last output of workflow - unlock: path to empty file for unlocking subworkflow - """ - if method not in METHODS.keys(): - raise ValueError(f'{method} is not a valid method. Must be one of {METHODS.keys()}') - - if type_ == 'workdir': - p = MODULES / METHODS[method] - elif type_ == 'snakefile': - p = MODULES / METHODS[method] / 'Snakefile' - elif type_ == 'tmp_dir': - p = TMP_DIR / method - elif type_ == 'final_file': - p = TMP_DIR / method / 'done' - elif type_ == 'unlock': - p = TMP_DIR / method / 'unlock' - else: - raise ValueError(f'invalid type_: "{type_}"') - - if str_: - p = str(p) - return p - - diff --git a/drop/template/Scripts/AberrantExpressionAnalysis/Overview.R b/drop/template/Scripts/AberrantExpression/Overview.R similarity index 73% rename from drop/template/Scripts/AberrantExpressionAnalysis/Overview.R rename to drop/template/Scripts/AberrantExpression/Overview.R index b195c0e3..85daf601 100644 --- a/drop/template/Scripts/AberrantExpressionAnalysis/Overview.R +++ b/drop/template/Scripts/AberrantExpression/Overview.R @@ -2,24 +2,20 @@ #' title: Aberrant Expression #' author: #' wb: -#' py: -#' - | -#' annotations = list(config["geneAnnotation"].keys()) -#' datasets = config["aberrantExpression"]["groups"] +#' log: +#' - snakemake: '`sm str(tmp_dir / "AE" / "Overview.Rds")`' #' params: -#' - annotations: '`sm annotations`' -#' - datasets: '`sm datasets`' -#' - tmpdir: '`sm drop.getTmpDir()`' +#' - annotations: '`sm cfg.getGeneVersions()`' +#' - datasets: '`sm cfg.AE.groups`' #' - htmlDir: '`sm config["htmlOutputPath"] + "/AberrantExpression"`' -#' - odsFiles: '`sm expand(parser.getProcResultsDir() + +#' input: +#' - odsFiles: '`sm expand(cfg.getProcessedResultsDir() + #' "/aberrant_expression/{annotation}/outrider/{dataset}/ods.Rds", -#' annotation=annotations, dataset=datasets)`' -#' - resultTables: '`sm expand(parser.getProcResultsDir() + +#' annotation=cfg.getGeneVersions(), dataset=cfg.AE.groups)`' +#' - resultTables: '`sm expand(cfg.getProcessedResultsDir() + #' "/aberrant_expression/{annotation}/outrider/" + #' "{dataset}/OUTRIDER_results.tsv", -#' annotation=annotations, dataset=datasets)`' -#' input: -#' - AE: '`sm drop.getTmpDir() + "/AE.done"`' +#' annotation=cfg.getGeneVersions(), dataset=cfg.AE.groups)`' #' output: #' html_document: #' code_folding: show @@ -27,9 +23,7 @@ #'--- #+ echo=F -saveRDS(snakemake, file.path(snakemake@params$tmpdir, - "AberrantExpression_OUTRIDER.snakemake")) -# snakemake <- readRDS(".drop/tmp/AberrantExpression_OUTRIDER.snakemake") +saveRDS(snakemake, snakemake@log$snakemake) suppressPackageStartupMessages({ library(OUTRIDER) @@ -82,17 +76,17 @@ outrider_links <- sapply(annotations, get_html_path, #' ## Files #' ### OUTRIDER datasets (ods) #' -#' `r paste('* ', snakemake@params$odsFiles, collapse = '\n')` +#' `r paste('* ', snakemake@input$odsFiles, collapse = '\n')` #' #' ### Results tables #' -#' `r paste('* ', snakemake@params$resultTables, collapse = '\n')` +#' `r paste('* ', snakemake@input$resultTables, collapse = '\n')` #' #' ## Analyze Individual Results # Read the first ods object and results table -ods <- readRDS(snakemake@params$odsFiles[[1]]) -res <- fread(snakemake@params$resultTables[[1]]) +ods <- readRDS(snakemake@input$odsFiles[[1]]) +res <- fread(snakemake@input$resultTables[[1]]) #' Display the results table of the first dataset #+ echo=FALSE diff --git a/drop/template/Scripts/AberrantSplicingAnalysis/Overview.R b/drop/template/Scripts/AberrantSplicing/Overview.R similarity index 73% rename from drop/template/Scripts/AberrantSplicingAnalysis/Overview.R rename to drop/template/Scripts/AberrantSplicing/Overview.R index 65b93549..36cfaf6b 100644 --- a/drop/template/Scripts/AberrantSplicingAnalysis/Overview.R +++ b/drop/template/Scripts/AberrantSplicing/Overview.R @@ -2,19 +2,15 @@ #' title: Aberrant Splicing #' author: #' wb: -#' py: -#' - | -#' datasets = config['aberrantSplicing']['groups'] -#' params: -#' - tmpdir: '`sm drop.getTmpDir()`' -#' - fds_files: '`sm expand(parser.getProcDataDir() + +#' log: +#' - snakemake: '`sm str(tmp_dir / "AS" / "Overview.Rds")`' +#' input: +#' - fds_files: '`sm expand(cfg.getProcessedDataDir() + #' "/aberrant_splicing/datasets/savedObjects/{dataset}/" + -#' "fds-object.RDS", dataset=datasets)`' -#' - result_tables: '`sm expand(parser.getProcDataDir() + +#' "fds-object.RDS", dataset=cfg.AS.groups)`' +#' - result_tables: '`sm expand(cfg.getProcessedDataDir() + #' "/aberrant_splicing/results/{dataset}_results_per_junction.tsv", -#' dataset=datasets)`' -#' input: -#' - AS: '`sm drop.getTmpDir() + "/AS.done"`' +#' dataset=cfg.AS.groups)`' #' output: #' html_document: #' code_folding: show @@ -22,9 +18,7 @@ #'--- #+ echo=F -saveRDS(snakemake, file.path(snakemake@params$tmpdir, - "AberrantSplicing_FRASER.snakemake")) -# snakemake <- readRDS(".drop/tmp/AberrantSplicing_FRASER.snakemake") +saveRDS(snakemake, snakemake@log$snakemake) suppressPackageStartupMessages({ library(FRASER) @@ -71,16 +65,16 @@ fraser_links <- get_html_path(datasets = datasets, #' ## Files #' ### FRASER datasets (fds) -#' `r paste('* ', snakemake@params$fds_files, collapse = '\n')` +#' `r paste('* ', snakemake@input$fds_files, collapse = '\n')` #' #' ### Results tables -#' `r paste('* ', snakemake@params$result_tables, collapse = '\n')` +#' `r paste('* ', snakemake@input$result_tables, collapse = '\n')` #' #' ## Analyze individual results # Read the first fds object and results table -fds <- loadFraserDataSet(file = snakemake@params$fds_files[[1]]) -res <- fread(snakemake@params$result_tables[[1]]) +fds <- loadFraserDataSet(file = snakemake@input$fds_files[[1]]) +res <- fread(snakemake@input$result_tables[[1]]) #' Display the results table of the first dataset #+ echo=FALSE diff --git a/drop/template/Scripts/MAEAnalysis/Overview.R b/drop/template/Scripts/MonoallelicExpression/Overview.R similarity index 53% rename from drop/template/Scripts/MAEAnalysis/Overview.R rename to drop/template/Scripts/MonoallelicExpression/Overview.R index d1e51bb9..53047afc 100644 --- a/drop/template/Scripts/MAEAnalysis/Overview.R +++ b/drop/template/Scripts/MonoallelicExpression/Overview.R @@ -2,24 +2,20 @@ #' title: Monoallelic Expression #' author: #' wb: -#' params: -#' - tmpdir: '`sm drop.getTmpDir()`' -#' - mae_ids: '`sm parser.getMaeAll()`' -#' - allelic_counts: '`sm expand(parser.getProcDataDir() + +#' log: +#' - snakemake: '`sm str(tmp_dir / "MAE" / "Overview.Rds")`' +#' input: +#' - allelic_counts: '`sm expand(cfg.getProcessedDataDir() + #' "/mae/allelic_counts/{mae_id}.csv.gz", -#' mae_id=parser.getMaeAll())`' -#' - results_obj: '`sm expand(parser.getProcResultsDir() + +#' mae_id=cfg.MAE.getMaeAll())`' +#' - results_obj: '`sm expand(cfg.getProcessedResultsDir() + #' "/mae/samples/{mae_id}_res.Rds", -#' mae_id=parser.getMaeAll())`' -#' - results_tables: '`sm expand(parser.getProcResultsDir() + +#' mae_id=cfg.MAE.getMaeAll())`' +#' - results_tables: '`sm expand(cfg.getProcessedResultsDir() + #' "/mae/{dataset}/MAE_results_{annotation}.tsv", -#' dataset=config["mae"]["groups"], -#' annotation=list(config["geneAnnotation"].keys()))`' -#' - html: '`sm config["htmlOutputPath"] + "/Scripts_MAE_Results_Overview.html"`' -#' - qc_matrix: '`sm expand(parser.getProcResultsDir() + "/mae/{qc_group}/" + -#' "dna_rna_qc_matrix.Rds", qc_group=config["mae"]["qcGroups"])`' -#' input: -#' - MAE: '`sm drop.getTmpDir() + "/MAE.done"`' +#' dataset=cfg.MAE.groups, annotation=cfg.getGeneVersions())`' +#' - qc_matrix: '`sm expand(cfg.getProcessedResultsDir() + "/mae/{qc_group}/" + +#' "dna_rna_qc_matrix.Rds", qc_group=cfg.MAE.qcGroups)`' #' output: #' html_document: #' code_folding: hide @@ -27,8 +23,7 @@ #'--- #+ echo=F -saveRDS(snakemake, file.path(snakemake@params$tmpdir, "MAE_analysis.snakemake")) -# snakemake <- readRDS(".drop/tmp/MAE_analysis.snakemake") +saveRDS(snakemake, snakemake@log$snakemake) suppressPackageStartupMessages({ library(tMAE) @@ -43,7 +38,7 @@ suppressPackageStartupMessages({ #' Located in `r file.path(snakemake@config$root, 'processed_results/mae/samples/')` #' #' ### Aggregated results tables of each group -#' `r paste('* ', snakemake@params$results_tables, collapse = '\n')` +#' `r paste('* ', snakemake@input$results_tables, collapse = '\n')` #' #' ### MAE Pipeline Output #' [MAE Pipeline Output](`r "./Scripts_MAE_Datasets.html"`) @@ -51,7 +46,7 @@ suppressPackageStartupMessages({ #' ## Analyze Individual Results # Read the first results table -res_sample <- readRDS(snakemake@params$results_obj[[1]]) +res_sample <- readRDS(snakemake@input$results_obj[[1]]) #+echo=F if(is.null(res_sample$rare)){ @@ -73,5 +68,5 @@ g2 #' [QC Overview](`r "./Scripts_QC_Datasets.html"`) #' #' ### DNA-RNA matrix: -#' `r paste('* ', snakemake@params$qc_matrix, collapse='\n')` +#' `r paste('* ', snakemake@input$qc_matrix, collapse='\n')` diff --git a/drop/template/Scripts/Pipeline/SampleAnnotation.R b/drop/template/Scripts/Pipeline/SampleAnnotation.R index 4e0271b8..4fcd72b2 100644 --- a/drop/template/Scripts/Pipeline/SampleAnnotation.R +++ b/drop/template/Scripts/Pipeline/SampleAnnotation.R @@ -2,15 +2,16 @@ #' title: Sample Annotation Overview #' author: #' wb: +#' log: +#' - snakemake: '`sm str(tmp_dir / "SampleAnnotation.Rds")`' #' params: -#' - tmpdir: '`sm drop.getTmpDir()`' -#' - export_dir: '`sm parser.getProcResultsDir() + "/exported_counts"`' -#' - groups: '`sm parser.getExportGroups()`' +#' - export_dir: '`sm cfg.getProcessedResultsDir() + "/exported_counts"`' +#' - groups: '`sm cfg.exportCounts.getExportGroups()`' #' input: #' - sampleAnnotation: '`sm config["sampleAnnotation"]`' #' output: -#' - export: '`sm touch(parser.getProcResultsDir() + "/exported_counts/sample_anno.done")`' -#' - done: '`sm touch(parser.getProcDataDir() + "/sample_anno/sample_anno.done")`' +#' - export: '`sm touch(cfg.getProcessedResultsDir() + "/exported_counts/sample_anno.done")`' +#' - done: '`sm touch(cfg.getProcessedDataDir() + "/sample_anno/sample_anno.done")`' #' output: #' html_document: #' code_folding: hide @@ -18,8 +19,7 @@ #'--- #+echo=F -saveRDS(snakemake, file.path(snakemake@params$tmpdir, "sample_anno_overview.snakemake")) -# snakemake <- readRDS(".drop/tmp/sample_anno_overview.snakemake") +saveRDS(snakemake, snakemake@log$snakemake) suppressPackageStartupMessages({ library(data.table) diff --git a/drop/template/Snakefile b/drop/template/Snakefile index abd1038a..b119ed9f 100644 --- a/drop/template/Snakefile +++ b/drop/template/Snakefile @@ -1,100 +1,53 @@ -import os -import re +from pathlib import Path import drop +import wbuild +from wbuild.createIndex import createIndexRule, ci +drop.installRPackages() configfile: "config.yaml" -parser, config = drop.drop(config) -include: config["wBuildPath"] + "/wBuild.snakefile" +projectDir = Path.cwd().resolve() +_, projectPaths = drop.setupPaths(projectDir) +tmp_dir = projectPaths["tmpDir"] -# aberrant expression -subworkflow AE: - workdir: drop.getMethodPath("AE", "workdir") - snakefile: drop.getMethodPath("AE", "snakefile") - configfile: drop.getConfFile() +cfg = drop.config.DropConfig(wbuild.utils.Config()) +sa = cfg.sampleAnnotation +config = cfg.config_dict # legacy -# aberrant splicing -subworkflow AS: - workdir: drop.getMethodPath("AS", "workdir") - snakefile: drop.getMethodPath("AS", "snakefile") - configfile: drop.getConfFile() - -# monoallelic expression -subworkflow MAE: - workdir: drop.getMethodPath("MAE", "workdir") - snakefile: drop.getMethodPath("MAE", "snakefile") - configfile: drop.getConfFile() +include: drop.utils.getWBuildSnakefile() +include: cfg.AE.getWorkdir() + "/Snakefile" +include: cfg.AS.getWorkdir() + "/Snakefile" +include: cfg.MAE.getWorkdir() + "/Snakefile" rule all: input: - AE(config["finalFiles"]["AE"]), - AS(config["finalFiles"]["AS"]), - MAE(config["finalFiles"]["MAE"]), + rules.aberrantExpression.output, + rules.aberrantSplicing.output, + rules.mae.output, rules.Index.output, - config["htmlOutputPath"] + "/readme.html", - config["tmpdir"] + "/rulegraphs.done" + dep_graph = cfg.get("htmlOutputPath") + "/dependency.done" + shell: + """ + rm {input.dep_graph} + """ rule sampleAnnotation: - input: parser.getProcDataDir() + "/sample_anno/sample_anno.done" - -rule getIndexNames: - input: - AE(config["finalFiles"]["AE"]), - AS(config["finalFiles"]["AS"]), - MAE(config["finalFiles"]["MAE"]) - output: - indexFile = config["htmlOutputPath"] + "/indexNames.txt" - run: - indexList = [x for x in os.listdir(config["htmlOutputPath"]) if re.search("_index.html$",x)] - with open(output.indexFile, 'w') as file_handler: - for item in indexList: - file_handler.write(f"{item}\n") - -rule aberrantExpression: - input: AE(config["finalFiles"]["AE"]) - output: touch(drop.getTmpDir() + "/AE.done") - -rule aberrantSplicing: - input: - AS(config["finalFiles"]["AS"]) - output: touch(drop.getTmpDir() + "/AS.done") - -rule mae: - input: MAE(config["finalFiles"]["MAE"]) - output: touch(drop.getTmpDir() + "/MAE.done") - -rule sampleQC: - input: MAE(drop.getTmpDir() + "/sampleQC.done") + input: cfg.getProcessedDataDir() + "/sample_anno/sample_anno.done" rule exportCounts: input: - AE(parser.getExportCountFiles("geneCounts")), - AS(parser.getExportCountFiles("splitCounts")), - AS(parser.getExportCountFiles("spliceSiteOverlapCounts")), - parser.getProcResultsDir() + "/exported_counts/sample_anno.done" + cfg.exportCounts.getExportCountFiles("geneCounts"), + cfg.exportCounts.getExportCountFiles("splitCounts"), + cfg.exportCounts.getExportCountFiles("spliceSiteOverlapCounts"), + cfg.getProcessedResultsDir() + "/exported_counts/sample_anno.done" rule dependencyGraph: input: - MAE(config["htmlOutputPath"] + "/MAE_rulegraph.svg"), - AS(config["htmlOutputPath"] + "/AS_rulegraph.svg"), - AE(config["htmlOutputPath"] + "/AE_rulegraph.svg"), - output: - touch(config["tmpdir"] + "/rulegraphs.done") + rules.aberrantExpression_dependency.output, + rules.aberrantSplicing_dependency.output, + rules.mae_dependency.output + output: touch(cfg.get("htmlOutputPath") + "/dependency.done") + priority: 1 rule publish_local: shell: "rsync -Ort {config[htmlOutputPath]} {config[webDir]}" - -rule unlock: - input: - AE(drop.getMethodPath("AE", "unlock")), - AS(drop.getMethodPath("AS", "unlock")), - MAE(drop.getMethodPath("MAE", "unlock")) - params: - finalFiles = " ".join(config["finalFiles"].values()) - shell: - """ - snakemake --unlock - rm {input} - rm -f {params.finalFiles} - """ - diff --git a/drop/template/config.yaml b/drop/template/config.yaml index c3ea271b..3191d096 100644 --- a/drop/template/config.yaml +++ b/drop/template/config.yaml @@ -1,10 +1,7 @@ -projectTitle: Detection of RNA Outlier Pipeline +projectTitle: "DROP: Detection of RNA Outliers Pipeline" root: # root directory of all intermediate output htmlOutputPath: # path for HTML rendered reports - -# settings for wBuild - do not change -indexWithFolderName: true -fileRegex: .*\.R +indexWithFolderName: true # whether the root base name should be part of the index name sampleAnnotation: # path to sample annotation (see documenation on how to create it) @@ -14,7 +11,6 @@ geneAnnotation: v29: /path/to/gencode29.gtf.gz # example genomeAssembly: hg19 # either hg19 or hg38 -scanBamParam: null # or a path to an Rds file containing a scanBamParam object exportCounts: # specify which gene annotations to include and which # groups to exclude when exporting counts diff --git a/drop/utils.py b/drop/utils.py new file mode 100644 index 00000000..bc84ef38 --- /dev/null +++ b/drop/utils.py @@ -0,0 +1,81 @@ +from pathlib import Path +from snakemake.logging import logger +import wbuild + + +def returnPath(path, str_=True): + return str(path) if str_ else Path(path) + + +def checkFileExists(files): + """ + :param files: single filename or iterable of filenames + :return: list of existing files + """ + files = [files] if isinstance(files, str) else files + return [f for f in files if Path(f).exists()] + + +def createDir(directory): + directory = Path(directory) + if not directory.exists(): + logger.debug(f"creating {directory}") + directory.mkdir(parents=True) + + +def checkKeys(dict_, keys, check_files=False): + """ + :param dict_: config dictionary + :param keys: keys that are expected to be in dict_ + :param file_values: if True, do file checks on values + :return: dictionary with absolute file paths + """ + keys = dict_.keys() if keys is None else keys + for key in keys: + if key not in dict_.keys(): + raise KeyError(f"{key} is mandatory but missing") + + if check_files: + filename = Path(dict_[key]) + if not filename.exists(): + raise FileExistsError(filename) + # get real path + dict_[key] = str(filename.expanduser()) + return dict_ + + +def setKey(dict_, sub, key, default): + if sub is not None: + if not isinstance(sub, list): + raise TypeError(f"{sub} is not of type list") + for x in sub: + dict_ = dict_[x] + if key not in dict_ or dict_[key] is None: + logger.debug(f'{key} not in config{sub}, using default') + dict_[key] = default + return dict_[key] + + +def getWBuildPath(str_=True): + return returnPath(Path(wbuild.__file__).parent, str_=str_) + + +def getWBuildSnakefile(str_=True): + wb_path = getWBuildPath(str_=False) + return returnPath(wb_path / "wBuild.snakefile", str_=str_) + + +def subsetBy(df, column, values): + """ + Subset by one or more values of different columns from data frame + :param df: data frame + :param column: column to subset by + :param values: values to subset by + :return: df subset by values and column + """ + if values is None: + return df + elif isinstance(values, str): + return df[df[column] == values] + else: + return df[df[column].isin(values)] diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 00000000..cccae8f6 --- /dev/null +++ b/pytest.ini @@ -0,0 +1,3 @@ +[pytest] +log_cli = True +log_cli_level = INFO \ No newline at end of file diff --git a/requirements_test.txt b/requirements_test.txt new file mode 100644 index 00000000..5f9e97c0 --- /dev/null +++ b/requirements_test.txt @@ -0,0 +1,4 @@ +pytest-runner +pytest +pytest-testdirectory +pytest-icdiff diff --git a/setup.py b/setup.py index bf9aba9f..7eb31647 100644 --- a/setup.py +++ b/setup.py @@ -1,12 +1,12 @@ import setuptools import os -import glob with open("README.md", "r") as fh: long_description = fh.read() requirements = [ - 'wbuild==1.7.0', + #'wbuild @ git+https://github.com/gagneurlab/wBuild.git', + 'wbuild>=1.7.1', 'python-dateutil', 'pandoc', 'graphviz', @@ -23,18 +23,18 @@ setuptools.setup( name="drop", - version="0.9.2", + version="1.0.0", author="Michaela Müller, Daniela Andrade Salazar, Vicente Yepez", author_email="mumichae@in.tum.de", description="Detection of RNA Outlier Pipeline", long_description=long_description, long_description_content_type="text/markdown", url="https://github.com/gagneurlab/drop", - packages=setuptools.find_packages(include=["drop"]), + packages=setuptools.find_packages(include=["drop", "drop.*"]), entry_points={'console_scripts': ['drop=drop.cli:main']}, package_data={'drop': extra_files}, include_package_data=True, - install_requires=requirements + install_requires=requirements, ) diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/common.py b/tests/common.py new file mode 100644 index 00000000..13ce5218 --- /dev/null +++ b/tests/common.py @@ -0,0 +1,29 @@ +import pytest +import subprocess +import logging + +CORES = "2" +LOGGER = logging.getLogger(__name__) + + +def run(cmd, dir_path, report_stdout=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, **kwargs): + shell = isinstance(cmd, str) + response = subprocess.run( + cmd, cwd=dir_path, shell=shell, + stdout=stdout, stderr=stderr, + universal_newlines=True, + **kwargs + ) + try: + response.check_returncode() + except subprocess.CalledProcessError: + if report_stdout: + LOGGER.error("Standard out:") + LOGGER.error(response.stdout) + LOGGER.error(response.stderr) + raise + return response + + +def runR(r_cmd, dir_path, report_stdout=False): + return run(f"Rscript -e '{r_cmd}'", dir_path, report_stdout=report_stdout) diff --git a/tests/config/__init__.py b/tests/config/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/config/test_AE.py b/tests/config/test_AE.py new file mode 100644 index 00000000..46ed010f --- /dev/null +++ b/tests/config/test_AE.py @@ -0,0 +1,43 @@ +class Test_AE_Config: + + def test_config(self, dropConfig): + assert dropConfig.AE.getWorkdir() == "Scripts/AberrantExpression/pipeline" + dict_ = { + 'groups': ['outrider'], + 'fpkmCutoff': 1, + 'implementation': 'autoencoder', + 'padjCutoff': 1, + 'zScoreCutoff': 0, + 'maxTestedDimensionProportion': 3 + } + assert dict_.items() <= dropConfig.AE.dict_.items() + + def test_getCountsFiles(self, demo_dir, dropConfig): + counts_dir = f"{demo_dir}/Output/processed_data/aberrant_expression/v29/counts/" + counts_files = [ + 'HG00103.4.M_120208_3.Rds', + 'HG00096.1.M_111124_6.Rds', + 'HG00106.4.M_120208_5.Rds', + 'HG00116.2.M_120131_1.Rds', + 'HG00111.2.M_111215_4.Rds', + 'HG00149.1.M_111124_6.Rds', + 'HG00176.4.M_120208_2.Rds', + 'HG00150.4.M_120208_7.Rds', + 'HG00126.1.M_111124_8.Rds', + 'HG00132.2.M_111215_4.Rds', + 'external_geneCounts.tsv.gz' + ] + counts_files_true = [f"{counts_dir}/{f}" for f in counts_files].sort() + counts_files_test = dropConfig.AE.getCountFiles(annotation="v29", group="outrider").sort() + assert counts_files_true == counts_files_test + + def test_getCountParams(self, dropConfig): + params = { + 'STRAND': 'no', + 'COUNT_MODE': 'IntersectionStrict', + 'PAIRED_END': True, + 'COUNT_OVERLAPS': True + } + assert params == dropConfig.AE.getCountParams("HG00103.4.M_120208_3") + + diff --git a/tests/config/test_AS.py b/tests/config/test_AS.py new file mode 100644 index 00000000..62a12c2a --- /dev/null +++ b/tests/config/test_AS.py @@ -0,0 +1,19 @@ +class Test_AS_Config: + + def test_config(self, dropConfig): + assert dropConfig.AS.getWorkdir() == "Scripts/AberrantSplicing/pipeline" + dict_ = { + 'groups': ['fraser'], + 'recount': True, + 'longRead': False, + 'keepNonStandardChrs': True, + 'filter': False, + 'minExpressionInOneSample': 20, + 'minDeltaPsi': 0.05, + 'implementation': 'PCA', + 'padjCutoff': 1, + 'zScoreCutoff': 0, + 'deltaPsiCutoff': 0.05, + 'maxTestedDimensionProportion': 6 + } + assert dict_.items() <= dropConfig.AS.dict_.items() diff --git a/tests/config/test_MAE.py b/tests/config/test_MAE.py new file mode 100644 index 00000000..2bf0fa64 --- /dev/null +++ b/tests/config/test_MAE.py @@ -0,0 +1,23 @@ +class Test_MAE_Config: + + def test_config(self, demo_dir, dropConfig): + assert dropConfig.MAE.getWorkdir() == "Scripts/MonoallelicExpression/pipeline" + dict_ = { + 'genome': f'{demo_dir}/Data/chr21.fa', + 'qcVcf': f'{demo_dir}/Data/qc_vcf_1000G.vcf.gz', + 'groups': ['mae'], + 'qcGroups': ['mae'], + 'gatkIgnoreHeaderCheck': True, + 'padjCutoff': 0.05, + 'allelicRatioCutoff': 0.8, + 'maxAF': 0.001, + 'addAF': False, + 'maxVarFreqCohort': 0.04, + 'gnomAD': False + } + assert dict_.items() <= dropConfig.MAE.dict_.items() + + def test_createMaeIDS(self, dropConfig): + mae_ids_real = {'mae': ['HG00096--HG00096.1.M_111124_6', 'HG00103--HG00103.4.M_120208_3'].sort()} + mae_ids_test = {k: l.sort() for k, l in dropConfig.MAE.createMaeIDS().items()} + assert mae_ids_real == mae_ids_test diff --git a/tests/config/test_SampleAnnotation.py b/tests/config/test_SampleAnnotation.py new file mode 100644 index 00000000..71a9d1da --- /dev/null +++ b/tests/config/test_SampleAnnotation.py @@ -0,0 +1,44 @@ +import pytest + +class Test_SampleAnnotation: + + @pytest.fixture(scope="class") + def sampleAnnotation(self, dropConfig): + return dropConfig.sampleAnnotation + + def test_columns(self, sampleAnnotation): + parsed_cols = set(list(sampleAnnotation.sa)) + def_cols = set(sampleAnnotation.SAMPLE_ANNOTATION_COLUMNS) + assert def_cols <= parsed_cols + + def test_mapping(self, sampleAnnotation): + # ID mappings/groups + assert sampleAnnotation.idMapping.shape == (20, 2) + assert sampleAnnotation.sampleFileMapping.shape == (32, 4) + assert {k: len(v) for k, v in sampleAnnotation.rnaIDs.items()} == {'mae': 2, 'outrider': 10, 'fraser': 10} + assert {k: len(v) for k, v in sampleAnnotation.dnaIDs.items()} == {'mae': 2, 'outrider': 10, 'fraser': 10} + + @pytest.mark.parametrize( + "sample_id,file_type,file_name", + [ + ("HG00096.1.M_111124_6", "RNA_BAM_FILE", "Data/rna_bam/HG00096.1.M_111124_6_chr21.bam"), + ("HG00178.4.M_120208_8", "GENE_COUNTS_FILE", "Data/external_geneCounts.tsv.gz"), + ("HG00096", "DNA_VCF_FILE", "Data/dna_vcf/demo_chr21.vcf.gz") + ] + ) + def test_filePaths(self, demo_dir, sampleAnnotation, sample_id, file_type, file_name): + true_path = f"{demo_dir}/{file_name}" + test_path = sampleAnnotation.getFilePath(sample_id, file_type) + assert true_path == test_path + + @pytest.mark.parametrize( + "annotation,group,files", + [ + ("v29", "outrider", {'Data/external_geneCounts.tsv.gz'}) + ] + ) + def test_import(self, demo_dir, sampleAnnotation, annotation, group, files): + true_import_files = {f"{demo_dir}/{file}" for file in files} + test_import_files = sampleAnnotation.getImportCountFiles(annotation, group) + assert true_import_files == test_import_files + diff --git a/tests/config/test_config.py b/tests/config/test_config.py new file mode 100644 index 00000000..df5671ed --- /dev/null +++ b/tests/config/test_config.py @@ -0,0 +1,40 @@ +import pytest + + +def test_DropConfigKeys(dropConfig): + for key in dropConfig.CONFIG_KEYS: + assert dropConfig.get(key) == dropConfig.config_dict[key] + + +def test_DropConfigPaths(demo_dir, dropConfig): + assert dropConfig.getRoot() == f"{demo_dir}/Output" + assert dropConfig.getProcessedDataDir() == f"{demo_dir}/Output/processed_data" + assert dropConfig.getProcessedResultsDir() == f"{demo_dir}/Output/processed_results" + gene_anno = {'v29': f'{demo_dir}/Data/gencode_annotation_trunc.gtf'} + assert gene_anno == dropConfig.getGeneAnnotations() + + +@pytest.mark.parametrize( + "modules,groups", + [ + ("aberrantExpression", {"outrider"}), + ("aberrantSplicing", {"fraser"}), + (["aberrantExpression", "aberrantSplicing"], {"outrider", "fraser"}) + ] + ) +def test_cfgExportGroups(dropConfig, modules, groups): + assert groups == dropConfig.exportCounts.getExportGroups(modules) + + +@pytest.mark.parametrize( + "prefix,files", + [ + ("geneCounts", ['Output/processed_results/exported_counts/outrider--hg19--v29/geneCounts.tsv.gz']), + ("splitCounts", ['Output/processed_results/exported_counts/fraser--hg19--v29/splitCounts.tsv.gz']), + ("spliceSiteOverlapCounts", ['Output/processed_results/exported_counts/fraser--hg19--v29/spliceSiteOverlapCounts.tsv.gz']) + ] + ) +def test_cfgExportCountFiles(demo_dir, dropConfig, prefix, files): + true_path = [f"{demo_dir}/{file}" for file in files] + test_path = dropConfig.exportCounts.getExportCountFiles(prefix) + assert true_path == test_path diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 00000000..2458d6c9 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,31 @@ +import os +import wbuild +import drop +import sys +from unittest.mock import patch +from .common import * + + +@pytest.yield_fixture(scope="session", autouse=True) +def demo_dir(tmpdir_factory): + """ + Directory containing files downloaded from Gagneurlab ready to run the demo pipeline + :param testdirectory: inherits from pytest-testdirectory + :return: demo directory + """ + run_dir = tmpdir_factory.mktemp("demo_dir") + LOGGER.info(f"\n create demo dir: {run_dir}") + run(["drop", "demo"], run_dir, stdout=None) + yield run_dir + LOGGER.info("\n remove demo directory") + run_dir.remove() + + +@pytest.fixture(scope="session", autouse=True) +def dropConfig(demo_dir): + orig_path = os.getcwd() + os.chdir(demo_dir) + with patch.object(sys, 'argv', ["snakemake", "-n"]): + cfg = drop.config.DropConfig(wbuild.utils.Config()) + os.chdir(orig_path) + return cfg diff --git a/tests/pipeline/__init__.py b/tests/pipeline/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/pipeline/test_AE.py b/tests/pipeline/test_AE.py new file mode 100644 index 00000000..9f8c9da5 --- /dev/null +++ b/tests/pipeline/test_AE.py @@ -0,0 +1,39 @@ +from tests.common import * + + +class Test_AE_Pipeline: + + @pytest.fixture(scope="class") + def pipeline_run(self, demo_dir): + LOGGER.info("run aberrant expression pipeline...") + pipeline_run = run(["snakemake", "aberrantExpression", "-j", CORES], demo_dir) + assert "Finished job 0." in pipeline_run.stderr + return pipeline_run + + @pytest.mark.usefixtures("pipeline_run") + def test_counts(self, demo_dir): + cnt_file = "Output/processed_data/aberrant_expression/v29/outrider/outrider/total_counts.Rds" + r_cmd = """ + counts <- readRDS("{}") + print(counts) + """.format(cnt_file) + r = runR(r_cmd, demo_dir) + assert "dim: 805 12" in r.stdout + assert "rownames(805): ENSG00000141956.13_3 ENSG00000141959.16_1 ..." in r.stdout + + @pytest.mark.usefixtures("pipeline_run") + def test_results(self, demo_dir): + output_dir = "Output/processed_results/aberrant_expression/v29/outrider/outrider" + r_cmd = """ + # ods object + ods <- readRDS(file.path("{}", "ods.Rds")) + print(ods) + + # results table + res <- readRDS(file.path("{}", "OUTRIDER_results_all.Rds")) + print(paste(c("res:", dim(res)), collapse=" ")) + """.format(output_dir, output_dir) + r = runR(r_cmd, demo_dir) + assert "class: OutriderDataSet" in r.stdout + assert "dim: 441 12" in r.stdout + assert "res: 5292 15" in r.stdout \ No newline at end of file diff --git a/tests/pipeline/test_AS.py b/tests/pipeline/test_AS.py new file mode 100644 index 00000000..f26dcea4 --- /dev/null +++ b/tests/pipeline/test_AS.py @@ -0,0 +1,32 @@ +from tests.common import * + + +class Test_AS_Pipeline: + + @pytest.fixture(scope="class") + def pipeline_run(self, demo_dir): + LOGGER.info("run aberrant splicing pipeline") + pipeline_run = run(["snakemake", "aberrantSplicing", "-j", CORES], demo_dir) + assert "Finished job 0." in pipeline_run.stderr + return pipeline_run + + @pytest.mark.usefixtures("pipeline_run") + def test_counts(self, demo_dir): + cnt_file = "Output/processed_data/aberrant_splicing/datasets/savedObjects/raw-fraser/fds-object.RDS" + r_cmd = """ + library(FRASER) + fds <- loadFraserDataSet(file="{}") + print(fds) + """.format(cnt_file) + r = runR(r_cmd, demo_dir) + assert "Number of samples: 10" in r.stdout + assert "Number of junctions: 81" in r.stdout + assert "Number of splice sites: 9" in r.stdout + + @pytest.mark.usefixtures("pipeline_run") + def test_results(self, demo_dir): + results_dir = "Output/processed_data/aberrant_splicing/results" + r = run(f"wc -l {results_dir}/fraser_results_per_junction.tsv", demo_dir) + assert "88 " in r.stdout + r = run(f"wc -l {results_dir}/fraser_results.tsv", demo_dir) + assert "1 " in r.stdout diff --git a/tests/pipeline/test_MAE.py b/tests/pipeline/test_MAE.py new file mode 100644 index 00000000..c420fbfe --- /dev/null +++ b/tests/pipeline/test_MAE.py @@ -0,0 +1,33 @@ +from tests.common import * + + +class Test_MAE_Pipeline: + + @pytest.fixture(scope="class") + def pipeline_run(self, demo_dir): + LOGGER.info("run MAE pipeline") + pipeline_run = run(["snakemake", "mae", "-j", CORES], demo_dir) + assert "Finished job 0." in pipeline_run.stderr + return pipeline_run + + @pytest.mark.usefixtures("pipeline_run") + def test_counts(self, demo_dir): + cnt_file = "Output/processed_data/mae/allelic_counts/HG00103--HG00103.4.M_120208_3.csv.gz" + r_cmd = """ + library(data.table) + cnts <- fread("{}") + print(nrow(cnts)) + """.format(cnt_file) + r = runR(r_cmd, demo_dir) + assert "[1] 235" in r.stdout + + @pytest.mark.usefixtures("pipeline_run") + def test_results(self, demo_dir): + results_file = "Output/processed_results/mae/mae/MAE_results_all_v29.tsv.gz" + r_cmd = """ + library(data.table) + res <- fread("{}") + print(nrow(res)) + """.format(results_file) + r = runR(r_cmd, demo_dir) + assert "[1] 335" in r.stdout diff --git a/tests/pipeline/test_pipeline.py b/tests/pipeline/test_pipeline.py new file mode 100644 index 00000000..438d2174 --- /dev/null +++ b/tests/pipeline/test_pipeline.py @@ -0,0 +1,30 @@ +from tests.common import * + + +def test_dryrun(demo_dir): + r = run(["snakemake", "-n"], dir_path=demo_dir) + message = "This was a dry-run (flag -n). The order of jobs does not reflect the order of execution." + assert message in r.stdout + + +def test_dependencyGraph(demo_dir): + r = run(["snakemake", "dependencyGraph", "-Fj"], dir_path=demo_dir) + assert "aberrantExpression_dependency" in r.stderr + assert "aberrantSplicing_dependency" in r.stderr + assert "mae_dependency" in r.stderr + assert "Finished job 0." in r.stderr + # clean HTML output + r = run(["snakemake", "clean", "-j"], dir_path=demo_dir) + assert "clean" in r.stderr + assert "Finished job 0." in r.stderr + + +def test_export(demo_dir): + r = run(["snakemake", "exportCounts", "-j", CORES], demo_dir) + assert "Finished job 0." in r.stderr + + +@pytest.mark.trylast +def test_all(demo_dir): + r = run(["snakemake", "-j", CORES], demo_dir) + assert "Finished job 0." in r.stderr diff --git a/tests/test_cli.py b/tests/test_cli.py new file mode 100755 index 00000000..b00636a8 --- /dev/null +++ b/tests/test_cli.py @@ -0,0 +1,22 @@ +from click.testing import CliRunner +from drop import cli +from .common import * + + +def test_Help_isShown(): + runner = CliRunner() + result = runner.invoke(cli.main) + assert result.exit_code == 0 + help_result = runner.invoke(cli.main, ['--help']) + assert help_result.exit_code == 0 + assert '--help' in help_result.output + assert 'Show this message and exit.' in help_result.output + + +def test_drop_init(tmpdir): + init_dir = tmpdir.mkdir("init") + r = run(["drop", "init"], dir_path=init_dir) + assert 'init...done\n' in r.stderr + + +# TOOD: test update