remove submodule structure (#101)

* test full end2end pipeline * removed git submodule structure * version bump Co-authored-by: Michaela Müller <[email protected]>
gagneurlab · Jul 21, 2020 · d254aca · d254aca
1 parent fd32ed3
commit d254aca
Show file tree

Hide file tree

Showing 61 changed files with 2,963 additions and 65 deletions.
diff --git a/.gitmodules b/.gitmodules
diff --git a/.travis.yml b/.travis.yml
@@ -38,3 +38,9 @@ script:
   - bcftools --version
   - drop --version
   - python --version
+
+  - mkdir drop_demo
+  - cd drop_demo
+  - drop demo
+  - snakemake -n
+  - snakemake --jobs 2 --cores 2
diff --git a/README.md b/README.md
@@ -1,6 +1,6 @@
 # Detection of RNA Outlier Pipeline
 [![Pipeline status](https://travis-ci.org/gagneurlab/drop.svg?branch=master)](https://travis-ci.org/gagneurlab/drop)
-[![Version](https://img.shields.io/badge/Version-0.9.0-green.svg)](https://github.com/gagneurlab/drop/master)
+[![Version](https://img.shields.io/badge/Version-0.9.1-green.svg)](https://github.com/gagneurlab/drop/master)
 [![Version](https://readthedocs.org/projects/gagneurlab-drop/badge/?version=latest)](https://gagneurlab-drop.readthedocs.io/en/latest)
 
 The manuscript main file, supplementary figures and table can be found in the manuscript folder or in 

diff --git a/conda.recipe/meta.yaml b/conda.recipe/meta.yaml
@@ -1,5 +1,5 @@
 {% set name = "drop" %}
-{% set version = "0.9.0" %}
+{% set version = "0.9.1" %}
 
 package:
   name: "{{ name|lower }}"
@@ -10,55 +10,36 @@ source:
 
 build:
   number: 0
-  script: 
-    - "{{ PYTHON }} -m pip install -vv git+https://github.com/gagneurlab/wbuild.git#egg=wbuild"
-    - "${R} -e 'BiocManager::install(\"mumichae/tMAE\", dependencies=FALSE, update=FALSE, ask=FALSE)'"
-    - "{{ PYTHON }} -m pip install . -vv"
-  rpaths:
-    - lib/R/lib/
-    - lib/
+  noarch: python
+  entry_points:
+    - drop=drop.cli:main
+  script: "{{ PYTHON }} -m pip install . -vv"
 
 requirements:
-  build:
-    - {{ compiler('c') }}
-
   host:
-    - python
+    - python >=3.6
     - pip
-
-    # wbuild requirements
-    - pyyaml>=4.2b1
-    - pytest-runner
-
-    # tMAE requirements
-    - r-base>=4.0.0
-    - r-devtools
-    - r-biocmanager
-    - r-data.table
-    - r-ggplot2
-    - r-dplyr
-    - bioconductor-DESeq2
-    - bioconductor-GenomicScores 
+    - wbuild >=1.7.0
 
   run:
-    - python
+    - python >=3.6
     - pandas
-    - Click>=7.0
+    - Click >=7.0
     - click-log
     - python-dateutil
 
     # snakemake/wbuild
-    - snakemake=>5.5.2
+    - snakemake >=5.5.2
+    - wbuild >=1.7.0
     - pandoc
     - graphviz
-    - pyyaml>=4.2b1
-
+
     # command line tools
     - tabix
-    - samtools>=1.7
-    - bcftools>=1.7
-    - gatk4>=4.0.4
-    - star>=2.7
+    - samtools >=1.7
+    - bcftools >=1.7
+    - gatk4 >=4.0.4
+    - star >=2.7
 
     # R dependencies
     - r-base>=4.0.0
@@ -72,35 +53,42 @@ requirements:
     - r-tidyr
     - r-magrittr
     - r-devtools
-
+    - r-tmae
+
     # bioconductor packages
     - bioconductor-deseq2
     - bioconductor-GenomicScores
     - bioconductor-outrider
     - bioconductor-fraser
     - bioconductor-variantannotation
     - bioconductor-bsgenome.hsapiens.ucsc.hg19
-    #- bioconductor-mafdb.gnomad.r2.1.hs37d5
-    #- bioconductor-mafdb.gnomad.r2.1.grch38
+      #- bioconductor-mafdb.gnomad.r2.1.hs37d5
+      #- bioconductor-mafdb.gnomad.r2.1.grch38
 
 test:
   imports:
     - drop
-    - wbuild
   commands:
-    - ${R} -e "library(tMAE)"
-    - wbuild --version
-    - drop --version
+    - drop --help
+  requires:
+    - pytest
 
 about:
-  home: "https://gagneurlab-drop.readthedocs.io"
+  home: https://github.com/gagneurlab/drop
   license: MIT
-  license_family: MIT
-  license_file: ../LICENSE
-  summary: "Detection of RNA Outliers Pipeline"
-  doc_url: 
-  dev_url: 
+  license_family: OTHER
+  summary: Detection of RNA Outlier Pipeline
+  doc_url: https://gagneurlab-drop.readthedocs.io/en/latest/
+  dev_url: https://github.com/gagneurlab/drop
 
 extra:
+  container:
+    # click requires a unicode locale when used with Python 3
+    # extended-base generates en_US.UTF-8 locale and sets LC_ALL, LANG properly
+    extended-base: true
+  identifiers:
+    - https://dx.doi.org/10.21203/rs.2.19080/v1
   recipe-maintainers:
+    - c-mertes
     - mumichae
+
diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -19,11 +19,11 @@
 # -- Project information -----------------------------------------------------
 
 project = 'DROP'
-copyright = '2019, Michaela Müller'
+copyright = '2020, Michaela Müller'
 author = 'Michaela Müller'
 
 # The full version, including alpha/beta/rc tags
-release = '0.9.0'
+release = '0.9.1'
 
 
 # -- General configuration ---------------------------------------------------

diff --git a/drop/cli.py b/drop/cli.py
@@ -13,7 +13,7 @@
 
 @click.group()
 @click_log.simple_verbosity_option(logger)
-@click.version_option('0.9.0',prog_name='drop')
+@click.version_option('0.9.1',prog_name='drop')
 def main():
     pass
 

diff --git a/drop/modules/aberrant-expression-pipeline b/drop/modules/aberrant-expression-pipeline
diff --git a/drop/modules/aberrant-expression-pipeline/Scripts/Counting/Datasets.R b/drop/modules/aberrant-expression-pipeline/Scripts/Counting/Datasets.R
@@ -0,0 +1,35 @@
+#'---
+#' title: Counts Overview
+#' author:  mumichae, salazar
+#' wb:
+#'  params:
+#'   - ids: '`sm parser.outrider_ids`'
+#'   - tmpdir: '`sm drop.getMethodPath(METHOD, "tmp_dir")`'
+#'  input: 
+#'   - summaries: '`sm expand(config["htmlOutputPath"] + 
+#'                "/AberrantExpression/Counting/{annotation}/Summary_{dataset}.html",
+#'                annotation=list(config["geneAnnotation"].keys()), dataset=parser.outrider_ids)`'
+#' output:
+#'   html_document:
+#'    code_folding: hide
+#'    code_download: TRUE
+#'---
+
+saveRDS(snakemake, file.path(snakemake@params$tmpdir, "counting_overview.snakemake") )
+# snakemake <- readRDS(".drop/tmp/AE/counting_overview.snakemake")
+
+# Obtain the annotations and datasets
+gene_annotation_names <- names(snakemake@config$geneAnnotation)
+datasets <- snakemake@config$aberrantExpression$groups
+
+#+ echo=FALSE, results="asis"
+devNull <- sapply(datasets, function(name){
+  sapply(gene_annotation_names, function(version){
+  cat(paste0(
+    "<h1>Dataset: ", name, "</h1>",
+    "<p>",
+    "</br>", "<a href='AberrantExpression/Counting/", version, "/Summary_", name, ".html'   >Count Summary</a>",
+    "</br>", "</p>"
+  ))
+  })
+})
diff --git a/drop/modules/aberrant-expression-pipeline/Scripts/Counting/Summary.R b/drop/modules/aberrant-expression-pipeline/Scripts/Counting/Summary.R
@@ -0,0 +1,160 @@
+#'---
+#' title: "Counts Summary: `r gsub('_', ' ', snakemake@wildcards$dataset)`"
+#' author: 
+#' wb:
+#'  params:
+#'    - tmpdir: '`sm drop.getMethodPath(METHOD, "tmp_dir")`'
+#'  input: 
+#'    - ods: '`sm parser.getProcResultsDir() +
+#'            "/aberrant_expression/{annotation}/outrider/{dataset}/ods_unfitted.Rds"`'
+#'    - bam_cov: '`sm parser.getProcDataDir() +
+#'                "/aberrant_expression/{annotation}/outrider/{dataset}/bam_coverage.tsv"`'
+#'  output:
+#'   - wBhtml: '`sm config["htmlOutputPath"] +
+#'              "/AberrantExpression/Counting/{annotation}/Summary_{dataset}.html"`'
+#'  type: noindex
+#' output:
+#'  html_document:
+#'   code_folding: hide
+#'   code_download: TRUE
+#'---
+
+saveRDS(snakemake, file.path(snakemake@config$tmpdir, "AE/counting_summary.snakemake") )
+#snakemake <- readRDS(".drop/tmp/AE/counting_summary.snakemake")
+
+suppressPackageStartupMessages({
+  library(OUTRIDER)
+  library(SummarizedExperiment)
+  library(GenomicAlignments)
+  library(ggplot2)
+  library(ggthemes)
+  library(cowplot)
+  library(data.table)
+  library(tidyr)
+})
+
+ods <- readRDS(snakemake@input$ods)
+cnts_mtx <- counts(ods, normalized = F)
+
+#' Number of samples: `r ncol(ods)`
+#' 
+#' # Count Quality Control
+#' 
+#' Compare number of records vs. read counts
+#' 
+bam_coverage <- fread(snakemake@input$bam_cov)
+bam_coverage[, sampleID := as.character(sampleID)]
+coverage_dt <- merge(bam_coverage,
+                   data.table(sampleID = colnames(ods),
+                              read_count = colSums(cnts_mtx)),
+                   by = "sampleID", sort = FALSE)
+# read count
+setorder(coverage_dt, read_count)
+coverage_dt[, count_rank := .I]
+# ratio
+coverage_dt[, counted_frac := read_count/record_count]
+setorder(coverage_dt, counted_frac)
+coverage_dt[, frac_rank := .I]
+
+# size factors 
+ods <- estimateSizeFactors(ods)
+coverage_dt[, size_factors := sizeFactors(ods)]
+setorder(coverage_dt, size_factors)
+coverage_dt[, sf_rank := 1:.N]
+
+p_depth <- ggplot(coverage_dt, aes(count_rank, read_count)) +
+    geom_point() +
+    theme_cowplot() +
+    background_grid() +
+    labs(title = "Obtained Read Counts", x="Sample Rank", y = "Reads Counted") +
+    ylim(c(0,NA))
+
+p_frac <- ggplot(coverage_dt, aes(frac_rank, counted_frac)) +
+    geom_point() +
+    theme_cowplot() +
+    background_grid() +
+    labs(title = "Obtained Read Count Ratio", x = "Sample Rank", 
+       y = "Percent Reads Counted") +
+   ylim(c(0,NA))
+
+#+ QC, fig.height=6, fig.width=12
+plot_grid(p_depth, p_frac)
+
+p_sf <- ggplot(coverage_dt, aes(sf_rank, size_factors)) +
+  geom_point() +
+  ylim(c(0,NA)) +
+    theme_cowplot() +
+  background_grid() +
+  labs(title = 'Size Factors', x = 'Sample Rank', y = 'Size Factors')
+
+p_sf_cov <- ggplot(coverage_dt, aes(read_count, size_factors)) +
+    geom_point() +
+    ylim(c(0,NA)) +
+    theme_cowplot() +
+    background_grid() +
+    labs(title = 'Size Factors vs. Read Count Ratio',
+         x = 'Read Count Ratio', y = 'Size Factors')
+
+#+ sizeFactors, fig.height=6, fig.width=12
+plot_grid(p_sf, p_sf_cov)
+
+#' # Filtering
+quant <- .95
+filter_mtx <- list(
+  all = cnts_mtx,
+  passed_FPKM = cnts_mtx[rowData(ods)$passedFilter,],
+  min_1 = cnts_mtx[rowQuantiles(cnts_mtx, probs = quant) > 1, ],
+  min_10 = cnts_mtx[rowQuantiles(cnts_mtx, probs = quant) > 10, ]
+)
+filter_dt <- lapply(names(filter_mtx), function(filter_name) {
+  mtx <- filter_mtx[[filter_name]]
+  data.table(gene_ID = rownames(mtx), median_counts = rowMeans(mtx), filter = filter_name)
+}) %>% rbindlist
+filter_dt[, filter := factor(filter, levels = c('all', 'passed_FPKM', 'min_1', 'min_10'))]
+
+binwidth <- .2
+p_hist <- ggplot(filter_dt, aes(x = median_counts, fill = filter)) +
+  geom_histogram(binwidth = binwidth) +
+  scale_x_log10() +
+  facet_wrap(.~filter) +
+  labs(x = "Mean counts per gene", y = "Frequency", title = 'Mean Count Distribution') +
+  guides(col = guide_legend(title = NULL)) +
+  scale_fill_brewer(palette = "Paired") +
+  theme_cowplot() +
+  theme(legend.position = "none")
+
+p_dens <- ggplot(filter_dt, aes(x = median_counts, col = filter)) +
+  geom_density(aes(y=binwidth * ..count..), size = 1.2) +
+  scale_x_log10() +
+  labs(x = "Mean counts per gene", y = "Frequency") +
+  guides(col = guide_legend(title = NULL)) +
+  scale_color_brewer(palette = "Paired") +
+  theme_cowplot() +
+  theme(legend.position = "top",
+        legend.justification="center",
+       legend.background = element_rect(color = NA))
+
+#+ meanCounts, fig.height=6, fig.width=12
+plot_grid(p_hist, p_dens)
+
+#+ expressedGenes, fig.height=6, fig.width=8
+plotExpressedGenes(ods) +
+  theme_cowplot() +
+  background_grid(major = "y")
+
+expressed_genes <- as.data.table(colData(ods))
+expressed_genes <- expressed_genes[, .(expressedGenes, unionExpressedGenes,
+                    intersectionExpressedGenes, passedFilterGenes,
+                    expressedGenesRank)]
+
+#+echo=F
+rank_1 <- expressed_genes[expressedGenesRank == 1]
+#' **Rank 1:**
+#' `r as.character(rank_1$expressedGenes)` expressed genes
+#+echo=F
+rank_n <- expressed_genes[expressedGenesRank == .N]
+#' **Rank `r rank_n$expressedGenesRank`:**  
+#' `r as.character(rank_n$expressedGenes)` expressed genes  
+#' `r as.character(rank_n$unionExpressedGenes)` expressed genes (union)  
+#' `r as.character(rank_n$intersectionExpressedGenes)` expressed genes (intersection)  
+#' `r as.character(rank_n$passedFilterGenes)` genes passed the filter