diff --git a/.dockstore.yml b/.dockstore.yml index 8da326a7..f4e23abd 100644 --- a/.dockstore.yml +++ b/.dockstore.yml @@ -18,3 +18,6 @@ workflows: - name: MalariaVector_StatisticalPhasing subclass: WDL primaryDescriptorPath: /pipelines/phasing-vector/gcp/StatisticalPhasing.wdl + - name: MalariaVector_CNV + subclass: WDL + primaryDescriptorPath: /pipelines/copy-number-variation-vector/gcp/CNV.wdl diff --git a/dockerfiles/CNV/Dockerfile b/dockerfiles/CNV/Dockerfile new file mode 100644 index 00000000..5d824d52 --- /dev/null +++ b/dockerfiles/CNV/Dockerfile @@ -0,0 +1,34 @@ +FROM --platform=linux/amd64 continuumio/miniconda3 + +# Update the package lists and install necessary packages +RUN apt-get update && \ + apt-get install -y curl bzip2 ca-certificates gnupg && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +#TODO: Determine if R is really required here +# # Add the R repository and install the latest R +# RUN echo "deb https://cloud.r-project.org/bin/linux/ubuntu hirsute-cran40/" >> /etc/apt/sources.list && \ +# apt-key adv --keyserver keyserver.ubuntu.com --recv-keys E298A3A825C0D65DFD57CBB651716619E084DAB9 && \ +# apt-get update && \ +# apt-get install -y r-base && \ +# apt-get clean && \ +# rm -rf /var/lib/apt/lists/* + +# Copy the environment file to the container +COPY requirements_conda.yml requirements_conda.yml + +# Create a conda environment and activate it +RUN mkdir -p /cnv/output && \ + mkdir -p /cnv/input && \ + conda env create -f requirements_conda.yml && \ + echo "conda activate $(head -1 /opt/conda/envs/*/etc/environment.yml | cut -d'/' -f6)" >> ~/.bashrc + +# Set the working directory +WORKDIR /cnv + +# Copy the source code to the container +COPY . . + +# Set the default command to run when the container starts +# CMD ["python", ""] \ No newline at end of file diff --git a/dockerfiles/CNV/R3.6.1/Dockerfile b/dockerfiles/CNV/R3.6.1/Dockerfile new file mode 100644 index 00000000..d78c86af --- /dev/null +++ b/dockerfiles/CNV/R3.6.1/Dockerfile @@ -0,0 +1,41 @@ +## Platform tag ensures that images built on ARM-based machines (ex. M-series macs) are portable +FROM --platform="linux/amd64" ubuntu:18.04 + +ENV DEBIAN_FRONTEND=noninteractive + +# Set parameteres +ARG R_VERSION="3.6.1" + +WORKDIR /usr/local/ + +# Update and install dependencies and wet +RUN sed -Ei 's/^# deb-src /deb-src /' /etc/apt/sources.list && apt-get update \ + && apt-get install -y \ + build-essential \ + wget + +# Install R and addtional dependencies +RUN apt-get build-dep -y r-base \ + && wget https://cloud.r-project.org/src/base/R-3/R-${R_VERSION}.tar.gz \ + && tar xzf R-${R_VERSION}.tar.gz \ + && cd "R-${R_VERSION}" \ + && ./configure --prefix=/opt/R/${R_VERSION}/ --enable-R-shlib --with-blas --with-lapack \ + && make \ + && make install + +RUN rm -r R-${R_VERSION} \ + && rm R-${R_VERSION}.tar.gz + +# Run an R script to install several libraries +COPY installRDeps.R /usr/local/ +RUN /opt/R/${R_VERSION}/bin/Rscript /usr/local/installRDeps.R \ + && rm installRDeps.R + +# Add R installation directory to path +ENV PATH="/opt/R/${R_VERSION}/bin/:$PATH" + +# Copy source code +COPY . . + + + diff --git a/dockerfiles/CNV/R3.6.1/Rscripts/CNV_analysis.r b/dockerfiles/CNV/R3.6.1/Rscripts/CNV_analysis.r new file mode 100644 index 00000000..6e0017b9 --- /dev/null +++ b/dockerfiles/CNV/R3.6.1/Rscripts/CNV_analysis.r @@ -0,0 +1,367 @@ +# Load a library that will allow padding 0s in front of a number +library(stringr) +library(parallel) + +# This is the minimum number of consecutive duplicated windows required to call a duplication +min.dup.length <- 5 +# This is the threshold variance in coverage above which we remove a sample +threshold.variance <- 0.2 +# This is the coverage window that was used to calculate the coverage for the samples we are loading. +# This parameter should not be changed unless using input data calculated with a different window size. +coverage.window <- 300 +# Set the minimum proportion of windows in a gene that need to have some coverage data before making a duplication call. +min.win.cov <- 0.5 +# Set the likelihood ratio threshold at which we will accept a CNV +lik.thresh <- 10000 + +arg.values <- commandArgs(trailingOnly=T) +cat('Arguments:', arg.values, '\n', sep = '\n') + +# Get the chrosome +acceptable.chromosomes <- c('2L', '2R', '3L', '3R', 'X') +chrom <- arg.values[1] +if (!(chrom %in% acceptable.chromosomes)){ + stop(paste('Chromosome should be one of: ', paste(acceptable.chromosomes, collapse = ', '))) +} + +# Get the names of the samples to analyse +sample.list <- arg.values[2] +sample.names <- setNames(nm = read.table(sample.list, stringsAsFactors = F)[,1]) + +# Load the coverage variance data +cov.var.file <- arg.values[3] +cov.var <- read.table(cov.var.file, header = T, sep = '\t', row.names = 1) +# Identify samples with high coverage variance for exclusion +high.var.samples <- intersect(sample.names, rownames(cov.var)[cov.var$autosomes > threshold.variance]) +good.var.samples <- setdiff(sample.names, rownames(cov.var)[cov.var$autosomes > threshold.variance]) + +# Load up the information on gene and exon positions. +gene.pos.file <- arg.values[4] +gene.positions <- read.table(gene.pos.file) +gene.positions <- gene.positions[gene.positions$Chrom == chrom, ] + +# Get the detox genes file +detox.genes.file <- arg.values[5] +detox.genes <- read.table(detox.genes.file, stringsAsFactors = F)[,1] + +# Get the working directory +working.directory <- arg.values[6] + +# Get the number of cores +n.cores <- arg.values[7] + +# Get the output folder +output.folder <- arg.values[8] +dir.create(output.folder, showWarnings = FALSE) + +# If this is the sex chromosome, we also need the metadata to determine sex. + # These are the minimum copy number states required to consider a duplication (2 = "normal" diploid state). +{if (chrom == 'X'){ + metafile <- arg.values[9] + meta <- read.table(metafile, sep = '\t', header = T, row.names = 1, quote = '"', comment.char = '')[sample.names, ] + threshold.copy.number <- c(3, 2)[(meta$sex_call == 'M') + 1] +} +else { + threshold.copy.number <- 3 +}} + + +# Function to identify runs of increased copy number state +find.full.dup.sequences <- function(cnv, threshold, n){ + # If cnv is a dataframe, extract the cnv vector + if (is.data.frame(cnv)) + cnv.vector <- cnv$CNV + else + cnv.vector <- cnv + current.run <- 0 + output <- matrix(0,0,2) + for (i in 1:length(cnv.vector)){ + if (cnv.vector[i] >= threshold) { + if (current.run == 0) + start.pos <- i + current.run <- current.run + 1 + } + else{ + if (current.run >= n) + output <- rbind(output, c(start.pos, i-1)) + current.run <- 0 + } + } + if (current.run >= n) + output <- rbind(output, c(start.pos, i-1)) + return(output) +} + + +# Function to calculate the likelihood ratio of a duplication vs no duplication for a given CNV +# call +lik.ratio <- function(hmm, sample.variance, base.variance = 0.01, trans.prob = 0.00001, ploidy = 2){ + # Get the probability of not changing state and the ratio of the two probabilities + non.trans.prob <- 1 - 12*trans.prob + # Get the number of transitions in the cnv states + transitions <- sum((hmm$CNV[-1] - hmm$CNV[-nrow(hmm)]) != 0) + # get the ratio of transition probabilities + trans.ratio <- (trans.prob^transitions)/(non.trans.prob^transitions) + # get the likelihood of the null model. + lik.null <- dnorm(hmm$Normalised_coverage, ploidy, sqrt(base.variance + (sample.variance/2)*ploidy)) + # get the likelihood of the duplication model. + lik.dup <- dnorm(hmm$Normalised_coverage, hmm$CNV, sqrt(base.variance + (sample.variance/2)*hmm$CNV)) + # If the likelihood of the duplicated model at any given window is 0, then coverage must have been so high + # that even CNV state 12 could not capture it, so we report a very large likelihood ratio + if (any(lik.dup == 0)) + return(Inf) + # Otherwise, get the product of the ratios multiplied by the transition probability + else + return(trans.ratio * prod(lik.dup/lik.null)) +} + +# Function to identify the genetic features encompass in a CNV +find.features.encompassed <- function(pos.table, genepos.table, filtered.windows, cov.win){ + output.table <- data.frame(matrix(0,0,3, dimnames = list(character(),c('start', 'end', 'Features')))) + if (nrow(pos.table) == 0){ + return(output.table) + } + for (i in 1:nrow(pos.table)){ + # Find the genes whose post-filtering windows are all included in this duplication + gene.req <- genepos.table[,'start'] >= pos.table[i,1] & genepos.table[,'end'] <= pos.table[i,2] + # Gene that do not have enough covered windows to be considered will produce NAs in the above line. Turn + # these to FALSE + gene.req[is.na(gene.req)] <- F + # The regions where both of these requirements are met represent a gene encompassed in the duplication + genes.encompassed <- genepos.table[gene.req,] + # If there is at least one overlap, add the pos.table region to the output table and record the name of the + # genepos.table regions that overlapped it + if (nrow(genes.encompassed)){ + region.names <- paste(rownames(genes.encompassed), collapse = ";") + new.row <- data.frame(start = pos.table[i,1], end = pos.table[i,2], Features = region.names, stringsAsFactors = F)[1,] + if (!is.null(rownames(pos.table))) + output.table[rownames(pos.table)[i],] <- new.row + else + output.table <- rbind(output.table, new.row) + } + } + # Now convert the indices to genomic positions in the output table + output.table[,1] <- filtered.windows[output.table[,1]] + output.table[,2] <- filtered.windows[output.table[,2]] + cov.win + return(output.table) +} + +# Function that will go through all of the duplications in a list and merge them if they have similar start +# and end positions. Let's write it such that the function can be run on either indices or positions, making +# the permissiveness around the start and end points an argument, so that appropriate values can be given +# depending on whether indices or positions are used. +merge.duplications <- function(dup.list, leeway){ + # Turn the matrices that make up the input list into dataframes + dup.list.out <- lapply(dup.list, function(x) data.frame(start = x[,1], end = x[,2], CNV_code = 0)) + # Here is the code we will assign the next CNV group we find + current.CNV.code = 1 + # Now go through every dup, looking for matching ranges + for (i in 1:length(dup.list.out)){ + cat('Analysing sample ', names(dup.list.out)[i], '.\n', sep = '') + for (k in 1:nrow(dup.list.out[[i]])){ + this.start <- dup.list.out[[i]][k,1] + this.end <- dup.list.out[[i]][k,2] + # Go through every previous sample, finding matching duplications. We record matching duplications + # by the sample and row in which they are found, and their current CNV_code. + matching.dups <- matrix(0,0,3) + for (j in (1:i)[-i]){ + # find any dups that match the start and end points of this one, including the leeway + match.start <- (dup.list.out[[j]][,1] >= this.start - leeway) & (dup.list.out[[j]][,1] <= this.start + leeway) + match.end <- (dup.list.out[[j]][,2] >= this.end - leeway) & (dup.list.out[[j]][,2] <= this.end + leeway) + matching.dup.index <- which(match.start & match.end) + # We should never find more than one matching dup in a given sample. Check that this is true + if (length(matching.dup.index) > 1) + stop(paste('Fail. Found more than one matching duplications for i = ', i, ', k = ', k, ', j = ', j, '.')) + else if (length(matching.dup.index) == 1) + matching.dups <- rbind(matching.dups, c(j, matching.dup.index, dup.list.out[[j]][matching.dup.index, 3])) + } + # Now we have identified all of the dups that match this one, we need to merge them all together + # If there are no matching dups, then give the current CNV the next available code + if (nrow(matching.dups) == 0){ + dup.list.out[[i]][k,3] <- current.CNV.code + current.CNV.code <- current.CNV.code + 1 + } + else { + # If there is only one matching dup, just give the same code to the current dup + matching.dup.codes <- sort(unique(matching.dups[,3])) + if (length(matching.dup.codes) == 1) + dup.list.out[[i]][k,3] <- matching.dups[1,3] + # Otherwise, find the smallest of the matching codes and give all matching dups that code. We + # also need to find all the dups that have the matching code (not just the ones that directly + # matched this dup) and change the code for those dups. + else { + smallest.code <- matching.dup.codes[1] + cat('\tMerging CNV codes ', paste(matching.dup.codes, collapse = ', '), '.\n', sep = '') + # Give this code to the current dup + dup.list.out[[i]][k,3] <- smallest.code + # For each other dup code, find all the samples that have that code and change it to the + # smallest code + for (j in (1:i)[-i]){ + dup.list.out[[j]][dup.list.out[[j]][,3] %in% matching.dup.codes[-1], 3] <- smallest.code + } + } + } + } + } + dup.list.out +} + +# Function to load up each file and store the results in a list +load.hmm.file <- function(sample.name){ + all.files <- list.files() + # Find the file corresponding to this sample + this.file <- grep(sample.name, all.files, value = T) + if (length(this.file) == 0) + stop(paste('No matching files were found for sample', sample.name)) + if (length(this.file) > 1) + stop(paste('More than one matching file were found for sample', sample.name)) + # Load the file + cat('\t', this.file, '\n', sep = '') + read.table(this.file, header = T) +} + +# Function to filter samples by likelihood ratio +filter.by.likelihood.ratio <- function(cnv.table, hmm.table, sample.name, likelihood.threshold){ + filtered.table <- matrix(0,0,2) + keeprow <- function(x) + lik.ratio(hmm.table[x[1]:x[2], ], cov.var[sample.name, 'autosomes']) > likelihood.threshold + filtered.table <- cnv.table[apply(cnv.table, 1, keeprow), ] + filtered.table +} + +# Function to load the data for a sample and detect raw CNVs within it +process.sample <- function(sample.name, threshold, n, likelihood.threshold){ + these.hmm <- load.hmm.file(sample.name) + duplications.indices <- find.full.dup.sequences(these.hmm, threshold, n) + goodlr.duplications.indices <- filter.by.likelihood.ratio(duplications.indices, these.hmm, sample.name, likelihood.threshold) + goodlr.duplications.indices +} + +# Set the working directory +setwd(working.directory) + +cat('Processing HMM files\n') +# We'll use the first sample to get the list of posistions that remained after window filtering +hmm.positions <- load.hmm.file(sample.names[1])$Position +# Now load up all the samples and look for CNVs +goodlr.duplications.list.indices <- mcmapply(process.sample, sample.names, threshold = threshold.copy.number, n = min.dup.length, likelihood.threshold = lik.thresh, mc.cores = n.cores, SIMPLIFY = F) +# Because I am not 100% sure that mcmapply will output the data in the right order, the next line ensures the order +# is correct +goodlr.duplications.list.indices <- goodlr.duplications.list.indices[sample.names] + +## Now create a single table listing all of these raw CNVs in all samples, including the high variance samples +num.raw.cnvs <- sapply(goodlr.duplications.list.indices, nrow) +full.raw.cnv.indices <- do.call(rbind, goodlr.duplications.list.indices) +full.raw.cnv.table <- data.frame(Sample = rep(names(num.raw.cnvs), num.raw.cnvs), + Chrom = rep(chrom, sum(num.raw.cnvs)), + # Get the CNV coordinates as positions rather than indices + matrix(hmm.positions[full.raw.cnv.indices] + rep(c(0,coverage.window), each = nrow(full.raw.cnv.indices)), ncol = 2) + ) +colnames(full.raw.cnv.table)[3:4] <- c('Start', 'Stop') +write.table(full.raw.cnv.table, paste(output.folder, '/full_raw_CNV_table_', chrom, '.csv', sep = ''), sep = '\t', row.names = F, quote = F) + +# Now remove the samples with high variance +goodvar.goodlr.duplications.list.indices <- goodlr.duplications.list.indices[good.var.samples] + +# Merge CNV clusters, but only use the samples with good variance values for this +cat('Merging CNV clusters.\n') +clustered.list <- merge.duplications(goodvar.goodlr.duplications.list.indices, 1) +save.image(paste('temp.Rdata', sep = '')) + +# Order these data by CNV cluster, instead of by sample +combined.clustered.list <- do.call(rbind, clustered.list) +all.CNVs <- unique(combined.clustered.list[,3]) +# The names of the all.CNVs vector should be the names that we will give to the CNVs. We pad them with at +# least 4 characters, or more if required. +names(all.CNVs) <- paste('CNV_', chrom, str_pad(all.CNVs, max(c(4, max(nchar(all.CNVs)))), pad = '0'), sep = '') +# List the data by CNV, instead of by sample. +get.cnv.cluster <- function(cnv, cnv.table){ + this.CNV.table <- subset(cnv.table, CNV_code == cnv) + # Unlike phase2, some CNV codes end up existig twice in the same sample (when the range in start and + # end positions ends up large enough. So the sample names can no longer be the row names of the CNV + # table + this.CNV.table$sample.name <- sub('\\..+', '', rownames(this.CNV.table)) + this.CNV.table[,c('sample.name', 'start', 'end')] +} +duplications.by.cluster <- lapply(all.CNVs, get.cnv.cluster, combined.clustered.list) +rm(all.CNVs) +rm(combined.clustered.list) + +# Create a table showing the start and end points of these CNVs. For each start point, we take the median of +# the start points of all the occurences of that CNV. We also want a measure of the uncertainty of the values +# of those medians, which we calculate as the range which each of the start and end points take. +summarise.CNVs <- function(CNV.cluster){ + start.point = ceiling(median(CNV.cluster$start)) + end.point = floor(median(CNV.cluster$end)) + start.05.quantile = floor(quantile(CNV.cluster$start, 0.05)) + start.95.quantile = ceiling(quantile(CNV.cluster$start, 0.95)) + end.05.quantile = floor(quantile(CNV.cluster$end, 0.05)) + end.95.quantile = ceiling(quantile(CNV.cluster$end, 0.95)) + num_samples = length(unique(CNV.cluster$sample.name)) + setNames(c(start.point, end.point, start.05.quantile, start.95.quantile, end.05.quantile, end.95.quantile, num_samples), + c('start', 'end', 'start.05.quantile', 'start.95.quantile', 'end.05.quantile', 'end.95.quantile', 'num_samples')) +} +CNV.table <- data.frame(t(sapply(duplications.by.cluster, summarise.CNVs))) + +# For each duplications table, identify any genes that are encompassed by any duplications, with a minimum +# number of windows providing useable coverage data. +# First, convert the positions of the gene table into indices of windows that passed filtering. +determine.eligible.gene.positions <- function(this.gene.coords, positions, cov.win, min.cov){ + this.gene.start <- this.gene.coords[1] + this.gene.end <- this.gene.coords[2] + # Get the windows included in this gene + this.gene.covered.windows <- which((positions >= this.gene.start) & (positions <= this.gene.end - cov.win)) + # Get the number of windows that would be included in this gene if none had been filtered + this.gene.full.windows <- floor(this.gene.end/cov.win) - ceiling(this.gene.start/cov.win) + # If there are no potential windows fully encompassed in the gene, declare the gene ineligible + if (this.gene.full.windows <= 0) + return(c(length(this.gene.covered.windows), + 0, + NA, + NA)) + else if (length(this.gene.covered.windows) >= (min.cov * this.gene.full.windows)) + return(c(length(this.gene.covered.windows), + this.gene.full.windows, + min(this.gene.covered.windows), + max(this.gene.covered.windows))) + else + return(c(length(this.gene.covered.windows), + this.gene.full.windows, + NA, + NA)) +} +eligible.gene.position.indices <- cbind(gene.positions[, 'Chrom', drop = F], data.frame(t(matrix(apply(gene.positions[, c('start', 'end')], 1, determine.eligible.gene.positions, hmm.positions, coverage.window, min.win.cov), nrow = 4, dimnames = list(c('covered.windows', 'full.windows', 'start', 'end')))))) + +# Now look for encompassed genes +cat('Finding genes encompassed within CNVs\n') +# Note that the eligible.gene.position.indices table records the indices of the windows that are fully contained +# inside each gene. Thus, implicitly, we are only requiring that these windows are in a cnv in order to call a +# gene as duplicated. If the windows either side, overlapping the ends of the gene) are not in the cnv, this will +# not prevent the duplication call. +CNV.encompass.table <- find.features.encompassed(CNV.table, genepos.table = eligible.gene.position.indices, filtered.windows = hmm.positions, cov.win = coverage.window) +# Create a list that captures the genes present in each CNV in a more effective way. +CNV.encompass.list <- strsplit(CNV.encompass.table$Features, ';') +names(CNV.encompass.list) <- rownames(CNV.encompass.table) +CNV.encompass.table$has.detox <- sapply(CNV.encompass.list, function(x) any(x %in% detox.genes)) +# Create the full.CNV.table. Num_samples is not necessarily gambiae + arabiensis because there is one intermediate +# sample. +full.CNV.table <- data.frame(Chrom = chrom, + Start = hmm.positions[CNV.table$start], + End = hmm.positions[CNV.table$end] + coverage.window, + Start_90_quantile_range = hmm.positions[CNV.table$start.95.quantile] - hmm.positions[CNV.table$start.05.quantile], + End_90_quantile_range = hmm.positions[CNV.table$end.95.quantile] - hmm.positions[CNV.table$end.05.quantile], + Num_samples = CNV.table$num_samples, + Genes = CNV.encompass.table[rownames(CNV.table), 'Features'], + Has_detox = CNV.encompass.table[rownames(CNV.table), 'has.detox'], + row.names = rownames(CNV.table), + stringsAsFactors = F) +full.CNV.table$Genes[is.na(full.CNV.table$Genes)] <- '' +full.CNV.table$Has_detox[is.na(full.CNV.table$Has_detox)] <- F +for (s in good.var.samples) + full.CNV.table[[s]] <- sapply(duplications.by.cluster, function(x) s %in% x$sample.name) +rm(CNV.encompass.table) + +write.table(full.CNV.table, paste(output.folder, '/full_coverage_CNV_table_', chrom, '.csv', sep = ''), sep = '\t', col.names = NA, quote = F) + +save.image(paste(output.folder, '/CNV_analysis_', chrom, '.Rdata', sep = '')) + diff --git a/dockerfiles/CNV/R3.6.1/Rscripts/find_failed_samples.r b/dockerfiles/CNV/R3.6.1/Rscripts/find_failed_samples.r new file mode 100644 index 00000000..69e44f2f --- /dev/null +++ b/dockerfiles/CNV/R3.6.1/Rscripts/find_failed_samples.r @@ -0,0 +1,42 @@ +arg.values <- commandArgs(trailingOnly=T) +cat('Arguments:', arg.values, '\n', sep = '\n') + +study.id <- arg.values[1] +bampaths.table.filename <- arg.values[2] +expected.coverage.files <- as.numeric(arg.values[3]) +expected.diagnostic.reads.files <- as.numeric(arg.values[4]) +output.filename <- arg.values[5] + +working.folder <- paste('~/personal', study.id, sep = '/') + +bampaths.table <- read.table(bampaths.table.filename, sep = '\t', header = F, row.names = 1) +expected.samples <- setNames(nm = rownames(bampaths.table)) + +coverage.folder <- paste(working.folder, 'coverage', sep = '/') +all.coverage.files <- list.files(coverage.folder, recursive = T) +csv.coverage.files <- all.coverage.files[grepl('.csv$', all.coverage.files)] +matching.coverage.files <- sapply(expected.samples, function(x) sum(grepl(x, csv.coverage.files))) +missing.coverage.samples <- matching.coverage.files[matching.coverage.files < expected.coverage.files] + +cat('\nThe following samples did not have the full complement of coverage output files (', + expected.coverage.files, ' files were expected per sample, the number observed for ', + 'each failing sample is shown below).\n\n', sep = '') +print(missing.coverage.samples) + +diagnostic.reads.folder <- paste(working.folder, 'diagnostic_reads', sep = '/') +all.diagnostic.reads.files <- list.files(diagnostic.reads.folder, recursive = T) +csv.diagnostic.reads.files <- all.diagnostic.reads.files[grepl('.csv$', all.diagnostic.reads.files)] +matching.diagnostic.reads.files <- sapply(expected.samples, function(x) sum(grepl(x, csv.diagnostic.reads.files))) +missing.diagnostic.reads.samples <- matching.diagnostic.reads.files[matching.diagnostic.reads.files < expected.diagnostic.reads.files] + +cat('\n\nThe following samples did not have the full complement of diagnostic read output files (', + expected.diagnostic.reads.files, ' files were expected per sample, the number observed for ', + 'each failing sample is shown below).\n\n', sep = '') +print(missing.diagnostic.reads.samples) + +all.missing.samples <- sort(unique(c(names(missing.coverage.samples), names(missing.diagnostic.reads.samples)))) +cat('\n\nOverall,', length(all.missing.samples), 'samples were missing at least one output file.\n') + +missing.samples.bampaths <- bampaths.table[all.missing.samples, , drop = F] +write.table(missing.samples.bampaths, output.filename, sep = '\t', col.names = F, row.names = T, quote = F) + diff --git a/dockerfiles/CNV/R3.6.1/Rscripts/modal_CNVs.sh b/dockerfiles/CNV/R3.6.1/Rscripts/modal_CNVs.sh new file mode 100755 index 00000000..d679c7cc --- /dev/null +++ b/dockerfiles/CNV/R3.6.1/Rscripts/modal_CNVs.sh @@ -0,0 +1,21 @@ +coveragefolder=$1 +manifest=$2 +output_name=$3 +coverage_variance_file=$4 +metadata=$5 +outputfolder=$6 + +gene_coordinates_file=~/personal/phase3_data/tables/gene_regions.csv +scriptsfolder=~/scripts/CNV_scripts/scripts + +R --version + +R --slave -f $scriptsfolder/modal_cnv.r --args $manifest \ + $gene_coordinates_file \ + $metadata \ + $output_name \ + $coverage_variance_file \ + $coveragefolder \ + $outputfolder \ + > $outputfolder/modal_CNV_${output_name}.log 2>&1 + diff --git a/dockerfiles/CNV/R3.6.1/Rscripts/modal_CNVs_vobs.sh b/dockerfiles/CNV/R3.6.1/Rscripts/modal_CNVs_vobs.sh new file mode 100755 index 00000000..4c06c48b --- /dev/null +++ b/dockerfiles/CNV/R3.6.1/Rscripts/modal_CNVs_vobs.sh @@ -0,0 +1,21 @@ +coveragefolder=$1 +manifest=$2 +output_name=$3 +coverage_variance_file=$4 +metadata=$5 +outputfolder=$6 +gene_coordinates_file=$7 + +scriptsfolder=~/scripts/CNV_scripts/scripts + +R-3.6.1 --version + +R-3.6.1 --slave -f $scriptsfolder/modal_cnv.r --args $manifest \ + $gene_coordinates_file \ + $metadata \ + $output_name \ + $coverage_variance_file \ + $coveragefolder \ + $outputfolder \ + > $outputfolder/modal_CNV_${output_name}.log 2>&1 + diff --git a/dockerfiles/CNV/R3.6.1/Rscripts/modal_cnv.r b/dockerfiles/CNV/R3.6.1/Rscripts/modal_cnv.r new file mode 100644 index 00000000..3359c3ba --- /dev/null +++ b/dockerfiles/CNV/R3.6.1/Rscripts/modal_cnv.r @@ -0,0 +1,84 @@ + +arg.values <- commandArgs(trailingOnly=T) +cat('Arguments:', arg.values, '\n', sep = '\n') + +chroms = c('2L', '2R', '3L', '3R', 'X') + +threshold.variance <- 0.2 + +# Get the sample manifest +sample.list <- arg.values[1] +sample.names <- setNames(nm = read.table(sample.list, stringsAsFactors = F)[[1]]) + +# Get the table of gene coordinates +gene.regions.file <- arg.values[2] +all.gene.coordinates <- read.table(gene.regions.file, sep = '\t', header = T, row.names = 1) +gene.coordinates.list <- lapply(split(all.gene.coordinates, all.gene.coordinates$Chrom)[chroms], function(x) x[, c('start', 'end')]) + +# Get the metadata +meta.file <- arg.values[3] +meta <- read.table(meta.file, sep = '\t', header = T, row.names = 1, quote = '"', comment.char = '', colClasses = c(sex_call = 'factor'))[sample.names, ] +expected.copy.number.on.X <- c(2, 1)[(meta$sex_call == 'M') + 1] + +# Get the species name (only used to name the output file) +species <- arg.values[4] + +# Get the coverage variance data +cov.var.file <- arg.values[5] +cov.var <- read.table(cov.var.file, header = T, sep = '\t', row.names = 1) +high.var.samples <- intersect(sample.names, rownames(cov.var)[cov.var$autosomes > threshold.variance]) + +# Get the folder in which the HMM output are stored +coverage.folder <- arg.values[6] + +# Get the output folder +output.folder <- arg.values[7] +dir.create(output.folder, showWarnings = FALSE) + +# Function to load up each file and store the results in a list +load.hmm.files <- function(sample.names, chrom){ + folder <- paste(coverage.folder, chrom, 'HMM_output', sep = '/') + all.files <- setNames(paste(folder, '/', sample.names, '_', chrom, '_HMM.csv', sep = ''), sample.names) + output.table <- as.data.frame(lapply(all.files, function(fn) read.table(fn, header = T, row.names = 1)[, 'CNV', drop = F])) + colnames(output.table) <- sample.names + output.table$Position <- as.numeric(rownames(output.table)) + output.table +} + +# Functions that will calculate the mode of the HMM coverage in a given region +calculate.mode <- function(copy.number.vector){ + # We calculate the mode. Where there is more than one mode, we take the smallest one + un <- unique(copy.number.vector) + tab <- tabulate(match(copy.number.vector, un)) + copy.number.mode <- min(un[tab == max(tab)]) + copy.number.mode +} + +get.gene.mode <- function(target.region, hmm.data, window.size = 300){ + target.region <- as.numeric(target.region) + hmm.in.region <- subset(hmm.data, Position >= (target.region[1] - window.size) & Position < target.region[2])[, sample.names, drop = F] + gene.mode <- apply(hmm.in.region, 2, calculate.mode) + gene.mode +} + +modal.copy.number.change <- function(chrom, expected.copy.number){ + cat(chrom, '\n') + cat('\tLoading hmm output files.\n') + full.table <- load.hmm.files(sample.names, chrom) + cat('\tCalculating modal copy number change.\n') + copy.number.change <- apply(gene.coordinates.list[[chrom]], 1, get.gene.mode, hmm.data = full.table) - expected.copy.number + if (!is.matrix(copy.number.change)) + copy.number.change <- matrix(copy.number.change, nrow = 1, dimnames = list(sample.names, names(copy.number.change))) + copy.number.change +} + +modal.copy.number <- data.frame(do.call(cbind, mapply(modal.copy.number.change, chroms, list(2, 2, 2, 2, expected.copy.number.on.X)))) + +modal.copy.number <- cbind(meta[rownames(modal.copy.number), 'sex_call', drop = F], + data.frame(high_variance = rownames(modal.copy.number) %in% high.var.samples), + modal.copy.number[, order(colnames(modal.copy.number))]) + +# Write that table to file +output.fn = paste(output.folder, '/modal_copy_number_', species, '.csv', sep = '') +write.table(modal.copy.number, file = output.fn, sep = '\t', col.names = NA, quote = F) + diff --git a/dockerfiles/CNV/R3.6.1/Rscripts/plotting_functions.r b/dockerfiles/CNV/R3.6.1/Rscripts/plotting_functions.r new file mode 100644 index 00000000..dff0b8dc --- /dev/null +++ b/dockerfiles/CNV/R3.6.1/Rscripts/plotting_functions.r @@ -0,0 +1,407 @@ +library(RColorBrewer) + +# Create a vector of colours to use for plotting. +# The last two CNVs are named Dup1a and Dup1b, for the Cyp6 cluster +n.colours <- 30 +duplication.colours <- c(colorRampPalette(brewer.pal(12, 'Paired'))(n.colours), 'grey30', 'grey70') +#duplication.colours <- c(rgb(0.6509804, 0.8078431, 0.8901961), rgb(0.1215686, 0.4705882, 0.7058824), rgb(0.6980392, 0.8745098, 0.5411765), rgb(0.2, 0.627451, 0.172549), rgb(0.9843137, 0.6039216, 0.6), rgb(0.8901961, 0.1019608, 0.1098039), rgb(0.9921569, 0.7490196, 0.4352941), rgb(1, 0.4980392, 0), rgb(0.7921569, 0.6980392, 0.8392157), rgb(0.4156863, 0.2392157, 0.6039216), rgb(1,0,1,0.7), rgb(0.6941176, 0.3490196, 0.1568627), rgb(0.5,0.5,0,0.8), rgb(0,0.5,0.5,0.8), rgb(0.5, 0, 0.5, 0.8), 'yellow', 'lightblue', 'pink', 'orange', 'lightgreen', 'grey30', 'grey70') +names(duplication.colours) <- paste('Dup', c(as.character(1:n.colours), '1a', '1b'), sep = '') +gene.colours <- c('black', 'purple', 'orange', 'magenta', 'brown', 'green', 'violet', 'red', 'blue') + +# Create a function that can be used to plot discordant read pairs +add.diagnostics <- function(coordinates, this.col = 'red', yrange = c(0,1)){ + coordinates <- as.matrix(coordinates) + n <- nrow(coordinates) + co <- ncol(coordinates) + jitter.values <- seq(yrange[1], yrange[2], length.out = n) + points(coordinates, matrix(jitter.values, nrow = n, ncol = co), col = this.col) + if (co == 2) + segments(coordinates[,1], jitter.values, coordinates[,2], jitter.values, col = this.col) +} + +plot.ace1 <- function(this.sample, list.of.dups = NULL, diagnostics = c('FA','SS','FM','XC','BP'), start.pos = plotting.ranges$ace1[1], end.pos = plotting.ranges$ace1[2]){ + start.index <- which(compact.hmm$ace1[[this.sample]]$Position >= start.pos)[1] + end.index <- tail(which(compact.hmm$ace1[[this.sample]]$Position <= end.pos), 1) + plot(compact.hmm$ace1[[this.sample]]$Position[start.index : end.index], compact.hmm$ace1[[this.sample]]$Normalised_coverage[start.index : end.index], main = this.sample, ylim = c(0,12)) + # We highlight along the x axis where the duplications and deletions are as predicted by the FA and FM reads + if (!is.null(list.of.dups)){ + if (list.of.dups['Dup1']) + rect(known.cnvs$ace1$Dup1$BP$pos[1], -0.3, known.cnvs$ace1$Dup1$BP$pos[2], 0, col = rgb(1, 0, 0, 0.8), border = rgb(1, 0, 0, 0.8)) + if (list.of.dups['Dup2']) + rect(3448100, -0.3, 3518750, 0, col = rgb(0, 1, 0, 0.8), border = rgb(0, 1, 0, 0.8)) + if (list.of.dups['Del1']) + rect(3502000, -0.3, 3598900, 0, col = rgb(0, 0, 1, 0.8), border = rgb(0, 0, 1, 0.8)) + if (list.of.dups['Del2']) + rect(3539450, -0.3, 3573600, 0, col = rgb(1, 0, 1, 0.8), border = rgb(1, 0, 1, 0.8)) + if (list.of.dups['Del3']) + rect(3536000, -0.3, 3618850, 0, col = rgb(1, 1, 0, 0.8), border = rgb(1, 1, 0, 0.8)) + if (list.of.dups['Del4']) + rect(3512500, -0.3, 3616000, 0, col = rgb(0, 1, 1, 0.8), border = rgb(0, 1, 1, 0.8)) + } + discordant.colours <- c(FA = 'blue', SS = 'cyan', FM = 'green', XC = 'red') + y.ranges <- list(FA = c(0,3), SS = c(3,6), FM = c(6,9), XC = c(9,12)) + for (d in diagnostics){ + if (d == 'BP'){ + add.diagnostics(diagnostic.reads$ace1[[this.sample]][[d]]$CEP$Position, this.col = 'brown', yrange = c(0,12)) + add.diagnostics(diagnostic.reads$ace1[[this.sample]][[d]]$CSP$Position, this.col = 'pink', yrange = c(0,12)) + } + else if (d == 'XC') + add.diagnostics(diagnostic.reads$ace1[[this.sample]][[d]]$Position, this.col = discordant.colours[d], yrange = y.ranges[[d]]) + else + add.diagnostics(diagnostic.reads$ace1[[this.sample]][[d]], this.col = discordant.colours[d], yrange = y.ranges[[d]]) + } + these.cnv.states <- compact.hmm$ace1[[this.sample]]$CNV[start.index : end.index] + lines(compact.hmm$ace1[[this.sample]]$Position[start.index : end.index], these.cnv.states , col = 2, lwd = 2) + abline(v = gene.coords$ace1[, c('start', 'end')]) + text(apply(gene.coords$ace1[, c('start', 'end')], 1, mean), 11, rownames(gene.coords$ace1), srt=90, adj = 0) +} + +plot.all.ace1 <- function(list.of.samples, matrix.of.read.dups = read.based.cnvs$ace1, diagnostics = c('FA','SS','FM','XC','BP'), start.pos = plotting.ranges$ace1[1], end.pos = plotting.ranges$ace1[2]){ + x11() + x.midpoint <- mean(c(start.pos, end.pos)) + i <- 1 + while(1){ + if (i < 1) + i <- length(list.of.samples) + this.sample <- list.of.samples[i] + if (is.null(matrix.of.read.dups)) + plot.ace1(this.sample, NULL, diagnostics, start.pos, end.pos) + else + plot.ace1(this.sample, matrix.of.read.dups[this.sample,], diagnostics, start.pos, end.pos) + x <- locator(1)$x + if (x <= x.midpoint) + i <- i-1 + else + i <- i+1 + } +} + +plot.cyp6 <- function(this.sample, list.of.dups = NULL, diagnostics = c('FA','SS','FM','XC','BP'), start.pos = plotting.ranges$cyp6[1], end.pos = plotting.ranges$cyp6[2]){ + start.index <- which(compact.hmm$cyp6[[this.sample]]$Position >= start.pos)[1] + end.index <- tail(which(compact.hmm$cyp6[[this.sample]]$Position <= end.pos), 1) + plot(compact.hmm$cyp6[[this.sample]]$Position[start.index : end.index], compact.hmm$cyp6[[this.sample]]$Normalised_coverage[start.index : end.index], main = this.sample, ylim = c(0,12)) + # We highlight along the x axis where the duplications are as predicted by the FA and SS reads + if (!is.null(list.of.dups)){ + Dup.order <- paste('Dup', c(as.character(c(30:20, 19, 14, 15, 13:11, 18, 10:7, 17, 6:2)), '1a', '1b'), sep = '') + list.of.dups <- list.of.dups[Dup.order] + for (d in names(list.of.dups)[list.of.dups]){ + if (d == 'Dup15'){ + rect(known.cnvs$cyp6[[d]]$BP$pos[1], -0.6, 28555300, 0, col = duplication.colours[d], border = col) + text(28555300, -0.4, d, cex = 1.2, adj = c(1,0)) + } + else if (d == 'Dup23'){ + rect(28450000, -0.6, known.cnvs$cyp6[[d]]$BP$pos[2], 0, col = duplication.colours[d], border = col) + text(known.cnvs$cyp6[[d]]$BP$pos[2], -0.4, d, cex = 1.2, adj = c(1,0)) + } + else if (d == 'Dup27'){ + rect(28496700, -0.6, 28499200, 0, col = duplication.colours[d], border = col) + text(28499200, -0.4, d, cex = 1.2, adj = c(1,0)) + } + else{ + rect(known.cnvs$cyp6[[d]]$BP$pos[1], -0.6, known.cnvs$cyp6[[d]]$BP$pos[2], 0, col = duplication.colours[d], border = col) + text(known.cnvs$cyp6[[d]]$BP$pos[2], -0.4, d, cex = 1.2, adj = c(1,0)) + } + } + } + discordant.colours <- c(FA = 'blue', SS = 'cyan', FM = 'green', XC = 'red') + y.ranges <- list(FA = c(0,3), SS = c(3,6), FM = c(6,9), XC = c(9,12)) + for (d in diagnostics){ + if (d == 'BP'){ + add.diagnostics(diagnostic.reads$cyp6[[this.sample]][[d]]$CEP$Position, this.col = 'brown', yrange = c(0,12)) + add.diagnostics(diagnostic.reads$cyp6[[this.sample]][[d]]$CSP$Position, this.col = 'pink', yrange = c(0,12)) + } + else if (d == 'XC') + add.diagnostics(diagnostic.reads$cyp6[[this.sample]][[d]]$Position, this.col = discordant.colours[d], yrange = y.ranges[[d]]) + else + add.diagnostics(diagnostic.reads$cyp6[[this.sample]][[d]], this.col = discordant.colours[d], yrange = y.ranges[[d]]) + } + these.cnv.states <- compact.hmm$cyp6[[this.sample]]$CNV[start.index : end.index] + lines(compact.hmm$cyp6[[this.sample]]$Position[start.index : end.index], these.cnv.states , col = 2, lwd = 2) + # Add the genes to the plot + these.gene.colours <- gene.colours[1:nrow(gene.coords$cyp6)] + abline(v = unlist(gene.coords$cyp6[, c('start', 'end')]), col = rep(these.gene.colours, 2)) + text(apply(gene.coords$cyp6[, c('start', 'end')], 1, mean), 9, rownames(gene.coords$cyp6), srt=90, adj = 0, col = these.gene.colours) +} + +plot.all.cyp6 <- function(list.of.samples, matrix.of.read.dups = read.based.cnvs$cyp6, diagnostics = c('FA','SS','FM','XC','BP'), start.pos = plotting.ranges$cyp6[1], end.pos = plotting.ranges$cyp6[2]){ + x11() + x.midpoint <- mean(c(start.pos, end.pos)) + i <- 1 + while(1){ + if (i < 1) + i <- length(list.of.samples) + this.sample <- list.of.samples[i] + if (is.null(matrix.of.read.dups)) + plot.cyp6(this.sample, NULL, diagnostics, start.pos, end.pos) + else + plot.cyp6(this.sample, matrix.of.read.dups[this.sample,], diagnostics, start.pos, end.pos) + x <- locator(1)$x + if (x <= x.midpoint) + i <- i-1 + else + i <- i+1 + } +} + +plot.cyp6mz <- function(this.sample, list.of.dups = NULL, diagnostics = c('FA','SS','FM','XC','BP'), start.pos = plotting.ranges$cyp6mz[1], end.pos = plotting.ranges$cyp6mz[2]){ + start.index <- which(compact.hmm$cyp6mz[[this.sample]]$Position >= start.pos)[1] + end.index <- tail(which(compact.hmm$cyp6mz[[this.sample]]$Position <= end.pos), 1) + plot(compact.hmm$cyp6mz[[this.sample]]$Position[start.index : end.index], compact.hmm$cyp6mz[[this.sample]]$Normalised_coverage[start.index : end.index], main = this.sample, ylim = c(0,12)) + # We highlight along the x axis where the duplications are as predicted by the FA and SS reads + list.of.dups <- list.of.dups[-1] + if (!is.null(list.of.dups)){ + these.duplication.colours <- setNames(duplication.colours[1:length(list.of.dups)], names(list.of.dups)) + for (d in names(list.of.dups)[list.of.dups]){ + if (d == 'Dupz2'){ + rect(6975100, -0.6, known.cnvs$cyp6mz$Dupz2$BP$pos[2], 0, col = these.duplication.colours[d], border = col) + text(known.cnvs$cyp6mz$Dupz2$BP$pos[2], -0.4, d, cex = 1.2, adj = c(1,0)) + } + else if (d == 'Dupz3'){ + rect(known.cnvs$cyp6mz$Dupz3$BP$pos[1], -0.6, 6978100, 0, col = these.duplication.colours[d], border = col) + text(6978100, -0.4, d, cex = 1.2, adj = c(1,0)) + } + else if (d == 'Dupz5'){ + rect(6969800, -0.6, 6976000, 0, col = these.duplication.colours[d], border = col) + text(6976000, -0.4, d, cex = 1.2, adj = c(1,0)) + } + else if (d == 'Dupm1'){ + rect(known.cnvs$cyp6mz$Dupm1$BP$pos[1], -0.6, 6930500, 0, col = these.duplication.colours[d], border = col) + text(6930500, -0.4, d, cex = 1.2, adj = c(1,0)) + } + else if (d == 'Dupm2'){ + rect(6933100, -0.6, 6935200, 0, col = these.duplication.colours[d], border = col) + text(6935200, -0.4, d, cex = 1.2, adj = c(1,0)) + } + else if (d == 'Dupm3'){ + rect(6929600, -0.6, 6932800, 0, col = these.duplication.colours[d], border = col) + text(6932800, -0.4, d, cex = 1.2, adj = c(1,0)) + } + else if (d == 'Dupmz1'){ + rect(6904000, -0.6, 6982700, 0, col = these.duplication.colours[d], border = col) + text(6982700, -0.4, d, cex = 1.2, adj = c(1,0)) + } + else{ + rect(known.cnvs$cyp6mz[[d]]$BP$pos[1], -0.6, known.cnvs$cyp6mz[[d]]$BP$pos[2], 0, col = these.duplication.colours[d], border = col) + text(known.cnvs$cyp6mz[[d]]$BP$pos[2], -0.4, d, cex = 1.2, adj = c(1,0)) + } + } + } + discordant.colours <- c(FA = 'blue', SS = 'cyan', FM = 'green', XC = 'red') + y.ranges <- list(FA = c(0,3), SS = c(3,6), FM = c(6,9), XC = c(9,12)) + for (d in diagnostics){ + if (d == 'BP'){ + add.diagnostics(diagnostic.reads$cyp6mz[[this.sample]][[d]]$CEP$Position, this.col = 'brown', yrange = c(0,12)) + add.diagnostics(diagnostic.reads$cyp6mz[[this.sample]][[d]]$CSP$Position, this.col = 'pink', yrange = c(0,12)) + } + else if (d == 'XC') + add.diagnostics(diagnostic.reads$cyp6mz[[this.sample]][[d]]$Position, this.col = discordant.colours[d], yrange = y.ranges[[d]]) + else + add.diagnostics(diagnostic.reads$cyp6mz[[this.sample]][[d]], this.col = discordant.colours[d], yrange = y.ranges[[d]]) + } + these.cnv.states <- compact.hmm$cyp6mz[[this.sample]]$CNV[start.index : end.index] + lines(compact.hmm$cyp6mz[[this.sample]]$Position[start.index : end.index], these.cnv.states , col = 2, lwd = 2) + # Add the genes to the plot + these.gene.colours <- gene.colours[1:nrow(gene.coords$cyp6mz)] + abline(v = unlist(gene.coords$cyp6mz[, c('start', 'end')]), col = rep(these.gene.colours, 2)) + text(apply(gene.coords$cyp6mz[, c('start', 'end')], 1, mean), 9, rownames(gene.coords$cyp6mz), srt=90, adj = 0, col = these.gene.colours) +} + +plot.all.cyp6mz <- function(list.of.samples, matrix.of.read.dups = read.based.cnvs$cyp6mz, diagnostics = c('FA','SS','FM','XC','BP'), start.pos = plotting.ranges$cyp6mz[1], end.pos = plotting.ranges$cyp6mz[2]){ + x11() + x.midpoint <- mean(c(start.pos, end.pos)) + i <- 1 + while(1){ + if (i < 1) + i <- length(list.of.samples) + this.sample <- list.of.samples[i] + if (is.null(matrix.of.read.dups)) + plot.cyp6mz(this.sample, NULL, diagnostics, start.pos, end.pos) + else + plot.cyp6mz(this.sample, matrix.of.read.dups[this.sample,], diagnostics, start.pos, end.pos) + x <- locator(1)$x + if (x <= x.midpoint) + i <- i-1 + else + i <- i+1 + } +} + +plot.gste <- function(this.sample, list.of.dups = NULL, diagnostics = c('FA','SS','FM','XC','BP'), start.pos = plotting.ranges$gste[1], end.pos = plotting.ranges$gste[2]){ + start.index <- which(compact.hmm$gste[[this.sample]]$Position >= start.pos)[1] + end.index <- tail(which(compact.hmm$gste[[this.sample]]$Position <= end.pos), 1) + plot(compact.hmm$gste[[this.sample]]$Position[start.index : end.index], compact.hmm$gste[[this.sample]]$Normalised_coverage[start.index : end.index], main = this.sample, ylim = c(0,12)) + # We highlight along the x axis where the duplications are as predicted by the FA and SS reads + if (!is.null(list.of.dups)){ + # The order in which we plot Dups is important for clarity (we want smaller ones on top of larger ones) + list.of.dups <- rev(list.of.dups[-1]) + for (d in names(list.of.dups)[list.of.dups]){ + if (d == 'Dup13'){ + rect(known.cnvs$gste[[d]]$BP$pos[1], -0.6, 28599287, 0, col = duplication.colours[d], border = col) + text(28599287, -0.4, d, cex = 1.2, adj = c(1,0)) + } + else{ + rect(known.cnvs$gste[[d]]$BP$pos[1], -0.6, known.cnvs$gste[[d]]$BP$pos[2], 0, col = duplication.colours[d], border = col) + text(known.cnvs$gste[[d]]$BP$pos[2], -0.4, d, cex = 1.2, adj = c(1,0)) + } + } + } + discordant.colours <- c(FA = 'blue', SS = 'cyan', FM = 'green', XC = 'red') + y.ranges <- list(FA = c(0,3), SS = c(3,6), FM = c(6,9), XC = c(9,12)) + for (d in diagnostics){ + if (d == 'BP'){ + add.diagnostics(diagnostic.reads$gste[[this.sample]][[d]]$CEP$Position, this.col = 'brown', yrange = c(0,12)) + add.diagnostics(diagnostic.reads$gste[[this.sample]][[d]]$CSP$Position, this.col = 'pink', yrange = c(0,12)) + } + else if (d == 'XC') + add.diagnostics(diagnostic.reads$gste[[this.sample]][[d]]$Position, this.col = discordant.colours[d], yrange = y.ranges[[d]]) + else + add.diagnostics(diagnostic.reads$gste[[this.sample]][[d]], this.col = discordant.colours[d], yrange = y.ranges[[d]]) + } + these.cnv.states <- compact.hmm$gste[[this.sample]]$CNV[start.index : end.index] + lines(compact.hmm$gste[[this.sample]]$Position[start.index : end.index], these.cnv.states , col = 2, lwd = 2) + # Add the genes to the plot + these.gene.colours <- gene.colours[1:nrow(gene.coords$gste)] + abline(v = unlist(gene.coords$gste[, c('start', 'end')]), col = rep(these.gene.colours, 2)) + text(apply(gene.coords$gste[, c('start', 'end')], 1, mean), 9, rownames(gene.coords$gste), srt=90, adj = 0, col = these.gene.colours) +} + +plot.all.gste <- function(list.of.samples, matrix.of.read.dups = read.based.cnvs$gste, diagnostics = c('FA','SS','FM','XC','BP'), start.pos = plotting.ranges$gste[1], end.pos = plotting.ranges$gste[2]){ + x11() + x.midpoint <- mean(c(start.pos, end.pos)) + i <- 1 + while(1){ + if (i < 1) + i <- length(list.of.samples) + this.sample <- list.of.samples[i] + if (is.null(matrix.of.read.dups)) + plot.gste(this.sample, NULL, diagnostics, start.pos, end.pos) + else + plot.gste(this.sample, matrix.of.read.dups[this.sample,], diagnostics, start.pos, end.pos) + x <- locator(1)$x + if (x <= x.midpoint) + i <- i-1 + else + i <- i+1 + } +} + +plot.cyp9k1 <- function(this.sample, list.of.dups = NULL, diagnostics = c('FA','SS','FM','XC','BP'), start.pos = plotting.ranges$cyp9k1[1], end.pos = plotting.ranges$cyp9k1[2]){ + start.index <- which(compact.hmm$cyp9k1[[this.sample]]$Position >= start.pos)[1] + end.index <- tail(which(compact.hmm$cyp9k1[[this.sample]]$Position <= end.pos), 1) + plot(compact.hmm$cyp9k1[[this.sample]]$Position[start.index : end.index], compact.hmm$cyp9k1[[this.sample]]$Normalised_coverage[start.index : end.index], main = this.sample, ylim = c(0,12)) + # We highlight along the x axis where the duplications are as predicted by the FA and SS reads + if (!is.null(list.of.dups)){ + # The order in which we plot Dups is important for clarity (we want smaller ones on top of larger ones) + Dup.order <- paste('Dup', as.character(c(28:1, 17)), sep = '') + list.of.dups <- list.of.dups[Dup.order] + for (d in names(list.of.dups)[list.of.dups]){ + if (d == 'Dup3'){ + rect(15240464, -0.6, known.cnvs$cyp9k1[[d]]$BP$pos[2], 0, col = duplication.colours[d], border = col) + text(known.cnvs$cyp9k1[[d]]$BP$pos[2], -0.4, d, cex = 1.2, adj = c(1,0)) + } + else if (d == 'Dup7'){ + rect(15238000, -0.6, 15246300, 0, col = duplication.colours[d], border = col) + text(15246300, -0.4, d, cex = 1.2, adj = c(1,0)) + } + else if (d == 'Dup14'){ + rect(15233807, -0.6, known.cnvs$cyp9k1[[d]]$BP$pos[2], 0, col = duplication.colours[d], border = col) + text(known.cnvs$cyp9k1[[d]]$BP$pos[2], -0.4, d, cex = 1.2, adj = c(1,0)) + } + else if (d == 'Dup15'){ + rect(15233807, -0.6, known.cnvs$cyp9k1[[d]]$BP$pos[2], 0, col = duplication.colours[d], border = col) + text(known.cnvs$cyp9k1[[d]]$BP$pos[2], -0.4, d, cex = 1.2, adj = c(1,0)) + } + else if (d == 'Dup16'){ + rect(15222810, -0.6, known.cnvs$cyp9k1[[d]]$BP$pos[2], 0, col = duplication.colours[d], border = col) + text(known.cnvs$cyp9k1[[d]]$BP$pos[2], -0.4, d, cex = 1.2, adj = c(1,0)) + } + else if (d == 'Dup18'){ + rect(15236100, -0.6, known.cnvs$cyp9k1[[d]]$BP$pos[2], 0, col = duplication.colours[d], border = col) + text(known.cnvs$cyp9k1[[d]]$BP$pos[2], -0.4, d, cex = 1.2, adj = c(1,0)) + } + else if (d == 'Dup24'){ + rect(15238550, -0.6, 15255100, 0, col = duplication.colours[d], border = col) + text(15255100, -0.4, d, cex = 1.2, adj = c(1,0)) + } + else if (d == 'Dup25'){ + rect(15223500, -0.6, 15246650, 0, col = duplication.colours[d], border = col) + text(15246650, -0.4, d, cex = 1.2, adj = c(1,0)) + } + else if (d == 'Dup26'){ + rect(15222700, -0.6, 15248050, 0, col = duplication.colours[d], border = col) + text(15248050, -0.4, d, cex = 1.2, adj = c(1,0)) + } + else if (d == 'Dup27'){ + rect(known.cnvs$cyp9k1[[d]]$BP$pos[1], -0.6, 15248650, 0, col = duplication.colours[d], border = col) + text(15248650, -0.4, d, cex = 1.2, adj = c(1,0)) + } + else if (d == 'Dup28'){ + rect(15235000, -0.6, known.cnvs$cyp9k1[[d]]$BP$pos[2], 0, col = duplication.colours[d], border = col) + text(known.cnvs$cyp9k1[[d]]$BP$pos[2], -0.4, d, cex = 1.2, adj = c(1,0)) + } + else{ + rect(known.cnvs$cyp9k1[[d]]$BP$pos[1], -0.6, known.cnvs$cyp9k1[[d]]$BP$pos[2], 0, col = duplication.colours[d], border = col) + text(known.cnvs$cyp9k1[[d]]$BP$pos[2], -0.4, d, cex = 1.2, adj = c(1,0)) + } + } + } + discordant.colours <- c(FA = 'blue', SS = 'cyan', FM = 'green', XC = 'red') + y.ranges <- list(FA = c(0,3), SS = c(3,6), FM = c(6,9), XC = c(9,12)) + for (d in diagnostics){ + if (d == 'BP'){ + add.diagnostics(diagnostic.reads$cyp9k1[[this.sample]][[d]]$CEP$Position, this.col = 'brown', yrange = c(0,12)) + add.diagnostics(diagnostic.reads$cyp9k1[[this.sample]][[d]]$CSP$Position, this.col = 'pink', yrange = c(0,12)) + } + else if (d == 'XC') + add.diagnostics(diagnostic.reads$cyp9k1[[this.sample]][[d]]$Position, this.col = discordant.colours[d], yrange = y.ranges[[d]]) + else + add.diagnostics(diagnostic.reads$cyp9k1[[this.sample]][[d]], this.col = discordant.colours[d], yrange = y.ranges[[d]]) + } + these.cnv.states <- compact.hmm$cyp9k1[[this.sample]]$CNV[start.index : end.index] + lines(compact.hmm$cyp9k1[[this.sample]]$Position[start.index : end.index], these.cnv.states , col = 2, lwd = 2) + # Add the genes to the plot + abline(v = c(gene.coords$cyp9k1[, c('start', 'end')])) + text(apply(gene.coords$cyp9k1[, c('start', 'end')], 1, mean), 11, rownames(gene.coords$cyp9k1), srt=90, adj = 0) +} + +plot.all.cyp9k1 <- function(list.of.samples, matrix.of.read.dups = read.based.cnvs$cyp9k1, diagnostics = c('FA','SS','FM','XC','BP'), start.pos = plotting.ranges$cyp9k1[1], end.pos = plotting.ranges$cyp9k1[2]){ + x11() + x.midpoint <- mean(c(start.pos, end.pos)) + i <- 1 + while(1){ + if (i < 1) + i <- length(list.of.samples) + this.sample <- list.of.samples[i] + if (is.null(matrix.of.read.dups)) + plot.cyp9k1(this.sample, NULL, diagnostics, start.pos, end.pos) + else + plot.cyp9k1(this.sample, matrix.of.read.dups[this.sample,], diagnostics, start.pos, end.pos) + x <- locator(1)$x + if (x <= x.midpoint) + i <- i-1 + else + i <- i+1 + } +} + +simple.plot <- function(sample.names, gene.cluster, smoothing = 5, maxy = 12, ...){ + these.coords <- plotting.ranges[[gene.cluster]] + plot(c(min(these.coords), max(these.coords)), c(0,maxy), type = 'n', bty = 'n', xaxt = 'n', xlab = '', ylab = 'Normalised coverage') + axis(1, lwd = 0, mgp = c(0,0.25,2)) + mtext(paste('Position on chromosome', unique(gene.coords[[gene.cluster]]$Chrom)), 1, 1.75) + these.gene.colours <- gene.colours[1:nrow(gene.coords[[gene.cluster]])] + abline(v = unlist(gene.coords[[gene.cluster]][, c('start', 'end')]), col = rep(these.gene.colours, 2)) + text(apply(gene.coords[[gene.cluster]][, c('start', 'end')], 1, mean), 9, rownames(gene.coords[[gene.cluster]]), srt=90, adj = 0, col = these.gene.colours) + smoothed.line <- function(s){ + these.counts <- subset(compact.hmm[[gene.cluster]][[s]], Position >= these.coords[1] & Position <= these.coords[2]) + # If needed, create a table of smoothed data here + if (smoothing > 1){ + if (smoothing > nrow(these.counts)) + stop('Fail. Smoothing window size is larger than the number of points to smooth over.') + these.counts <- t(sapply(1:(nrow(these.counts) - smoothing + 1), function(j) apply(these.counts[j:(j + smoothing - 1), c('Position', 'Normalised_coverage')], 2, mean))) + } + lines(these.counts[, 'Position'], these.counts[, 'Normalised_coverage']) + } + empty <- sapply(sample.names, smoothed.line) +} + diff --git a/dockerfiles/CNV/R3.6.1/Rscripts/target_regions_analysis.r b/dockerfiles/CNV/R3.6.1/Rscripts/target_regions_analysis.r new file mode 100644 index 00000000..adcc93f0 --- /dev/null +++ b/dockerfiles/CNV/R3.6.1/Rscripts/target_regions_analysis.r @@ -0,0 +1,638 @@ +library(stringr) +library(stringi) +library(Biostrings) +library(parallel) + +arg.values <- commandArgs(trailingOnly=T) +cat('Arguments:', arg.values, '\n', sep = '\n') + +threshold.variance <- 0.2 + +sample.list <- arg.values[1] +sample.names <- setNames(nm = read.table(sample.list, stringsAsFactors = F)[[1]]) + +gene.regions.file <- arg.values[2] +all.gene.coordinates <- read.table(gene.regions.file, sep = '\t', header = T, row.names = 1) + +meta.file <- arg.values[3] +meta <- read.table(meta.file, sep = '\t', header = T, row.names = 1, quote = '"', comment.char = '')[sample.names, ] +expected.copy.number.on.X <- c(2, 1)[(meta$sex_call == 'M') + 1] + +# Get the species calls +species.calls.file <- arg.values[4] +species.calls <- read.table(species.calls.file, sep = '\t', header = T, row.names = 1) + +cov.var.file <- arg.values[5] +cov.var <- read.table(cov.var.file, header = T, sep = '\t', row.names = 1) +high.var.samples <- intersect(sample.names, rownames(cov.var)[cov.var$autosomes > threshold.variance]) + +coverage.folder <- arg.values[6] +diagnostic.reads.folder <- arg.values[7] +plotting.functions.file <- arg.values[8] +num.cores <- as.numeric(arg.values[9]) + +# Function to load up each file and store the results in a list +load.hmm.file <- function(sample.name, folder = '.'){ + all.files <- list.files(folder) + # Find the file corresponding to this sample + if (length(grep(sample.name, all.files, value = T)) == 0) + stop(paste('No matching files were found for sample', sample.name)) + this.file <- paste(folder, grep(sample.name, all.files, value = T), sep = '/') + if (length(this.file) > 1) + stop(paste('More than one matching file were found for sample', sample.name)) + # Load the file + cat('\t', this.file, '\n', sep = '') + read.table(this.file, header = T) +} + +# Function that will take a table of HMM output and shrink it to contain only a desired region +shrink.compact.hmm <- function(region.coords, input.hmm){ + subset(input.hmm, (Position >= region.coords[1]) & (Position <= region.coords[2])) +} + +# In order to process each hmm file only once, our function for multi-threading needs to output all +# objects needed from each hmm file +process_2R_regions <- function(sample.name){ + hmm.table <- load.hmm.file(sample.name, paste(coverage.folder, '2R/HMM_output', sep = '/')) + output.list <- list(ace1 = shrink.compact.hmm(region.coords$ace1, hmm.table), + cyp6 = shrink.compact.hmm(region.coords$cyp6, hmm.table) + ) + return(output.list) +} + +process_3R_regions <- function(sample.name){ + hmm.table <- load.hmm.file(sample.name, paste(coverage.folder, '3R/HMM_output', sep = '/')) + output.list <- list(cyp6mz = shrink.compact.hmm(region.coords$cyp6mz, hmm.table), + gste = shrink.compact.hmm(region.coords$gste, hmm.table) + ) + return(output.list) +} + +process_X_regions <- function(sample.name){ + hmm.table <- load.hmm.file(sample.name, paste(coverage.folder, 'X/HMM_output', sep = '/')) + return(shrink.compact.hmm(region.coords$cyp9k1, hmm.table)) +} + +# Function that will calculate the mode of the HMM coverage in a given region +get.gene.mode <- function(hmm.data, target.region, window.size = 300){ + target.region <- as.numeric(target.region) + hmm.in.region <- subset(hmm.data, Position >= (target.region[1] - window.size) & Position < target.region[2])$CNV + # We calculate the mode. Where there is more than one mode, we take the smallest one + un <- unique(hmm.in.region) + tab <- tabulate(match(hmm.in.region, un)) + hmm.mode <- min(un[tab == max(tab)]) + hmm.mode +} + +# Get the Agap numbers of the genes of interest, grouped by the cluster to which they belong. +focal.genes <- list(ace1 = c(Ace1 = 'AGAP001356'), + cyp6 = c(Cyp6aa1 = 'AGAP002862', + Cyp6aa2 = 'AGAP013128', + Coeae6o = 'AGAP002863', + AGAP002864 = 'AGAP002864', + Cyp6p1 = 'AGAP002868', + Cyp6p2 = 'AGAP002869', + Cyp6p3 = 'AGAP002865', + Cyp6p4 = 'AGAP002867', + Cyp6p5 = 'AGAP002866', + Cyp6ad1 = 'AGAP002870'), + cyp6mz = c(Cyp6m2 = 'AGAP008212', + Cyp6m3 = 'AGAP008213', + Cyp6m4 = 'AGAP008214', + Cyp6z1 = 'AGAP008219', + Cyp6z2 = 'AGAP008218', + Cyp6z3 = 'AGAP008217'), + gste = c(Gste1 = 'AGAP009195', + Gste2 = 'AGAP009194', + Gste3 = 'AGAP009197', + Gste4 = 'AGAP009193', + Gste5 = 'AGAP009192', + Gste6 = 'AGAP009191', + Gste7 = 'AGAP009196', + Gstu4 = 'AGAP009190'), + cyp9k1 = c(Cyp9k1 = 'AGAP000818') +) + +# Get the genetic coordinates of those genes. +gene.coords <- lapply(focal.genes, function(x) {X = all.gene.coordinates[x, ]; rownames(X) = names(x); X}) + +# Define the coordinates of the genetic regions that we will use for each cluster +region.coords = list(ace1 = c(3425000, 3650000), + cyp6 = c(28460000, 28800000), + cyp6mz = c(6900000, 7000000), + gste = c(28570000, 28620000), + cyp9k1 = c(15220000, 15255000)) + +plotting.ranges <- list() +compact.hmm <- list() +cov.cnv.samples <- list() +# Create an empty dataframe where each row is a sample +hmm.cnv.table <- data.frame(row.names = sample.names) +# We will include a table giving the sex of the sample, since this is relevant to the Cyp9k1 copy number +hmm.cnv.table$sex <- meta[rownames(hmm.cnv.table), 'sex_call'] +# And a column showing whether the sample has high coverage variance, since these will be dubious +hmm.cnv.table$high.var <- rownames(hmm.cnv.table) %in% high.var.samples + +# Load up the 2R data and get the tables for the Ace1 and Cyp6 regions +cat('Creating shrunk table for Ace1 and Cyp6 regions.\n') +temp.2R <- mclapply(sample.names, process_2R_regions, mc.cores = num.cores) +# Because the above line uses parallel processing, I am not 100% confident that it will correctly preserve +# sample order, so in the next couple of lines we use lapply on the sample.names vector to make sure the order +# is correct. +compact.hmm$ace1 <- lapply(sample.names, function(s) temp.2R[[s]]$ace1) +compact.hmm$cyp6 <- lapply(sample.names, function(s) temp.2R[[s]]$cyp6) +rm(temp.2R) +# Get the ranges within this region that we wish to use for plotting +plotting.ranges$ace1 <- region.coords$ace1 +plotting.ranges$cyp6 <- c(region.coords$cyp6[1], 28570000) +# Record the CNVs that were discovered based on coverage in these regions +hmm.cnv.table[['Ace1']] <- unlist(lapply(compact.hmm$ace1, get.gene.mode, target.region = gene.coords$ace1['Ace1', 2:3])) - 2 +cov.cnv.samples$ace1 <- rownames(subset(hmm.cnv.table, Ace1 > 0)) +cyp6aap.genes <- c('Cyp6aa1', 'Cyp6aa2', 'Cyp6p1', 'Cyp6p2', 'Cyp6p3', 'Cyp6p4', 'Cyp6p5') +for (g in cyp6aap.genes) + hmm.cnv.table[[g]] <- unlist(lapply(compact.hmm$cyp6, get.gene.mode, target.region = gene.coords$cyp6[g, 2:3])) - 2 +hmm.cnv.table[["Max_Cyp6aap"]] <- apply(hmm.cnv.table[, cyp6aap.genes], 1, max) +cov.cnv.samples$cyp6 <- rownames(subset(hmm.cnv.table, Max_Cyp6aap > 0)) + +# Load up the 3R data and get the tables for the Cyp6m-z and Gste regions +cat('Creating shrunk table for Cyp6M2-Z1 and Gste regions.\n') +temp.3R <- mclapply(sample.names, process_3R_regions, mc.cores = num.cores) +compact.hmm$cyp6mz <- lapply(sample.names, function(s) temp.3R[[s]]$cyp6mz) +compact.hmm$gste <- lapply(sample.names, function(s) temp.3R[[s]]$gste) +rm(temp.3R) +plotting.ranges$cyp6mz <- region.coords$cyp6mz +plotting.ranges$gste <- region.coords$gste +for (g in names(focal.genes$cyp6mz)) + hmm.cnv.table[[g]] <- unlist(lapply(compact.hmm$cyp6mz, get.gene.mode, target.region = gene.coords$cyp6mz[g, 2:3])) - 2 +hmm.cnv.table[["Max_Cyp6mz"]] <- apply(hmm.cnv.table[, names(focal.genes$cyp6mz)], 1, max) +cov.cnv.samples$cyp6mz <- rownames(subset(hmm.cnv.table, Max_Cyp6mz > 0)) +for (g in names(focal.genes$gste)) + hmm.cnv.table[[g]] <- unlist(lapply(compact.hmm$gste, get.gene.mode, target.region = gene.coords$gste[g, 2:3])) - 2 +hmm.cnv.table[["Max_Gstue"]] <- apply(hmm.cnv.table[, names(focal.genes$gste)], 1, max) +cov.cnv.samples$gste <- rownames(subset(hmm.cnv.table, Max_Gstue > 0)) + +# Load up the X data and get the tables for Cyp9k1 +cat('Creating shrunk table for Cyp9k1 region.\n') +temp.X <- mclapply(sample.names, process_X_regions, mc.cores = num.cores) +compact.hmm$cyp9k1 <- temp.X[sample.names] +rm(temp.X) +plotting.ranges$cyp9k1 <- region.coords$cyp9k1 +hmm.cnv.table[['Cyp9k1']] <- unlist(lapply(compact.hmm$cyp9k1, get.gene.mode, target.region = gene.coords$cyp9k1['Cyp9k1', 2:3])) - expected.copy.number.on.X +cov.cnv.samples$cyp9k1 <- rownames(subset(hmm.cnv.table, Cyp9k1 > 0)) + +# Get a names vector of the types of discordant reads that will be used +discordant.read.types <- c(FA = 'FA', SS = 'SS', FM = 'FM', XC = 'crosschrom') + +# Set the diagnostic reads that will be used to detect each CNV +known.cnvs <- list(ace1 = list(Dup1 = list(FA = matrix(c(3436850,3639550,3437150,3639850), 2, 2), + #BP = data.frame(pos = c(3436926, 3639836), seq = c('GCGAA', 'GGAAT'))) + BP = data.frame(pos = c(3436926, 3639836), seq = c('GCGAA', 'TTGTT'))), + Dup2 = list(FA = matrix(c(3447950,3518600,3448250,3518900), 2, 2)), + Del1 = list(FM = matrix(c(3501850,3598750,3502150,3599050), 2, 2)), + Del2 = list(FM = matrix(c(3539300,3573450,3539600,3573750), 2, 2)), + Del3 = list(FM = matrix(c(3535850,3618700,3536150,3619000), 2, 2)), + Del4 = list(FM = matrix(c(3512200,3615990,3512500,3616290), 2, 2))), + cyp6 = list(Dup1 = list(FA = matrix(c(28480150, 28483200, 28480450, 28483550), 2, 2)), + Dup1a = list(BP = data.frame(pos = c(28480189, 28483475), + seq = c('CGTAG', 'AATTG'))), + Dup1b = list(BP = data.frame(pos = c(28480193, 28483675), + seq = c('CTGCT', 'CCTTC'))), + Dup2 = list(FA = matrix(c(28493450, 28497000, 28493750, 28497300), 2, 2), + BP = data.frame(pos = c(28493547, 28497279), + seq = c('GCCGC','TTTAA'))), + Dup3 = list(FA = matrix(c(28479350, 28483100, 28479650, 28483400), 2, 2), + BP = data.frame(pos = c(28479407, 28483372), + seq = c('GCTTA', 'CAAAG'))), + Dup4 = list(FA = matrix(c(28478850, 28482750, 28479150, 28483050), 2, 2), + BP = data.frame(pos = c(28478925, 28483069), + seq = c('TACTT', 'CATGT'))), + Dup5 = list(FA = matrix(c(28480300, 28484200, 28480600, 28484500), 2, 2), + BP = data.frame(pos = c(28480372, 28484518), + seq = c('AAGAG', 'ACAAA'))), + Dup6 = list(FA = matrix(c(28478150, 28483850, 28478450, 28484150), 2, 2), + BP = data.frame(pos = c(28478272, 28484157), + seq = c('ATCAC', 'CTAGA'))), + Dup7 = list(SS = matrix(c(28478000, 28486000, 28478300, 28486300), 2, 2), + BP = data.frame(pos = c(28478057, 28486036), + seq = c('AGAGC','TTTTT'))), + Dup8 = list(FA = matrix(c(28475900, 28484700, 28476200, 28485000), 2, 2), + BP = data.frame(pos = c(28475996, 28485005), + seq = c('AGCGA', 'CAAAT'))), + Dup9 = list(FA = matrix(c(28479100, 28491200, 28479400, 28491500), 2, 2), + BP = data.frame(pos = c(28479181, 28491431), + seq = c('TGTTC', 'TGTGG'))), + Dup10 = list(FA = matrix(c(28477800, 28490850, 28478100, 28491150), 2, 2), + BP = data.frame(pos = c(28477889, 28491215), + # It turns out that Dup10 is not quite as simple as made out in the Supp Mat for the Genome Research paper. + # There is actually some kind of insertion / mutation happening around the breakpoint, and the aligners used + # for this experiment and in Ag1000G deal with this differently (so we need to check the phase3 data and + # onwards to see what happens there since they used a different aligner to phase 2). We therefore need to change + # the sequence slightly here. + seq = c('TGTAG','AACTT'))), + #seq = c('TGTAG','ACTCT'))), + Dup11 = list(FA = matrix(c(28487450, 28517800, 28487750, 28518100), 2, 2), + BP = data.frame(pos = c(28487546, 28518123), + seq = c('AACAC', 'TTATC'))), + Dup12 = list(FA = matrix(c(28474450, 28519650, 28474750, 28519950), 2, 2), + BP = data.frame(pos = c(28474576, 28520016), + seq = c('CCGAC', 'ACGGT'))), + Dup13 = list(FA = matrix(c(28472650, 28522350, 28472950, 28522650), 2, 2), + BP = data.frame(pos = c(28472728, 28522671), + seq = c('ACCGC', 'AGCTG'))), + Dup14 = list(FA = matrix(c(28473800, 28563200, 28474100, 28563500), 2, 2), + BP = data.frame(pos = c(28473874, 28563596), + seq = c('CCCAC', 'AGTTG'))), + Dup15 = list(FA = matrix(c(28465600, 55958800, 28465900, 55959100), 2, 2), + BP = data.frame(pos = c(28465673, NA), + seq = c('CAGCC', NA))), + Dup16 = list(FA = matrix(c(28480500, 28483300, 28480800, 28483600), 2, 2), + BP = data.frame(pos = c(28480547, 28483236), + seq = c('CCATT', 'TTAGT'))), + Dup17 = list(FA = matrix(c(28477500, 28484900, 28477800, 28485200), 2, 2), + BP = data.frame(pos = c(28477540, 28485380), + seq = c('TGCTG', 'ATCGG'))), + Dup18 = list(FA = matrix(c(28479500, 28494200, 28479800, 28494500), 2, 2), + BP = data.frame(pos = c(28479548, 28494597), + seq = c('AGTCG', 'TTGTC'))), + Dup19 = list(FA = matrix(c(28475480, 28556250, 28475780, 28556550), 2, 2), + BP = data.frame(pos = c(28475490, 28556726), + seq = c('AATAG', 'TGTGT'))), + Dup20 = list(FA = matrix(c(28473590, 28794750, 28473890, 28795050), 2, 2), + BP = data.frame(pos = c(28473600, 28795255), + seq = c('ATACT', 'CAAAA'))), + Dup21 = list(FA = matrix(c(28475100, 28483000, 28475400, 28483300), 2, 2), + BP = data.frame(pos = c(28475128, 28483473), + seq = c('AGCCG', 'TGCAC'))), + Dup22 = list(FA = matrix(c(28477200, 28484000, 28477500, 28484300), 2, 2), + BP = data.frame(pos = c(28477220, 28484338), + seq = c('GTGGA', 'CGAGT'))), + Dup23 = list(FA = matrix(c(28497300, 28371800, 28497600, 28372100), 2, 2), + BP = data.frame(pos = c(NA, 28497740), + seq = c(NA, 'TTGGC'))), + Dup24 = list(SS = matrix(c(28479500, 28483000, 28479800, 28483300), 2, 2), + BP = data.frame(pos = c(28480585, 28483442), + seq = c('AAACA','TTAAC'))), + # For 25 and 26, the FA reads would overlap, so we just use the BP reads. + Dup25 = list(BP = data.frame(pos = c(28480335, 28483384), + seq = c('GGCGT', 'CATAT'))), + Dup26 = list(BP = data.frame(pos = c(28480166, 28483253), + seq = c('AACGT', 'TGTGT'))), + Dup27 = list(FA = matrix(c(28496700, 28498900, 28497000, 28499200), 2, 2)), + Dup28 = list(FA = matrix(c(28477700, 28496600, 28478000, 28496900), 2, 2), + BP = data.frame(pos = c(28477710, 28496953), + seq = c('CTGTA', 'ATTCT'))), + Dup29 = list(FA = matrix(c(28494000, 28496000, 28494300, 28496300), 2, 2), + BP = data.frame(pos = c(28494017, 28496505), + seq = c('TGGAA', 'TTTGC'))), + Dup30 = list(FA = matrix(c(28478900, 28484700, 28479200, 28485000), 2, 2), + BP = data.frame(pos = c(28478987, 28485033), + seq = c('AACAG', 'ACGTT')))), + cyp6mz = list(Dupm1 = list(BP = data.frame(pos = c(6927942, NA), + seq = c('ATTAT', NA))), + Dupm2 = list(FA = matrix(c(6933100, 6934900, 6933400, 6935200), 2, 2)), + Dupm3 = list(FA = matrix(c(6929600, 6932500, 6929900, 6932800), 2, 2)), + Dupm4 = list(FA = matrix(c(6929900, 6936600, 6930200, 6936900), 2, 2), + BP = data.frame(pos = c(6929933, 6936902), + seq = c('TTAAA', 'TGTCG'))), + Dupm5 = list(FA = matrix(c(6933800, 6938300, 6934100, 6938600), 2, 2), + BP = data.frame(pos = c(6933972, 6938659), + seq = c('AAACC', 'GTCGG'))), + Dupz1 = list(FA = matrix(c(6968950, 6979300, 6969250, 6979600), 2, 2), + BP = data.frame(pos = c(6968962, 6979681), + seq = c('ACGCT', 'AGGTT'))), + Dupz2 = list(FA = matrix(c(6975100, 6977100, 6975400, 6977400), 2, 2), + BP = data.frame(pos = c(NA, 6977514), # clipped reads align at 6975066 + seq = c(NA, 'TAAGA'))), + Dupz3 = list(FA = matrix(c(6971450, 6977800, 6971750, 6978100), 2, 2), + BP = data.frame(pos = c(6971484, NA), # clipped reads align at 6978084 + seq = c('GCAAA', NA))), + Dupz4 = list(FA = matrix(c(6972700, 6977350, 6973000, 6977650), 2, 2), + BP = data.frame(pos = c(6972775, 6977699), + seq = c('GAATG', 'GTCCA'))), + Dupz5 = list(FA = matrix(c(6969800, 6975700, 6970100, 6976000), 2, 2)), + Dupmz1 = list(FA = matrix(c(6982700, 6879900, 6983000, 6880200), 2, 2))), + gste = list(Dup1 = list(FA = matrix(c(6968950, 6979300, 6969250, 6979600), 2, 2), + BP = data.frame(pos = c(28596818, 28598850), + seq = c('TTTTG', 'CGTTT'))), + # The following definition includes some false positives. + Dup2 = list(BP.weak = data.frame(pos = c(28596390, 28598923), + seq = c('GGGGG', 'TTCCC'))), + Dup3 = list(FA = matrix(c(28590500, 28592950, 28590800, 28593250), 2, 2), + BP = data.frame(pos = c(28590597, 28593254), + seq = c('TCAAA', 'AGGGC'))), + Dup4 = list(FA = matrix(c(28595050, 28598750, 28595350, 28599050), 2, 2), + BP = data.frame(pos = c(28595162, 28599081), + seq = c('TTCTA', 'AGAAC'))), + Dup5 = list(BP = data.frame(pos = c(28593122, 28598971), + seq = c('GTCAT', 'ATTTA'))), + Dup6 = list(FA = matrix(c(28596250, 28601900, 28596550, 28602200), 2, 2), + BP = data.frame(pos = c(28596241, 28602177), + seq = c('ACAAC', 'GAAGC'))), + # For the XC reads, the first two rows are the CNV start point, and the second two rows are the end point + Dup7 = list(XC = data.frame(c(28597400, 3696450, 28603950, 26597300), c(28597700, 3696750, 28604250, 26597600), c('3R', '2L', '3R', 'UNKN')), + BP = data.frame(pos = c(28597504, 28604250), + seq = c('GTCCA', 'GCTGT'))), + Dup8 = list(BP = data.frame(pos = c(28594797, 28602349), + seq = c('GTCCC', 'CAGGG'))), + Dup9 = list(FA = matrix(c(28591050, 28600850, 28591350, 28601150), 2, 2), + BP = data.frame(pos = c(28591140, 28601188), + seq = c('AGAAG', 'GATGA'))), + Dup10 = list(FA = matrix(c(28593550, 28603350, 28593850, 28603650), 2, 2), + BP = data.frame(pos = c(28593642, 28603786), + seq = c('TCGCT', 'AAGAC'))), + Dup11 = list(XC = data.frame(c(28581250, 29210650, 28604650, 29210650), c(28581550, 29210950, 28604950, 29210950), c('3R', 'UNKN', '3R', 'UNKN')), + BP = data.frame(pos = c(28581256, 28604994), + seq = c('CCATT', 'GGTAA'))), + Dup12 = list(FA = matrix(c(28597000, 28599950, 28597300, 28600250), 2, 2), + BP = data.frame(pos = c(28597030, 28600292), + seq = c('TACTG', 'CATCT'))), + Dup13 = list(FA = matrix(c(28597000, 28598900, 28597300, 28599200), 2, 2), + BP = data.frame(pos = c(28597181, NA), # clipped sequences align at 28599287 + seq = c('TACTC', NA))), + Dup14 = list(FA = matrix(c(28599800, 28607200, 28600100, 28607500), 2, 2), + BP = data.frame(pos = c(28599926, 28607500), + seq = c('CGACG', 'ATGCA'))), + Dup15 = list(FA = matrix(c(28596200, 28598500, 28596500, 28598800), 2, 2), + BP = data.frame(pos = c(28596252, 28598948), + seq = c('TTGGA', 'TTGAC'))), + Dup16 = list(FA = matrix(c(28597300, 28603200, 28597600, 28603500), 2, 2), + BP = data.frame(pos = c(28597383, 28603517), + seq = c('ACATT', 'ATTAC')))), + cyp9k1 = list(Dup1 = list(FA = matrix(c(15242500, 15244500, 15242800, 15244800), 2, 2), + BP = data.frame(pos = c(15242505,15244812), + seq = c('GTTTG', 'CATAT'))), + Dup2 = list(FA = matrix(c(15238300, 15240800, 15238600, 15241100), 2, 2), + BP = data.frame(pos = c(15238400, 15241082), + seq = c('CCGGC',' CGGTA'))), + Dup3 = list(FA = matrix(c(15240300, 15243450, 15240600, 15243750), 2, 2), + BP = data.frame(pos = c(NA, 15243860), + seq = c(NA, 'TGAAC'))), + Dup4 = list(FA = matrix(c(15240600, 15244200, 15240900, 15244500), 2, 2), + BP = data.frame(pos = c(15240608, 15244503), + seq = c('ATAAA', 'ACTGG'))), + Dup5 = list(FA = matrix(c(15238800, 15243850, 15239100, 15244150), 2, 2), + BP = data.frame(pos = c(15238911, 15244175), + seq = c('CACGT', 'AGTAA'))), + Dup6 = list(FA = matrix(c(15236400, 15243250, 15236700, 15243550), 2, 2), + BP = data.frame(pos = c(15236449, 15243646), + seq = c('TTTTT', 'GTTTT'))), + Dup7 = list(SS = matrix(c(15245400, 15246900, 15245700, 15247200), 2, 2), + BP = data.frame(pos = c(15245768, 15247258), + seq = c('TTTGT', 'TCTAA'))), + Dup8 = list(FA = matrix(c(15239200, 15247250, 15239500, 15247550), 2, 2), + BP = data.frame(pos = c(15239276, 15247645), + seq = c('AACAT', 'TTGCT'))), + Dup9 = list(FA = matrix(c(15239100, 15248900, 15239400, 15249200), 2, 2), + BP = data.frame(pos = c(15239184, 15249314), + seq = c('GCACA', 'AGTAC'))), + Dup10 = list(FA = matrix(c(15234900, 15244750, 15235200, 15245050), 2, 2), + BP = data.frame(pos = c(15234989, 15245128), + seq = c('GCACC', 'CTGAA'))), + Dup11 = list(FA = matrix(c(15236900, 15246800, 15237200, 15247100), 2, 2), + BP = data.frame(pos = c(15236922, 15247159), + seq = c('CATTA', 'TATCT'))), + Dup12 = list(FA = matrix(c(15234400, 15244350, 15234700, 15244650), 2, 2), + BP = data.frame(pos = c(15234434, 15244702), + seq = c('AACAG', 'TACTA'))), + Dup13 = list(FA = matrix(c(15240100, 15250250, 15240400, 15250550), 2, 2), + BP = data.frame(pos = c(15240067, 15250575), + seq = c('CCTAA', 'GTGTA'))), + # Dup 14 seems to have a different endpoint to Dup15, but the same insertion point way upstream + Dup14 = list(FA = matrix(c(15244200, 9676400, 15244500, 9676700), 2, 2), + # Because Dup14 has the same start pos as Dup15, we don't use the start breakpoint as a diagnostic + #BP = data.frame(pos = c(15233807, 15244936), + # seq = c('GGGTT', 'CCCAA'))), + BP = data.frame(pos = c(NA, 15244936), + seq = c(NA, 'CCCAA'))), + Dup15 = list(FA = matrix(c(15246250, 9676400, 15246550, 9676700), 2, 2), + # Because Dup14 has the same start pos as Dup15, we don't use the start breakpoint as a diagnostic + #BP = data.frame(pos = c(15233807, 15246640), + # seq = c('GGGTT', 'CCCAA'))), + BP = data.frame(pos = c(NA, 15246640), + seq = c(NA, 'CCCAA'))), + Dup16 = list(FA = matrix(c(15222700, 15244300, 15223000, 15244600), 2, 2), + BP = data.frame(pos = c(NA, 15244755), + seq = c(NA, 'AAGTA'))), + Dup17 = list(FA = matrix(c(15237150, 15243650, 15237450, 15243950), 2, 2), + BP = data.frame(pos = c(15237138, 15243975), + seq = c('TTGCT', 'TTTCG'))), + Dup18 = list(FA = matrix(c(15236100, 15243500, 15236400, 15243800), 2, 2), + BP = data.frame(pos = c(NA, 15243915), # clipped sequence aligns to 15236175 + seq = c(NA, 'CGGCG'))), + Dup19 = list(FA = matrix(c(15238800, 15251100, 15239100, 15251400), 2, 2), + BP = data.frame(pos = c(15238878, 15251503), + seq = c('TAAAT', 'GTTAC'))), + Dup20 = list(FA = matrix(c(15237350, 15243100, 15237650, 15243400), 2, 2), + BP = data.frame(pos = c(15237397, 15243514), + seq = c('ATGTT', 'TTACG'))), + Dup21 = list(FA = matrix(c(15237450, 15250300, 15237750, 15250600), 2, 2), + BP = data.frame(pos = c(15237482, 15250699), + seq = c('CTCTG', 'TTCTC'))), + Dup22 = list(FA = matrix(c(15240650, 15250300, 15240950, 15250600), 2, 2), + BP = data.frame(pos = c(15240680, 15250670), + seq = c('TTCCA', 'ATTCT'))), + Dup23 = list(FA = matrix(c(15241800, 15248100, 15242100, 15248400), 2, 2), + BP = data.frame(pos = c(15241929, 15248352), + seq = c('AACAA', 'CACGT'))), + Dup24 = list(FA = matrix(c(15238550, 15254800, 15238850, 15255100), 2, 2)), + Dup25 = list(FA = matrix(c(15223500, 15246350, 15223800, 15246650), 2, 2)), + Dup26 = list(FA = matrix(c(15222700, 15247750, 15223000, 15248050), 2, 2)), + Dup27 = list(FA = matrix(c(15237400, 15248350, 15237700, 15248650), 2, 2), + BP = data.frame(pos = c(15237566, NA), + seq = c('AATGT', NA))), + Dup28 = list(BP = data.frame(pos = c(NA, 15246640), + seq = c(NA, 'TCGAG')))) +) + +# Write a function to load the results of discordant read analysis from file for a given sample +get.discordant.reads <- function(this.sample, target.folder, target.region.coords){ + this.file <- grep(paste(this.sample, '.*csv$', sep = ''), list.files(target.folder), value = T) + if (length(this.file) != 1) + stop('There should be one and only one file in the folder that matches the sample name.') + pos.table <- read.table(paste(target.folder, this.file, sep='/'), header = T, sep='\t', colClasses = c('character', 'integer', 'character')) + split.pos.table <- split(pos.table, pos.table$Type) + get.discordant.read.type <- function(read.type){ + full.read.type <- paste(read.type, 'mapq >= 10') + if (!(full.read.type %in% names(split.pos.table))){ + return(data.frame(Position = integer(), Mate.position = integer())) + } + allpos <- split.pos.table[[full.read.type]][, c('Position', 'Mate.position')] + if (read.type %in% c('FA', 'SS', 'FM')){ + allpos$Mate.position <- as.integer(allpos$Mate.position) + which.in.region <- ((allpos$Position >= target.region.coords[1]) & (allpos$Position <= target.region.coords[2])) | ((allpos$Mate.position >= target.region.coords[1]) & (allpos$Mate.position <= target.region.coords[2])) + } + else{ + which.in.region <- (allpos$Position >= target.region.coords[1]) & (allpos$Position <= target.region.coords[2]) + } + allpos <- allpos[which.in.region, ] + allpos[order(allpos$Position),] + } + lapply(discordant.read.types, get.discordant.read.type) +} + +# Write a function to load the results of breakpoint read analysis from file for a given sample +get.breakpoint.reads <- function(this.sample, target.folder, target.region.coords){ + this.file <- grep(paste(this.sample, '.*csv$', sep = ''), list.files(target.folder), value = T) + if (length(this.file) != 1) + stop('There should be one and only one file in the folder that matches the sample name.') + pos.table <- read.table(paste(target.folder, this.file, sep='/'), header = T, sep='\t', colClasses = c('character', 'integer', 'character')) + which.in.region <- (pos.table$Position >= target.region.coords[1]) & (pos.table$Position <= target.region.coords[2]) + pos.table <- pos.table[which.in.region, ] + clipping.start.point <- pos.table[pos.table$Type == 'soft clipping start point mapq >= 10', c('Position', 'Clipped_sequence')] + clipping.end.point <- pos.table[pos.table$Type == 'soft clipping end point mapq >= 10', c('Position', 'Clipped_sequence')] + # + list(CSP = clipping.start.point[order(clipping.start.point$Position), ], + CEP = clipping.end.point[order(clipping.end.point$Position), ]) +} + +get.diagnostic.reads <- function(this.sample, disc.target.folder, bp.target.folder, target.region.coords){ + cat('\tLoading diagnostic reads for ', this.sample, '.\n', sep = '') + output <- get.discordant.reads(this.sample, disc.target.folder, target.region.coords) + output[['BP']] <- get.breakpoint.reads (this.sample, bp.target.folder, target.region.coords) + output +} + +# Write a function to load the results of discordant and breakpoint reads from file for a vector of samples +get.diagnostic.reads.allsamples <- function(sample.names, disc.target.folder, bp.target.folder, target.region.coords){ + cat('Loading discordant reads from ', disc.target.folder, ' and breakpoint reads from ', bp.target.folder, '.\n', sep = '') + lapply(sample.names, get.diagnostic.reads, disc.target.folder, bp.target.folder, target.region.coords) +} + +# Function to count the diagnostic reads for a given CNV. +# di is a table of observed discordant and breakpoint reads for a given sample. +count.reads.per.dup <- function(Dup.diagnostics, di){ + # There shouldn't be more than one of FA, SS, FM and XC + if (sum(names(Dup.diagnostics) %in% c('FA', 'SS', 'FM', 'XC')) > 1) + stop('There should not be more than one type of diagnostic read for a given Duplication.') + num.reads <- setNames(numeric(7), c('FA', 'SS', 'FM', 'XCS', 'XCE', 'CSP', 'CEP')) + if ('FA' %in% names(Dup.diagnostics)){ + num.reads['FA'] <- sum((di$FA$Position > Dup.diagnostics$FA[1,1]) & di$FA$Position < Dup.diagnostics$FA[1,2] + & (di$FA$Mate.position > Dup.diagnostics$FA[2,1]) & (di$FA$Mate.position < Dup.diagnostics$FA[2,2])) + } + if ('SS' %in% names(Dup.diagnostics)){ + num.reads['SS'] <- sum((di$SS$Position > Dup.diagnostics$SS[1,1]) & di$SS$Position < Dup.diagnostics$SS[1,2] + & (di$SS$Mate.position > Dup.diagnostics$SS[2,1]) & (di$SS$Mate.position < Dup.diagnostics$SS[2,2])) + } + if ('FM' %in% names(Dup.diagnostics)){ + num.reads['FM'] <- sum((di$FM$Position > Dup.diagnostics$FM[1,1]) & di$FM$Position < Dup.diagnostics$FM[1,2] + & (di$FM$Mate.position > Dup.diagnostics$FM[2,1]) & (di$FM$Mate.position < Dup.diagnostics$FM[2,2])) + } + if ('XC' %in% names(Dup.diagnostics)){ + these.XC <- di$XC + these.XC$Matechrom <- sub(':.*', '', these.XC$Mate.position) + these.XC$Matepos <- as.integer(sub('.*:' , '', these.XC$Mate.position)) + num.reads['XCS'] <- sum((these.XC$Position > Dup.diagnostics$XC[1,1]) & (these.XC$Position < Dup.diagnostics$XC[1,2]) + & (these.XC$Matechrom == Dup.diagnostics$XC[2,3]) & (these.XC$Matepos > Dup.diagnostics$XC[2,1]) & (these.XC$Matepos < Dup.diagnostics$XC[2,2]), na.rm = T) + num.reads['XCE'] <- sum((these.XC$Position > Dup.diagnostics$XC[3,1]) & (these.XC$Position < Dup.diagnostics$XC[3,2]) + & (these.XC$Matechrom == Dup.diagnostics$XC[4,3]) & (these.XC$Matepos > Dup.diagnostics$XC[4,1]) & (these.XC$Matepos < Dup.diagnostics$XC[4,2]), na.rm = T) + } + if ('BP' %in% names(Dup.diagnostics)){ + CEP.seq <- subset(di$BP$CEP, Position == Dup.diagnostics$BP$pos[1])$Clipped_sequence + num.reads['CEP'] <- sum(substr(reverse(CEP.seq), 1, 5) == Dup.diagnostics$BP$seq[1]) + CSP.seq <- subset(di$BP$CSP, Position == Dup.diagnostics$BP$pos[2])$Clipped_sequence + num.reads['CSP'] <- sum(substr(CSP.seq, 1, 5) == Dup.diagnostics$BP$seq[2]) + } + # There is a special case in Gste2 where one Dup isn't defined very effectively by diagnostic + # reads. The best diagnostic based on phase 2 data still includes some false positives, which + # were minimised by taking the minimum of CEP and CSP, rather than the sum + if ('BP.weak' %in% names(Dup.diagnostics)){ + CEP.seq <- subset(di$BP$CEP, Position == Dup.diagnostics$BP$pos[1])$Clipped_sequence + num.reads.cep <- sum(substr(reverse(CEP.seq), 1, 5) == Dup.diagnostics$BP$seq[1]) + CSP.seq <- subset(di$BP$CSP, Position == Dup.diagnostics$BP$pos[2])$Clipped_sequence + num.reads.csp <- sum(substr(CSP.seq, 1, 5) == Dup.diagnostics$BP$seq[2]) + num.reads['BP.weak'] <- min(num.reads.cep, num.reads.csp) + } + num.reads +} + +# Function to apply count.reads.per.dup to all CNVs for a given sample +count.diagnostic.reads <- function(diagnostic.reads.list, known.cnvs.list, all.breakpoint.positions, verbose = F){ + # Subset the BP reads so that they only include reads that match at least one of + # the breakpoints. That will greatly reduce the list that needs searching for every + # CNV. + diagnostic.reads.list$BP$CSP <- subset(diagnostic.reads.list$BP$CSP, Position %in% all.breakpoint.positions) + diagnostic.reads.list$BP$CEP <- subset(diagnostic.reads.list$BP$CEP, Position %in% all.breakpoint.positions) + num.reads <- lapply(known.cnvs.list, count.reads.per.dup, diagnostic.reads.list) + if (verbose) + print(num.reads) + sapply(num.reads, sum) +} + +# Function to apply counts.diagnostic.reads to all samples +count.diagnostic.reads.allsamples <- function(diagnostic.reads.allsamples, known.cnvs.list){ + # Create a single object containing all of the breakpoint positions + breakpoint.summary <- unlist(sapply(known.cnvs.list, function(x) x$BP$pos)) + t(sapply(diagnostic.reads.allsamples, count.diagnostic.reads, known.cnvs.list, breakpoint.summary)) +} + +# Function to produce a table of all observed CNVs in all samples, given a list of coverage-based calls, +# a table of counts of diagnostic reads for each CNV in all samples, and a threshold number of diagnostic +# reads required to call a CNV +get.read.based.cnvs <- function(coverage.cnv.calls, diagnostic.read.counts.table, threshold.diagnostic.reads = 4){ + read.based <- cbind(diagnostic.read.counts.table >= threshold.diagnostic.reads) + read.cnv.samples <- rownames(read.based)[apply(read.based, 1, any)] + # Unlike in phase2, we use Dup0 to indicate that there is ANY coverage call, not just the samples that + # have a coverage call but not a read-based call + cbind(Dup0 = rownames(read.based) %in% coverage.cnv.calls, read.based) +} + +# Set the folders in which to look for FA, SS, FM and XC reads +SSFA.folders <- setNames(paste(diagnostic.reads.folder, c('SSFA/2R/Ace1_region', + 'SSFA/2R/Cyp6_region', + 'SSFA/3R/Cyp6zm_region', + 'SSFA/3R/Gste_region', + 'SSFA/X/Cyp9k1_region'), + sep = '/'), + c('ace1', 'cyp6', 'cyp6mz', 'gste', 'cyp9k1') + ) + +# Set the folders in which to look for breakpoint reads. +breakpoints.folders <- setNames(paste(diagnostic.reads.folder, c('breakpoints/2R/Ace1_region', + 'breakpoints/2R/Cyp6_region', + 'breakpoints/3R/Cyp6zm_region', + 'breakpoints/3R/Gste_region', + 'breakpoints/X/Cyp9k1_region'), + sep = '/'), + c('ace1', 'cyp6', 'cyp6mz', 'gste', 'cyp9k1') + ) + +cat('Loading discordant and breakpoint reads\n') +diagnostic.reads <- mcmapply(get.diagnostic.reads.allsamples, SSFA.folders, breakpoints.folders, region.coords, MoreArgs = list(sample.names = sample.names), SIMPLIFY = F, mc.preschedule = F, mc.cores = num.cores) + +cat('Counting diagnostic reads\n') +diagnostic.read.counts <- mcmapply(count.diagnostic.reads.allsamples, diagnostic.reads, known.cnvs, mc.preschedule = F, mc.cores = num.cores) +cat('Calling read-based CNVs\n') +read.based.cnvs <- mapply(get.read.based.cnvs, cov.cnv.samples, diagnostic.read.counts, SIMPLIFY = F) +# As a special case for cyp6, we don't score the presence of Dup1 if either Dup1a or Dup1b are present (because Dup1 +# just represents an ambiguous call for Dup1a / Dup1b. +cyp6.Dup1ab <- read.based.cnvs$cyp6[, 'Dup1a'] | read.based.cnvs$cyp6[, 'Dup1b'] +read.based.cnvs$cyp6[cyp6.Dup1ab, 'Dup1'] <- F + +# Combine the separate CNV tables into a single table +full.cnv.table <- do.call(cbind, read.based.cnvs) +full.cnv.table <- cbind(rownames(full.cnv.table) %in% high.var.samples, full.cnv.table) +gene.cluster.names <- setNames(c('Ace1', 'Cyp6aap', 'Cyp6mz', 'Gstue', 'Cyp9k1'), + names(diagnostic.read.counts)) +# The column names for the full table will be with the gene cluster codes that we used in the phase 2 analysis. +colnames(full.cnv.table) <- c('High.var.sample', unlist(sapply(names(gene.cluster.names), function(x) paste(gene.cluster.names[x], colnames(read.based.cnvs[[x]]), sep = '_')))) +# Get the results in a different format. Here as a list where, for each CNV, we have a vector of samples that +# carry it. +cnv.sample.lists <- lapply(read.based.cnvs, function(m) apply(m, 2, function(x) rownames(m)[x])) + +# Get a vector of samples that carry at least one CNV based on read calls. +# Dup0 is always the first column, so the -1 index removes Dup0 +read.cnv.samples <- lapply(read.based.cnvs, function(x) rownames(x)[apply(x[, -1, drop = F], 1, any)]) + +# For each cluster, get a vector of samples that have been called as carrying a cnv by reads but not coverage, +# ignoring samples with high variance. +cov.based.negatives <- lapply(mapply(setdiff, read.cnv.samples, cov.cnv.samples), setdiff, high.var.samples) +# And vice-versa +read.based.negatives <- lapply(mapply(setdiff, cov.cnv.samples, read.cnv.samples), setdiff, high.var.samples) + +dir.create('target_regions_analysis', showWarnings = FALSE) +write.table(full.cnv.table, file = 'target_regions_analysis/focal_region_CNV_table.csv', sep = '\t', col.names = NA, quote = F) +write.table(hmm.cnv.table, file = 'target_regions_analysis/HMM_gene_copy_number.csv', sep = '\t', col.names = NA, quote = F) + +source(plotting.functions.file) +save.image('target_regions_analysis/target_regions_analysis.Rdata') + diff --git a/dockerfiles/CNV/R3.6.1/Rscripts/target_regions_analysis.sh b/dockerfiles/CNV/R3.6.1/Rscripts/target_regions_analysis.sh new file mode 100755 index 00000000..4f3687b8 --- /dev/null +++ b/dockerfiles/CNV/R3.6.1/Rscripts/target_regions_analysis.sh @@ -0,0 +1,33 @@ +workingfolder=$1 +manifest=$2 +species_id_file=$3 +coverage_variance_file=$4 +metadata=$5 +ncores=$6 + +coveragefolder=$workingfolder/coverage +diagnostic_reads_folder=$workingfolder/diagnostic_reads +gene_coordinates_file=~/personal/phase3_data/tables/gene_regions.csv +scriptsfolder=~/scripts/CNV_scripts/scripts +plotting_functions_file=$scriptsfolder/plotting_functions.r + +cd $workingfolder + +mkdir -p target_regions_analysis + +export R_LIBS_USER="~/R-modules:$R_LIBS_USER" + +R-3.6.1 --version + +R-3.6.1 --slave -f $scriptsfolder/target_regions_analysis.r --args $manifest \ + $gene_coordinates_file \ + $metadata \ + $species_id_file \ + $coverage_variance_file \ + $coveragefolder \ + $diagnostic_reads_folder \ + $plotting_functions_file \ + $ncores \ + > target_regions_analysis/target_regions_analysis.log 2>&1 + + diff --git a/dockerfiles/CNV/R3.6.1/Rscripts/target_regions_analysis_old.r b/dockerfiles/CNV/R3.6.1/Rscripts/target_regions_analysis_old.r new file mode 100644 index 00000000..91c22e58 --- /dev/null +++ b/dockerfiles/CNV/R3.6.1/Rscripts/target_regions_analysis_old.r @@ -0,0 +1,643 @@ +library(stringr) +library(stringi) +library(Biostrings) +library(parallel) + +arg.values <- commandArgs(trailingOnly=T) +cat('Arguments:', arg.values, '\n', sep = '\n') + +threshold.variance <- 0.2 + +sample.list <- arg.values[1] +sample.names <- setNames(nm = read.table(sample.list, stringsAsFactors = F)[[1]]) + +gene.regions.file <- arg.values[2] +all.gene.coordinates <- read.table(gene.regions.file, sep = '\t', header = T, row.names = 1) + +meta.file <- arg.values[3] +meta <- read.table(meta.file, sep = '\t', header = T, row.names = 1, quote = '', comment.char = '')[sample.names, ] +expected.copy.number.on.X <- c(2, 1)[(meta$sex_call == 'M') + 1] + +# Get the species calls +species.calls.file <- arg.values[4] +species.calls <- read.table(species.calls.file, sep = '\t', header = T, row.names = 1) + +cov.var.file <- arg.values[5] +cov.var <- read.table(cov.var.file, header = T, sep = '\t', row.names = 1) +high.var.samples <- intersect(sample.names, rownames(cov.var)[cov.var$autosomes > threshold.variance]) + +coverage.folder <- arg.values[6] +diagnostic.reads.folder <- arg.values[7] +plotting.functions.file <- arg.values[8] +num.cores <- as.numeric(arg.values[9]) + +# Function to load up each file and store the results in a list +load.hmm.file <- function(sample.name, folder = '.'){ + all.files <- list.files(folder) + # Find the file corresponding to this sample + this.file <- paste(folder, grep(sample.name, all.files, value = T), sep = '/') + if (length(this.file) == 0) + stop(paste('No matching files were found for sample', sample.name)) + if (length(this.file) > 1) + stop(paste('More than one matching file were found for sample', sample.name)) + # Load the file + cat('\t', this.file, '\n', sep = '') + read.table(this.file, header = T) +} + +# Function that will take a table of HMM output and shrink it to contain only a desired region +shrink.compact.hmm <- function(region.coords, input.hmm){ + subset(input.hmm, (Position >= region.coords[1]) & (Position <= region.coords[2])) +} + +# In order to process each hmm file only once, our function for multi-threading needs to output all +# objects needed from each hmm file +process_2R_regions <- function(sample.name){ + hmm.table <- load.hmm.file(sample.name, paste(coverage.folder, '2R/HMM_output', sep = '/')) + output.list <- list(ace1 = shrink.compact.hmm(region.coords$ace1, hmm.table), + cyp6 = shrink.compact.hmm(region.coords$cyp6, hmm.table) + ) + return(output.list) +} + +process_3R_regions <- function(sample.name){ + hmm.table <- load.hmm.file(sample.name, paste(coverage.folder, '3R/HMM_output', sep = '/')) + output.list <- list(cyp6mz = shrink.compact.hmm(region.coords$cyp6mz, hmm.table), + gste = shrink.compact.hmm(region.coords$gste, hmm.table) + ) + return(output.list) +} + +process_X_regions <- function(sample.name){ + hmm.table <- load.hmm.file(sample.name, paste(coverage.folder, 'X/HMM_output', sep = '/')) + return(shrink.compact.hmm(region.coords$cyp9k1, hmm.table)) +} + +# Function that will calculate the mode of the HMM coverage in a given region +get.gene.mode <- function(hmm.data, target.region, window.size = 300){ + target.region <- as.numeric(target.region) + hmm.in.region <- subset(hmm.data, Position >= (target.region[1] - window.size) & Position < target.region[2])$CNV + # We calculate the mode. Where there is more than one mode, we take the smallest one + un <- unique(hmm.in.region) + tab <- tabulate(match(hmm.in.region, un)) + hmm.mode <- min(un[tab == max(tab)]) + hmm.mode +} + +# Get the Agap numbers of the genes of interest, grouped by the cluster to which they belong. +focal.genes <- list(ace1 = c(Ace1 = 'AGAP001356'), + cyp6 = c(Cyp6aa1 = 'AGAP002862', + Cyp6aa2 = 'AGAP013128', + Coeae6o = 'AGAP002863', + AGAP002864 = 'AGAP002864', + Cyp6p1 = 'AGAP002868', + Cyp6p2 = 'AGAP002869', + Cyp6p3 = 'AGAP002865', + Cyp6p4 = 'AGAP002867', + Cyp6p5 = 'AGAP002866', + Cyp6ad1 = 'AGAP002870'), + cyp6mz = c(Cyp6m2 = 'AGAP008212', + Cyp6m3 = 'AGAP008213', + Cyp6m4 = 'AGAP008214', + Cyp6z1 = 'AGAP008219', + Cyp6z2 = 'AGAP008218', + Cyp6z3 = 'AGAP008217'), + gste = c(Gste1 = 'AGAP009195', + Gste2 = 'AGAP009194', + Gste3 = 'AGAP009197', + Gste4 = 'AGAP009193', + Gste5 = 'AGAP009192', + Gste6 = 'AGAP009191', + Gste7 = 'AGAP009196', + Gstu4 = 'AGAP009190'), + cyp9k1 = c(Cyp9k1 = 'AGAP000818') +) + +# Get the genetic coordinates of those genes. +gene.coords <- lapply(focal.genes, function(x) {X = all.gene.coordinates[x, ]; rownames(X) = names(x); X}) + +# Define the coordinates of the genetic regions that we will use for each cluster +region.coords = list(ace1 = c(3425000, 3650000), + cyp6 = c(28460000, 28800000), + cyp6mz = c(6900000, 7000000), + gste = c(28570000, 28620000), + cyp9k1 = c(15220000, 15255000)) + +plotting.ranges <- list() +compact.hmm <- list() +cov.cnv.samples <- list() +# Create an empty dataframe where each row is a sample +hmm.cnv.table <- data.frame(row.names = sample.names) +# We will include a table giving the sex of the sample, since this is relevant to the Cyp9k1 copy number +hmm.cnv.table$sex <- meta[rownames(hmm.cnv.table), 'sex_call'] +# And a column showing whether the sample has high coverage variance, since these will be dubious +hmm.cnv.table$high.var <- rownames(hmm.cnv.table) %in% high.var.samples + +# Load up the 2R data and get the tables for the Ace1 and Cyp6 regions +cat('Creating shrunk table for Ace1 and Cyp6 regions.\n') +temp.2R <- mclapply(sample.names, process_2R_regions, mc.cores = num.cores) +# Because the above line uses parallel processing, I am not 100% confident that it will correctly preserve +# sample order, so in the next couple of lines we use lapply on the sample.names vector to make sure the order +# is correct. +compact.hmm$ace1 <- lapply(sample.names, function(s) temp.2R[[s]]$ace1) +compact.hmm$cyp6 <- lapply(sample.names, function(s) temp.2R[[s]]$cyp6) +rm(temp.2R) +# Get the ranges within this region that we wish to use for plotting +plotting.ranges$ace1 <- region.coords$ace1 +plotting.ranges$cyp6 <- c(region.coords$cyp6[1], 28570000) +# Record the CNVs that were discovered based on coverage in these regions +hmm.cnv.table[['Ace1']] <- unlist(lapply(compact.hmm$ace1, get.gene.mode, target.region = gene.coords$ace1['Ace1', 2:3])) - 2 +cov.cnv.samples$ace1 <- rownames(subset(hmm.cnv.table, Ace1 > 0)) +cyp6aap.genes <- c('Cyp6aa1', 'Cyp6aa2', 'Cyp6p1', 'Cyp6p2', 'Cyp6p3', 'Cyp6p4', 'Cyp6p5') +for (g in cyp6aap.genes) + hmm.cnv.table[[g]] <- unlist(lapply(compact.hmm$cyp6, get.gene.mode, target.region = gene.coords$cyp6[g, 2:3])) - 2 +hmm.cnv.table[["Max_Cyp6aap"]] <- apply(hmm.cnv.table[, cyp6aap.genes], 1, max) +cov.cnv.samples$cyp6 <- rownames(subset(hmm.cnv.table, Max_Cyp6aap > 0)) + +# Load up the 3R data and get the tables for the Cyp6m-z and Gste regions +cat('Creating shrunk table for Cyp6M2-Z1 and Gste regions.\n') +temp.3R <- mclapply(sample.names, process_3R_regions, mc.cores = num.cores) +compact.hmm$cyp6mz <- lapply(sample.names, function(s) temp.3R[[s]]$cyp6mz) +compact.hmm$gste <- lapply(sample.names, function(s) temp.3R[[s]]$gste) +rm(temp.3R) +plotting.ranges$cyp6mz <- region.coords$cyp6mz +plotting.ranges$gste <- region.coords$gste +for (g in names(focal.genes$cyp6mz)) + hmm.cnv.table[[g]] <- unlist(lapply(compact.hmm$cyp6mz, get.gene.mode, target.region = gene.coords$cyp6mz[g, 2:3])) - 2 +hmm.cnv.table[["Max_Cyp6mz"]] <- apply(hmm.cnv.table[, names(focal.genes$cyp6mz)], 1, max) +cov.cnv.samples$cyp6mz <- rownames(subset(hmm.cnv.table, Max_Cyp6mz > 0)) +for (g in names(focal.genes$gste)) + hmm.cnv.table[[g]] <- unlist(lapply(compact.hmm$gste, get.gene.mode, target.region = gene.coords$gste[g, 2:3])) - 2 +hmm.cnv.table[["Max_Gstue"]] <- apply(hmm.cnv.table[, names(focal.genes$gste)], 1, max) +cov.cnv.samples$gste <- rownames(subset(hmm.cnv.table, Max_Gstue > 0)) + +# Load up the X data and get the tables for Cyp9k1 +cat('Creating shrunk table for Cyp9k1 region.\n') +temp.X <- mclapply(sample.names, process_X_regions, mc.cores = num.cores) +compact.hmm$cyp9k1 <- temp.X[sample.names] +rm(temp.X) +plotting.ranges$cyp9k1 <- region.coords$cyp9k1 +hmm.cnv.table[['Cyp9k1']] <- unlist(lapply(compact.hmm$cyp9k1, get.gene.mode, target.region = gene.coords$cyp9k1['Cyp9k1', 2:3])) - expected.copy.number.on.X +cov.cnv.samples$cyp9k1 <- rownames(subset(hmm.cnv.table, Cyp9k1 > 0)) + +# Get a names vector of the types of discordant reads that will be used +discordant.read.types <- c(FA = 'FA', SS = 'SS', FM = 'FM', XC = 'crosschrom') + +# Set the diagnostic reads that will be used to detect each CNV +known.cnvs <- list(ace1 = list(Dup1 = list(FA = matrix(c(3436850,3639550,3437150,3639850), 2, 2), + #BP = data.frame(pos = c(3436926, 3639836), seq = c('GCGAA', 'GGAAT'))) + BP = data.frame(pos = c(3436926, 3639836), seq = c('GCGAA', 'TTGTT'))), + Dup2 = list(FA = matrix(c(3447950,3518600,3448250,3518900), 2, 2)), + Del1 = list(FM = matrix(c(3501850,3598750,3502150,3599050), 2, 2)), + Del2 = list(FM = matrix(c(3539300,3573450,3539600,3573750), 2, 2)), + Del3 = list(FM = matrix(c(3535850,3618700,3536150,3619000), 2, 2)), + Del4 = list(FM = matrix(c(3512200,3615990,3512500,3616290), 2, 2))), + cyp6 = list(Dup1 = list(FA = matrix(c(28480150, 28483200, 28480450, 28483550), 2, 2)), + Dup1a = list(BP = data.frame(pos = c(28480189, 28483475), + seq = c('CGTAG', 'AATTG'))), + Dup1b = list(BP = data.frame(pos = c(28480193, 28483675), + seq = c('CTGCT', 'CCTTC'))), + Dup2 = list(FA = matrix(c(28493450, 28497000, 28493750, 28497300), 2, 2), + BP = data.frame(pos = c(28493547, 28497279), + seq = c('GCCGC','TTTAA'))), + Dup3 = list(FA = matrix(c(28479350, 28483100, 28479650, 28483400), 2, 2), + BP = data.frame(pos = c(28479407, 28483372), + seq = c('GCTTA', 'CAAAG'))), + Dup4 = list(FA = matrix(c(28478850, 28482750, 28479150, 28483050), 2, 2), + BP = data.frame(pos = c(28478925, 28483069), + seq = c('TACTT', 'CATGT'))), + Dup5 = list(FA = matrix(c(28480300, 28484200, 28480600, 28484500), 2, 2), + BP = data.frame(pos = c(28480372, 28484518), + seq = c('AAGAG', 'ACAAA'))), + Dup6 = list(FA = matrix(c(28478150, 28483850, 28478450, 28484150), 2, 2), + BP = data.frame(pos = c(28478272, 28484157), + seq = c('ATCAC', 'CTAGA'))), + Dup7 = list(SS = matrix(c(28478000, 28486000, 28478300, 28486300), 2, 2), + BP = data.frame(pos = c(28478057, 28486036), + seq = c('AGAGC','TTTTT'))), + Dup8 = list(FA = matrix(c(28475900, 28484700, 28476200, 28485000), 2, 2), + BP = data.frame(pos = c(28475996, 28485005), + seq = c('AGCGA', 'CAAAT'))), + Dup9 = list(FA = matrix(c(28479100, 28491200, 28479400, 28491500), 2, 2), + BP = data.frame(pos = c(28479181, 28491431), + seq = c('TGTTC', 'TGTGG'))), + Dup10 = list(FA = matrix(c(28477800, 28490850, 28478100, 28491150), 2, 2), + BP = data.frame(pos = c(28477889, 28491215), + # It turns out that Dup10 is not quite as simple as made out in the Supp Mat for the Genome Research paper. + # There is actually some kind of insertion / mutation happening around the breakpoint, and the aligners used + # for this experiment and in Ag1000G deal with this differently (so we need to check the phase3 data and + # onwards to see what happens there since they used a different aligner to phase 2). We therefore need to change + # the sequence slightly here. + seq = c('TGTAG','AACTT'))), + #seq = c('TGTAG','ACTCT'))), + Dup11 = list(FA = matrix(c(28487450, 28517800, 28487750, 28518100), 2, 2), + BP = data.frame(pos = c(28487546, 28518123), + seq = c('AACAC', 'TTATC'))), + Dup12 = list(FA = matrix(c(28474450, 28519650, 28474750, 28519950), 2, 2), + BP = data.frame(pos = c(28474576, 28520016), + seq = c('CCGAC', 'ACGGT'))), + Dup13 = list(FA = matrix(c(28472650, 28522350, 28472950, 28522650), 2, 2), + BP = data.frame(pos = c(28472728, 28522671), + seq = c('ACCGC', 'AGCTG'))), + Dup14 = list(FA = matrix(c(28473800, 28563200, 28474100, 28563500), 2, 2), + BP = data.frame(pos = c(28473874, 28563596), + seq = c('CCCAC', 'AGTTG'))), + Dup15 = list(FA = matrix(c(28465600, 55958800, 28465900, 55959100), 2, 2), + BP = data.frame(pos = c(28465673, NA), + seq = c('CAGCC', NA))), + Dup16 = list(FA = matrix(c(28480500, 28483300, 28480800, 28483600), 2, 2), + BP = data.frame(pos = c(28480547, 28483236), + seq = c('CCATT', 'TTAGT'))), + Dup17 = list(FA = matrix(c(28477500, 28484900, 28477800, 28485200), 2, 2), + BP = data.frame(pos = c(28477540, 28485380), + seq = c('TGCTG', 'ATCGG'))), + Dup18 = list(FA = matrix(c(28479500, 28494200, 28479800, 28494500), 2, 2), + BP = data.frame(pos = c(28479548, 28494597), + seq = c('AGTCG', 'TTGTC'))), + Dup19 = list(FA = matrix(c(28475480, 28556250, 28475780, 28556550), 2, 2), + BP = data.frame(pos = c(28475490, 28556726), + seq = c('AATAG', 'TGTGT'))), + Dup20 = list(FA = matrix(c(28473590, 28794750, 28473890, 28795050), 2, 2), + BP = data.frame(pos = c(28473600, 28795255), + seq = c('ATACT', 'CAAAA'))), + Dup21 = list(FA = matrix(c(28475100, 28483000, 28475400, 28483300), 2, 2), + BP = data.frame(pos = c(28475128, 28483473), + seq = c('AGCCG', 'TGCAC'))), + Dup22 = list(FA = matrix(c(28477200, 28484000, 28477500, 28484300), 2, 2), + BP = data.frame(pos = c(28477220, 28484338), + seq = c('GTGGA', 'CGAGT'))), + Dup23 = list(FA = matrix(c(28497300, 28371800, 28497600, 28372100), 2, 2), + BP = data.frame(pos = c(NA, 28497740), + seq = c(NA, 'TTGGC'))), + Dup24 = list(SS = matrix(c(28479500, 28483000, 28479800, 28483300), 2, 2), + BP = data.frame(pos = c(28480585, 28483442), + seq = c('AAACA','TTAAC'))), + # For 25 and 26, the FA reads would overlap, so we just use the BP reads. + Dup25 = list(BP = data.frame(pos = c(28480335, 28483384), + seq = c('GGCGT', 'CATAT'))), + Dup26 = list(BP = data.frame(pos = c(28480166, 28483253), + seq = c('AACGT', 'TGTGT'))), + Dup27 = list(FA = matrix(c(28496700, 28498900, 28497000, 28499200), 2, 2)), + Dup28 = list(FA = matrix(c(28477700, 28496600, 28478000, 28496900), 2, 2), + BP = data.frame(pos = c(28477710, 28496953), + seq = c('CTGTA', 'ATTCT'))), + Dup29 = list(FA = matrix(c(28494000, 28496000, 28494300, 28496300), 2, 2), + BP = data.frame(pos = c(28494017, 28496505), + seq = c('TGGAA', 'TTTGC'))), + Dup30 = list(FA = matrix(c(28478900, 28484700, 28479200, 28485000), 2, 2), + BP = data.frame(pos = c(28478987, 28485033), + seq = c('AACAG', 'ACGTT')))), + cyp6mz = list(Dupm1 = list(BP = data.frame(pos = c(6927942, NA), + seq = c('ATTAT', NA))), + Dupm2 = list(FA = matrix(c(6933100, 6934900, 6933400, 6935200), 2, 2)), + Dupm3 = list(FA = matrix(c(6929600, 6932500, 6929900, 6932800), 2, 2)), + Dupm4 = list(FA = matrix(c(6929900, 6936600, 6930200, 6936900), 2, 2), + BP = data.frame(pos = c(6929933, 6936902), + seq = c('TTAAA', 'TGTCG'))), + Dupm5 = list(FA = matrix(c(6933800, 6938300, 6934100, 6938600), 2, 2), + BP = data.frame(pos = c(6933972, 6938659), + seq = c('AAACC', 'GTCGG'))), + Dupz1 = list(FA = matrix(c(6968950, 6979300, 6969250, 6979600), 2, 2), + BP = data.frame(pos = c(6968962, 6979681), + seq = c('ACGCT', 'AGGTT'))), + Dupz2 = list(FA = matrix(c(6975100, 6977100, 6975400, 6977400), 2, 2), + BP = data.frame(pos = c(NA, 6977514), # clipped reads align at 6975066 + seq = c(NA, 'TAAGA'))), + Dupz3 = list(FA = matrix(c(6971450, 6977800, 6971750, 6978100), 2, 2), + BP = data.frame(pos = c(6971484, NA), # clipped reads align at 6978084 + seq = c('GCAAA', NA))), + Dupz4 = list(FA = matrix(c(6972700, 6977350, 6973000, 6977650), 2, 2), + BP = data.frame(pos = c(6972775, 6977699), + seq = c('GAATG', 'GTCCA'))), + Dupz5 = list(FA = matrix(c(6969800, 6975700, 6970100, 6976000), 2, 2)), + Dupmz1 = list(FA = matrix(c(6982700, 6879900, 6983000, 6880200), 2, 2))), + gste = list(Dup1 = list(FA = matrix(c(6968950, 6979300, 6969250, 6979600), 2, 2), + BP = data.frame(pos = c(28596818, 28598850), + seq = c('TTTTG', 'CGTTT'))), + # The following definition includes some false positives. + Dup2 = list(BP.weak = data.frame(pos = c(28596390, 28598923), + seq = c('GGGGG', 'TTCCC'))), + Dup3 = list(FA = matrix(c(28590500, 28592950, 28590800, 28593250), 2, 2), + BP = data.frame(pos = c(28590597, 28593254), + seq = c('TCAAA', 'AGGGC'))), + Dup4 = list(FA = matrix(c(28595050, 28598750, 28595350, 28599050), 2, 2), + BP = data.frame(pos = c(28595162, 28599081), + seq = c('TTCTA', 'AGAAC'))), + Dup5 = list(BP = data.frame(pos = c(28593122, 28598971), + seq = c('GTCAT', 'ATTTA'))), + Dup6 = list(FA = matrix(c(28596250, 28601900, 28596550, 28602200), 2, 2), + BP = data.frame(pos = c(28596241, 28602177), + seq = c('ACAAC', 'GAAGC'))), + # For the XC reads, the first two rows are the CNV start point, and the second two rows are the end point + Dup7 = list(XC = data.frame(c(28597400, 3696450, 28603950, 26597300), c(28597700, 3696750, 28604250, 26597600), c('3R', '2L', '3R', 'UNKN')), + BP = data.frame(pos = c(28597504, 28604250), + seq = c('GTCCA', 'GCTGT'))), + Dup8 = list(BP = data.frame(pos = c(28594797, 28602349), + seq = c('GTCCC', 'CAGGG'))), + Dup9 = list(FA = matrix(c(28591050, 28600850, 28591350, 28601150), 2, 2), + BP = data.frame(pos = c(28591140, 28601188), + seq = c('AGAAG', 'GATGA'))), + Dup10 = list(FA = matrix(c(28593550, 28603350, 28593850, 28603650), 2, 2), + BP = data.frame(pos = c(28593642, 28603786), + seq = c('TCGCT', 'AAGAC'))), + Dup11 = list(XC = data.frame(c(28581250, 29210650, 28604650, 29210650), c(28581550, 29210950, 28604950, 29210950), c('3R', 'UNKN', '3R', 'UNKN')), + BP = data.frame(pos = c(28581256, 28604994), + seq = c('CCATT', 'GGTAA'))), + Dup12 = list(FA = matrix(c(28597000, 28599950, 28597300, 28600250), 2, 2), + BP = data.frame(pos = c(28597030, 28600292), + seq = c('TACTG', 'CATCT'))), + Dup13 = list(FA = matrix(c(28597000, 28598900, 28597300, 28599200), 2, 2), + BP = data.frame(pos = c(28597181, NA), # clipped sequences align at 28599287 + seq = c('TACTC', NA))), + Dup14 = list(FA = matrix(c(28599800, 28607200, 28600100, 28607500), 2, 2), + BP = data.frame(pos = c(28599926, 28607500), + seq = c('CGACG', 'ATGCA'))), + Dup15 = list(FA = matrix(c(28596200, 28598500, 28596500, 28598800), 2, 2), + BP = data.frame(pos = c(28596252, 28598948), + seq = c('TTGGA', 'TTGAC'))), + Dup16 = list(FA = matrix(c(28597300, 28603200, 28597600, 28603500), 2, 2), + BP = data.frame(pos = c(28597383, 28603517), + seq = c('ACATT', 'ATTAC')))), + cyp9k1 = list(Dup1 = list(FA = matrix(c(15242500, 15244500, 15242800, 15244800), 2, 2), + BP = data.frame(pos = c(15242505,15244812), + seq = c('GTTTG', 'CATAT'))), + Dup2 = list(FA = matrix(c(15238300, 15240800, 15238600, 15241100), 2, 2), + BP = data.frame(pos = c(15238400, 15241082), + seq = c('CCGGC',' CGGTA'))), + Dup3 = list(FA = matrix(c(15240300, 15243450, 15240600, 15243750), 2, 2), + BP = data.frame(pos = c(NA, 15243860), + seq = c(NA, 'TGAAC'))), + Dup4 = list(FA = matrix(c(15240600, 15244200, 15240900, 15244500), 2, 2), + BP = data.frame(pos = c(15240608, 15244503), + seq = c('ATAAA', 'ACTGG'))), + Dup5 = list(FA = matrix(c(15238800, 15243850, 15239100, 15244150), 2, 2), + BP = data.frame(pos = c(15238911, 15244175), + seq = c('CACGT', 'AGTAA'))), + Dup6 = list(FA = matrix(c(15236400, 15243250, 15236700, 15243550), 2, 2), + BP = data.frame(pos = c(15236449, 15243646), + seq = c('TTTTT', 'GTTTT'))), + Dup7 = list(SS = matrix(c(15245400, 15246900, 15245700, 15247200), 2, 2), + BP = data.frame(pos = c(15245768, 15247258), + seq = c('TTTGT', 'TCTAA'))), + Dup8 = list(FA = matrix(c(15239200, 15247250, 15239500, 15247550), 2, 2), + BP = data.frame(pos = c(15239276, 15247645), + seq = c('AACAT', 'TTGCT'))), + Dup9 = list(FA = matrix(c(15239100, 15248900, 15239400, 15249200), 2, 2), + BP = data.frame(pos = c(15239184, 15249314), + seq = c('GCACA', 'AGTAC'))), + Dup10 = list(FA = matrix(c(15234900, 15244750, 15235200, 15245050), 2, 2), + BP = data.frame(pos = c(15234989, 15245128), + seq = c('GCACC', 'ATTCT'))), + Dup11 = list(FA = matrix(c(15236900, 15246800, 15237200, 15247100), 2, 2), + BP = data.frame(pos = c(15236922, 15247159), + seq = c('CATTA', 'TATCT'))), + Dup12 = list(FA = matrix(c(15234400, 15244350, 15234700, 15244650), 2, 2), + BP = data.frame(pos = c(15234434, 15244702), + seq = c('AACAG', 'TACTA'))), + Dup13 = list(FA = matrix(c(15240100, 15250250, 15240400, 15250550), 2, 2), + BP = data.frame(pos = c(15240067, 15250575), + seq = c('CCTAA', 'GTGTA'))), + # Dup 14 seems to have a different endpoint to Dup15, but the same insertion point way upstream + Dup14 = list(FA = matrix(c(15244200, 9676400, 15244500, 9676700), 2, 2), + # Because Dup14 has the same start pos as Dup15, we don't use the start breakpoint as a diagnostic + #BP = data.frame(pos = c(15233807, 15244936), + # seq = c('GGGTT', 'CCCAA'))), + BP = data.frame(pos = c(NA, 15244936), + seq = c(NA, 'CCCAA'))), + Dup15 = list(FA = matrix(c(15246250, 9676400, 15246550, 9676700), 2, 2), + # Because Dup14 has the same start pos as Dup15, we don't use the start breakpoint as a diagnostic + #BP = data.frame(pos = c(15233807, 15246640), + # seq = c('GGGTT', 'CCCAA'))), + BP = data.frame(pos = c(NA, 15246640), + seq = c(NA, 'CCCAA'))), + Dup16 = list(FA = matrix(c(15222700, 15244300, 15223000, 15244600), 2, 2), + BP = data.frame(pos = c(NA, 15244755), + seq = c(NA, 'AAGTA'))), + Dup17 = list(FA = matrix(c(15237150, 15243650, 15237450, 15243950), 2, 2), + BP = data.frame(pos = c(15237138, 15243975), + seq = c('TTGCT', 'TTTCG'))), + Dup18 = list(FA = matrix(c(15236100, 15243500, 15236400, 15243800), 2, 2), + BP = data.frame(pos = c(NA, 15243915), # clipped sequence aligns to 15236175 + seq = c(NA, 'CGGCG'))), + Dup19 = list(FA = matrix(c(15238800, 15251100, 15239100, 15251400), 2, 2), + BP = data.frame(pos = c(15238878, 15251503), + seq = c('TAAAT', 'GTTAC'))), + Dup20 = list(FA = matrix(c(15237350, 15243100, 15237650, 15243400), 2, 2), + BP = data.frame(pos = c(15237397, 15243514), + seq = c('ATGTT', 'TTACG'))), + Dup21 = list(FA = matrix(c(15237450, 15250300, 15237750, 15250600), 2, 2), + BP = data.frame(pos = c(15237482, 15250699), + seq = c('CTCTG', 'TTCTC'))), + Dup22 = list(FA = matrix(c(15240650, 15250300, 15240950, 15250600), 2, 2), + BP = data.frame(pos = c(15240680, 15250670), + seq = c('TTCCA', 'ATTCT'))), + Dup23 = list(FA = matrix(c(15241800, 15248100, 15242100, 15248400), 2, 2), + BP = data.frame(pos = c(15241929, 15248352), + seq = c('AACAA', 'CACGT'))), + Dup24 = list(FA = matrix(c(15238550, 15254800, 15238850, 15255100), 2, 2)), + Dup25 = list(FA = matrix(c(15223500, 15246350, 15223800, 15246650), 2, 2)), + Dup26 = list(FA = matrix(c(15222700, 15247750, 15223000, 15248050), 2, 2)), + Dup27 = list(FA = matrix(c(15237400, 15248350, 15237700, 15248650), 2, 2), + BP = data.frame(pos = c(15237566, NA), + seq = c('AATGT', NA))), + Dup28 = list(BP = data.frame(pos = c(NA, 15246640), + seq = c(NA, 'TCGAG')))) +) + +# Write a function to load the results of discordant read analysis from file for a given sample +get.discordant.reads <- function(this.sample, target.folder, target.region.coords){ + this.file <- grep(paste(this.sample, '.*csv$', sep = ''), list.files(target.folder), value = T) + if (length(this.file) != 1) + stop('There should be one and only one file in the folder that matches the sample name.') + pos.table <- read.table(paste(target.folder, this.file, sep='/'), header = T, sep='\t', colClasses = c('character', 'integer', 'character')) + split.pos.table <- split(pos.table, pos.table$Type) + get.discordant.read.type <- function(read.type){ + full.read.type <- paste(read.type, 'mapq >= 10') + if (!(full.read.type %in% names(split.pos.table))){ + return(data.frame(Position = integer(), Mate.position = integer())) + } + allpos <- split.pos.table[[full.read.type]][, c('Position', 'Mate.position')] + if (read.type %in% c('FA', 'SS', 'FM')){ + allpos$Mate.position <- as.integer(allpos$Mate.position) + which.in.region <- ((allpos$Position >= target.region.coords[1]) & (allpos$Position <= target.region.coords[2])) | ((allpos$Mate.position >= target.region.coords[1]) & (allpos$Mate.position <= target.region.coords[2])) + } + else{ + which.in.region <- (allpos$Position >= target.region.coords[1]) & (allpos$Position <= target.region.coords[2]) + } + allpos <- allpos[which.in.region, ] + allpos[order(allpos$Position),] + } + lapply(discordant.read.types, get.discordant.read.type) +} + +# Write a function to load the results of breakpoint read analysis from file for a given sample +get.breakpoint.reads <- function(this.sample, target.folder, target.region.coords){ + this.file <- grep(paste(this.sample, '.*csv$', sep = ''), list.files(target.folder), value = T) + if (length(this.file) != 1) + stop('There should be one and only one file in the folder that matches the sample name.') + pos.table <- read.table(paste(target.folder, this.file, sep='/'), header = T, sep='\t', colClasses = c('character', 'integer', 'character')) + which.in.region <- (pos.table$Position >= target.region.coords[1]) & (pos.table$Position <= target.region.coords[2]) + pos.table <- pos.table[which.in.region, ] + clipping.start.point <- pos.table[pos.table$Type == 'soft clipping start point mapq >= 10', c('Position', 'Clipped_sequence')] + clipping.end.point <- pos.table[pos.table$Type == 'soft clipping end point mapq >= 10', c('Position', 'Clipped_sequence')] + # + list(CSP = clipping.start.point[order(clipping.start.point$Position), ], + CEP = clipping.end.point[order(clipping.end.point$Position), ]) +} + +get.diagnostic.reads <- function(this.sample, disc.target.folder, bp.target.folder, target.region.coords){ + cat('\tLoading diagnostic reads for ', this.sample, '.\n', sep = '') + output <- get.discordant.reads(this.sample, disc.target.folder, target.region.coords) + output[['BP']] <- get.breakpoint.reads (this.sample, bp.target.folder, target.region.coords) + output +} + +# Write a function to load the results of discordant and breakpoint reads from file for a vector of samples +get.diagnostic.reads.allsamples <- function(sample.names, disc.target.folder, bp.target.folder, target.region.coords){ + cat('Loading discordant reads from ', disc.target.folder, ' and breakpoint reads from ', bp.target.folder, '.\n', sep = '') + lapply(sample.names, get.diagnostic.reads, disc.target.folder, bp.target.folder, target.region.coords) +} + +# Function to count the diagnostic reads for a given CNV. +# di is a table of observed discordant and breakpoint reads for a given sample. +count.reads.per.dup <- function(Dup.diagnostics, di){ + # There shouldn't be more than one of FA, SS, FM and XC + if (sum(names(Dup.diagnostics) %in% c('FA', 'SS', 'FM', 'XC')) > 1) + stop('There should not be more than one type of diagnostic read for a given Duplication.') + num.reads <- setNames(numeric(7), c('FA', 'SS', 'FM', 'XCS', 'XCE', 'CSP', 'CEP')) + if ('FA' %in% names(Dup.diagnostics)){ + num.reads['FA'] <- sum((di$FA$Position > Dup.diagnostics$FA[1,1]) & di$FA$Position < Dup.diagnostics$FA[1,2] + & (di$FA$Mate.position > Dup.diagnostics$FA[2,1]) & (di$FA$Mate.position < Dup.diagnostics$FA[2,2])) + } + if ('SS' %in% names(Dup.diagnostics)){ + num.reads['SS'] <- sum((di$SS$Position > Dup.diagnostics$SS[1,1]) & di$SS$Position < Dup.diagnostics$SS[1,2] + & (di$SS$Mate.position > Dup.diagnostics$SS[2,1]) & (di$SS$Mate.position < Dup.diagnostics$SS[2,2])) + } + if ('FM' %in% names(Dup.diagnostics)){ + num.reads['FM'] <- sum((di$FM$Position > Dup.diagnostics$FM[1,1]) & di$FM$Position < Dup.diagnostics$FM[1,2] + & (di$FM$Mate.position > Dup.diagnostics$FM[2,1]) & (di$FM$Mate.position < Dup.diagnostics$FM[2,2])) + } + if ('XC' %in% names(Dup.diagnostics)){ + these.XC <- di$XC + these.XC$Matechrom <- sub(':.*', '', these.XC$Mate.position) + these.XC$Matepos <- as.integer(sub('.*:' , '', these.XC$Mate.position)) + num.reads['XCS'] <- sum((these.XC$Position > Dup.diagnostics$XC[1,1]) & (these.XC$Position < Dup.diagnostics$XC[1,2]) + & (these.XC$Matechrom == Dup.diagnostics$XC[2,3]) & (these.XC$Matepos > Dup.diagnostics$XC[2,1]) & (these.XC$Matepos < Dup.diagnostics$XC[2,2]), na.rm = T) + num.reads['XCE'] <- sum((these.XC$Position > Dup.diagnostics$XC[3,1]) & (these.XC$Position < Dup.diagnostics$XC[3,2]) + & (these.XC$Matechrom == Dup.diagnostics$XC[4,3]) & (these.XC$Matepos > Dup.diagnostics$XC[4,1]) & (these.XC$Matepos < Dup.diagnostics$XC[4,2]), na.rm = T) + } + if ('BP' %in% names(Dup.diagnostics)){ + CEP.seq <- subset(di$BP$CEP, Position == Dup.diagnostics$BP$pos[1])$Clipped_sequence + num.reads['CEP'] <- sum(substr(reverse(CEP.seq), 1, 5) == Dup.diagnostics$BP$seq[1]) + CSP.seq <- subset(di$BP$CSP, Position == Dup.diagnostics$BP$pos[2])$Clipped_sequence + num.reads['CSP'] <- sum(substr(CSP.seq, 1, 5) == Dup.diagnostics$BP$seq[2]) + } + # There is a special case in Gste2 where one Dup isn't defined very effectively by diagnostic + # reads. The best diagnostic based on phase 2 data still includes some false positives, which + # were minimised by taking the minimum of CEP and CSP, rather than the sum + if ('BP.weak' %in% names(Dup.diagnostics)){ + CEP.seq <- subset(di$BP$CEP, Position == Dup.diagnostics$BP$pos[1])$Clipped_sequence + num.reads.cep <- sum(substr(reverse(CEP.seq), 1, 5) == Dup.diagnostics$BP$seq[1]) + CSP.seq <- subset(di$BP$CSP, Position == Dup.diagnostics$BP$pos[2])$Clipped_sequence + num.reads.csp <- sum(substr(CSP.seq, 1, 5) == Dup.diagnostics$BP$seq[2]) + num.reads['BP.weak'] <- min(num.reads.cep, num.reads.csp) + } + num.reads +} + +# Function to apply count.reads.per.dup to all CNVs for a given sample +count.diagnostic.reads <- function(diagnostic.reads.list, known.cnvs.list, all.breakpoint.positions, verbose = F){ + # Subset the BP reads so that they only include reads that match at least one of + # the breakpoints. That will greatly reduce the list that needs searching for every + # CNV. + diagnostic.reads.list$BP$CSP <- subset(diagnostic.reads.list$BP$CSP, Position %in% all.breakpoint.positions) + diagnostic.reads.list$BP$CEP <- subset(diagnostic.reads.list$BP$CEP, Position %in% all.breakpoint.positions) + num.reads <- lapply(known.cnvs.list, count.reads.per.dup, diagnostic.reads.list) + if (verbose) + print(num.reads) + sapply(num.reads, sum) +} + +# Function to apply counts.diagnostic.reads to all samples +count.diagnostic.reads.allsamples <- function(diagnostic.reads.allsamples, known.cnvs.list){ + # Create a single object containing all of the breakpoint positions + breakpoint.summary <- unlist(sapply(known.cnvs.list, function(x) x$BP$pos)) + t(sapply(diagnostic.reads.allsamples, count.diagnostic.reads, known.cnvs.list, breakpoint.summary)) +} + +# Function to produce a table of all observed CNVs in all samples, given a list of coverage-based calls, +# a table of counts of diagnostic reads for each CNV in all samples, and a threshold number of diagnostic +# reads required to call a CNV +get.read.based.cnvs <- function(coverage.cnv.calls, diagnostic.read.counts.table, threshold.diagnostic.reads = 4){ + read.based <- cbind(diagnostic.read.counts.table >= threshold.diagnostic.reads) + read.cnv.samples <- rownames(read.based)[apply(read.based, 1, any)] + # Unlike in phase2, we use Dup0 to indicate that there is ANY coverage call, not just the samples that + # have a coverage call but not a read-based call + cbind(Dup0 = rownames(read.based) %in% coverage.cnv.calls, read.based) +} + +# Set the folders in which to look for FA, SS, FM and XC reads +SSFA.folders <- setNames(paste(diagnostic.reads.folder, c('SSFA/2R/Ace1_region', + 'SSFA/2R/Cyp6_region', + 'SSFA/3R/Cyp6zm_region', + 'SSFA/3R/Gste_region', + 'SSFA/X/Cyp9k1_region'), + sep = '/'), + c('ace1', 'cyp6', 'cyp6mz', 'gste', 'cyp9k1') + ) + +# Set the folders in which to look for breakpoint reads. +breakpoints.folders <- setNames(paste(diagnostic.reads.folder, c('breakpoints/2R/Ace1_region', + 'breakpoints/2R/Cyp6_region', + 'breakpoints/3R/Cyp6zm_region', + 'breakpoints/3R/Gste_region', + 'breakpoints/X/Cyp9k1_region'), + sep = '/'), + c('ace1', 'cyp6', 'cyp6mz', 'gste', 'cyp9k1') + ) + +cat('Loading discordant and breakpoint reads\n') +diagnostic.reads <- mcmapply(get.diagnostic.reads.allsamples, SSFA.folders, breakpoints.folders, region.coords, MoreArgs = list(sample.names = sample.names), SIMPLIFY = F, mc.preschedule = F, mc.cores = num.cores) + +cat('Counting diagnostic reads\n') +diagnostic.read.counts <- mcmapply(count.diagnostic.reads.allsamples, diagnostic.reads, known.cnvs, mc.preschedule = F, mc.cores = num.cores) +cat('Calling read-based CNVs\n') +read.based.cnvs <- mapply(get.read.based.cnvs, cov.cnv.samples, diagnostic.read.counts, SIMPLIFY = F) +# As a special case for cyp6, we don't score the presence of Dup1 if either Dup1a or Dup1b are present (because Dup1 +# just represents an ambiguous call for Dup1a / Dup1b. +cyp6.Dup1ab <- read.based.cnvs$cyp6[, 'Dup1a'] | read.based.cnvs$cyp6[, 'Dup1b'] +read.based.cnvs$cyp6[cyp6.Dup1ab, 'Dup1'] <- F + +# Combine the separate CNV tables into a single table +full.cnv.table <- do.call(cbind, read.based.cnvs) +full.cnv.table <- cbind(rownames(full.cnv.table) %in% high.var.samples, full.cnv.table) +gene.cluster.names <- setNames(c('Ace1', 'Cyp6aap', 'Cyp6mz', 'Gstue', 'Cyp9k1'), + names(diagnostic.read.counts)) +# The column names for the full table will be with the gene cluster codes that we used in the phase 2 analysis, +# and will split the Cyp6mz cnvs into their constituent clusters +colnames(full.cnv.table) <- c('High.var.sample', stri_replace_all_fixed(unlist(sapply(names(gene.cluster.names), function(x) paste(gene.cluster.names[x], colnames(read.based.cnvs[[x]]), sep = '_'))), + c('Cyp6mz_Dupm[^z]', 'Cyp6mz_Dupz'), + c('Cyp6m_Dup', 'Cyp6z_Dup'), + vectorize_all = F) + ) +# Get the results in a different format. Here as a list where, for each CNV, we have a vector of samples that +# carry it. +cnv.sample.lists <- lapply(read.based.cnvs, function(m) apply(m, 2, function(x) rownames(m)[x])) + +# Get a vector of samples that carry at least one CNV based on read calls. +# Dup0 is always the first column, so the -1 index removes Dup0 +read.cnv.samples <- lapply(read.based.cnvs, function(x) rownames(x)[apply(x[, -1], 1, any)]) + +# For each cluster, get a vector of samples that have been called as carrying a cnv by reads but not coverage, +# ignoring samples with high variance. +cov.based.negatives <- lapply(mapply(setdiff, read.cnv.samples, cov.cnv.samples), setdiff, high.var.samples) +# And vice-versa +read.based.negatives <- lapply(mapply(setdiff, cov.cnv.samples, read.cnv.samples), setdiff, high.var.samples) + +dir.create('target_regions_analysis', showWarnings = FALSE) +write.table(full.cnv.table, file = 'target_regions_analysis/focal_region_CNV_table.csv', sep = '\t', col.names = NA, quote = F) +write.table(hmm.cnv.table, file = 'target_regions_analysis/HMM_gene_copy_number.csv', sep = '\t', col.names = NA, quote = F) + +source(plotting.functions.file) +save.image('target_regions_analysis/target_regions_analysis.Rdata') + diff --git a/dockerfiles/CNV/R3.6.1/Rscripts/target_regions_analysis_old.sh b/dockerfiles/CNV/R3.6.1/Rscripts/target_regions_analysis_old.sh new file mode 100755 index 00000000..48fde692 --- /dev/null +++ b/dockerfiles/CNV/R3.6.1/Rscripts/target_regions_analysis_old.sh @@ -0,0 +1,33 @@ +workingfolder=$1 +manifest=$2 +species_id_file=$3 +coverage_variance_file=$4 +metadata=$5 +ncores=$6 + +coveragefolder=$workingfolder/coverage +diagnostic_reads_folder=$workingfolder/diagnostic_reads +gene_coordinates_file=~/personal/phase3_data/tables/gene_regions.csv +scriptsfolder=~/scripts/CNV_scripts/scripts +plotting_functions_file=$scriptsfolder/plotting_functions.r + +cd $workingfolder + +mkdir -p target_regions_analysis + +export R_LIBS_USER="~/R-modules:$R_LIBS_USER" + +R --version + +R --slave -f $scriptsfolder/target_regions_analysis.r --args $manifest \ + $gene_coordinates_file \ + $metadata \ + $species_id_file \ + $coverage_variance_file \ + $coveragefolder \ + $diagnostic_reads_folder \ + $plotting_functions_file \ + $ncores \ + > target_regions_analysis/target_regions_analysis.log 2>&1 + + diff --git a/dockerfiles/CNV/R3.6.1/Rscripts/target_regions_analysis_vobs.sh b/dockerfiles/CNV/R3.6.1/Rscripts/target_regions_analysis_vobs.sh new file mode 100755 index 00000000..1cf3ef51 --- /dev/null +++ b/dockerfiles/CNV/R3.6.1/Rscripts/target_regions_analysis_vobs.sh @@ -0,0 +1,33 @@ +workingfolder=$1 +manifest=$2 +species_id_file=$3 +coverage_variance_file=$4 +gene_coordinates_file=$5 +metadata=$6 +ncores=$7 + +coveragefolder=$workingfolder/coverage +diagnostic_reads_folder=$workingfolder/diagnostic_reads +scriptsfolder=~/scripts/CNV_scripts/scripts +plotting_functions_file=$scriptsfolder/plotting_functions.r + +cd $workingfolder + +mkdir -p target_regions_analysis + +export R_LIBS_USER="~/R-modules:$R_LIBS_USER" + +R-3.6.1 --version + +R-3.6.1 --slave -f $scriptsfolder/target_regions_analysis.r --args $manifest \ + $gene_coordinates_file \ + $metadata \ + $species_id_file \ + $coverage_variance_file \ + $coveragefolder \ + $diagnostic_reads_folder \ + $plotting_functions_file \ + $ncores \ + > target_regions_analysis/target_regions_analysis.log 2>&1 + + diff --git a/dockerfiles/CNV/R3.6.1/docker_build.sh b/dockerfiles/CNV/R3.6.1/docker_build.sh new file mode 100755 index 00000000..82b3da4d --- /dev/null +++ b/dockerfiles/CNV/R3.6.1/docker_build.sh @@ -0,0 +1,73 @@ +#!/bin/bash +#fail-fast +set -e -x + +# Update version when changes to Dockerfile are made +DOCKER_IMAGE_VERSION=1.0.0 +TIMESTAMP=$(date +"%s") +DIR=$(cd "$(dirname "$0")" && pwd) +TAG=$1 +CACHING=$2 + +# Registries and tags +GCR_URL="us.gcr.io/broad-gotc-prod/cnv/r" + + +# Necessary tools and help text +TOOLS=(docker gcloud) +HELP="$(basename "$0") [-h|--help] [-t|tools] [TAG] [CACHING] -- script to build the image and push to GCR +where: + -h|--help Show help text + -t|--tools Show tools needed to run script + TAG : Set a fixed tag string (this will override our tagging convention). Useful for development or on CI/CD to reduce wasted images + CACHING : Set to ON to enable caching. Any other value or simply not setting it will default to no-cache builds. + " + +function main(){ + for t in "${TOOLS[@]}"; do which "$t" >/dev/null || ok=no; done + if [[ $ok == no ]]; then + echo "Missing one of the following tools: " + for t in "${TOOLS[@]}"; do echo "$t"; done + exit 1 + fi + + while [[ $# -gt 0 ]] + do + key="$1" + case $key in + -h|--help) + echo "$HELP" + exit 0 + ;; + -t|--tools) + for t in "${TOOLS[@]}"; do echo "$t"; done + exit 0 + ;; + *) + shift + ;; + esac + done + if [ -n "$TAG" ]; + then + IMAGE_TAG="$TAG" + echo "Tag is set to $TAG. Overriding default." + else + echo "Tag is unset. Using default." + IMAGE_TAG="$DOCKER_IMAGE_VERSION-$TIMESTAMP" + fi + + echo "building and pushing GCR Image - $GCR_URL:$IMAGE_TAG" + + if [[ "$CACHING" == "OFF" ]]; then + docker build -t "$GCR_URL:$IMAGE_TAG" --no-cache "$DIR" + else + docker build -t "$GCR_URL:$IMAGE_TAG" "$DIR" + fi + docker push "$GCR_URL:$IMAGE_TAG" + + echo -e "$GCR_URL:$IMAGE_TAG" >> "$DIR/docker_versions.tsv" + echo "done" +} + +main "$@" \ No newline at end of file diff --git a/dockerfiles/CNV/R3.6.1/installRDeps.R b/dockerfiles/CNV/R3.6.1/installRDeps.R new file mode 100644 index 00000000..a158d03a --- /dev/null +++ b/dockerfiles/CNV/R3.6.1/installRDeps.R @@ -0,0 +1,42 @@ +#!/usr/bin/env Rscript + +## Default repo +local({r <- getOption("repos") + r["CRAN"] <- "https://cloud.r-project.org" + options(repos=r) +}) + + +## Install Biostrings +if (!require("BiocManager", quietly = TRUE)) + install.packages("BiocManager") + +BiocManager::install("Biostrings") + +library(BiocManager) +BiocManager::valid() + +print("checking Biostrings") + +if ( ! library("Biostrings", character.only=TRUE, logical.return=TRUE) ) { + quit(status=1, save='no') + } + + +## Install other libraries +install.packages("stringi", dependencies=TRUE, INSTALL_opts = c('--no-lock')) +install.packages("stringr", dependencies=TRUE, INSTALL_opts = c('--no-lock')) +install.packages("RColorBrewer", dependencies=TRUE, INSTALL_opts = c('--no-lock')) + +print("checking stringi") +if ( ! library('stringi', character.only=TRUE, logical.return=TRUE) ) { + quit(status=1, save='no') + } +print("checking stringr") +if ( ! library("stringr", character.only=TRUE, logical.return=TRUE) ) { + quit(status=1, save='no') + } +print("checking RColorBrewer") +if ( ! library("RColorBrewer", character.only=TRUE, logical.return=TRUE) ) { + quit(status=1, save='no') + } \ No newline at end of file diff --git a/dockerfiles/CNV/docker_build.sh b/dockerfiles/CNV/docker_build.sh new file mode 100755 index 00000000..c508f2a7 --- /dev/null +++ b/dockerfiles/CNV/docker_build.sh @@ -0,0 +1,73 @@ +#!/bin/bash +#fail-fast +set -e -x + +# Update version when changes to Dockerfile are made +DOCKER_IMAGE_VERSION=1.0.0 +TIMESTAMP=$(date +"%s") +DIR=$(cd "$(dirname "$0")" && pwd) +TAG=$1 +CACHING=$2 + +# Registries and tags +GCR_URL="us.gcr.io/broad-gotc-prod/cnv" + + +# Necessary tools and help text +TOOLS=(docker gcloud) +HELP="$(basename "$0") [-h|--help] [-t|tools] [TAG] [CACHING] -- script to build the image and push to GCR +where: + -h|--help Show help text + -t|--tools Show tools needed to run script + TAG : Set a fixed tag string (this will override our tagging convention). Useful for development or on CI/CD to reduce wasted images + CACHING : Set to ON to enable caching. Any other value or simply not setting it will default to no-cache builds. + " + +function main(){ + for t in "${TOOLS[@]}"; do which "$t" >/dev/null || ok=no; done + if [[ $ok == no ]]; then + echo "Missing one of the following tools: " + for t in "${TOOLS[@]}"; do echo "$t"; done + exit 1 + fi + + while [[ $# -gt 0 ]] + do + key="$1" + case $key in + -h|--help) + echo "$HELP" + exit 0 + ;; + -t|--tools) + for t in "${TOOLS[@]}"; do echo "$t"; done + exit 0 + ;; + *) + shift + ;; + esac + done + if [ -n "$TAG" ]; + then + IMAGE_TAG="$TAG" + echo "Tag is set to $TAG. Overriding default." + else + echo "Tag is unset. Using default." + IMAGE_TAG="$DOCKER_IMAGE_VERSION-$TIMESTAMP" + fi + + echo "building and pushing GCR Image - $GCR_URL:$IMAGE_TAG" + + if [[ "$CACHING" == "ON" ]]; then + docker build -t "$GCR_URL:$IMAGE_TAG" "$DIR" + else + docker build -t "$GCR_URL:$IMAGE_TAG" --no-cache "$DIR" + fi + docker push "$GCR_URL:$IMAGE_TAG" + + echo -e "$GCR_URL:$IMAGE_TAG" >> "$DIR/docker_versions.tsv" + echo "done" +} + +main "$@" \ No newline at end of file diff --git a/dockerfiles/CNV/docker_versions.tsv b/dockerfiles/CNV/docker_versions.tsv new file mode 100644 index 00000000..ae4252ad --- /dev/null +++ b/dockerfiles/CNV/docker_versions.tsv @@ -0,0 +1,5 @@ +us.gcr.io/broad-gotc-prod/cnv:0.1 +us.gcr.io/broad-gotc-prod/cnv:1.0.0-1677515293 +us.gcr.io/broad-gotc-prod/cnv:1.0.0-1677557222 +us.gcr.io/broad-gotc-prod/cnv:1.0.0-1679431881 +us.gcr.io/broad-gotc-prod/cnv:1.0.0-1690999960 diff --git a/dockerfiles/CNV/requirements.txt b/dockerfiles/CNV/requirements.txt new file mode 100644 index 00000000..6f2a4e00 --- /dev/null +++ b/dockerfiles/CNV/requirements.txt @@ -0,0 +1,103 @@ +argon2-cffi @ file:///home/conda/feedstock_root/build_artifacts/argon2-cffi_1602546578258/work +asciitree==0.3.3 +async-generator==1.10 +attrs @ file:///home/conda/feedstock_root/build_artifacts/attrs_1599308529326/work +backports.functools-lru-cache==1.6.1 +bcolz==1.2.1 +biopython @ file:///tmp/build/80754af9/biopython_1600298863563/work +bleach @ file:///home/conda/feedstock_root/build_artifacts/bleach_1600454382015/work +bokeh @ file:///home/conda/feedstock_root/build_artifacts/bokeh_1602690186583/work +certifi @ file:///croot/certifi_1671487769961/work/certifi +cffi @ file:///home/conda/feedstock_root/build_artifacts/cffi_1602537219008/work +click==7.1.2 +cloudpickle @ file:///home/conda/feedstock_root/build_artifacts/cloudpickle_1598400192773/work +cycler==0.10.0 +cytoolz==0.11.0 +dask @ file:///home/conda/feedstock_root/build_artifacts/dask-core_1602029610262/work +decorator==4.4.2 +defusedxml==0.6.0 +distributed @ file:///home/conda/feedstock_root/build_artifacts/distributed_1602493186453/work +entrypoints @ file:///home/conda/feedstock_root/build_artifacts/entrypoints_1602701733603/work/dist/entrypoints-0.3-py2.py3-none-any.whl +fasteners==0.14.1 +fsspec @ file:///home/conda/feedstock_root/build_artifacts/fsspec_1602700749102/work +h5py @ file:///home/conda/feedstock_root/build_artifacts/h5py_1602551881630/work +HeapDict==1.0.1 +hmmlearn @ file:///home/conda/feedstock_root/build_artifacts/hmmlearn_1603039319943/work +importlib-metadata @ file:///home/conda/feedstock_root/build_artifacts/importlib-metadata_1602263269022/work +ipykernel @ file:///home/conda/feedstock_root/build_artifacts/ipykernel_1602682802500/work/dist/ipykernel-5.3.4-py3-none-any.whl +ipython==5.8.0 +ipython-genutils==0.2.0 +ipywidgets @ file:///home/conda/feedstock_root/build_artifacts/ipywidgets_1599554010055/work +Jinja2==2.11.2 +joblib @ file:///home/conda/feedstock_root/build_artifacts/joblib_1601671685479/work +jsonschema @ file:///home/conda/feedstock_root/build_artifacts/jsonschema_1602551949684/work +jupyter-client @ file:///home/conda/feedstock_root/build_artifacts/jupyter_client_1598486169312/work +jupyter-console==5.2.0 +jupyter-core @ file:///home/conda/feedstock_root/build_artifacts/jupyter_core_1602537277085/work +jupyterlab-pygments @ file:///home/conda/feedstock_root/build_artifacts/jupyterlab_pygments_1601375948261/work +kiwisolver @ file:///home/conda/feedstock_root/build_artifacts/kiwisolver_1602517221725/work +locket==0.2.0 +MarkupSafe @ file:///home/conda/feedstock_root/build_artifacts/markupsafe_1602267316845/work +matplotlib @ file:///home/conda/feedstock_root/build_artifacts/matplotlib-suite_1602600750896/work +mistune @ file:///home/conda/feedstock_root/build_artifacts/mistune_1602381812692/work +mock @ file:///home/conda/feedstock_root/build_artifacts/mock_1602394920017/work +monotonic==1.5 +msgpack @ file:///home/conda/feedstock_root/build_artifacts/msgpack-python_1602380760823/work +nbclient @ file:///home/conda/feedstock_root/build_artifacts/nbclient_1598558657104/work +nbconvert @ file:///home/conda/feedstock_root/build_artifacts/nbconvert_1602715396354/work +nbformat @ file:///home/conda/feedstock_root/build_artifacts/nbformat_1602732862338/work +nest-asyncio @ file:///home/conda/feedstock_root/build_artifacts/nest-asyncio_1601342677072/work +networkx @ file:///home/conda/feedstock_root/build_artifacts/networkx_1598210780226/work +notebook @ file:///home/conda/feedstock_root/build_artifacts/notebook_1602720128568/work +numcodecs @ file:///home/conda/feedstock_root/build_artifacts/numcodecs_1602514772924/work +numexpr @ file:///home/conda/feedstock_root/build_artifacts/numexpr_1602671016982/work +numpy @ file:///home/conda/feedstock_root/build_artifacts/numpy_1602429044575/work +olefile==0.46 +packaging @ file:///home/conda/feedstock_root/build_artifacts/packaging_1589925210001/work +pandas @ file:///home/conda/feedstock_root/build_artifacts/pandas_1602502751364/work +pandocfilters==1.4.2 +partd==1.1.0 +patsy==0.5.1 +pexpect @ file:///home/conda/feedstock_root/build_artifacts/pexpect_1602535608087/work +pickleshare @ file:///home/conda/feedstock_root/build_artifacts/pickleshare_1602536217715/work +Pillow @ file:///home/conda/feedstock_root/build_artifacts/pillow_1602708615436/work +pomegranate @ file:///home/conda/feedstock_root/build_artifacts/pomegranate_1591385582992/work +prometheus-client @ file:///home/conda/feedstock_root/build_artifacts/prometheus_client_1590412252446/work +prompt-toolkit==1.0.15 +psutil @ file:///home/conda/feedstock_root/build_artifacts/psutil_1602264040045/work +ptyprocess==0.6.0 +pycparser @ file:///home/conda/feedstock_root/build_artifacts/pycparser_1593275161868/work +Pygments @ file:///home/conda/feedstock_root/build_artifacts/pygments_1600347314331/work +pyparsing==2.4.7 +pyrsistent @ file:///home/conda/feedstock_root/build_artifacts/pyrsistent_1602259985647/work +pysam==0.15.3 +python-dateutil==2.8.1 +pytz==2020.1 +PyYAML==5.3.1 +pyzmq==19.0.2 +qtconsole @ file:///home/conda/feedstock_root/build_artifacts/qtconsole_1599147533948/work +QtPy==1.9.0 +scikit-allel @ file:///home/conda/feedstock_root/build_artifacts/scikit-allel_1597004364242/work +scikit-learn @ file:///home/conda/feedstock_root/build_artifacts/scikit-learn_1596546074663/work +scipy @ file:///home/conda/feedstock_root/build_artifacts/scipy_1602441086235/work +seaborn @ file:///home/conda/feedstock_root/build_artifacts/seaborn-base_1599592695803/work +Send2Trash==1.5.0 +simplegeneric==0.8.1 +six @ file:///home/conda/feedstock_root/build_artifacts/six_1590081179328/work +sortedcontainers @ file:///home/conda/feedstock_root/build_artifacts/sortedcontainers_1591999956871/work +statsmodels @ file:///home/conda/feedstock_root/build_artifacts/statsmodels_1602599914091/work +tables==3.6.1 +tblib==1.6.0 +terminado @ file:///home/conda/feedstock_root/build_artifacts/terminado_1602679584439/work +testpath==0.4.4 +threadpoolctl @ file:///tmp/tmp79xdzxkt/threadpoolctl-2.1.0-py3-none-any.whl +toolz @ file:///home/conda/feedstock_root/build_artifacts/toolz_1600973991856/work +tornado @ file:///home/conda/feedstock_root/build_artifacts/tornado_1602488893411/work +traitlets @ file:///home/conda/feedstock_root/build_artifacts/traitlets_1600970644261/work +typing-extensions @ file:///home/conda/feedstock_root/build_artifacts/typing_extensions_1602702424206/work +wcwidth @ file:///home/conda/feedstock_root/build_artifacts/wcwidth_1600965781394/work +webencodings==0.5.1 +widgetsnbextension @ file:///home/conda/feedstock_root/build_artifacts/widgetsnbextension_1594164347302/work +zarr @ file:///tmp/build/80754af9/zarr_1602185171386/work +zict==2.0.0 +zipp @ file:///home/conda/feedstock_root/build_artifacts/zipp_1601765966131/work diff --git a/dockerfiles/CNV/requirements_conda.yml b/dockerfiles/CNV/requirements_conda.yml new file mode 100644 index 00000000..47a2631e --- /dev/null +++ b/dockerfiles/CNV/requirements_conda.yml @@ -0,0 +1,191 @@ +name: cnv37 +channels: + - bioconda + - conda-forge + - defaults +dependencies: + - _libgcc_mutex=0.1=conda_forge + - _openmp_mutex=4.5=1_gnu + - argon2-cffi=20.1.0=py37h8f50634_2 + - asciitree=0.3.3=py_2 + - async_generator=1.10=py_0 + - attrs=20.2.0=pyh9f0ad1d_0 + - backports=1.0=py_2 + - backports.functools_lru_cache=1.6.1=py_0 + - bcolz=1.2.1=py37hb3f55d8_1001 + - biopython=1.78=py37h7b6447c_0 + - bleach=3.2.1=pyh9f0ad1d_0 + - blosc=1.20.1=he1b5a44_0 + - bokeh=2.2.2=py37hc8dfbb8_0 + - bzip2=1.0.8=h516909a_3 + - ca-certificates=2022.10.11=h06a4308_0 + - certifi=2022.12.7=py37h06a4308_0 + - cffi=1.14.3=py37h00ebd2e_1 + - click=7.1.2=pyh9f0ad1d_0 + - cloudpickle=1.6.0=py_0 + - curl=7.71.1=hbc83047_1 + - cycler=0.10.0=py_2 + - cytoolz=0.11.0=py37h8f50634_1 + - dask=2.30.0=py_0 + - dask-core=2.30.0=py_0 + - dbus=1.13.6=he372182_0 + - decorator=4.4.2=py_0 + - defusedxml=0.6.0=py_0 + - distributed=2.30.0=py37hc8dfbb8_1 + - entrypoints=0.3=py37hc8dfbb8_1002 + - expat=2.2.9=he1b5a44_2 + - fasteners=0.14.1=py_3 + - fontconfig=2.13.0=h9420a91_0 + - freetype=2.10.3=he06d7ca_0 + - fsspec=0.8.4=py_0 + - gettext=0.19.8.1=hc5be6a0_1002 + - glib=2.66.1=h680cd38_0 + - gst-plugins-base=1.14.5=h0935bb2_2 + - gstreamer=1.14.5=h36ae1b5_2 + - h5py=2.10.0=nompi_py37hf7afa78_105 + - hdf5=1.10.6=nompi_h3c11f04_101 + - heapdict=1.0.1=py_0 + - hmmlearn=0.2.4=py37h161383b_1 + - icu=58.2=he6710b0_3 + - importlib-metadata=2.0.0=py_1 + - importlib_metadata=2.0.0=1 + - ipykernel=5.3.4=py37hc6149b9_1 + - ipython=5.8.0=py37_1 + - ipython_genutils=0.2.0=py_1 + - ipywidgets=7.5.1=pyh9f0ad1d_1 + - jinja2=2.11.2=pyh9f0ad1d_0 + - joblib=0.17.0=py_0 + - jpeg=9d=h516909a_0 + - jsonschema=3.2.0=py_2 + - jupyter=1.0.0=py_2 + - jupyter_client=6.1.7=py_0 + - jupyter_console=5.2.0=py37_1 + - jupyter_core=4.6.3=py37hc8dfbb8_2 + - jupyterlab_pygments=0.1.2=pyh9f0ad1d_0 + - kiwisolver=1.2.0=py37h99015e2_1 + - krb5=1.18.2=h173b8e3_0 + - lcms2=2.11=hbd6801e_0 + - ld_impl_linux-64=2.35=h769bd43_9 + - libblas=3.8.0=17_openblas + - libcblas=3.8.0=17_openblas + - libclang=10.0.1=default_hde54327_1 + - libcurl=7.71.1=h20c2e04_1 + - libdeflate=1.0=h14c3975_1 + - libedit=3.1.20191231=he28a2e2_2 + - libevent=2.1.10=hcdb4288_3 + - libffi=3.2.1=he1b5a44_1007 + - libgcc-ng=9.3.0=h5dbcf3e_17 + - libgfortran-ng=7.5.0=hae1eefd_17 + - libgfortran4=7.5.0=hae1eefd_17 + - libgomp=9.3.0=h5dbcf3e_17 + - libiconv=1.16=h516909a_0 + - liblapack=3.8.0=17_openblas + - libllvm10=10.0.1=he513fc3_3 + - libopenblas=0.3.10=pthreads_hb3c22a3_5 + - libpng=1.6.37=hed695b0_2 + - libsodium=1.0.18=h516909a_1 + - libssh2=1.9.0=h1ba5d50_1 + - libstdcxx-ng=9.3.0=h2ae2ef3_17 + - libtiff=4.1.0=hc7e4089_6 + - libuuid=1.0.3=h1bed415_2 + - libwebp-base=1.1.0=h516909a_3 + - libxcb=1.13=h14c3975_1002 + - libxkbcommon=0.10.0=he1b5a44_0 + - libxml2=2.9.10=he19cac6_1 + - locket=0.2.0=py_2 + - lz4-c=1.9.2=he1b5a44_3 + - lzo=2.10=h516909a_1000 + - markupsafe=1.1.1=py37hb5d75c8_2 + - matplotlib-base=3.3.2=py37hc9afd2a_1 + - mistune=0.8.4=py37h8f50634_1002 + - mock=4.0.2=py37hc8dfbb8_1 + - monotonic=1.5=py_0 + - msgpack-python=1.0.0=py37h99015e2_2 + - mysql-common=8.0.21=2 + - mysql-libs=8.0.21=hf3661c5_2 + - nbclient=0.5.0=py_0 + - nbconvert=6.0.7=py37hc8dfbb8_1 + - nbformat=5.0.8=py_0 + - ncurses=6.2=he1b5a44_2 + - nest-asyncio=1.4.1=py_0 + - networkx=2.5=py_0 + - notebook=6.1.4=py37hc8dfbb8_1 + - nspr=4.29=he1b5a44_0 + - nss=3.57=he751ad9_0 + - numcodecs=0.7.2=py37h3340039_1 + - numexpr=2.7.1=py37h9fdb41a_3 + - numpy=1.19.2=py37h7ea13bd_1 + - olefile=0.46=py_0 + - openssl=1.1.1s=h7f8727e_0 + - packaging=20.4=pyh9f0ad1d_0 + - pandas=1.1.3=py37h9fdb41a_2 + - pandoc=2.11.0.1=hd18ef5c_0 + - pandocfilters=1.4.2=py_1 + - partd=1.1.0=py_0 + - patsy=0.5.1=py_0 + - pcre=8.44=he1b5a44_0 + - pexpect=4.8.0=pyh9f0ad1d_2 + - pickleshare=0.7.5=py_1003 + - pillow=8.0.0=py37h718be6c_0 + - pip=20.2.3=py_0 + - pomegranate=0.13.3=py37h99015e2_0 + - prometheus_client=0.8.0=pyh9f0ad1d_0 + - prompt_toolkit=1.0.15=py_1 + - psutil=5.7.2=py37hb5d75c8_1 + - pthread-stubs=0.4=h14c3975_1001 + - ptyprocess=0.6.0=py_1001 + - pycparser=2.20=pyh9f0ad1d_2 + - pygments=2.7.1=py_0 + - pyparsing=2.4.7=pyh9f0ad1d_0 + - pyqt=5.9.2=py37h05f1152_2 + - pyrsistent=0.17.3=py37h8f50634_1 + - pysam=0.15.3=py37hda2845c_1 + - pytables=3.6.1=py37h56451d4_2 + - python=3.7.8=h6f2ec95_1_cpython + - python-dateutil=2.8.1=py_0 + - python_abi=3.7=1_cp37m + - pytz=2020.1=pyh9f0ad1d_0 + - pyyaml=5.3.1=py37hb5d75c8_1 + - pyzmq=19.0.2=py37hac76be4_2 + - qt=5.9.7=h5867ecd_1 + - qtconsole=4.7.7=pyh9f0ad1d_0 + - qtpy=1.9.0=py_0 + - readline=8.0=he28a2e2_2 + - scikit-allel=1.3.2=py37h9fdb41a_0 + - scikit-learn=0.23.2=py37h6785257_0 + - scipy=1.5.2=py37hb14ef9d_1 + - seaborn=0.11.0=0 + - seaborn-base=0.11.0=py_0 + - send2trash=1.5.0=py_0 + - setuptools=49.6.0=py37he5f6b98_2 + - simplegeneric=0.8.1=py_1 + - sip=4.19.8=py37hf484d3e_0 + - six=1.15.0=pyh9f0ad1d_0 + - sortedcontainers=2.2.2=pyh9f0ad1d_0 + - sqlite=3.33.0=h4cf870e_1 + - statsmodels=0.12.0=py37h161383b_1 + - tblib=1.6.0=py_0 + - terminado=0.9.1=py37hc8dfbb8_1 + - testpath=0.4.4=py_0 + - threadpoolctl=2.1.0=pyh5ca1d4c_0 + - tk=8.6.10=hed695b0_1 + - toolz=0.11.1=py_0 + - tornado=6.0.4=py37h8f50634_2 + - traitlets=5.0.4=py_1 + - typing_extensions=3.7.4.3=py_0 + - wcwidth=0.2.5=pyh9f0ad1d_2 + - webencodings=0.5.1=py_1 + - wheel=0.35.1=pyh9f0ad1d_0 + - widgetsnbextension=3.5.1=py37hc8dfbb8_1 + - xorg-libxau=1.0.9=h14c3975_0 + - xorg-libxdmcp=1.1.3=h516909a_0 + - xz=5.2.5=h516909a_1 + - yaml=0.2.5=h516909a_0 + - zarr=2.5.0=py_0 + - zeromq=4.3.3=he1b5a44_2 + - zict=2.0.0=py_0 + - zipp=3.3.0=py_0 + - zlib=1.2.11=h516909a_1010 + - zstd=1.4.5=h6597ccf_2 +prefix: /cnv + diff --git a/dockerfiles/CNV/scripts/HMM_process.py b/dockerfiles/CNV/scripts/HMM_process.py new file mode 100755 index 00000000..9f69c1a8 --- /dev/null +++ b/dockerfiles/CNV/scripts/HMM_process.py @@ -0,0 +1,187 @@ +# This script runs an mHMM on coverage counts obtained from a bamfile, using normalisation based on the +# mean coverage per GC bin over the whole genome of the individual. +# The equivalent script used in the phase 2 paper was mHMM_v5_python2_fullgcnorm_varvar_trans000001.py + +import time # required to output the time at which the script was run +from sys import stdout # this import is need to flush python output to the stdout (instead of leaving it +# in the buffer +from sys import argv # this import is needed in order for the script to handle command line arguments +import socket # this import allows us to access the name of the computer that the script is being run on +from hmmlearn.hmm import GaussianHMM +import numpy as np +import pandas as pd +from re import * + +# Check how many arguments you have and do anything necessary +if (len(argv) == 9): + samplenames_file = argv[1] + chrom = argv[2] + workingfolder = argv[3] + input_gc_filename = argv[4] + mean_coverage_by_gc_filename = argv[5] + variance_by_sample_filename = argv[6] + mapq_proportions_filename = argv[7] + max_mapq = argv[8] +else: + raise Exception("Fail. There should be eight command line arguments (samplenames_file, chrom, workingfolder, input_gc_filename, mean_coverage_by_gc_filename, variance_by_sample_filename, mapq_proportion_filename, max_mapq)") + +print('Running ' + argv[0] + ' at ' + time.strftime("%H:%M") + ' on ' + time.strftime("%d/%m/%Y") + ' using machine ' + socket.gethostname() + '\n\n') +print('Input arguments:') +print('\tsamplenames_file: ' + samplenames_file) +print('\tchrom: ' + chrom) +print('\tworkingfolder: ' + workingfolder) +print('\tinput_gc_filename: ' + input_gc_filename) +print('\tmean_coverage_by_gc_filename: ' + mean_coverage_by_gc_filename) +print('\tvariance_by_sample_filename: ' + variance_by_sample_filename) +print('\tmapq_proportions_filename: ' + mapq_proportions_filename) +print('\tmax_mapq: ' + max_mapq + '\n') +stdout.flush() + +# Write a function to fir the hmm given a set of parameters and data +def fit_hmm(depth_normed, # normalised coverage array + transition_probability, # probability of state transition + variance, # variance per copy + variance_fixed, # variance for the zero copy number state + max_copy_number=12, # maximum copy number to consider in the model + n_iter=0, # number of iterations to perform when fitting the model + params='st', # parameters that can be changed through fitting + init_params='' # parameters that are initialised from the data + ): + + # convenience variable + min_copy_number = 0 # minimum copy number to consider in the model + n_states = max_copy_number - min_copy_number + 1 + + # construct the transition matrix + transmat = np.zeros((n_states, n_states)) + transmat[:] = transition_probability + transmat[np.diag_indices(n_states)] = 1-((n_states-1)*transition_probability) + + # construct means and covariance + means_list = range(n_states) + means = np.array([[n] for n in means_list]) + covars = np.array([[variance*n + variance_fixed] for n in means_list]) + + # setup HMM + model = GaussianHMM(n_states, + covariance_type='diag', + n_iter=n_iter, + params=params, + init_params=init_params) + model.means_ = means + model.covars_ = covars + model.transmat_ = transmat + + # fit HMM + obs = np.column_stack([depth_normed]) + model.fit(obs) + + # predict hidden states + h = model.predict(obs) + + return h + +# Write a function to normalised the coverage for each GC content bin +def normalise_coverage_by_GC(coverage, mean_coverage_by_GC, ploidy = 2): + output = coverage.copy() + # For each counts value in the output object, associate it with the mean coverage for its GC bin + output['Expected_coverage'] = [mean_coverage_by_GC.loc[x] for x in output['GC']] + # Now divide each counts value by the coverage associated with its GC bin, we multiply by 2 because + # that was the ploidy of the expected coverage. + output['Normalised_coverage'] = ploidy * output['Counts'] / output['Expected_coverage'] + # Return the new dataframe + return output + +with open(samplenames_file, 'r') as f: + samplenames = [x.rstrip('\n') for x in f.readlines()] + +max_mapq = float(max_mapq) +# Load up the file containing the mapq information and calculate the proportion of mapq0 reads in each bin +mapq_prop = pd.read_csv(mapq_proportions_filename, sep = '\t') +mapq_prop['mapq0_prop'] = mapq_prop['Count mapq = 0'] / (mapq_prop['Count mapq = 0'] + mapq_prop['Count mapq > 0']) +# Create a boolean for whether the mapq0 proportion is low enough +mapq_prop['mapq0_acceptable'] = mapq_prop['mapq0_prop'] <= max_mapq +# Split the data by chromosome +mapq_prop_bychrom = mapq_prop.groupby('Chrom') + +# Get the GC content +gc_all = pd.read_csv(input_gc_filename, sep = '\t', header = None) +# Round the GC content to the nearest integer percentage +gc_all.iloc[:,2] = (gc_all.iloc[:,2]*100).astype(int) +# Load up the table of mean coverage per GC +mean_cov_by_gc_all_bins = pd.read_csv(mean_coverage_by_gc_filename, sep = '\t', index_col = 'GC') +mean_cov_by_gc = mean_cov_by_gc_all_bins.loc[mean_cov_by_gc_all_bins['bin_freq'] >= 100, :] +# Remove bins that don't have at least n = 100 in the accessible genome + +# Load up the table of mean variance per sample +var_by_sample = pd.read_csv(variance_by_sample_filename, sep = '\t', index_col = 0) + +# Function to run the HMM on a given file +def process_sample(sample_name, chrom): + # Load up the counts data + input_counts_filename = workingfolder + '/' + chrom + '/counts_for_HMM_' + sample_name + '_' + chrom + '_output.csv' + print('\nLoading file ' + input_counts_filename) + stdout.flush() + input_coverage = pd.read_csv(input_counts_filename, sep = '\t').loc[:, ['Position', 'Counts total']] + input_coverage.rename(columns={'Counts total': 'Counts'}, inplace = True) + + # Get the GC content for this chromosome + gc_thischrom = gc_all.groupby(0).get_group(chrom) + # Get the mapq0 information for this chromosome + mapq_prop_thischrom = mapq_prop_bychrom.get_group(chrom) + # Check that the length of the coverage and GC_content dataframes are the same + if input_coverage.shape[0] != gc_thischrom.shape[0]: + raise Exception('Fail. Coverage and GC_content should have the same number of elements') + # Check that the length of the coverage and mapq0 dataframes are the same + if input_coverage.shape[0] != mapq_prop_thischrom.shape[0]: + raise Exception('Fail. Coverage and GC_content should have the same number of elements') + + # We add the GC content data to the counts table + input_coverage['GC'] = gc_thischrom.iloc[:,2].values + + # Get the mean coverage by GC bin for this sample + this_sample_mean_cov = mean_cov_by_gc[sample_name] + + # Mask out the positions with low mapq0. + input_coverage_mapq_masked = input_coverage.loc[np.array(mapq_prop_thischrom['mapq0_acceptable']),:] + print(str(np.sum(-mapq_prop_thischrom['mapq0_acceptable'])) + ' out of ' + str(input_coverage.shape[0]) + ' were masked for having more than ' + str(max_mapq * 100) + '% mapq0 reads.') + # Mask out the positions that don't have a GC bin with mean coverage data. + input_coverage_mapqandgc_masked = input_coverage_mapq_masked.loc[input_coverage_mapq_masked['GC'].isin(this_sample_mean_cov.index),:] + print('Of these, ' + str(np.sum(-input_coverage_mapq_masked['GC'].isin(this_sample_mean_cov.index))) + ' were masked for having a GC bin with too little coverage information.') + + # Normalised coverage according to the GC content + print('\tNormalising coverage') + stdout.flush() + main_table = normalise_coverage_by_GC(input_coverage_mapqandgc_masked, this_sample_mean_cov) + + # Get the autosomal variance in coverage for this sample + this_mean_variance = var_by_sample.loc[(sample_name, 'autosomes')] + + # Fit the CNV model. The variance we have calculated is for the diploid state, but the model expects + # the variance increment but unit. So we feed it the variance / 2. + # We fit the model six times with different transition probability values + trans_prob = 0.00001 + print('\tFitting CNV model with trans_prob = ' + str(trans_prob) + '.') + stdout.flush() + cnv = fit_hmm(main_table['Normalised_coverage'].values, + transition_probability = trans_prob, + variance = this_mean_variance/2, + variance_fixed = 0.001, + max_copy_number=12) + # Add the CNV values to the table + main_table['CNV'] = cnv + + # Build a unique output filename for this sample + output_filename = workingfolder + '/' + chrom + '/HMM_output/' + sample_name + '_' + chrom + '_HMM.csv' + # Write that table to file + print('\tSaving output to file ' + output_filename) + stdout.flush() + main_table.to_csv(output_filename, sep = '\t', index = False) + main_table + +for sample_name in samplenames: + process_sample(sample_name, chrom) + +print('\n\nScript finished running at ' + time.strftime("%H:%M") + ' on ' + time.strftime("%d/%m/%Y") + '\n') +stdout.flush() + diff --git a/dockerfiles/CNV/scripts/SSFA.py b/dockerfiles/CNV/scripts/SSFA.py new file mode 100755 index 00000000..afbf68e3 --- /dev/null +++ b/dockerfiles/CNV/scripts/SSFA.py @@ -0,0 +1,208 @@ +# This script goes through an alignment file and records the positions of reads within a specified region +# whose mates map to a different chromosome or discordantly on the same chromosome + +import time # required to output the time at which the script was run +from sys import stdout # this import is need to flush python output to the stdout (instead of leaving it +# in the buffer +from sys import argv # this import is needed in order for the script to handle command line arguments +import socket # this import allows us to access the name of the computer that the script is being run on +import pysam + +if (len(argv) == 4): + input_filename = argv[1] + chromosome = argv[2] + region = argv[3] + output_filename = 'counts_for_mHMM_python2_v3.txt' + minqual = 10 +elif (len(argv) == 5): + input_filename = argv[1] + chromosome = argv[2] + region = argv[3] + output_filename = argv[4] + minqual = 10 +elif (len(argv) == 6): + input_filename = argv[1] + chromosome = argv[2] + region = argv[3] + output_filename = argv[4] + minqual = argv[5] +else: + raise Exception("Fail. There should be between 3 and 5 command line arguments (input_filename, chromosome, region [, output_filename [,minqual]])") + +print('Running ' + argv[0] + ' at ' + time.strftime("%H:%M") + ' on ' + time.strftime("%d/%m/%Y") + ' using machine ' + socket.gethostname() + '\n') +print('Input arguments:\n\tinput_filename: ' + input_filename + '\n\tchromosome: ' + chromosome + '\n\tregion: ' + region + '\n\tminqual: ' + str(minqual) + '\n\toutput_filename: ' + output_filename + '\n\n') +stdout.flush() + +print('Loading alignment file ' + input_filename) +stdout.flush() +bambam = pysam.Samfile(input_filename, 'rb') + +# Check that the requested chromosome is a valid reference and get its length +if chromosome not in bambam.references: + raise Exception('Fail. Requested chromosome was not present in the reference') +else: + chrom_id = bambam.get_tid(chromosome) + chromosome_length = bambam.lengths[chrom_id] + +print('Going through chromosome, looking for Cross-chrom, Face Away, Same Strand and Far Mapped reads') +stdout.flush() + +output_crosschrom = [] +output_crosschrom_mapq0 = [] +output_crosschrom_belowT = [] +output_FA = [] +output_FA_mapq0 = [] +output_FA_belowT = [] +output_SS = [] +output_SS_mapq0 = [] +output_SS_belowT = [] +output_FM = [] +output_FM_mapq0 = [] +output_FM_belowT = [] +count = 0 +count_mapq0 = 0 +count_belowT = 0 + +# Get the info from the bamfile +if region == ':': + alignments = bambam.fetch(chromosome) +else: + region_start, region_end = [int(x) for x in region.split(':')] + alignments = bambam.fetch(chromosome, region_start, region_end) + +# To avoid counting the same pair twice, we record the these reads we have already counted. +done_reads = [] +for al in alignments: + # If the reads are a proper pair, just move to the next one + if al.is_proper_pair: + continue + # We check that the read and its mate are both mapped + if al.is_unmapped or al.mate_is_unmapped: + continue + # Set all statuses to False + pair_is_crosschrom = False + pair_is_SS = False + pair_is_FA = False + pair_is_FM = False + # Check we haven't already counted this pair (the two mates have the same name, so the following + # check is correct). + if al.query_name not in done_reads: + done_reads += [al.query_name] + # Look for genes that map discordantly. + this_mate = bambam.mate(al) + if bambam.references[this_mate.rname] != chromosome: + pair_is_crosschrom = True + else: + # If the reads map to the same chromosome, look at whether they are FA, SS or FM. + # We check whether the read pair is same strand + if al.is_reverse == al.mate_is_reverse: + pair_is_SS = True + else: + # If the positions of the read and its mate are the same, we have no information about FA so we + # record it as the default False. There is clearly no FM. + if al.pos < al.mpos: + # We check whether the read pair is face-away + if al.is_reverse: + pair_is_FA = True + else: + # We check how far the reads are apart from each other. If it's more than 1000bp, we consider it FM + if al.mpos > (al.pos + 1000): + pair_is_FM = True + else: + continue + elif al.pos > al.mpos: + # We check whether the read pair is face-away + if al.is_reverse: + # We check how far the reads are apart from each other. If it's more than 1000bp, we consider it FM + if al.pos > (al.mpos + 1000): + pair_is_FM = True + else: + continue + else: + pair_is_FA = True + else: + continue + # We check the mapq value. In order to pass a certain threshold, the mapq of both the read and its mate need to be + # high enough + lowest_mapq = min([al.mapq, this_mate.mapq]) + if lowest_mapq >= float(minqual): + count += 1 + if pair_is_crosschrom: + output_crosschrom += [[al.pos, bambam.references[this_mate.rname], al.mpos]] + if pair_is_SS: + output_SS += [[al.pos, al.mpos]] + if pair_is_FA: + if al.pos == al.mpos: + raise Exception('This shouldn\'t be happening') + output_FA += [[al.pos, al.mpos]] + if pair_is_FM: + if al.pos == al.mpos: + raise Exception('This shouldn\'t be happening') + output_FM += [[al.pos, al.mpos]] + else: + if lowest_mapq == 0: + count_mapq0 += 1 + if pair_is_crosschrom: + output_crosschrom_mapq0 += [[al.pos, bambam.references[this_mate.rname], al.mpos]] + if pair_is_SS: + output_SS_mapq0 += [[al.pos, al.mpos]] + if pair_is_FA: + if al.pos == al.mpos: + raise Exception('This shouldn\'t be happening') + output_FA_mapq0 += [[al.pos, al.mpos]] + if pair_is_FM: + if al.pos == al.mpos: + raise Exception('This shouldn\'t be happening') + output_FM_mapq0 += [[al.pos, al.mpos]] + else: + count_belowT += 1 + if pair_is_crosschrom: + output_crosschrom_belowT += [[al.pos, bambam.references[this_mate.rname], al.mpos]] + if pair_is_SS: + output_SS_belowT += [[al.pos, al.mpos]] + if pair_is_FA: + if al.pos == al.mpos: + raise Exception('This shouldn\'t be happening') + output_FA_belowT += [[al.pos, al.mpos]] + if pair_is_FM: + if al.pos == al.mpos: + raise Exception('This shouldn\'t be happening') + output_FM_belowT += [[al.pos, al.mpos]] + +print('Writing output to file ' + output_filename) +stdout.flush() +outputfile = open(output_filename, 'w') +outputfile.write('#Total mapq >= ' + str(minqual) + ': ' + str(count) + '\n') +outputfile.write('#Total mapq < ' + str(minqual) + ': ' + str(count_belowT) + '\n') +outputfile.write('#Total mapq0: ' + str(count_mapq0) + '\n') +outputfile.write('Type\tPosition\tMate position\n') +for pos in output_crosschrom: + outputfile.write('crosschrom mapq >= ' + str(minqual) + '\t' + str(pos[0]) + '\t' + pos[1] + ':' + str(pos[2]) + '\n') +for pos in output_crosschrom_mapq0: + outputfile.write('crosschrom mapq0\t' + str(pos[0]) + '\t' + pos[1] + ':' + str(pos[2]) + '\n') +for pos in output_crosschrom_belowT: + outputfile.write('crosschrom mapq < ' + str(minqual) + '\t' + str(pos[0]) + '\t' + pos[1] + ':' + str(pos[2]) + '\n') +for pos in output_SS: + outputfile.write('SS mapq >= ' + str(minqual) + '\t' + str(pos[0]) + '\t' + str(pos[1]) + '\n') +for pos in output_SS_mapq0: + outputfile.write('SS mapq0\t' + str(pos[0]) + '\t' + str(pos[1]) + '\n') +for pos in output_SS_belowT: + outputfile.write('SS mapq < ' + str(minqual) + '\t' + str(pos[0]) + '\t' + str(pos[1]) + '\n') +for pos in output_FA: + outputfile.write('FA mapq >= ' + str(minqual) + '\t' + str(pos[0]) + '\t' + str(pos[1]) + '\n') +for pos in output_FA_mapq0: + outputfile.write('FA mapq0\t' + str(pos[0]) + '\t' + str(pos[1]) + '\n') +for pos in output_FA_belowT: + outputfile.write('FA mapq < ' + str(minqual) + '\t' + str(pos[0]) + '\t' + str(pos[1]) + '\n') +for pos in output_FM: + outputfile.write('FM mapq >= ' + str(minqual) + '\t' + str(pos[0]) + '\t' + str(pos[1]) + '\n') +for pos in output_FM_mapq0: + outputfile.write('FM mapq0\t' + str(pos[0]) + '\t' + str(pos[1]) + '\n') +for pos in output_FM_belowT: + outputfile.write('FM mapq < ' + str(minqual) + '\t' + str(pos[0]) + '\t' + str(pos[1]) + '\n') +outputfile.close() + +print('\n\nScript finished running at ' + time.strftime("%H:%M") + ' on ' + time.strftime("%d/%m/%Y") + '\n') +stdout.flush() + + diff --git a/dockerfiles/CNV/scripts/breakpoint_detector.py b/dockerfiles/CNV/scripts/breakpoint_detector.py new file mode 100755 index 00000000..726463dd --- /dev/null +++ b/dockerfiles/CNV/scripts/breakpoint_detector.py @@ -0,0 +1,159 @@ +# breakpoint_detector.py + +# This script goes through an alignment file and records the positions at which soft_clipping is detected +# in the aligned reads + +import time # required to output the time at which the script was run +from sys import stdout # this import is need to flush python output to the stdout (instead of leaving it +# in the buffer +from sys import argv # this import is needed in order for the script to handle command line arguments +import socket # this import allows us to access the name of the computer that the script is being run on +import pysam + +# Run the script: + +if (len(argv) == 4): + input_filename = argv[1] + chromosome = argv[2] + region = argv[3] + output_filename_root = 'breakpoint_detector' + minqual = 10 +elif (len(argv) == 5): + input_filename = argv[1] + chromosome = argv[2] + region = argv[3] + output_filename_root = argv[4] + minqual = 10 +elif (len(argv) == 6): + input_filename = argv[1] + chromosome = argv[2] + region = argv[3] + output_filename_root = argv[4] + minqual = argv[5] +else: + raise Exception("Fail. There should be between 3 and 5 command line arguments (input_filename, chromosome, region [, output_filename_root [,minqual]])") + +print('Running ' + argv[0] + ' at ' + time.strftime("%H:%M") + ' on ' + time.strftime("%d/%m/%Y") + ' using machine ' + socket.gethostname() + '\n') +print('Input arguments:\n\tinput_filename: ' + input_filename + '\n\tchromosome: ' + chromosome + '\n\tregion: ' + region + '\n\tminqual: ' + str(minqual) + '\n\toutput_filename_root: ' + output_filename_root + '\n\n') +stdout.flush() + +print('Loading alignment file ' + input_filename) +stdout.flush() +bambam = pysam.Samfile(input_filename, 'rb') + +output_filename = output_filename_root + '.csv' +pre_clipping_fastq = output_filename_root + '_preclipping.fastq' +pre_clipping_fastq_file = open(pre_clipping_fastq, 'w') +post_clipping_fastq = output_filename_root + '_postclipping.fastq' +post_clipping_fastq_file = open(post_clipping_fastq, 'w') + +# Check that the requested chromosome is a valid reference and get its length +if chromosome not in bambam.references: + raise Exception('Fail. Requested chromosome was not present in the reference') +else: + chrom_id = bambam.get_tid(chromosome) + chromosome_length = bambam.lengths[chrom_id] + +print('Going through chromosome, looking for soft-clipped reads') +stdout.flush() + +soft_clipping_start_points = [] +soft_clipping_start_points_mapq0 = [] +soft_clipping_start_points_belowT = [] +soft_clipping_end_points = [] +soft_clipping_end_points_mapq0 = [] +soft_clipping_end_points_belowT = [] +count = 0 +count_mapq0 = 0 +count_belowT = 0 + +# Get the info from the bamfile +if region == ':': + alignments = bambam.fetch(chromosome) +else: + region_start, region_end = [int(x) for x in region.split(':')] + alignments = bambam.fetch(chromosome, region_start, region_end) + +for al in alignments: + # Set the possible states as False + soft_clipping_start_point = False + soft_clipping_end_point = False + # We check that the read is mapped + if al.is_unmapped: + continue + # The deal with the weird situation that required this change to the previous version of this script + # (see top of the page) we check that the positions vector is not empty + if len(al.positions) == 0: + continue + cigar = al.cigar + # Check the first and last entries of the cigar to see if it is soft clipped. If there is only + # one element in the cigar, it can't be soft clipped + if len(cigar) == 1: + continue + # We require soft-clipping to be at least 10 bases long + if (cigar[0][0] == 4) & (cigar[0][1] >= 10): + # We have soft-clipping at the start of the read. Record it as a soft-clipping end point + soft_clipping_end_point = al.positions[0] + # We will output a fastq file containing the soft-clipped reads. + pre_clipping_fastq_file.write('@' + al.query_name + '_bases_soft_clipped_left_of_pos_' + str(soft_clipping_end_point) + '\n') + pre_clipping_fastq_file.write(al.seq[:cigar[0][1]] + '\n') + pre_clipping_fastq_file.write('+\n') + pre_clipping_fastq_file.write(al.qual[:cigar[0][1]] + '\n') + if (cigar[-1][0] == 4) & (cigar[-1][1] >= 10): + # We have soft-clipping at the end of the read. Record it as a soft-clipping start point + soft_clipping_start_point = al.positions[-1]+2 #(we're adding 1 because of python indexing, and 1 because al.positions[-1] is the last base that matches, not the first base that is soft-clipped + # We will output a fastq file containing the soft-clipped reads. + post_clipping_fastq_file.write('@' + al.query_name + '_bases_soft_clipped_right_of_pos_' + str(soft_clipping_start_point) + '\n') + post_clipping_fastq_file.write(al.seq[(100-cigar[-1][1]):] + '\n') + post_clipping_fastq_file.write('+\n') + post_clipping_fastq_file.write(al.qual[(100-cigar[-1][1]):] + '\n') + + # We check the mapq value. + if al.mapq >= float(minqual): + count += 1 + if soft_clipping_start_point: + soft_clipping_start_points += [[soft_clipping_start_point, al.seq[(100-cigar[-1][1]):]]] + if soft_clipping_end_point: + soft_clipping_end_points += [[soft_clipping_end_point, al.seq[:cigar[0][1]]]] + else: + if al.mapq == 0: + count_mapq0 += 1 + if soft_clipping_start_point: + soft_clipping_start_points_mapq0 += [[soft_clipping_start_point, al.seq[(100-cigar[-1][1]):]]] + if soft_clipping_end_point: + soft_clipping_end_points_mapq0 += [[soft_clipping_end_point, al.seq[:cigar[0][1]]]] + else: + count_belowT += 1 + if soft_clipping_start_point: + soft_clipping_start_points_belowT += [[soft_clipping_start_point, al.seq[(100-cigar[-1][1]):]]] + if soft_clipping_end_point: + soft_clipping_end_points_belowT += [[soft_clipping_end_point, al.seq[:cigar[0][1]]]] +post_clipping_fastq_file.close() +pre_clipping_fastq_file.close() + +print('Writing output to file ' + output_filename) +stdout.flush() +outputfile = open(output_filename, 'w') +outputfile.write('#Total mapq >= ' + str(minqual) + ': ' + str(count) + '\n') +outputfile.write('#Total mapq < ' + str(minqual) + ': ' + str(count_belowT) + '\n') +outputfile.write('#Total mapq0: ' + str(count_mapq0) + '\n') +outputfile.write('Type\tPosition\tClipped_sequence\n') +for pos in soft_clipping_start_points: + outputfile.write('soft clipping start point mapq >= ' + str(minqual) + '\t' + str(pos[0]) + '\t' + str(pos[1]) + '\n') +for pos in soft_clipping_start_points_mapq0: + outputfile.write('soft clipping start point0\t' + str(pos[0]) + '\t' + str(pos[1]) + '\n') +for pos in soft_clipping_start_points_belowT: + outputfile.write('soft clipping start point mapq < ' + str(minqual) + '\t' + str(pos[0]) + '\t' + str(pos[1]) + '\n') +for pos in soft_clipping_end_points: + outputfile.write('soft clipping end point mapq >= ' + str(minqual) + '\t' + str(pos[0]) + '\t' + str(pos[1]) + '\n') +for pos in soft_clipping_end_points_mapq0: + outputfile.write('soft clipping end point0\t' + str(pos[0]) + '\t' + str(pos[1]) + '\n') +for pos in soft_clipping_end_points_belowT: + outputfile.write('soft clipping end point mapq < ' + str(minqual) + '\t' + str(pos[0]) + '\t' + str(pos[1]) + '\n') +outputfile.close() + +print('\n\nScript finished running at ' + time.strftime("%H:%M") + ' on ' + time.strftime("%d/%m/%Y") + '\n') +stdout.flush() + + + diff --git a/dockerfiles/CNV/scripts/calculate_median_coverage_by_GC.py b/dockerfiles/CNV/scripts/calculate_median_coverage_by_GC.py new file mode 100755 index 00000000..1d364bcd --- /dev/null +++ b/dockerfiles/CNV/scripts/calculate_median_coverage_by_GC.py @@ -0,0 +1,179 @@ +#!/usr/bin/python3 + +# This script calculates the median coverage by GC window for a list of samples from Ag1000G + +import time # required to output the time at which the script was run +from sys import stdout # this import is need to flush python output to the stdout (instead of leaving it +# in the buffer +from sys import argv # this import is needed in order for the script to handle command line arguments +import socket # this import allows us to access the name of the computer that the script is being run on +import numpy as np +import pandas as pd +from re import * + + +# Write a function to normalised the coverage for each GC content bin +def normalise_coverage_by_GC(coverage, median_coverage_by_GC, ploidy=2): + output = coverage.copy() + # For each counts value in the output object, associate it with the median coverage for its GC bin + output['expcov'] = [median_coverage_by_GC.loc[x] for x in output['GC']] + # Now divide each counts value by the coverage associated with its GC bin, we multiply by 2 because + # that was the ploidy of our median coverage. + output['Normalised_coverage'] = ploidy * output['Counts total'] / output['expcov'] + # In very rare cases, the expected coverage ends up being 0 (because none of the few windows with a given + # GC content have any coverage). In these cases, we manually set the coverage to be 0 (rather than NaN) + which_zeros = (output['expcov'] == 0) + output.loc[(which_zeros, 'Normalised_coverage')] = 0 + # Return the new dataframe + return output + + +# Write a function to load and mask the counts data for a given chromosome in a given sample +def load_and_mask(working_folder, chrom, sample_name, gc, filterpass): + this_filename = working_folder + '/' + chrom + '/counts_for_HMM_' + sample_name + '_' + chrom + '_output.csv' + print('\tLoading file ' + this_filename) + stdout.flush() + these_counts = pd.read_csv(this_filename, sep='\t') + # Check that the number of bins for the coverage is correct (we have already checked that accessibility + # and GC have the same number of bins). + if these_counts.shape[0] != gc.shape[0]: + raise Exception( + 'Fail. Coverage and GC_content on chromosome ' + chrom + ' should have the same number of bins.') + # Add the GC content information to the counts table. + these_counts['GC'] = gc.iloc[:, 2].values + these_counts['Chrom'] = chrom + # Get the accessibility-masked coverage + output_masked_counts = these_counts.loc[np.array(filterpass), :] + return (output_masked_counts) + + +# Check how many arguments you have and do anything necessary +if (len(argv) == 9): + accessibility_threshold = float(argv[1]) + accessibility_mask_file = argv[2] + mapq0_threshold = float(argv[3]) + mapq0_file = argv[4] + sample_manifest = argv[5] + gc_content_filename = argv[6] + working_folder = argv[7] + output_file_key = argv[8] +else: + raise Exception( + "Fail. There should be eight command line arguments (accessibility_threshold, accessibility_mask_file, mapq0_threshold, mapq0_file, sample_manifest, gc_content_filename, working_folder, output_file_key)" + ) + +print('Running ' + argv[0] + ' at ' + time.strftime("%H:%M") + ' on ' + time.strftime( + "%d/%m/%Y") + ' using machine ' + socket.gethostname() + '\n\n') +print('Input arguments:') +print('\taccessibility_threshold: ' + str(accessibility_threshold)) +print('\taccessibility_mask_file: ' + accessibility_mask_file) +print('\tmapq0_threshold: ' + str(mapq0_threshold)) +print('\tmapq0_file: ' + mapq0_file) +print('\tsample_manifest: ' + sample_manifest) +print('\tgc_content_filename: ' + gc_content_filename) +print('\tworking_folder: ' + working_folder) +print('\toutput_file_key: ' + output_file_key + '\n') +stdout.flush() + +# Set the chromosomes +autosomes = ['2L', '2R', '3L', '3R'] +sex_chrom = 'X' +chroms = autosomes + [sex_chrom] + +with open(sample_manifest, 'r') as f: + sample_names = [x.rstrip('\n') for x in f.readlines()] + +mapq0_prop = pd.read_csv(mapq0_file, sep='\t') +mapq0_prop['prop_mapq0'] = mapq0_prop['Count mapq = 0'] / (mapq0_prop['Count mapq > 0'] + mapq0_prop['Count mapq = 0']) +# Create the mask for mapq0. There are windows where prop_mapq0 is NA (because there are no reads mapping in that window. +# They get removed later anyway because they are inaccessible, but even in the event that they are not removed, the +# output of prop_mapq0 <= mapq0_threshold will be False in these cases. +mapq0_prop['filterpass'] = mapq0_prop.prop_mapq0 <= mapq0_threshold +mapq0_prop_grouped = mapq0_prop.groupby('Chrom') + +# Load up the GC composition for the Agam genome. +gc_all = pd.read_csv(gc_content_filename, sep='\t', header=None) +# Take the floor of the percentage GC content +gc_all.iloc[:, 2] = (gc_all.iloc[:, 2] * 100).astype(int) +gc_grouped = gc_all.groupby(0) + +# Load up the accessibility data +acc_all = pd.read_csv(accessibility_mask_file, sep='\t') +acc_all['filterpass'] = acc_all.Mean_accessibility >= accessibility_threshold +acc_all['doublefilterpass'] = acc_all.filterpass & mapq0_prop.filterpass +acc_grouped = acc_all.groupby('Chrom') + +# Check that the GC content and accessibility have the same number of bins for each chromosome +for chrom in chroms: + if gc_grouped.get_group(chrom).shape[0] != acc_grouped.get_group(chrom).shape[0]: + raise Exception('Fail. GC content and accessibility on chromosome ' + chrom + ' should have the same number of bins.') + +# Get the number of each GC bin after filtering +gc_mainchroms = gc_all.loc[gc_all[0].apply(lambda x: x in chroms), :].copy() +gc_mainchroms['filterpass'] = np.array(acc_all.doublefilterpass) +# I don't know why, but "query" doesn't work here, so need to do this in two steps. +output_table = pd.DataFrame(gc_mainchroms.groupby(2).apply(lambda x: sum(x.filterpass)), columns=['bin_freq'])# .query('bin_freq > 0') +output_table = output_table.loc[output_table.bin_freq > 0, :].copy() +output_table.index = output_table.index.rename('GC') + +# Here is where we will store the variance data +output_variance = pd.DataFrame(0, columns=chroms + ['autosomes'], index=sample_names) + +# Now for each sample get the counts data for all the chromosomes +for this_sample in sample_names: + print('\nAnalysing sample ' + this_sample + '.') + stdout.flush() + # Load and mask the data for each chromosome + autosomal_masked_raw_counts = pd.concat(map(lambda c: load_and_mask(working_folder, + c, + this_sample, + gc_grouped.get_group(c), + acc_grouped.get_group(c).doublefilterpass), + autosomes)) + sexchrom_masked_raw_counts = load_and_mask(working_folder, sex_chrom, this_sample, + gc_grouped.get_group(sex_chrom), + acc_grouped.get_group(sex_chrom).doublefilterpass) + + # Now we have combined the results from all of the autosomes, we can group the coverage by GC bin + # and then compute the median + counts_by_GC = autosomal_masked_raw_counts.groupby('GC') + median_counts_by_GC = counts_by_GC['Counts total'].median() + # Add these to the global data + output_table[this_sample] = median_counts_by_GC + + # Get the normalised coverage + autosomal_masked_counts = normalise_coverage_by_GC(autosomal_masked_raw_counts, median_counts_by_GC) + # We still use the autosomal median_counts_by_GC for the X, because we want to normalised by the diploid expectation + sexchrom_masked_counts = normalise_coverage_by_GC(sexchrom_masked_raw_counts, median_counts_by_GC) + + # Now we want to calculate the variance in coverage for this sample, both for each chromosome + # and overall. We filter out the 1% of windows with the highest coverage before calculating + # the variance. + print('\tCalculating normalised coverage and variance.') + stdout.flush() + masked_counts_by_autosome = autosomal_masked_counts.groupby('Chrom') + + def filter_by_quantile(x, thresh): + return (x[x < np.quantile(x, thresh)]) + + quantile_threshold = 0.99 + for chrom in autosomes: + output_variance.loc[this_sample, chrom] = np.var(filter_by_quantile(masked_counts_by_autosome.get_group(chrom).Normalised_coverage, quantile_threshold)) + output_variance.loc[this_sample, sex_chrom] = np.var(filter_by_quantile(sexchrom_masked_counts.Normalised_coverage, quantile_threshold)) + # Now calculate the variance across the autosomes + output_variance.loc[this_sample, 'autosomes'] = np.var(filter_by_quantile(autosomal_masked_counts.Normalised_coverage, quantile_threshold)) + +# Write the output to file +output_filename = working_folder + '/median_coverage_by_GC_masked_' + sub('\.', '', str(accessibility_threshold)) + '_' + sub('\.', '', str(mapq0_threshold)) + '_' + output_file_key + '.csv' +output_variance_filename = working_folder + '/coverage_variance_masked_' + sub('\.', '', str(accessibility_threshold)) + '_' + sub('\.', '', str(mapq0_threshold)) + '_' + output_file_key + '.csv' + +print('\nSaving coverage output to file ' + output_filename) +stdout.flush() +output_table.to_csv(output_filename, sep='\t') + +print('\nSaving variance output to file ' + output_variance_filename) +stdout.flush() +output_variance.to_csv(output_variance_filename, sep='\t') + +print('\n\nScript finished running at ' + time.strftime("%H:%M") + ' on ' + time.strftime("%d/%m/%Y") + '\n') + diff --git a/dockerfiles/CNV/scripts/calculate_windowed_accessibility.sh b/dockerfiles/CNV/scripts/calculate_windowed_accessibility.sh new file mode 100755 index 00000000..496afbd8 --- /dev/null +++ b/dockerfiles/CNV/scripts/calculate_windowed_accessibility.sh @@ -0,0 +1,14 @@ + +output_dir=/lustre/scratch118/malaria/team112/personal/el10/phase3_data/windowed_accessibility +path_to_release=/lustre/scratch118/malaria/team112/personal/el10/vo_agam_release/v3 +path_to_snp_sites=${path_to_release}/snp_genotypes/all/sites +path_to_gambcolu_accessibility=${path_to_release}/site_filters/dt_20200416/gamb_colu +path_to_arab_accessibility=${path_to_release}/site_filters/dt_20200416/arab + +mkdir -p $output_dir + +source activate cnv37 + +python ../scripts/calculate_windowed_mean_accessibility.py $path_to_gambcolu_accessibility $path_to_snp_sites ${output_dir}/mean_accessibility_gambcolu.csv 300 > ${output_dir}/mean_accessibility_gambcolu.log 2>&1 +python ../scripts/calculate_windowed_mean_accessibility.py $path_to_arab_accessibility $path_to_snp_sites ${output_dir}/mean_accessibility_arabiensis.csv 300 > ${output_dir}/mean_accessibility_arabensis.log 2>&1 + diff --git a/dockerfiles/CNV/scripts/calculate_windowed_mean_accessibility.py b/dockerfiles/CNV/scripts/calculate_windowed_mean_accessibility.py new file mode 100755 index 00000000..57278961 --- /dev/null +++ b/dockerfiles/CNV/scripts/calculate_windowed_mean_accessibility.py @@ -0,0 +1,66 @@ +#!/usr/bin/python + +import time # required to output the time at which the script was run +import os +from sys import stdout # this import is need to flush python output to the stdout (instead of leaving it +# in the buffer +from sys import argv # this import is needed in order for the script to handle command line arguments +import socket # this import allows us to access the name of the computer that the script is being run on +import numpy as np +import pandas as pd +from re import * +import zarr +import allel + + +if (len(argv) == 4): + input_accessibility_filename = argv[1] + positions_filename = argv[2] + output_filename = argv[3] + window_size = 300 +if (len(argv) == 5): + input_accessibility_filename = argv[1] + positions_filename = argv[2] + output_filename = argv[3] + window_size = int(argv[4]) +else: + raise Exception("Fail. There should be three or four command line arguments (input_accessibility_filename, positions_filename, output_filename, [, window_size])") + +print('Running ' + argv[0] + ' at ' + time.strftime("%H:%M") + ' on ' + time.strftime("%d/%m/%Y") + ' using machine ' + socket.gethostname() + '\n\n') +print('Input arguments:\n\tinput_accessibility_filename: ' + input_accessibility_filename + '\n\tpositions_filename: ' + positions_filename + '\n\toutput_filename: ' + output_filename + '\n\twindow_size: ' + str(window_size) + '\n\n') +stdout.flush() + +chroms = ['2L', '2R', '3L', '3R', 'X'] +acc = zarr.open(os.path.expanduser(input_accessibility_filename), mode = 'r') +pos = zarr.open(os.path.expanduser(positions_filename), mode = 'r') + +outfile = open(output_filename, 'w') +outfile.write('Chrom\tPosition\tMean_accessibility\n') +for chrom in chroms: + print('Calculating average accessibility over chromosome ' + chrom + '.\n') + this_acc = acc[chrom + '/variants/filter_pass'] + these_pos = allel.SortedIndex(pos[chrom + '/variants/POS']) + # Do all but the last window in the for loop + for p in range(1,these_pos[-1] - 299, window_size): + endp = p + window_size - 1 + # If there are no windows in the range with a value, then there are no accessible positions in the + # window + try: + this_window_acc = this_acc[these_pos.locate_range(p, endp)] + except KeyError: + this_window_acc = 0 + # We always divide by the window size, even if there are fewer than window_size accessibility + # values in this interval. That's because any missing values would be N positions, which should + # not be counted as accessible. + outfile.write(chrom + '\t' + str(p - 1) + '\t' + str(float(np.sum(this_window_acc))/window_size) + '\n') + # Do the last window on its own, because it may be smaller than window_size + p = endp + 1 + this_window_acc = this_acc[these_pos.locate_range(p, p + window_size - 1)] + window_len = len(this_window_acc) + outfile.write(chrom + '\t' + str(p - 1) + '\t' + str(float(np.sum(this_window_acc)) / window_len) + '\n') + +outfile.close() + +print('\n\nScript finished running at ' + time.strftime("%H:%M") + ' on ' + time.strftime("%d/%m/%Y") + '\n') +stdout.flush() + diff --git a/dockerfiles/CNV/scripts/changes_from_phase2_method.txt b/dockerfiles/CNV/scripts/changes_from_phase2_method.txt new file mode 100644 index 00000000..56a8333e --- /dev/null +++ b/dockerfiles/CNV/scripts/changes_from_phase2_method.txt @@ -0,0 +1,16 @@ +Differences in the methods in phase3 compared to phase2: + + +We now calculate median instead of mean coverage to get the normalisation per GC windows. Windows used for calculation of median coverage and coverage variance are filtered based on having at least 90% accessible sites (same as for phase 2), but we now also add a mapq0 filter (windows are excluded in they have > 50% mapq0 reads). + +Coverage variance is calculated after excluding the 1% highest coverage windows. + +HMM process filtered with mapq0 <= 50%, rather than 2%. + +Coverage-based CNV calls for each of the focal genes (Ace-1, Cyp6 regions, Gste regions, Cyp9k1 region) are made using the mode of CNV state for each gene, instead of using the definition of encompassed gene duplications. + +Dup0 now means any coverage-based call for the focal region, instead of coverage-based call and lack of discordant read based call. + +When matching CNVs across samples, in phase2 there was a sanity check that the same CNV allele was never called twice in the same sample. But it is actually possible for this to happen if the range of start and end positions for a given CNV allele becomes wide enough. That sanity check has therefore been removed, and instead a measure of how consistent the start and end points are is given by the 90% quantile range size of the start and end points. + +Read-based calling is more stringent, requiring 4 supporting diagnostic reads instead of 2. diff --git a/dockerfiles/CNV/scripts/counts_for_HMM.py b/dockerfiles/CNV/scripts/counts_for_HMM.py new file mode 100755 index 00000000..d6e489ce --- /dev/null +++ b/dockerfiles/CNV/scripts/counts_for_HMM.py @@ -0,0 +1,89 @@ +# This script will calculate the coverage at every x bp along a genomic region for a given bamfile + +import time # required to output the time at which the script was run +from sys import stdout # this import is need to flush python output to the stdout (instead of leaving it +# in the buffer +from sys import argv # this import is needed in order for the script to handle command line arguments +import socket # this import allows us to access the name of the computer that the script is being run on +import pysam + +# Check how many arguments you have and do anything necessary +if (len(argv) == 5): + input_filename = argv[1] + chromosome = argv[2] + interval = argv[3] + window_size = argv[4] + minqual = 10 + output_filename = 'counts_for_HMM_python3.txt' +elif (len(argv) == 6): + input_filename = argv[1] + chromosome = argv[2] + interval = argv[3] + window_size = argv[4] + minqual = argv[5] + output_filename = 'counts_for_HMM_python3.txt' +elif (len(argv) == 7): + input_filename = argv[1] + chromosome = argv[2] + interval = argv[3] + window_size = argv[4] + minqual = argv[5] + output_filename = argv[6] +else: + raise Exception("Fail. There should be between four and six command line arguments (input_filename, chromosome, interval, window_size [, minqual [, output_filename]]]]])") + + +print('Running ' + argv[0] + ' at ' + time.strftime("%H:%M") + ' on ' + time.strftime("%d/%m/%Y") + ' using machine ' + socket.gethostname() + '\n') +print('Input arguments:\n\tinput_filename: ' + input_filename + '\n\tchromosome: ' + chromosome + '\n\tinterval: ' + str(interval) + '\n\twindow_size: ' + str(window_size) + '\n\tminqual: ' + str(minqual) + '\n\toutput_filename: ' + output_filename + '\n\n') +stdout.flush() + +print('Loading alignment file ' + input_filename) +stdout.flush() +bambam = pysam.Samfile(input_filename, 'rb') + +# Check that the requested chromosome is a valid reference and get its length +if chromosome not in bambam.references: + raise Exception('Fail. Requested chromosome was not present in the reference') +else: + chrom_id = bambam.get_tid(chromosome) + chromosome_length = bambam.lengths[chrom_id] + +print('Calculating coverage along chromosome ' + chromosome + ' from start to end (' + str(chromosome_length) + ') in intervals of ' + str(interval) + 'bp.') +stdout.flush() + +# We start from position 1 and move up through the positions in steps of size "interval", calculating +# the coverage at each step. +positions = range(0, chromosome_length-1, int(interval)) +output = [] +output_mapq0 = [] +output_belowT = [] +output_total = [] +for i in positions: + alignments = bambam.fetch(chromosome, i, i+int(window_size)) + count = 0 + count_mapq0 = 0 + count_belowT = 0 + for al in alignments: + if not al.is_unmapped: + # We count coverage as reads that start in the current interval + if al.reference_start >= i: + if al.mapq >= float(minqual): + count += 1 + else: + if al.mapq == 0: + count_mapq0 += 1 + else: + count_belowT += 1 + output += [str(count)] + output_mapq0 += [str(count_mapq0)] + output_belowT += [str(count_belowT)] + output_total += [str(count + count_mapq0 + count_belowT)] +outputfile = open(output_filename, 'w') +outputfile.write('Position\tCounts mapq >= ' + str(minqual) + '\tCounts 0 < mapq < ' + str(minqual) + '\tCounts mapq0\tCounts total\n') +for i in range(len(positions)): + outputfile.write(str(positions[i]) + '\t' + output[i] + '\t' + output_belowT[i] + '\t' + output_mapq0[i] + '\t' + output_total[i] + '\n') +outputfile.close() + +print('\n\nScript finished running at ' + time.strftime("%H:%M") + ' on ' + time.strftime("%d/%m/%Y") + '\n') +stdout.flush() + diff --git a/dockerfiles/CNV/scripts/coverage_CNVs.sh b/dockerfiles/CNV/scripts/coverage_CNVs.sh new file mode 100755 index 00000000..1b6defdf --- /dev/null +++ b/dockerfiles/CNV/scripts/coverage_CNVs.sh @@ -0,0 +1,27 @@ +coveragefolder=$1 +manifest=$2 +chrom=$3 +output_name=$4 +coverage_variance_file=$5 +ncores=$6 +metadata=$7 + +gene_coordinates_file=~/personal/phase3_data/tables/gene_regions.csv +detox_genes_file=~/personal/phase3_data/tables/detox_genes.txt +workingfolder=$coveragefolder/$chrom/HMM_output +outputfolder=$workingfolder/$output_name +scriptsfolder=~/scripts/CNV_scripts/scripts + +R --version + +R --slave -f $scriptsfolder/CNV_analysis.r --args $chrom \ + $manifest \ + $coverage_variance_file \ + $gene_coordinates_file \ + $detox_genes_file \ + $workingfolder \ + $ncores \ + $outputfolder \ + $metadata \ + > $coveragefolder/$chrom/CNV_analysis_logs/CNV_analysis_${output_name}.log 2>&1 + diff --git a/dockerfiles/CNV/scripts/coverage_CNVs_vobs.sh b/dockerfiles/CNV/scripts/coverage_CNVs_vobs.sh new file mode 100755 index 00000000..b85fdebb --- /dev/null +++ b/dockerfiles/CNV/scripts/coverage_CNVs_vobs.sh @@ -0,0 +1,27 @@ +coveragefolder=$1 +manifest=$2 +chrom=$3 +output_name=$4 +coverage_variance_file=$5 +ncores=$6 +metadata=$7 +gene_coordinates_file=$8 +detox_genes_file=$9 + +workingfolder=$coveragefolder/$chrom/HMM_output +outputfolder=$workingfolder/$output_name +scriptsfolder=~/scripts/CNV_scripts/scripts + +R-3.6.1 --version + +R-3.6.1 --slave -f $scriptsfolder/CNV_analysis.r --args $chrom \ + $manifest \ + $coverage_variance_file \ + $gene_coordinates_file \ + $detox_genes_file \ + $workingfolder \ + $ncores \ + $outputfolder \ + $metadata \ + > $coveragefolder/$chrom/CNV_analysis_logs/CNV_analysis_${output_name}.log 2>&1 + diff --git a/dockerfiles/CNV/scripts/coverage_HMM.sh b/dockerfiles/CNV/scripts/coverage_HMM.sh new file mode 100755 index 00000000..15bc1b2b --- /dev/null +++ b/dockerfiles/CNV/scripts/coverage_HMM.sh @@ -0,0 +1,24 @@ +coveragefolder=$1 +manifest=$2 +chrom=$3 +species_id_file=$4 +GC_content_file=$5 +coverage_by_GC_file=$coveragefolder/$6 +coverage_variance_file=$coveragefolder/$7 +mapq_prop_file=$coveragefolder/$8 + +scriptsfolder=~/scripts/CNV_scripts/scripts + +source activate cnv37 + +python ${scriptsfolder}/HMM_process.py \ + $manifest \ + $chrom \ + $coveragefolder \ + $GC_content_file \ + $coverage_by_GC_file \ + $coverage_variance_file \ + $mapq_prop_file \ + 0.5 \ + > ${coveragefolder}/${chrom}/HMM_logs_${species_id_file}/HMM_${chrom}.log 2>&1 + diff --git a/dockerfiles/CNV/scripts/coverage_HMM_vobs.sh b/dockerfiles/CNV/scripts/coverage_HMM_vobs.sh new file mode 100755 index 00000000..ca3bc1e8 --- /dev/null +++ b/dockerfiles/CNV/scripts/coverage_HMM_vobs.sh @@ -0,0 +1,24 @@ +coveragefolder=$1 +manifest=$2 +chrom=$3 +species=$4 +GC_content_file=$5 +coverage_by_GC_file=$coveragefolder/$6 +coverage_variance_file=$coveragefolder/$7 +mapq_prop_file=$8 + +scriptsfolder=~/scripts/CNV_scripts/scripts + +source activate cnv37 + +python ${scriptsfolder}/HMM_process.py \ + $manifest \ + $chrom \ + $coveragefolder \ + $GC_content_file \ + $coverage_by_GC_file \ + $coverage_variance_file \ + $mapq_prop_file \ + 0.5 \ + > ${coveragefolder}/${chrom}/HMM_logs_${species}/HMM_${chrom}.log 2>&1 + diff --git a/dockerfiles/CNV/scripts/get_coverage_stats_by_sample_set.sh b/dockerfiles/CNV/scripts/get_coverage_stats_by_sample_set.sh new file mode 100755 index 00000000..c404f462 --- /dev/null +++ b/dockerfiles/CNV/scripts/get_coverage_stats_by_sample_set.sh @@ -0,0 +1,43 @@ +workingfolder=$1 +manifest=$2 +accessibility_file=$3 +GC_content_file=$4 +sample_group_id=$5 +scriptsfolder=~/scripts/CNV_scripts/scripts +allchrom=(2L 2R 3L 3R X) + +source activate cnv37 + +startingfolder=$(pwd) + +# We will combine all of the mapq0 prop files into a single file +mapqprop_output_filepath=$workingfolder/mapq_proportions_allchrom_${sample_group_id}.csv +echo -e 'Chrom\tPosition\tCount mapq > 0\tCount mapq = 0' > $mapqprop_output_filepath + +# Calculate the windowed proportion of mapq0 reads across samples +echo "Calculating windowed proportion of mapq0 reads for sample group ${sample_group_id}" +for chrom in ${allchrom[@]} +do + cd $workingfolder/$chrom + python $scriptsfolder/get_prop_mapq0.py \ + $manifest \ + mapq_proportions_${chrom}_${sample_group_id}.csv \ + . \ + > get_prop_mapq0_${chrom}_${sample_group_id}.log 2>&1 + cd $startingfolder + tail -n +2 $workingfolder/$chrom/mapq_proportions_${chrom}_${sample_group_id}.csv | sed "s/^/${chrom}\t/" >> $mapqprop_output_filepath +done + +# Now calculate median coverage by GC +echo "Calculating median and variance of coverage by GC bin for sample group ${sample_group_id}" +python $scriptsfolder/calculate_median_coverage_by_GC.py 0.9 \ + $accessibility_file \ + 0.5 \ + $mapqprop_output_filepath \ + $manifest \ + $GC_content_file \ + $workingfolder \ + $sample_group_id \ + > $workingfolder/calculate_mean_coverage_by_GC_09_05_${sample_group_id}.log 2>&1 + + diff --git a/dockerfiles/CNV/scripts/get_coverage_stats_by_sample_set_vobs.sh b/dockerfiles/CNV/scripts/get_coverage_stats_by_sample_set_vobs.sh new file mode 100755 index 00000000..afebf651 --- /dev/null +++ b/dockerfiles/CNV/scripts/get_coverage_stats_by_sample_set_vobs.sh @@ -0,0 +1,24 @@ +workingfolder=$1 +manifest=$2 +accessibility_file=$3 +mapqprop_file=$4 +GC_content_file=$5 +sample_group_id=$6 +scriptsfolder=~/scripts/CNV_scripts/scripts +allchrom=(2L 2R 3L 3R X) + +source activate cnv37 + +# Now calculate median coverage by GC +echo "Calculating median and variance of coverage by GC bin for sample group ${sample_group_id}" +python $scriptsfolder/calculate_median_coverage_by_GC.py 0.9 \ + $accessibility_file \ + 0.5 \ + $mapqprop_file \ + $manifest \ + $GC_content_file \ + $workingfolder \ + $sample_group_id \ + > $workingfolder/calculate_mean_coverage_by_GC_09_05_${sample_group_id}.log 2>&1 + + diff --git a/dockerfiles/CNV/scripts/get_diagnostic_reads.sh b/dockerfiles/CNV/scripts/get_diagnostic_reads.sh new file mode 100755 index 00000000..3b195a92 --- /dev/null +++ b/dockerfiles/CNV/scripts/get_diagnostic_reads.sh @@ -0,0 +1,29 @@ +bamfilefolder=$1 +manifest=$2 +samplenum=$3 +outputfolder=$4 +scriptsfolder=~/scripts/CNV_scripts/scripts + +samplename=($(head -n$samplenum $manifest | tail -n1)) +bamfile=${bamfilefolder}/${samplename}.bam + +echo $bamfile + +source activate cnv37 + +SSFA_script=$scriptsfolder/SSFA.py +SSFAfolder=$outputfolder/SSFA +python $SSFA_script $bamfile 2R 3425000:3650000 ${SSFAfolder}/2R/Ace1_region/${samplename}_Ace1_SSFA_output.csv 10 > ${SSFAfolder}/2R/Ace1_region/SSFAlogs/${samplename}_Ace1_SSFA_output.log 2>&1 +python $SSFA_script $bamfile 2R 28460000:28570000 ${SSFAfolder}/2R/Cyp6_region/${samplename}_CYP6_SSFA_output.csv 10 > ${SSFAfolder}/2R/Cyp6_region/SSFAlogs/${samplename}_CYP6_SSFA_output.log 2>&1 +python $SSFA_script $bamfile 3R 6900000:7000000 ${SSFAfolder}/3R/Cyp6zm_region/${samplename}_CYP6ZM_SSFA_output.csv 10 > ${SSFAfolder}/3R/Cyp6zm_region/SSFAlogs/${samplename}_CYP6ZM_SSFA_output.log 2>&1 +python $SSFA_script $bamfile 3R 28570000:28620000 ${SSFAfolder}/3R/Gste_region/${samplename}_GST_SSFA_output.csv 10 > ${SSFAfolder}/3R/Gste_region/SSFAlogs/${samplename}_GST_SSFA_output.log 2>&1 +python $SSFA_script $bamfile X 15220000:15255000 ${SSFAfolder}/X/Cyp9k1_region/${samplename}_CYP9K1_SSFA_output.csv 10 > ${SSFAfolder}/X/Cyp9k1_region/SSFAlogs/${samplename}_CYP9K1_SSFA_output.log 2>&1 + +breakpoints_script=$scriptsfolder/breakpoint_detector.py +breakpointsfolder=$outputfolder/breakpoints +python $breakpoints_script $bamfile 2R 3425000:3650000 ${breakpointsfolder}/2R/Ace1_region/${samplename}_Ace1_breakpoints_output 10 > ${breakpointsfolder}/2R/Ace1_region/breakpointlogs/${samplename}_Ace1_breakpoints_output.log 2>&1 +python $breakpoints_script $bamfile 2R 28460000:28570000 ${breakpointsfolder}/2R/Cyp6_region/${samplename}_CYP6_breakpoints_output 10 > ${breakpointsfolder}/2R/Cyp6_region/breakpointlogs/${samplename}_CYP6_breakpoints_output.log 2>&1 +python $breakpoints_script $bamfile 3R 6900000:7000000 ${breakpointsfolder}/3R/Cyp6zm_region/${samplename}_CYP6ZM_breakpoints_output 10 > ${breakpointsfolder}/3R/Cyp6zm_region/breakpointlogs/${samplename}_CYP6ZM_breakpoints_output.log 2>&1 +python $breakpoints_script $bamfile 3R 28570000:28620000 ${breakpointsfolder}/3R/Gste_region/${samplename}_GST_breakpoints_output 10 > ${breakpointsfolder}/3R/Gste_region/breakpointlogs/${samplename}_GST_breakpoints_output.log 2>&1 +python $breakpoints_script $bamfile X 15220000:15255000 ${breakpointsfolder}/X/Cyp9k1_region/${samplename}_CYP9K1_breakpoints_output 10 > ${breakpointsfolder}/X/Cyp9k1_region/breakpointlogs/${samplename}_CYP9K1_breakpoints_output.log 2>&1 + diff --git a/dockerfiles/CNV/scripts/get_prop_mapq0.py b/dockerfiles/CNV/scripts/get_prop_mapq0.py new file mode 100755 index 00000000..062626b4 --- /dev/null +++ b/dockerfiles/CNV/scripts/get_prop_mapq0.py @@ -0,0 +1,83 @@ +#!/usr/bin/python3 +#get_prop_mapq0.py + +# This script takes the coverage data for all individuals and calculates the proportion of reads that have mapq=0 +# across all individuals in the dataset. + +import time # required to output the time at which the script was run +from sys import stdout # this import is need to flush python output to the stdout (instead of leaving it +# in the buffer +from sys import argv # this import is needed in order for the script to handle command line arguments +import socket # this import allows us to access the name of the computer that the script is being run on +from re import * +import pandas as pd +import os + +if (len(argv) == 3): + list_of_samples = argv[1] + output_filename = argv[2] + coverage_counts_directory = '.' +elif (len(argv) == 4): + list_of_samples = argv[1] + output_filename = argv[2] + coverage_counts_directory = argv[3] +else: + raise Exception("Fail. There should be one or two command line arguments (list_of_samples, output_filename [, coverage_counts_directory])") + + +print('Running ' + argv[0] + ' at ' + time.strftime("%H:%M") + ' on ' + time.strftime("%d/%m/%Y") + ' using machine ' + socket.gethostname() + '\n\n') +stdout.flush() + +print('Input arguments:') +print('\tlist_of_samples: ' + list_of_samples) +print('\toutput_filename: ' + output_filename) +print('\tcoverage_counts_directory: ' + coverage_counts_directory + '\n') +stdout.flush() + +# Get file objects for all of the files +with open(list_of_samples, 'r') as f: + samples = [x.rstrip('\n') for x in f.readlines()] + +# Identify the files relating to the sample manifest +all_files = os.listdir(coverage_counts_directory) +files_to_load = [os.path.join(coverage_counts_directory, x) for x in all_files for y in samples if search(y + '.*output.csv', x)] +# Check all of the samples are represented: +if len(files_to_load) != len(samples): + raise Exception('Manifest does not match available files in folder') + +# Get the data from the first file +print('Loading ' + files_to_load[0]) +stdout.flush() +ftable = pd.read_csv(files_to_load[0], sep = '\t') +positions = ftable['Position'] +total_mapqpositive = ftable['Counts mapq >= 10'] + ftable['Counts 0 < mapq < 10'] +total_mapq0 = ftable['Counts mapq0'] +counts_table = pd.concat([positions, total_mapqpositive.copy(), total_mapq0.copy()], 1) +del(ftable) + +def add_to_table(input_table, filename): + print('Loading ' + filename) + stdout.flush() + ftable = pd.read_csv(filename, sep = '\t') + if not all(ftable['Position'] == input_table['Position']): + raise Exception('Positions do not match between input table and new table') + total_mapqpositive = ftable['Counts mapq >= 10'] + ftable['Counts 0 < mapq < 10'] + total_mapq0 = ftable['Counts mapq0'] + input_table.iloc[:, 1:] = input_table.iloc[:, 1:] + pd.concat([total_mapqpositive, total_mapq0], 1) + return(input_table) + + +# Add the data from subsequent files +for f in files_to_load[1:]: + counts_table = add_to_table(counts_table, f) + +# Change the column names to match previous versions of the script and subsequent scripts in the pipeline +counts_table.columns = ['Position', 'Count_mapq_gr0', 'Count_mapq_eq0'] + +print('Writing output file') +stdout.flush() +counts_table.to_csv(output_filename, sep = '\t', index = False) + +print('\n\nScript finished running at ' + time.strftime("%H:%M") + ' on ' + time.strftime("%d/%m/%Y") + '\n') +stdout.flush() + diff --git a/dockerfiles/CNV/scripts/get_windowed_coverage.sh b/dockerfiles/CNV/scripts/get_windowed_coverage.sh new file mode 100755 index 00000000..20c587da --- /dev/null +++ b/dockerfiles/CNV/scripts/get_windowed_coverage.sh @@ -0,0 +1,25 @@ +bamfilefolder=$1 +manifest=$2 +samplenum=$3 +outputfolder=$4 +scriptsfolder=~/scripts/CNV_scripts/scripts +allchrom=(2L 2R 3L 3R X) + +samplename=($(head -n$samplenum $manifest | tail -n1)) +bamfile=${bamfilefolder}/${samplename}.bam + +echo $bamfile + +source activate cnv37 + +for chrom in ${allchrom[@]} +do + python ${scriptsfolder}/counts_for_HMM.py \ + $bamfile \ + $chrom \ + 300 300 10 \ + ${outputfolder}/${chrom}/counts_for_HMM_${samplename}_${chrom}_output.csv \ + > ${outputfolder}/${chrom}/coveragelogs/counts_for_HMM_${samplename}_${chrom}.log 2>&1 +done + + diff --git a/dockerfiles/CNV/scripts/get_windowed_coverage_and_diagnostic_reads.sh b/dockerfiles/CNV/scripts/get_windowed_coverage_and_diagnostic_reads.sh new file mode 100755 index 00000000..01d662ef --- /dev/null +++ b/dockerfiles/CNV/scripts/get_windowed_coverage_and_diagnostic_reads.sh @@ -0,0 +1,73 @@ +# This shell script was originally written by Eric Lucas and has been modified by Kevin Palis to run in Terra +# as part of the CNV pipeline. + + +# bamfilefolder=$1 +# bamfilepaths=$2 +# samplenum=$3 +# outputfolder=$4 + +# scriptsfolder=~/scripts/CNV_scripts/scripts + +# samplename=($(head -n$samplenum $bamfilepaths | tail -n1 | cut -f1)) +# bampath=($(head -n$samplenum $bamfilepaths | tail -n1 | cut -f2)) +# bamfile=${bamfilefolder}/${samplename}.bam + +bamfile = $1 +samplename = $2 +outputfolder = $3 + + +#echo Downloading bam file for $samplename from $bampath to $bamfile + +# Removing with the assumption that in Terra, the input files are already localized for us +# Download the required bam +#curl ${bampath} > ${bamfile} 2> ${bamfilefolder}/${samplename}_download.log +#curl ${bampath}.bai > ${bamfile}.bai 2>> ${bamfilefolder}/${samplename}_download.log + +# Start the conda env +source activate cnv37 + +# Get the coverage data +echo Calculating coverage +allchrom=(2L 2R 3L 3R X) +coveragefolder=$outputfolder/coverage +for chrom in ${allchrom[@]} +do + #create the directory structure needed (this was likely pre-provisioned in the baremetal version of this pipeline) + mkdir -p ${coveragefolder}/${chrom}/coveragelogs/ + python ${scriptsfolder}/counts_for_HMM.py \ + $bamfile \ + $chrom \ + 300 300 10 \ + ${coveragefolder}/${chrom}/counts_for_HMM_${samplename}_${chrom}_output.csv \ + > ${coveragefolder}/${chrom}/coveragelogs/counts_for_HMM_${samplename}_${chrom}.log 2>&1 +done + +# Commenting out all the code for discordant reads out as we've identified that we don't need this for this pipeline. +# TODO: Remove the code block entirely once this sub-pipeline is in working order. +# echo Identifying discordant reads +# Get the discordant reads +# SSFA_script=$scriptsfolder/SSFA.py +# SSFAfolder=$outputfolder/diagnostic_reads/SSFA +# python $SSFA_script $bamfile 2R 3425000:3650000 ${SSFAfolder}/2R/Ace1_region/${samplename}_Ace1_SSFA_output.csv 10 > ${SSFAfolder}/2R/Ace1_region/SSFAlogs/${samplename}_Ace1_SSFA_output.log 2>&1 +# python $SSFA_script $bamfile 2R 28460000:28570000 ${SSFAfolder}/2R/Cyp6_region/${samplename}_CYP6_SSFA_output.csv 10 > ${SSFAfolder}/2R/Cyp6_region/SSFAlogs/${samplename}_CYP6_SSFA_output.log 2>&1 +# python $SSFA_script $bamfile 3R 6900000:7000000 ${SSFAfolder}/3R/Cyp6zm_region/${samplename}_CYP6ZM_SSFA_output.csv 10 > ${SSFAfolder}/3R/Cyp6zm_region/SSFAlogs/${samplename}_CYP6ZM_SSFA_output.log 2>&1 +# python $SSFA_script $bamfile 3R 28570000:28620000 ${SSFAfolder}/3R/Gste_region/${samplename}_GST_SSFA_output.csv 10 > ${SSFAfolder}/3R/Gste_region/SSFAlogs/${samplename}_GST_SSFA_output.log 2>&1 +# python $SSFA_script $bamfile X 15220000:15255000 ${SSFAfolder}/X/Cyp9k1_region/${samplename}_CYP9K1_SSFA_output.csv 10 > ${SSFAfolder}/X/Cyp9k1_region/SSFAlogs/${samplename}_CYP9K1_SSFA_output.log 2>&1 + +# # Get the soft clipped reads +# breakpoints_script=$scriptsfolder/breakpoint_detector.py +# breakpointsfolder=$outputfolder/diagnostic_reads/breakpoints +# python $breakpoints_script $bamfile 2R 3425000:3650000 ${breakpointsfolder}/2R/Ace1_region/${samplename}_Ace1_breakpoints_output 10 > ${breakpointsfolder}/2R/Ace1_region/breakpointlogs/${samplename}_Ace1_breakpoints_output.log 2>&1 +# python $breakpoints_script $bamfile 2R 28460000:28570000 ${breakpointsfolder}/2R/Cyp6_region/${samplename}_CYP6_breakpoints_output 10 > ${breakpointsfolder}/2R/Cyp6_region/breakpointlogs/${samplename}_CYP6_breakpoints_output.log 2>&1 +# python $breakpoints_script $bamfile 3R 6900000:7000000 ${breakpointsfolder}/3R/Cyp6zm_region/${samplename}_CYP6ZM_breakpoints_output 10 > ${breakpointsfolder}/3R/Cyp6zm_region/breakpointlogs/${samplename}_CYP6ZM_breakpoints_output.log 2>&1 +# python $breakpoints_script $bamfile 3R 28570000:28620000 ${breakpointsfolder}/3R/Gste_region/${samplename}_GST_breakpoints_output 10 > ${breakpointsfolder}/3R/Gste_region/breakpointlogs/${samplename}_GST_breakpoints_output.log 2>&1 +# python $breakpoints_script $bamfile X 15220000:15255000 ${breakpointsfolder}/X/Cyp9k1_region/${samplename}_CYP9K1_breakpoints_output 10 > ${breakpointsfolder}/X/Cyp9k1_region/breakpointlogs/${samplename}_CYP9K1_breakpoints_output.log 2>&1 + +# Delete the bam file +# echo Deleting bam file +# rm ${bamfile}* + +echo "Windowed Coverage finished successfully." + diff --git a/dockerfiles/CNV/scripts/indexwrapper.sh b/dockerfiles/CNV/scripts/indexwrapper.sh new file mode 100755 index 00000000..a617044d --- /dev/null +++ b/dockerfiles/CNV/scripts/indexwrapper.sh @@ -0,0 +1,12 @@ +bamfilefolder=$1 +manifest=$2 +samplenum=$3 + +samplename=($(head -n$samplenum $manifest | tail -n1)) +bamfile=${bamfilefolder}/${samplename}.bam + +echo $bamfile + +module load -s samtools/1.9 + +samtools index $bamfile diff --git a/dockerfiles/CNV/scripts/join_CNV_output_files.sh b/dockerfiles/CNV/scripts/join_CNV_output_files.sh new file mode 100755 index 00000000..9c5357a7 --- /dev/null +++ b/dockerfiles/CNV/scripts/join_CNV_output_files.sh @@ -0,0 +1,35 @@ +study=$1 + +coveragefolder=/lustre/scratch118/malaria/team112/personal/el10/$study/coverage +outputfolder=/lustre/scratch118/malaria/team112/personal/el10/$study/coverage_CNVs +combined_full_CNV_table_gambcolu=$outputfolder/full_coverage_CNV_table_gambcolu.csv +combined_full_raw_CNV_table_gambcolu=$outputfolder/full_raw_CNV_table_gambcolu.csv +combined_full_CNV_table_arabiensis=$outputfolder/full_coverage_CNV_table_arabiensis.csv +combined_full_raw_CNV_table_arabiensis=$outputfolder/full_raw_CNV_table_arabiensis.csv + +mkdir -p $outputfolder + +cat $coveragefolder/2L/HMM_output/gambcolu_CNV/full_coverage_CNV_table_2L.csv > $combined_full_CNV_table_gambcolu +tail -n +2 $coveragefolder/2R/HMM_output/gambcolu_CNV/full_coverage_CNV_table_2R.csv >> $combined_full_CNV_table_gambcolu +tail -n +2 $coveragefolder/3L/HMM_output/gambcolu_CNV/full_coverage_CNV_table_3L.csv >> $combined_full_CNV_table_gambcolu +tail -n +2 $coveragefolder/3R/HMM_output/gambcolu_CNV/full_coverage_CNV_table_3R.csv >> $combined_full_CNV_table_gambcolu +tail -n +2 $coveragefolder/X/HMM_output/gambcolu_CNV/full_coverage_CNV_table_X.csv >> $combined_full_CNV_table_gambcolu + +cat $coveragefolder/2L/HMM_output/gambcolu_CNV/full_raw_CNV_table_2L.csv > $combined_full_raw_CNV_table_gambcolu +tail -n +2 $coveragefolder/2R/HMM_output/gambcolu_CNV/full_raw_CNV_table_2R.csv >> $combined_full_raw_CNV_table_gambcolu +tail -n +2 $coveragefolder/3L/HMM_output/gambcolu_CNV/full_raw_CNV_table_3L.csv >> $combined_full_raw_CNV_table_gambcolu +tail -n +2 $coveragefolder/3R/HMM_output/gambcolu_CNV/full_raw_CNV_table_3R.csv >> $combined_full_raw_CNV_table_gambcolu +tail -n +2 $coveragefolder/X/HMM_output/gambcolu_CNV/full_raw_CNV_table_X.csv >> $combined_full_raw_CNV_table_gambcolu + +cat $coveragefolder/2L/HMM_output/arabiensis_CNV/full_coverage_CNV_table_2L.csv > $combined_full_CNV_table_arabiensis +tail -n +2 $coveragefolder/2R/HMM_output/arabiensis_CNV/full_coverage_CNV_table_2R.csv >> $combined_full_CNV_table_arabiensis +tail -n +2 $coveragefolder/3L/HMM_output/arabiensis_CNV/full_coverage_CNV_table_3L.csv >> $combined_full_CNV_table_arabiensis +tail -n +2 $coveragefolder/3R/HMM_output/arabiensis_CNV/full_coverage_CNV_table_3R.csv >> $combined_full_CNV_table_arabiensis +tail -n +2 $coveragefolder/X/HMM_output/arabiensis_CNV/full_coverage_CNV_table_X.csv >> $combined_full_CNV_table_arabiensis + +cat $coveragefolder/2L/HMM_output/arabiensis_CNV/full_raw_CNV_table_2L.csv > $combined_full_raw_CNV_table_arabiensis +tail -n +2 $coveragefolder/2R/HMM_output/arabiensis_CNV/full_raw_CNV_table_2R.csv >> $combined_full_raw_CNV_table_arabiensis +tail -n +2 $coveragefolder/3L/HMM_output/arabiensis_CNV/full_raw_CNV_table_3L.csv >> $combined_full_raw_CNV_table_arabiensis +tail -n +2 $coveragefolder/3R/HMM_output/arabiensis_CNV/full_raw_CNV_table_3R.csv >> $combined_full_raw_CNV_table_arabiensis +tail -n +2 $coveragefolder/X/HMM_output/arabiensis_CNV/full_raw_CNV_table_X.csv >> $combined_full_raw_CNV_table_arabiensis + diff --git a/dockerfiles/CNV/scripts/join_CNV_output_files_vobs.sh b/dockerfiles/CNV/scripts/join_CNV_output_files_vobs.sh new file mode 100755 index 00000000..a1aad3d1 --- /dev/null +++ b/dockerfiles/CNV/scripts/join_CNV_output_files_vobs.sh @@ -0,0 +1,29 @@ +study=$1 + +rootfolder=/lustre/scratch118/malaria/team112/personal/el10 +coveragefolder=$rootfolder/$study/coverage +outputfolder=$rootfolder/$study/coverage_CNVs + +mkdir -p $outputfolder + +all_CNV_folders=($(ls -d $coveragefolder/2L/HMM_output/*_CNV)) +ss=(${all_CNV_folders[@]#$coveragefolder/2L/HMM_output/}) +specieslist=(${ss[@]%_CNV}) + +for species in ${specieslist[@]} +do + combined_full_CNV_table=$outputfolder/full_coverage_CNV_table_${species}.csv + combined_full_raw_CNV_table=$outputfolder/full_raw_CNV_table_${species}.csv + + cat $coveragefolder/2L/HMM_output/${species}_CNV/full_coverage_CNV_table_2L.csv > $combined_full_CNV_table + tail -n +2 $coveragefolder/2R/HMM_output/${species}_CNV/full_coverage_CNV_table_2R.csv >> $combined_full_CNV_table + tail -n +2 $coveragefolder/3L/HMM_output/${species}_CNV/full_coverage_CNV_table_3L.csv >> $combined_full_CNV_table + tail -n +2 $coveragefolder/3R/HMM_output/${species}_CNV/full_coverage_CNV_table_3R.csv >> $combined_full_CNV_table + tail -n +2 $coveragefolder/X/HMM_output/${species}_CNV/full_coverage_CNV_table_X.csv >> $combined_full_CNV_table + + cat $coveragefolder/2L/HMM_output/${species}_CNV/full_raw_CNV_table_2L.csv > $combined_full_raw_CNV_table + tail -n +2 $coveragefolder/2R/HMM_output/${species}_CNV/full_raw_CNV_table_2R.csv >> $combined_full_raw_CNV_table + tail -n +2 $coveragefolder/3L/HMM_output/${species}_CNV/full_raw_CNV_table_3L.csv >> $combined_full_raw_CNV_table + tail -n +2 $coveragefolder/3R/HMM_output/${species}_CNV/full_raw_CNV_table_3R.csv >> $combined_full_raw_CNV_table + tail -n +2 $coveragefolder/X/HMM_output/${species}_CNV/full_raw_CNV_table_X.csv >> $combined_full_raw_CNV_table +done diff --git a/dockerfiles/CNV/scripts/join_gambiae_arabiensis_coverage_variance_files.sh b/dockerfiles/CNV/scripts/join_gambiae_arabiensis_coverage_variance_files.sh new file mode 100755 index 00000000..4cf732c0 --- /dev/null +++ b/dockerfiles/CNV/scripts/join_gambiae_arabiensis_coverage_variance_files.sh @@ -0,0 +1,9 @@ +study=$1 + +coveragefolder=/lustre/scratch118/malaria/team112/personal/el10/$study/coverage +arabiensis_coverage_variance_file=$coveragefolder/coverage_variance_masked_09_05_arabiensis.csv +gambcolu_coverage_variance_file=$coveragefolder/coverage_variance_masked_09_05_gambcolu.csv +combined_coverage_variance_file=$coveragefolder/coverage_variance_masked_09_05_all.csv + +(cat $arabiensis_coverage_variance_file; tail -n +2 $gambcolu_coverage_variance_file) > $combined_coverage_variance_file + diff --git a/dockerfiles/CNV/scripts/join_species_coverage_variance_files_vobs.sh b/dockerfiles/CNV/scripts/join_species_coverage_variance_files_vobs.sh new file mode 100755 index 00000000..e8186ba9 --- /dev/null +++ b/dockerfiles/CNV/scripts/join_species_coverage_variance_files_vobs.sh @@ -0,0 +1,14 @@ +study=$1 + +# turn on extended globbing +shopt -s extglob + +coveragefolder=/lustre/scratch118/malaria/team112/personal/el10/$study/coverage +coverage_variance_files=($(ls $coveragefolder/coverage_variance_masked_09_05_!(all).csv)) +combined_coverage_variance_file=$coveragefolder/coverage_variance_masked_09_05_all.csv + +if [ ${#coverage_variance_files[@]} -eq 1 ]; then + cp $coverage_variance_files $combined_coverage_variance_file +else + (cat ${coverage_variance_files[0]}; sed -s 1d ${coverage_variance_files[@]:1}) > $combined_coverage_variance_file +fi diff --git a/dockerfiles/CNV/scripts/setup_folder.sh b/dockerfiles/CNV/scripts/setup_folder.sh new file mode 100755 index 00000000..19713ce8 --- /dev/null +++ b/dockerfiles/CNV/scripts/setup_folder.sh @@ -0,0 +1,58 @@ +# This script prepares a folder in which a given sample set will be run +folder_location=$1 +sample_metadata=$2 +sample_speciescalls=$3 +bamfile_folder=$4 +bamindex_folder=$5 + +# turn on extended globbing +shopt -s extglob + +# Create the folder +mkdir -p $folder_location/data +# Copy the metadata to the folder +cp $sample_metadata $folder_location/data +metadata_filename="$(basename -- $sample_metadata)" +# Create a sample manifest from the metadata +sed -e '1d' -e '2,$s/,.*//' $folder_location/data/$metadata_filename > $folder_location/data/sample_manifest.txt +# Copy the species calls +cp $sample_speciescalls $folder_location/data +speciescalls_filename="$(basename -- $sample_speciescalls)" +# Some of the scripts will expect tab-delimited metadata and species calls +sed -e "s/,/\t/g" $folder_location/data/$metadata_filename > $folder_location/data/sample_metadata_tabs.tsv +sed -e "s/,/\t/g" $folder_location/data/$speciescalls_filename > $folder_location/data/sample_speciescalls_tabs.tsv + +# To create species-specific manifests, we want to pull out the gambcolu/arabiensis +# species call data. We could do this by column number, but I don't know whether the +# columns numbers will be consistent. So we do it by column names. The first step for +# this is to cut down the species calls file to just two columns (sample name and +# species call). Then we can be sure that the species call will be column 2. +species_manifest (){ + oldIFS=$IFS + IFS="," + read names + while read $names; do + echo -e ${sample_id}\\t${species_gambcolu_arabiensis/_/} + done + IFS=$oldIFS +} + +# So now we apply that function, and then split the sample names into two files based +# on column 2 (which is guaranteed to be the species calls, as long as the column names +# are always the same. +cat $sample_speciescalls | species_manifest | awk -v output_folder="$folder_location/data" '{print $1 > output_folder"/sample_manifest_"$2".txt"}' +cat $folder_location/data/sample_manifest_!(known_species).txt > $folder_location/data/sample_manifest_known_species.txt + +# Create a folder for the bamlinks +mkdir -p $folder_location/bamlinks +# Make symlinks to all the required bams +cd $folder_location/bamlinks +while read p +do + ln -s $bamfile_folder/${p}.bam + ln -s $bamindex_folder/${p}.bam.bai +done < $folder_location/data/sample_manifest.txt + + + + diff --git a/dockerfiles/CNV/scripts/setup_folder_gsutil.sh b/dockerfiles/CNV/scripts/setup_folder_gsutil.sh new file mode 100755 index 00000000..423e2c6f --- /dev/null +++ b/dockerfiles/CNV/scripts/setup_folder_gsutil.sh @@ -0,0 +1,78 @@ +# This script prepares a folder in which a given sample set will be run +folder_location=$1 +gsutil_path=$2 +speciescalls_folder=$3 +sampleset=$4 + +# turn on extended globbing +shopt -s extglob + +# Create the folder +mkdir -p $folder_location/data +# Download the metadata to the folder +sample_metadata=$gsutil_path/metadata/general/$sampleset/samples.meta.csv +gsutil cp $sample_metadata $folder_location/data +metadata_filename="$(basename -- $sample_metadata)" +# Create a sample manifest from the metadata +sed -e '1d' -e '2,$s/,.*//' $folder_location/data/$metadata_filename > $folder_location/data/sample_manifest.txt + +# Download the table linking sample names with bam files +sample_bampaths=$gsutil_path/metadata/general/$sampleset/wgs_snp_data.csv +gsutil cp $sample_bampaths $folder_location/data +bampaths_filename="$(basename -- $sample_bampaths)" + +# Download the species calls +sample_speciescalls=$gsutil_path/metadata/$speciescalls_folder/$sampleset/samples.species_aim.csv +gsutil cp $sample_speciescalls $folder_location/data +speciescalls_filename="$(basename -- $sample_speciescalls)" +# Some of the scripts will expect tab-delimited metadata and species calls +sed -e "s/,/\t/g" $folder_location/data/$metadata_filename > $folder_location/data/sample_metadata_tabs.tsv +sed -e "s/,/\t/g" $folder_location/data/$speciescalls_filename > $folder_location/data/sample_speciescalls_tabs.tsv + +# To create species-specific manifests, we want to pull out the gambcolu/arabiensis +# species call data. We could do this by column number, but I don't know whether the +# columns numbers will be consistent. So we do it by column names. The first step for +# this is to cut down the species calls file to just two columns (sample name and +# species call). Then we can be sure that the species call will be column 2. +species_manifest (){ + oldIFS=$IFS + IFS="," + read names + while read $names; do + echo -e ${sample_id}\\t${species_gambcolu_arabiensis/_/} + done + IFS=$oldIFS +} + +# So now we apply that function, and then split the sample names into two files based +# on column 2 (which is guaranteed to be the species calls, as long as the column names +# are always the same). +cat $folder_location/data/$speciescalls_filename | species_manifest | awk -v output_folder="$folder_location/data" '{print $1 > output_folder"/sample_manifest_"$2".txt"}' +cat $folder_location/data/sample_manifest_!(intermediate).txt > $folder_location/data/sample_manifest_known_species.txt + +# We want to pull out the paths to the bamfile for each sample +bamfiles (){ + oldIFS=$IFS + IFS="," + read names + while read $names; do + echo -e ${sample_id}\\t${alignments_bam} + done + IFS=$oldIFS +} + +cat $folder_location/data/$bampaths_filename | bamfiles > $folder_location/data/bampaths.csv + +# Create a folder for the bamfiles +mkdir -p $folder_location/bamlinks +# Download all the required bams +cd $folder_location/bamlinks +while IFS=$'\t' read sample bampath +do + curl ${bampath} > ${sample}.bam + curl ${bampath}.bai > ${sample}.bam.bai +done < ../data/bampaths.csv + + + + diff --git a/dockerfiles/CNV/scripts/setup_folder_gsutil_nobams.sh b/dockerfiles/CNV/scripts/setup_folder_gsutil_nobams.sh new file mode 100755 index 00000000..b96d09f3 --- /dev/null +++ b/dockerfiles/CNV/scripts/setup_folder_gsutil_nobams.sh @@ -0,0 +1,65 @@ +# This script prepares a folder in which a given sample set will be run +folder_location=$1 +gsutil_path=$2 +speciescalls_folder=$3 +sampleset=$4 + +# turn on extended globbing +shopt -s extglob + +# Create the folder +mkdir -p $folder_location/data +# Download the metadata to the folder +sample_metadata=$gsutil_path/metadata/general/$sampleset/samples.meta.csv +gsutil cp $sample_metadata $folder_location/data +metadata_filename="$(basename -- $sample_metadata)" +# Create a sample manifest from the metadata +sed -e '1d' -e '2,$s/,.*//' $folder_location/data/$metadata_filename > $folder_location/data/sample_manifest.txt + +# Download the table linking sample names with bam files +sample_bampaths=$gsutil_path/metadata/general/$sampleset/wgs_snp_data.csv +gsutil cp $sample_bampaths $folder_location/data +bampaths_filename="$(basename -- $sample_bampaths)" + +# Download the species calls +sample_speciescalls=$gsutil_path/metadata/$speciescalls_folder/$sampleset/samples.species_aim.csv +gsutil cp $sample_speciescalls $folder_location/data +speciescalls_filename="$(basename -- $sample_speciescalls)" +# Some of the scripts will expect tab-delimited metadata and species calls +sed -e "s/,/\t/g" $folder_location/data/$metadata_filename > $folder_location/data/sample_metadata_tabs.tsv +sed -e "s/,/\t/g" $folder_location/data/$speciescalls_filename > $folder_location/data/sample_speciescalls_tabs.tsv + +# To create species-specific manifests, we want to pull out the gambcolu/arabiensis +# species call data. We could do this by column number, but I don't know whether the +# columns numbers will be consistent. So we do it by column names. The first step for +# this is to cut down the species calls file to just two columns (sample name and +# species call). Then we can be sure that the species call will be column 2. +species_manifest (){ + oldIFS=$IFS + IFS="," + read names + while read $names; do + echo -e ${sample_id}\\t${species_gambcolu_arabiensis/_/} + done + IFS=$oldIFS +} + +# So now we apply that function, and then split the sample names into different files based +# on column 2 (which is guaranteed to be the species calls, as long as the column names +# are always the same). +cat $folder_location/data/$speciescalls_filename | species_manifest | awk -v output_folder="$folder_location/data" '{print $1 > output_folder"/sample_manifest_"$2".txt"}' +cat $folder_location/data/sample_manifest_!(intermediate).txt > $folder_location/data/sample_manifest_known_species.txt + +# We want to pull out the paths to the bamfile for each sample +bamfiles (){ + oldIFS=$IFS + IFS="," + read names + while read $names; do + echo -e ${sample_id}\\t${alignments_bam} + done + IFS=$oldIFS +} + +cat $folder_location/data/$bampaths_filename | bamfiles > $folder_location/data/bampaths.csv + diff --git a/dockerfiles/CNV/scripts/setup_folder_gsutil_nobams_wrapper.sh b/dockerfiles/CNV/scripts/setup_folder_gsutil_nobams_wrapper.sh new file mode 100755 index 00000000..c79a85a8 --- /dev/null +++ b/dockerfiles/CNV/scripts/setup_folder_gsutil_nobams_wrapper.sh @@ -0,0 +1,9 @@ +release=$1 +study_id=$2 + +workingfolder=~/personal/${release}_${study_id} +gsutil_path=gs://vo_agam_release/${release} +logfolder=$workingfolder/folder_setup_log +mkdir -p $logfolder +speciescalls_folder=species_calls_20200422 +~/scripts/CNV_scripts/scripts/setup_folder_gsutil_nobams.sh $workingfolder $gsutil_path $speciescalls_folder $study_id > $logfolder/setup_folder_gsutil.log 2>&1 & diff --git a/dockerfiles/CNV/scripts/setup_folder_gsutil_wrapper.sh b/dockerfiles/CNV/scripts/setup_folder_gsutil_wrapper.sh new file mode 100755 index 00000000..d9af5856 --- /dev/null +++ b/dockerfiles/CNV/scripts/setup_folder_gsutil_wrapper.sh @@ -0,0 +1,9 @@ +release=$1 +study_id=$2 + +workingfolder=~/personal/${release}_${study_id} +gsutil_path=gs://vo_agam_release/${release} +logfolder=$workingfolder/folder_setup_log +mkdir -p $logfolder +speciescalls_folder=species_calls_20200422 +~/scripts/CNV_scripts/scripts/setup_folder_gsutil.sh $workingfolder $gsutil_path $speciescalls_folder $study_id > $logfolder/setup_folder_gsutil.log 2>&1 & diff --git a/docs/specs/cnv-vector.md b/docs/specs/cnv-vector.md index d508aa69..1757e65b 100644 --- a/docs/specs/cnv-vector.md +++ b/docs/specs/cnv-vector.md @@ -41,7 +41,7 @@ cohort of multiple samples. The pipeline inclludes an HMM step followed by cover #### Step: Windowed Coverage **Description:** Divides the genome into 300bp windows and counts the number of reads for each window.\ **Inputs:** Bam file, 1 per sample\ -**Outputs:** Windowed count reads file, 1 row per window, 1 file per sample \ +**Outputs:** Windowed count reads file, 1 row per window, 1 file per sample per chromosome. \ **Software:** Python virtualenv cnv37\ Steps in CNV_pipeline/scripts/get_windowed_coverage_and_diagnostic_reads.sh\ ```bash @@ -58,7 +58,7 @@ Steps in CNV_pipeline/scripts/get_windowed_coverage_and_diagnostic_reads.sh\ #### Step: Calculate Coverage Summary Stats **Description:** Analyzes the read count for each sample. Computes a GC normalization table that describes the mean read count for each percentile of GC. Computes overall coverage variance \ **Inputs:** Windowed count reads file, 1 per sample\ -**Outputs:** Summary statistics file, 1 per sample\ +**Outputs:** Summary statistics files per sample: median_coverage_by_GC_masked and coverage_variance_masked CSVs \ **Software:** Python virtualenv cnv37\ Steps in CNV_pipeline/scripts/get_coverage_stats_by_sample_set_vobs.sh\ ```bash @@ -76,7 +76,7 @@ Steps in CNV_pipeline/scripts/get_coverage_stats_by_sample_set_vobs.sh\ #### Step: CNV HMM **Description:** Normalizes the read count for each sample by dividing by the modal coverage for each windw. Fits an HM to the normalized read counts , \ -**Inputs:** Windowed count reads file, 1 per sample. Coverage variance statistics, 1 per sample\ +**Inputs:** Windowed count reads file, 1 per sample per chromosome. Coverage variance statisticss, 2 CSVs per sample\ **Outputs:** Normalized coverage values, 1 per window, per sample. Copy number state, 1 per window, per sample. \ **Software:** Python virtualenv cnv37\ Steps in CNV_pipeline/scripts/coverage_HMM_vobs.sh\ @@ -152,7 +152,7 @@ Steps in CNV_pipeline/scripts/get_windowed_coverage_and_diagnostic_reads.sh\ #### Step: Target Regions CNV Calls **Description:** \ **Inputs:** \ -**Outputs:** \ +**Outputs:** text file with genotypes \ **Software:** R version 3.6.1\ Steps in CNV_pipeline/scripts/target_regions_analysis_vobs.sh\ ```bash @@ -170,6 +170,8 @@ Steps in CNV_pipeline/scripts/target_regions_analysis_vobs.sh\ ``` +## Convert outputs to zarr and vcf + ## Implementation notes diff --git a/pipelines/copy-number-variation-vector/gcp/CNV.wdl b/pipelines/copy-number-variation-vector/gcp/CNV.wdl new file mode 100644 index 00000000..312bf09e --- /dev/null +++ b/pipelines/copy-number-variation-vector/gcp/CNV.wdl @@ -0,0 +1,129 @@ +version 1.0 + +## Copyright Wellcome Sanger Institute, Oxford University, and the Broad Institute 2023 +## +## This WDL pipeline implements Copy Number Variation Pipeline as described in: +## https://github.com/malariagen/pipelines/blob/add_cnv_vector_spec/docs/specs/cnv-vector.md +##. + +import "HMM.wdl" as HMM +import "TargetRegions.wdl" as TargetRegions +import "CNVCoverageCalls.wdl" as CNVCoverageCalls +import "CNVTasks.wdl" as CNVTasks + +workflow CNV { + meta { + description: "This is a pipeline for calling Copy Number Variants (CNVs) for a cohort of multiple samples. This includes an HMM step followed by coverage calls and target regions pipelines to improve accuracy." + allowNestedInputs: true + } + + String pipeline_version = "1.0.0" + + input { + # windowed coverage inputs + Array[File] input_bams + Array[File] input_bais + Array[String] sample_ids + String output_dir="coverage" + Int interval = 300 + Int window_size = 300 + Int min_qual = 10 + # coverage summary stats and coverage HMM inputs + Float accessibility_threshold = 0.9 + Float mapq_threshold = 0.5 + File accessibility_mask_file + File mapq_file + File gc_content_file + String sample_group_id + # coverage HMM inputs + String species + # target regions inputs + File gene_coordinates_file + File sample_metadata + File species_id_file + File plotting_functions_file + # coverage calls inputs + Array[String] chromosomes = ["2L", "2R", "3L", "3R", "X"] + File detox_genes_file + # runtime inputs + Int preemptible_tries + String runtime_zones = "us-central1-b" + } + + # Scatter over samples + scatter(idx in range(length(input_bams))) { + # Run windowed coverage on each sample (for each chromosome) + call HMM.HMM as hmm { + input: + input_bam = input_bams[idx], + input_bai = input_bais[idx], + sample_id = sample_ids[idx], + output_dir = output_dir, + interval = interval, + window_size = window_size, + min_qual = min_qual, + accessibility_threshold = accessibility_threshold, + mapq_threshold = mapq_threshold, + accessibility_mask_file = accessibility_mask_file, + mapq_file = mapq_file, + gc_content_file = gc_content_file, + sample_group_id = sample_group_id, + species = species, + runtime_zones = runtime_zones + } + + call TargetRegions.TargetRegions as target_regions { + input: + sample_id = sample_ids[idx], + input_bam = input_bams[idx], + input_bam_index = input_bais[idx], + gene_coordinates_file = gene_coordinates_file, + sample_metadata = sample_metadata, + species_id_file = species_id_file, + CNV_HMM_output = hmm.output_gz, # zip of the coveragefolder + HMM_coverage_variance_file = hmm.coverage_variance, + plotting_functions_file = plotting_functions_file, + preemptible_tries = preemptible_tries, + runtime_zones = runtime_zones + } + } + + call CNVTasks.ConsolidateHMMOutput as CHMM { + input: + hmm_tarballs = hmm.output_gz, + output_dir = output_dir, + runtime_zones = runtime_zones + } + + call CNVTasks.CreateSampleManifest as create_sample_manifest { + input: + sample_ids = sample_ids, + runtime_zones = runtime_zones + } + + # Runs separately for each chromosome (2L, 2R, 3L, 3R, X) + scatter(idx in range(length(chromosomes))) { + call CNVCoverageCalls.CNVCoverageCalls as CNV_coverage_calls { + input: + chromosome = chromosomes[idx], + sample_species_manifest = create_sample_manifest.sample_manifest, + gene_coordinates_file = gene_coordinates_file, + detox_genes_file = detox_genes_file, + consolidated_coverage_dir_tar = CHMM.consolidated_gz, + sample_metadata = sample_metadata, + species = species, + num_samples = length(sample_ids), + preemptible_tries = preemptible_tries, + runtime_zones = runtime_zones + } + } + + output { + File hmm_tar = CHMM.consolidated_gz + Array[File] targeted_regions_focal_region_cnv_table = target_regions.focal_region_CNV_table + Array[File] targeted_regions_hmm_gene_copy_number = target_regions.HMM_gene_copy_number + Array[File] cnv_coverage_calls_tables = CNV_coverage_calls.cnv_coverage_table + Array[File] cnv_coverage_calls_raw_tables = CNV_coverage_calls.cnv_coverage_raw_table + Array[File] cnv_coverage_calls_Rdata = CNV_coverage_calls.cnv_coverage_Rdata + } +} \ No newline at end of file diff --git a/pipelines/copy-number-variation-vector/gcp/CNVCoverageCalls.wdl b/pipelines/copy-number-variation-vector/gcp/CNVCoverageCalls.wdl new file mode 100644 index 00000000..ae45cfac --- /dev/null +++ b/pipelines/copy-number-variation-vector/gcp/CNVCoverageCalls.wdl @@ -0,0 +1,140 @@ +version 1.0 + +## Copyright Wellcome Sanger Institute, Oxford University, and the Broad Institute 2023 +## +## This WDL pipeline implements CNV Coverage Calls portion of the vector Copy Number Variation Pipeline as described in: +## https://github.com/malariagen/pipelines/blob/add_cnv_vector_spec/docs/specs/cnv-vector.md +##. + +workflow CNVCoverageCalls { + String pipeline_version = "1.0.0" + + ## Sub-pipeline: CNV Coverage Calls + + input { + String chromosome # chrom: Runs separately for each species and chromosome (2L, 2R, 3L, 3R, X) + File sample_species_manifest # manifest: species specific manifest file - NOTE different from other manifests + File gene_coordinates_file # gene_coordinates_file: pipeline input + File detox_genes_file # detox_genes_file: pipeline input + File consolidated_coverage_dir_tar # working folder: Specifies the folder containing the HMM output files - will be a tarred output here + File sample_metadata # metadata: pipeline input + String species # species: pipeline input + Int num_samples # number of samples - used to calucluate the memory and disk requested for coverage calls step + Int preemptible_tries + String runtime_zones + } + + # Step 1: CNV Coverage Calls + call CoverageCalls { + input: + chromosome = chromosome, + sample_species_manifest = sample_species_manifest, + gene_coordinates_file = gene_coordinates_file, + detox_genes_file = detox_genes_file, + consolidated_coverage_dir_tar = consolidated_coverage_dir_tar, + sample_metadata = sample_metadata, + species = species, + num_samples = num_samples, + preemptible_tries = preemptible_tries, + runtime_zones = runtime_zones + } + + meta { + allowNestedInputs: true + } + + output { + File cnv_coverage_table = CoverageCalls.full_coverage_CNV_table + File cnv_coverage_raw_table = CoverageCalls.full_raw_CNV_table + File cnv_coverage_Rdata = CoverageCalls.coverage_CNV_Rdata + } +} + +task CoverageCalls { + input { + String chromosome # chrom: Runs separately for each species and chromosome (2L, 2R, 3L, 3R, X) + File sample_species_manifest # manifest: species specific manifest file - NOTE different from other manifests + File gene_coordinates_file # gene_coordinates_file: pipeline input + File detox_genes_file # detox_genes_file: pipeline input + File consolidated_coverage_dir_tar # working folder: Specifies the folder containing the HMM output files - will be a tarred output here + File sample_metadata # metadata: pipeline input + String species # species: pipeline input + Int num_samples # number of samples - used to calucluate the memory and disk requested for coverage calls step + + # Runtime Settings + Int preemptible_tries + String runtime_zones + String docker = "us.gcr.io/broad-gotc-prod/cnv/r:1.0.0-1692386236" + Int num_cpu = 1 + Int memory_gb = if (ceil(num_samples * 0.2) + 3) < 120 then (ceil(num_samples * 0.2) + 3) else 120 + Int disk_size_gb = if (ceil(num_samples * 0.75) + 10) < 1200 then (ceil(num_samples * 0.75) + 10) else 1200 + } + # ncores: number of CPUs - has not been optimized + String output_dir = species + "_CNV" + + # These will be named by the script and cannot be changed here + String full_coverage_CNV = output_dir + '/full_coverage_CNV_table_' + chromosome + '.csv' + String full_raw_CNV = output_dir + '/full_raw_CNV_table_' + chromosome + '.csv' + String Rdata = output_dir + '/CNV_analysis_' + chromosome +'.Rdata' + # Since I am not outputting the full directory with the species included in the folder structure, I am renaming the output file to include the species name + String species_full_coverage_CNV = output_dir + '/full_coverage_CNV_table_' + species + '_' + chromosome + '.csv' + String species_full_raw_CNV = output_dir + '/full_raw_CNV_table_' + species + '_' + chromosome + '.csv' + String species_Rdata = output_dir + '/CNV_analysis_' + species + '_' + chromosome +'.Rdata' + + # once we untarr the HMM dir, the path to the untarred dir will be local to where the command runs + String coverage_dir = basename(consolidated_coverage_dir_tar, ".tar.gz") + String HMM_working_dir = coverage_dir + '/' + chromosome + '/HMM_output' + + + + command <<< + # unzip the coverage tarball and get the directory name + tar -zxvf ~{consolidated_coverage_dir_tar} + + # set up the output dir + mkdir ~{HMM_working_dir}/~{output_dir} + + # The following combines the coverage_variance files into a single file + # Get the name of the first coverage variance file - doesn't matter which one + FIRST_COVERAGE_VARIANCE_FILE=$(ls ~{coverage_dir}/coverage_variance_masked_* | head -n 1) + + # Add the header to the new combined file (the header is in every file, but we only want it once in the combined file) + HEADER=$(head -n 1 $FIRST_COVERAGE_VARIANCE_FILE) + echo "sample_id$HEADER" > single_coverage_variance_masked.csv + + # Skip the header and add the remaining contents of each coverage variance file to the combined file + for f in $(ls ~{coverage_dir}/coverage_variance_masked_*); do grep -v "$HEADER" $f >> single_coverage_variance_masked.csv; done + + echo "single_coverage_variance_masked.csv" + cat single_coverage_variance_masked.csv + + echo "Starting R script" + /opt/R/3.6.1/bin/R --slave -f /usr/local/Rscripts/CNV_analysis.r --args ~{chromosome} \ + ~{sample_species_manifest} \ + single_coverage_variance_masked.csv \ + ~{gene_coordinates_file} \ + ~{detox_genes_file} \ + ~{HMM_working_dir} \ + ~{num_cpu} \ + ~{output_dir}\ + ~{sample_metadata} + echo "R script complete" + + mv ~{HMM_working_dir}/~{full_coverage_CNV} ~{species_full_coverage_CNV} + mv ~{HMM_working_dir}/~{full_raw_CNV} ~{species_full_raw_CNV} + mv ~{HMM_working_dir}/~{Rdata} ~{species_Rdata} + >>> + runtime { + docker: docker + preemptible: preemptible_tries + cpu: num_cpu + memory: memory_gb + " GiB" + disks: "local-disk " + disk_size_gb + " HDD" + zones: runtime_zones + } + output { + File full_coverage_CNV_table = species_full_coverage_CNV + File full_raw_CNV_table = species_full_raw_CNV + File coverage_CNV_Rdata = species_Rdata + } +} diff --git a/pipelines/copy-number-variation-vector/gcp/CNVTasks.wdl b/pipelines/copy-number-variation-vector/gcp/CNVTasks.wdl new file mode 100644 index 00000000..00996024 --- /dev/null +++ b/pipelines/copy-number-variation-vector/gcp/CNVTasks.wdl @@ -0,0 +1,84 @@ +version 1.0 + +task ConsolidateHMMOutput { + meta { + description: "This task takes the output from the HMM subpipeline and combines it into a single tarball." + } + parameter_meta { + hmm_tarballs: "The output files from the HMM sub-pipeline. This is an array of tarballs, one for each sample." + output_dir: "The output directory for the tarball." + } + input { + Array[File] hmm_tarballs + String output_dir + # runtime values + String docker = "us.gcr.io/broad-gotc-prod/cnv:1.0.0-1679431881" + Int ram = if (ceil(size(hmm_tarballs, "MiB") * 5) + 8000) < 120000 then (ceil(size(hmm_tarballs, "MiB") * 5) + 8000) else 120000 + Int cpu = 16 + Int disk = ceil(size(hmm_tarballs, "GiB") * 10) + 50 + Int preemptible = 3 + String runtime_zones + } + command <<< + set -x + echo "Current directory: " + pwd + #For each file in hmm_tarballs, extract the tarball and move the contents to the output directory + for tarball in ~{sep=' ' hmm_tarballs} ; do + echo "Extracting tarball: " $tarball + tar --backup=numbered -zxvf $tarball + ls -lht + done + ls -lht + tar -zcvf ~{output_dir}.tar.gz ~{output_dir} + >>> + runtime { + docker: docker + memory: "${ram} MiB" + disks: "local-disk ${disk} HDD" + cpu: cpu + preemptible: preemptible + zones: runtime_zones + } + output { + File consolidated_gz = "~{output_dir}.tar.gz" + } +} + + +task CreateSampleManifest { + meta { + description: "This task creates a new line separated sample manifest, where each sample id is a separate line." + } + parameter_meta { + sample_ids: "An Array of sample ids for all samples bing run together" + } + + input { + Array[String] sample_ids + # runtime values + String docker = "ubuntu:18.04" + String ram = "3000 MiB" + Int cpu = 1 + Int disk = 10 + Int preemptible = 3 + String runtime_zones + } + + command <<< + # I don't think we actually need to do anythign here + echo "writing sample ids to output file" + >>> + + runtime { + docker: docker + memory: ram + disks: "local-disk ${disk} HDD" + cpu: cpu + preemptible: preemptible + zones: runtime_zones + } + output { + File sample_manifest = write_lines(sample_ids) + } +} \ No newline at end of file diff --git a/pipelines/copy-number-variation-vector/gcp/HMM.wdl b/pipelines/copy-number-variation-vector/gcp/HMM.wdl new file mode 100644 index 00000000..2861dd06 --- /dev/null +++ b/pipelines/copy-number-variation-vector/gcp/HMM.wdl @@ -0,0 +1,329 @@ +version 1.0 + +## Copyright Wellcome Sanger Institute, Oxford University, and the Broad Institute 2021 +## +## This WDL pipeline implements the Read-Backed component of the Mospquito Phasing Pipeline as described in +## https://github.com/malariagen/pipelines/blob/master/docs/specs/phasing-vector.md +## + + +workflow HMM { + String pipeline_version = "1.0.0" + + input { + #windowed coverage parameters + File input_bam + File input_bai + String sample_id + String output_dir + Int interval + Int window_size + Int min_qual + #coverage summary parameters + Float accessibility_threshold + Float mapq_threshold + File accessibility_mask_file + File mapq_file + File gc_content_file + String sample_group_id + String species + String runtime_zones + } + + call WindowedCoverage { + input: + input_bam = input_bam, + input_bai = input_bai, + sample_id = sample_id, + output_dir = output_dir, + interval = interval, + window_size = window_size, + min_qual = min_qual, + runtime_zones = runtime_zones + } + + call CoverageSummary { + input: + coverage_tarball = WindowedCoverage.output_gz, + accessibility_threshold = accessibility_threshold, + mapq_threshold = mapq_threshold, + accessibility_mask_file = accessibility_mask_file, + mapq_file = mapq_file, + gc_content_file = gc_content_file, + sample_group_id = sample_group_id, + output_dir = output_dir, + sample_id = sample_id, + runtime_zones = runtime_zones + } + + call CoverageHMM { + input: + coverage_tarball = CoverageSummary.output_gz, + gc_content_file = gc_content_file, + coverage_gc = CoverageSummary.coverage_output, + coverage_variance = CoverageSummary.variance_output, + mapq_file = mapq_file, + mapq_threshold = mapq_threshold, + species = species, + output_dir = output_dir, + sample_id = sample_id, + runtime_zones = runtime_zones + } + + output { + File output_gz = CoverageHMM.output_gz + File coverage_variance = CoverageSummary.variance_output + } +} + +task WindowedCoverage { + input { + File input_bam + File input_bai + String sample_id + String output_dir + Int interval + Int window_size + Int min_qual + + # runtime values + String docker = "us.gcr.io/broad-gotc-prod/cnv:1.0.0-1679431881" + String ram = "8000 MiB" + Int cpu = 16 + Int disk = ceil(size(input_bam, "GiB")) + 50 + Int preemptible = 3 + String runtime_zones + } + meta { + description: "Compute aligned read counts across the genome in 300 bp windows. The output is a set of Windowed count reads file, 1 row per window, 1 file per sample - compressed in a tar.gz file to keep the directory structure." + } + parameter_meta { + input_bam: "The input BAM file" + input_bai: "The BAM file's corresponding index file" + sample_id: "The sample name. This is typically the same as the input bam file name but without the extension." + output_dir: "The output directory name. This is set to `coverage` by default." + interval: "The interval size (bp) to use for the coverage calculations. Default is 300." + window_size: "The window size (bp) to use for the coverage calculations. Default is 300." + min_qual: "The minimum quality to use for the coverage calculations. Default is 20." + docker: "(optional) the docker image containing the runtime environment for this task" + ram: "(optional) the amount of memory (MiB) to provision for this task" + cpu: "(optional) the number of cpus to provision for this task" + disk: "(optional) the amount of disk space (GiB) to provision for this task" + preemptible: "(optional) if non-zero, request a pre-emptible instance and allow for this number of preemptions before running the task on a non preemptible machine" + runtime_zones: "The ordered list of zone preference for running in GCP" + } + command <<< + set -x + echo "Calculating coverage for: " + basename ~{input_bam} + echo "Sample Name: ~{sample_id}" + echo "Current directory: " + pwd + ls -lht + # Make the output directory + mkdir -p ~{output_dir} + # Start the conda env + source activate cnv37 + # Get the coverage data for each chromosome + allchrom=(2L 2R 3L 3R X) + for chrom in ${allchrom[@]} + do + #create the directory structure needed (this was likely pre-provisioned in the baremetal version of this pipeline) + mkdir -p ~{output_dir}/${chrom}/coveragelogs/ + python /cnv/scripts/counts_for_HMM.py \ + ~{input_bam} \ + $chrom \ + ~{interval} ~{window_size} ~{min_qual} \ + ~{output_dir}/${chrom}/counts_for_HMM_~{sample_id}_${chrom}_output.csv \ + > ~{output_dir}/${chrom}/coveragelogs/counts_for_HMM_~{sample_id}_${chrom}.log 2>&1 + done + # Compress the output directory + tar -zcvf ~{output_dir}.tar.gz ~{output_dir} + ls -lht + >>> + runtime { + docker: docker + memory: ram + disks: "local-disk ${disk} HDD" + cpu: cpu + preemptible: preemptible + zones: runtime_zones + } + output { + #Compressed output directory + File output_gz = "~{output_dir}.tar.gz" + } +} + +task CoverageSummary { + meta { + description: "Analyzes the read count for each sample. Computes a GC normalization table that describes the mean read count for each percentile of GC. Computes overall coverage variance. The output is a summary statistics file, 1 per sample" + } + parameter_meta { + coverage_tarball: "The input tarball containing the coverage files" + accessibility_threshold: "The accessibility threshold to use for the coverage summary calculations. Default is 0.9." + mapq_threshold: "The mapq threshold to use for the coverage summary calculations. Default is 0.5." + accessibility_mask_file: "The accessibility mask file to use for the coverage summary calculations" + mapq_file: "The mapq file to use for the coverage summary calculations" + gc_content_file: "The gc content file to use for the coverage summary calculations" + sample_group_id: "The sample group id to use for the coverage summary calculations" + sample_id: "The sample name. This is typically the same as the input bam file name but without the extension." + output_dir: "The output directory name. This is set to `coverage` by default." + docker: "(optional) the docker image containing the runtime environment for this task" + ram: "(optional) the amount of memory (MiB) to provision for this task" + cpu: "(optional) the number of cpus to provision for this task" + disk: "(optional) the amount of disk space (GiB) to provision for this task" + preemptible: "(optional) if non-zero, request a pre-emptible instance and allow for this number of preemptions before running the task on a non preemptible machine" + runtime_zones: "The ordered list of zone preference for running in GCP" + } + input { + File coverage_tarball + Float accessibility_threshold + Float mapq_threshold + File accessibility_mask_file + File mapq_file + File gc_content_file + String sample_group_id + String output_dir + String sample_id + # runtime values + String docker = "us.gcr.io/broad-gotc-prod/cnv:1.0.0-1679431881" + String ram = "8000 MiB" + Int cpu = 16 + # TODO: Make disk space dynamic based on input size + Int disk = 70 + Int preemptible = 3 + String runtime_zones + } + String acc_threshold = sub(accessibility_threshold + "_", "[.]", "") + String m_threshold = sub(mapq_threshold + "_", "[.]", "") + String coverage_output_filename = "median_coverage_by_GC_masked_" + acc_threshold + m_threshold + sample_group_id + ".csv" + String variance_output_filename = "coverage_variance_masked_" + acc_threshold + m_threshold + sample_group_id + ".csv" + command <<< + set -x + echo "Current directory: " + pwd + #unzip the tarball + tar -zxvf ~{coverage_tarball} + #cd /cnv/scripts/ + echo "Creating temporary manifest: " + echo ~{sample_id} > manifest.txt + ls -lht + allchrom=(2L 2R 3L 3R X) + source activate cnv37 + # Calculate median coverage by GC + echo "Calculating median and variance of coverage by GC bin for sample group ~{sample_group_id}" + python /cnv/scripts/calculate_median_coverage_by_GC.py \ + ~{accessibility_threshold} \ + ~{accessibility_mask_file} \ + ~{mapq_threshold} \ + ~{mapq_file} \ + manifest.txt \ + ~{gc_content_file} \ + ~{output_dir} \ + ~{sample_group_id} \ + > calculate_mean_coverage_by_GC_~{sample_group_id}.log 2>&1 + ls -lht + tar -zcvf ~{output_dir}.tar.gz ~{output_dir} + echo "Numbers: ~{acc_threshold} ~{m_threshold} ~{sample_group_id}" + + >>> + runtime { + docker: docker + memory: ram + disks: "local-disk ${disk} HDD" + cpu: cpu + preemptible: preemptible + zones: runtime_zones + } + output { + File logs = "calculate_mean_coverage_by_GC_~{sample_group_id}.log" + File coverage_output = "~{output_dir}/~{coverage_output_filename}" + File variance_output = "~{output_dir}/~{variance_output_filename}" + File output_gz = "~{output_dir}.tar.gz" + } +} + +task CoverageHMM { + meta { + description: "Runs an mHMM on coverage counts obtained from a bamfile, using normalisation based on the mean coverage per GC bin over the whole genome of the individual. The outputs are normalized coverage values, 1 per window, per sample. Copy number state, 1 per window, per sample." + } + parameter_meta { + coverage_tarball: "The input tarball containing the coverage files" + gc_content_file: "The gc content file to use for the HMM calculations" + coverage_gc: "The coverage gc file to use for the HMM calculations" + coverage_variance: "The coverage variance file to use for the HMM calculations" + mapq_file: "The mapq file to use for the HMM calculations" + mapq_threshold: "The mapq threshold to use for the HMM calculations. Default is 0.5." + species: "The species of the samples specified in the manifest." + sample_id: "The sample name. This is typically the same as the input bam file name but without the extension." + docker: "(optional) the docker image containing the runtime environment for this task" + ram: "(optional) the amount of memory (MiB) to provision for this task" + cpu: "(optional) the number of cpus to provision for this task" + disk: "(optional) the amount of disk space (GiB) to provision for this task" + preemptible: "(optional) if non-zero, request a pre-emptible instance and allow for this number of preemptions before running the task on a non preemptible machine" + runtime_zones: "The ordered list of zone preference for running in GCP" + } + input { + File coverage_tarball + File gc_content_file + File coverage_gc + File coverage_variance + File mapq_file + Float mapq_threshold + String species + String output_dir + String sample_id + # runtime values + String docker = "us.gcr.io/broad-gotc-prod/cnv:1.0.0-1679431881" + String ram = "8000 MiB" + Int cpu = 16 + Int disk = 70 + Int preemptible = 3 + String runtime_zones + } + command <<< + set -x + echo "Current directory: " + pwd + #unzip the tarball + tar -zxvf ~{coverage_tarball} + echo "Creating temporary manifest: " + echo ~{sample_id} > manifest.txt + ls -lht + allchrom=(2L 2R 3L 3R X) + # Activate the conda environment + source activate cnv37 + for chrom in ${allchrom[@]} + do + mkdir -p ~{output_dir}/$chrom/HMM_logs_~{species}/ + mkdir -p ~{output_dir}/$chrom/HMM_output/ + # run an mHMM on coverage counts + python /cnv/scripts/HMM_process.py \ + manifest.txt \ + $chrom \ + ~{output_dir} \ + ~{gc_content_file} \ + ~{coverage_gc} \ + ~{coverage_variance} \ + ~{mapq_file} \ + ~{mapq_threshold} \ + > ~{output_dir}/$chrom/HMM_logs_~{species}/HMM_$chrom.log 2>&1 + done + ls -lht + tar -zcvf ~{output_dir}.tar.gz ~{output_dir} + >>> + runtime { + docker: docker + memory: ram + disks: "local-disk ${disk} HDD" + cpu: cpu + preemptible: preemptible + zones: runtime_zones + } + output { + #log outputs + Array[File] logs = glob("*.log") + File output_gz = "~{output_dir}.tar.gz" + } +} diff --git a/pipelines/copy-number-variation-vector/gcp/TargetRegions.wdl b/pipelines/copy-number-variation-vector/gcp/TargetRegions.wdl new file mode 100644 index 00000000..bd8c5f30 --- /dev/null +++ b/pipelines/copy-number-variation-vector/gcp/TargetRegions.wdl @@ -0,0 +1,202 @@ +version 1.0 + +## Copyright Wellcome Sanger Institute, Oxford University, and the Broad Institute 2023 +## +## This WDL pipeline implements Target Regions portion of the vector Copy Number Variation Pipeline as described in: +## https://github.com/malariagen/pipelines/blob/add_cnv_vector_spec/docs/specs/cnv-vector.md +##. + + +workflow TargetRegions { + String pipeline_version = "1.0.0" + + input { + String sample_id + File input_bam + File input_bam_index + File gene_coordinates_file + File sample_metadata + File species_id_file + + File CNV_HMM_output # zip of the coveragefolder + File HMM_coverage_variance_file + File plotting_functions_file + + Int preemptible_tries + String runtime_zones + } + + # Step 1: Extract Diagnostic Reads + call ExtractDiagnosticReads as ExtractDiagnosticReads { + input: + input_bam = input_bam, + input_bam_index = input_bam_index, + sample_id = sample_id, + preemptible_tries = preemptible_tries, + runtime_zones = runtime_zones + } + + # Step 2: Target Regions CNV calling + call TargetRegionsCNVCalling as CNVCalling { + input: + sample_id = sample_id, + gene_coordinates_file = gene_coordinates_file, + sample_metadata = sample_metadata, + species_id_file = species_id_file, + coverage_variance_file = HMM_coverage_variance_file, + coverage_tar = CNV_HMM_output, + diagnostic_reads_tar = ExtractDiagnosticReads.diagnostic_reads_tar, + plotting_functions_file = plotting_functions_file, + preemptible_tries = preemptible_tries, + runtime_zones = runtime_zones + } + + meta { + allowNestedInputs: true + } + + output { + File diagnostic_reads_tar = ExtractDiagnosticReads.diagnostic_reads_tar + File focal_region_CNV_table = CNVCalling.focal_region_CNV_table + File HMM_gene_copy_number = CNVCalling.HMM_gene_copy_number + #File target_regions_Rdata = CNVCalling.target_regions_Rdata + } +} + +task ExtractDiagnosticReads { + input { + File input_bam + File input_bam_index + String sample_id + + Int preemptible_tries + String runtime_zones + String docker = "us.gcr.io/broad-gotc-prod/cnv:1.0.0-1679431881" + Int num_cpu = 1 + Float mem_gb = 3.75 + Int disk_gb = 50 + } + + + command <<< + set -euo pipefail + # Get the discordant reads + # Runs SSFA.py for every chromosome: This script goes through an alignment file and records the positions of reads within a specified region whose mates map to a different chromosome or discordantly on the same chromosome + + # create directories for diagnostic reads output files and logs + mkdir -p diagnostic_reads/SSFA/2R/Ace1_region/SSFAlogs + mkdir -p diagnostic_reads/SSFA/2R/Cyp6_region/SSFAlogs + mkdir -p diagnostic_reads/SSFA/3R/Cyp6zm_region/SSFAlogs + mkdir -p diagnostic_reads/SSFA/3R/Gste_region/SSFAlogs + mkdir -p diagnostic_reads/SSFA/X/Cyp9k1_region/SSFAlogs + + # create directories for diagnostic reads output files and logs + mkdir -p diagnostic_reads/breakpoints/2R/Ace1_region/breakpointlogs + mkdir -p diagnostic_reads/breakpoints/2R/Cyp6_region/breakpointlogs + mkdir -p diagnostic_reads/breakpoints/3R/Cyp6zm_region/breakpointlogs + mkdir -p diagnostic_reads/breakpoints/3R/Gste_region/breakpointlogs + mkdir -p diagnostic_reads/breakpoints/X/Cyp9k1_region/breakpointlogs + + #activate the conda environment + source activate cnv37 + + python /cnv/scripts/SSFA.py ~{input_bam} 2R 3425000:3650000 diagnostic_reads/SSFA/2R/Ace1_region/~{sample_id}_Ace1_SSFA_output.csv 10 > diagnostic_reads/SSFA/2R/Ace1_region/SSFAlogs/~{sample_id}_Ace1_SSFA_output.log 2>&1 + python /cnv/scripts/SSFA.py ~{input_bam} 2R 28460000:28570000 diagnostic_reads/SSFA/2R/Cyp6_region/~{sample_id}_CYP6_SSFA_output.csv 10 > diagnostic_reads/SSFA/2R/Cyp6_region/SSFAlogs/~{sample_id}_CYP6_SSFA_output.log 2>&1 + python /cnv/scripts/SSFA.py ~{input_bam} 3R 6900000:7000000 diagnostic_reads/SSFA/3R/Cyp6zm_region/~{sample_id}_CYP6ZM_SSFA_output.csv 10 > diagnostic_reads/SSFA/3R/Cyp6zm_region/SSFAlogs/~{sample_id}_CYP6ZM_SSFA_output.log 2>&1 + python /cnv/scripts/SSFA.py ~{input_bam} 3R 28570000:28620000 diagnostic_reads/SSFA/3R/Gste_region/~{sample_id}_GST_SSFA_output.csv 10 > diagnostic_reads/SSFA/3R/Gste_region/SSFAlogs/~{sample_id}_GST_SSFA_output.log 2>&1 + python /cnv/scripts/SSFA.py ~{input_bam} X 15220000:15255000 diagnostic_reads/SSFA/X/Cyp9k1_region/~{sample_id}_CYP9K1_SSFA_output.csv 10 > diagnostic_reads/SSFA/X/Cyp9k1_region/SSFAlogs/~{sample_id}_CYP9K1_SSFA_output.log 2>&1 + + # Get the soft clipped reads + # Runs breakpoint_detector.py for every chrom: This script goes through an alignment file and records the positions at which soft_clipping is detected in the aligned reads + + python /cnv/scripts/breakpoint_detector.py ~{input_bam} 2R 3425000:3650000 diagnostic_reads/breakpoints/2R/Ace1_region/~{sample_id}_Ace1_breakpoints_output 10 > diagnostic_reads/breakpoints/2R/Ace1_region/breakpointlogs/~{sample_id}_Ace1_breakpoints_output.log 2>&1 + python /cnv/scripts/breakpoint_detector.py ~{input_bam} 2R 28460000:28570000 diagnostic_reads/breakpoints/2R/Cyp6_region/~{sample_id}_CYP6_breakpoints_output 10 > diagnostic_reads/breakpoints/2R/Cyp6_region/breakpointlogs/~{sample_id}_CYP6_breakpoints_output.log 2>&1 + python /cnv/scripts/breakpoint_detector.py ~{input_bam} 3R 6900000:7000000 diagnostic_reads/breakpoints/3R/Cyp6zm_region/~{sample_id}_CYP6ZM_breakpoints_output 10 > diagnostic_reads/breakpoints/3R/Cyp6zm_region/breakpointlogs/~{sample_id}_CYP6ZM_breakpoints_output.log 2>&1 + python /cnv/scripts/breakpoint_detector.py ~{input_bam} 3R 28570000:28620000 diagnostic_reads/breakpoints/3R/Gste_region/~{sample_id}_GST_breakpoints_output 10 > diagnostic_reads/breakpoints/3R/Gste_region/breakpointlogs/~{sample_id}_GST_breakpoints_output.log 2>&1 + python /cnv/scripts/breakpoint_detector.py ~{input_bam} X 15220000:15255000 diagnostic_reads/breakpoints/X/Cyp9k1_region/~{sample_id}_CYP9K1_breakpoints_output 10 > diagnostic_reads/breakpoints/X/Cyp9k1_region/breakpointlogs/~{sample_id}_CYP9K1_breakpoints_output.log 2>&1 + + ls -lht + tar -zcvf diagnostic_reads.tar.gz diagnostic_reads + >>> + runtime { + docker: docker + preemptible: preemptible_tries + cpu: num_cpu + memory: mem_gb + " GiB" + disks: "local-disk " + disk_gb + " HDD" + zones: runtime_zones + } + output { + File diagnostic_reads_tar = "diagnostic_reads.tar.gz" + } +} + +task TargetRegionsCNVCalling { + input { + String sample_id + File gene_coordinates_file # gene_coordinates_file: pipeline input + File sample_metadata # metadata: pipeline input + File species_id_file # species_id_file: pipeine input + File coverage_variance_file # coverage_variance_file: output from CoverageSummary step in HMM pipeline + File coverage_tar # coveragefolder: output from CoverageSummary step in HMM pipeline + File diagnostic_reads_tar # diagnostic_reads_folder: output of ExtractDiagnosticReads + File plotting_functions_file # plotting_functions_file + + String docker = "us.gcr.io/broad-gotc-prod/cnv/r:1.0.0-1692386236" + Int num_cpu = 8 + Int preemptible_tries + String runtime_zones = "us-central1-b" + Float mem_gb = 3.75 + Int disk_gb = 50 + } + + String coverage_dir = basename(coverage_tar, ".tar.gz") + String diagnostic_reads_dir = basename(diagnostic_reads_tar, ".tar.gz") + + command <<< + set -euo pipefail + + # get the manifest, metadata, and species_id files for just this sample + echo "~{sample_id}" > ~{sample_id}_manifest.tsv + + head -n 1 ~{sample_metadata} >> ~{sample_id}_metadata.tsv + grep "~{sample_id}" ~{sample_metadata} >> ~{sample_id}_metadata.tsv + + head -n 1 ~{species_id_file} >> ~{sample_id}_species_id.tsv + grep "~{sample_id}" ~{species_id_file} >> ~{sample_id}_species_id.tsv + + # unzip the coverage tarball and get the directory name + tar -zxvf ~{coverage_tar} + + # unzip the diagnostic reads tarball and get the directory name + tar -zxvf ~{diagnostic_reads_tar} + + #create output directory + mkdir target_regions_analysis + + echo "Starting R script" + /opt/R/3.6.1/bin/R --slave -f /usr/local/Rscripts/target_regions_analysis.r --args ~{sample_id}_manifest.tsv \ + ~{gene_coordinates_file} \ + ~{sample_id}_metadata.tsv \ + ~{sample_id}_species_id.tsv \ + ~{coverage_variance_file} \ + ~{coverage_dir} \ + ~{diagnostic_reads_dir} \ + ~{plotting_functions_file} \ + ~{num_cpu} + >>> + + runtime { + docker: docker + preemptible: preemptible_tries + cpu: num_cpu + memory: mem_gb + " GiB" + disks: "local-disk " + disk_gb + " HDD" + zones: runtime_zones + } + + output { + File focal_region_CNV_table = "target_regions_analysis/focal_region_CNV_table.csv" + File HMM_gene_copy_number = "target_regions_analysis/HMM_gene_copy_number.csv" + } +}