Merge pull request #5 from PolyOmica/VariousFixes

Various fixes, including fix for Issue #4
GreenwoodLab · Feb 11, 2016 · 820844a · 820844a
2 parents 1335100 + 73248c6
commit 820844a
Show file tree

Hide file tree

Showing 25 changed files with 238 additions and 110 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -13,21 +13,34 @@ Authors@R: c(person("Karim", "Oualkacha", email =
              "[email protected]", role = "aut")
              )
 Author: Karim Oualkacha [aut, cre],
-  M'Hamed Lajmi Lakhal-Chaieb [aut],
-  Celia M.T. Greenwood [aut],
-  Lennart C. Karssen [aut],
-  Sodbo Sharapov [aut]
+    M'Hamed Lajmi Lakhal-Chaieb [aut],
+    Celia M.T. Greenwood [aut],
+    Lennart C. Karssen [aut],
+    Sodbo Sharapov [aut]
 Maintainer: Karim Oualkacha <[email protected]>
 Description: This is a collection of the five region-based
-  rare-variant genetic association tests. The following tests are
-  currently implemented: ASKAT, ASKAT-Normalized, VC-C1, VC-C2 and
-  VC-C3.
-Depends: R (>= 3.2.2), foreach, doParallel
-Imports: ks, CompQuadForm, Matrix, snpStats, kinship2
-Suggests: GenABEL, knitr, methods
+    rare-variant genetic association tests. The following tests are
+    currently implemented: ASKAT, ASKAT-Normalized, VC-C1, VC-C2 and
+    VC-C3.
+Depends:
+    R (>= 3.2.2),
+    foreach,
+    doParallel
+Imports:
+    ks,
+    CompQuadForm,
+    data.table,
+    Matrix,
+    snpStats,
+    kinship2
+Suggests:
+    GenABEL,
+    knitr,
+    methods
 License: GPL (>=3)
 Copyright: Celia M.T. Greenwood
 LazyData: true
 VignetteBuilder: knitr
 NeedsCompilation: no
 Packaged: 2015-12-04 01:53:06 UTC; karimoualkacha
+RoxygenNote: 5.0.1
diff --git a/NAMESPACE b/NAMESPACE
@@ -17,6 +17,7 @@ import(foreach)
 import(snpStats)
 importFrom(CompQuadForm,davies)
 importFrom(Matrix,nearPD)
+importFrom(data.table,fread)
 importFrom(kinship2,kindepth)
 importFrom(ks,hns)
 importFrom(ks,kdde)
diff --git a/R/ASKAT.region.R b/R/ASKAT.region.R
@@ -72,7 +72,7 @@ ASKAT.region <- function(y=NULL,
                           startpos=startpos,
                           endpos=endpos)
 
-    if (ncol(haplotypes) == 0) {
+    if (is.null(haplotypes) || ncol(haplotypes) == 0) {
         warning("No genotypes available in the region from ",
                       startpos, " to ", endpos, " on chromosome ", chr)
         result.df <- data.frame(Score.Test=NA,
@@ -101,7 +101,7 @@ ASKAT.region <- function(y=NULL,
 
     result.df <- data.frame(Score.Test=pval$score,
                             P.value=pval$p.value,
-                            N.Markers=ncol(haplotypes))
+                            N.Markers=ncol(G))
 
     if (!is.null(regionname)) {
         rownames(result.df) <- regionname

diff --git a/R/NormalizedASKAT.region.R b/R/NormalizedASKAT.region.R
@@ -1,5 +1,7 @@
-##' Runs the normalized ASKAT method on a given genomic region. Rank-based normalization is applied
-##' to the phenotype residauls under the null model, after adjusting for covariate effects 
+##' Runs the normalized ASKAT method on a given genomic region.
+##'
+##' Rank-based normalization is applied to the phenotype residuals
+##' under the null model, after adjusting for covariate effects.
 ##'
 ##' @title Run the normalized ASKAT method on a genomic region defined
 ##'     by a start and a stop base pair coordinate
@@ -17,7 +19,8 @@
 ##'     \item \code{regionname}: Name of the region/gene on which you
 ##'     are running the association test
 ##'     }
-##' @author Lennart C. Karssen, Sodbo Sharapov
+##' @author Lennart C. Karssen
+##' @author Sodbo Sharapov
 ##' @export
 NormalizedASKAT.region <- function(y=NULL,
                                    X=NULL,
@@ -56,7 +59,7 @@ NormalizedASKAT.region <- function(y=NULL,
                           startpos=startpos,
                           endpos=endpos)
 
-    if (ncol(haplotypes) == 0) {
+    if (is.null(haplotypes) || ncol(haplotypes) == 0) {
         warning("No genotypes available in the region from ",
                       startpos, " to ", endpos, " on chromosome ", chr)
         result.df <- data.frame(Score.Test=NA,
@@ -85,7 +88,7 @@ NormalizedASKAT.region <- function(y=NULL,
 
     result.df <- data.frame(Score.Test=pval$score,
                             P.value=pval$p.value,
-                            N.Markers=ncol(haplotypes))
+                            N.Markers=ncol(G))
 
     if (!is.null(regionname)) {
         rownames(result.df) <- regionname

diff --git a/R/VCC1.region.R b/R/VCC1.region.R
@@ -16,7 +16,8 @@
 ##'     \item \code{regionname}: Name of the region/gene on which you
 ##'     are running the association test
 ##'     }
-##' @author Sodbo Sharapov, Lennart C. Karssen
+##' @author Sodbo Sharapov
+##' @author Lennart C. Karssen
 ##' @export
 VCC1.region <- function(y=NULL,
                         X=NULL,
@@ -57,7 +58,7 @@ VCC1.region <- function(y=NULL,
                           startpos=startpos,
                           endpos=endpos)
 
-    if (ncol(haplotypes) == 0) {
+    if (is.null(haplotypes) || ncol(haplotypes) == 0) {
         warning("No genotypes available in the region from ",
                       startpos, " to ", endpos, " on chromosome ", chr)
         result.df <- data.frame(Score.Test=NA,
@@ -85,7 +86,7 @@ VCC1.region <- function(y=NULL,
 
     result.df <- data.frame(Score.Test=pval$score,
                             P.value=pval$p.value,
-                            N.Markers=ncol(haplotypes))
+                            N.Markers=ncol(G))
 
     if (!is.null(regionname)) {
         rownames(result.df) <- regionname

diff --git a/R/VCC2.region.R b/R/VCC2.region.R
@@ -1,7 +1,7 @@
 ##' Runs the VC-C2 method on a given genomic region
 ##'
 ##' @title Run the VC-C2 method on a genomic region defined by a start
-##' and a stop base pair coordinate
+##'     and a stop base pair coordinate
 ##' @inheritParams read.haplo
 ##' @inheritParams VCC1.region
 ##' @param Nperm Integer, number of permutations to use for empirical
@@ -60,7 +60,7 @@ VCC2.region <- function(y=NULL,
                           startpos=startpos,
                           endpos=endpos)
 
-    if (ncol(haplotypes) == 0) {
+    if (is.null(haplotypes) || ncol(haplotypes) == 0) {
         warning("No genotypes available in the region from ",
                       startpos, " to ", endpos, " on chromosome ", chr)
         result.df <- data.frame(Score.Test=NA,
@@ -113,7 +113,7 @@ VCC2.region <- function(y=NULL,
 
     result.df <- data.frame(Score.Test=pval$score,
                             P.value=pval$p.value,
-                            N.Markers=ncol(haplotypes))
+                            N.Markers=ncol(G))
 
     if (!is.null(regionname)) {
         rownames(result.df) <- regionname

diff --git a/R/VCC3.region.R b/R/VCC3.region.R
@@ -1,7 +1,7 @@
 ##' Runs the VC-C3 method on a given genomic region
 ##'
 ##' @title Run the VC-C3 method on a genomic region defined by a start
-##' and a stop base pair coordinate
+##'     and a stop base pair coordinate
 ##' @inheritParams VCC2.region
 ##' @return A data frame containing the results of the association
 ##'     test. The data frame contains the following columns:
@@ -53,7 +53,7 @@ VCC3.region <- function(y=NULL,
                           startpos=startpos,
                           endpos=endpos)
 
-    if (ncol(haplotypes) == 0) {
+    if (is.null(haplotypes) || ncol(haplotypes) == 0) {
         warning("No genotypes available in the region from ",
                       startpos, " to ", endpos, " on chromosome ", chr)
         result.df <- data.frame(Score.Test=NA,
@@ -106,7 +106,7 @@ VCC3.region <- function(y=NULL,
 
     result.df <- data.frame(Score.Test=pval$score,
                             P.value=pval$p.value.VCC3,
-                            N.Markers=ncol(haplotypes))
+                            N.Markers=ncol(G))
 
     if (!is.null(regionname)) {
         rownames(result.df) <- regionname

diff --git a/R/ff.hap.R b/R/ff.hap.R
@@ -0,0 +1,8 @@
+ff.hap <- function(k){
+    if (k==1) {
+        x <- 6 + c(1, 2)
+    } else {
+        x <- 6 + c((2*k - 1), (2*k))
+    }
+    return(x)
+}
diff --git a/R/lettersTo12.R b/R/lettersTo12.R
@@ -0,0 +1,24 @@
+lettersTo12 <- function(x){
+    x1 <- c(x[, 1], x[, 2])
+    Letters.SNP <- unique(x1)
+
+    if (length(Letters.SNP) == 1) {
+        x[, 1] <- x[, 2] <- 2
+    }
+
+    if (length(Letters.SNP) > 1) {
+        if ( ( length(which(x1 == Letters.SNP[1])) / length(x1) ) > 0.5 ) {
+            x[which(x[, 1] == Letters.SNP[1]), 1] <- 2
+            x[which(x[, 2] == Letters.SNP[1]), 2] <- 2
+            x[which(x[, 1] == Letters.SNP[2]), 1] <- 1
+            x[which(x[, 2] == Letters.SNP[2]), 2] <- 1
+        } else {
+            x[which(x[, 1] == Letters.SNP[2]), 1] <- 2
+            x[which(x[, 2] == Letters.SNP[2]), 2] <- 2
+            x[which(x[, 1] == Letters.SNP[1]), 1] <- 1
+            x[which(x[, 2] == Letters.SNP[1]), 2] <- 1
+        }
+    }
+
+    return(x)
+}
diff --git a/R/pvalue.ASKAT.R b/R/pvalue.ASKAT.R
@@ -1,8 +1,8 @@
 ##' Compute p-value and score for the ASKAT method
 ##'
 ##' @param RH0 a vector of length 2 which the results (output) of the
-##' \code{\link{Estim.H0.ASKAT}} function (i.e. variance components
-##' estimates under the null model)
+##'     \code{\link{Estim.H0.ASKAT}} function (i.e. variance
+##'     components estimates under the null model)
 ##' @inheritParams RVPedigree
 ##' @param G matrix of genotypes
 ##' @return A list with score and p-value for the ASKAT test on the

diff --git a/R/read.haplo.R b/R/read.haplo.R
@@ -2,24 +2,26 @@
 #' files or ShapeIt output files
 #'
 #' @param type character, \code{'ped'}, \code{'bed'} (default) or
-#' \code{'shapeit-haps'} format of input file containing haplotype
-#' data
-#' @param filename character, path to input file containing haplotype data
+#'     \code{'shapeit-haps'} format of input file containing haplotype
+#'     data
+#' @param filename character, path to input file containing haplotype
+#'     data
 #' @param map object, data.frame contains 3 columns: rsID, chromosome,
-#' position in bp as output by e.g. \code{\link{readMapFile}}.
-#' @param chr character, chromosome number (basically from 1 to 22 as used by
-#' \href{http://pngu.mgh.harvard.edu/~purcell/plink/data.shtml#ped}{Plink}),
-#' on which the region of interest is located
+#'     position in bp as output by e.g. \code{\link{readMapFile}}.
+#' @param chr character, chromosome number (basically from 1 to 22 as
+#'     used by
+#'     \href{http://pngu.mgh.harvard.edu/~purcell/plink/data.shtml#ped}{Plink}),
+#'     on which the region of interest is located
 #' @param startpos numeric, start position (in bp, base pairs) of the
 #'     region of interest (default: 0)
 #' @param endpos numeric, end position (in bp, base pairs) of the
 #'     region of interest (default: 0)
 #' @return matrix object containing the haplotypes selected by the
-#' region of interest
+#'     region of interest
 #' @seealso \code{\link{read.haplo.pedfile}},
-#' \code{\link{read.haplo.bedfile}},
-#' \code{\link{read.haplo.shapeit_haps}},
-#' \code{\link{readMapFile}}
+#'     \code{\link{read.haplo.bedfile}},
+#'     \code{\link{read.haplo.shapeit_haps}},
+#'     \code{\link{readMapFile}}
 #' @keywords internal
 read.haplo <- function(type="bed",
                        filename,

diff --git a/R/read.haplo.bedfile.R b/R/read.haplo.bedfile.R
@@ -2,15 +2,17 @@
 #' (regular PLINK binary files)
 #'
 #' @param filename character, path to BED file containing haplotype
-#' data. Attention! filename should contain path to \code{<file>.bed}
-#' with full file name. For example \code{../mydata/inputplink.bed}.
+#'     data. Attention! filename should contain path to
+#'     \code{<file>.bed} with full file name. For example
+#'     \code{../mydata/inputplink.bed}.
 #' @inheritParams read.haplo
 #' @return matrix object containing the haplotypes selected by the
-#' region of interest
+#'     region of interest, or \code{NULL} if there are no variants in
+#'     the region
 #' @seealso \code{\link{read.haplo}},
-#' \code{\link{read.haplo.pedfile}},
-#' \code{\link{read.haplo.shapeit_haps}},
-#' \code{\link{readMapFile}}
+#'     \code{\link{read.haplo.pedfile}},
+#'     \code{\link{read.haplo.shapeit_haps}},
+#'     \code{\link{readMapFile}}
 #' @import snpStats
 #' @keywords internal
 read.haplo.bedfile <- function(filename = "NULL",
@@ -24,6 +26,13 @@ read.haplo.bedfile <- function(filename = "NULL",
                                   map[, 3] < endpos),
                     2]
 
+    if ( length(snps2out) < 1 ) {
+        warning("No genotypes available in the region from ",
+                startpos, " to ", endpos, " on chromosome ",
+                chr, " (filename ", filename, ")")
+        return(NULL)
+    }
+
     basefile <- sub(".bed$", "", filename)
     plink.input <- snpStats::read.plink(bed = paste0(basefile, ".bed"),
                                         bim = paste0(basefile, ".bim"),

diff --git a/R/read.haplo.pedfile.R b/R/read.haplo.pedfile.R
@@ -2,36 +2,65 @@
 #' (regular PLINK untransposed text files)
 #'
 #' @param filename character, path to PED file containing haplotype
-#' data. Attention! filename should contain path to \code{<file>.ped}
-#' with full file name. For example \code{../mydata/inputplink.ped}
+#'     data. Attention! filename should contain path to
+#'     \code{<file>.ped} with full file name. For example
+#'     \code{../mydata/inputplink.ped}
+#' @param recode character, designates if data are in 1/2 format or in
+#'     letters (A, C, G, T) format. The defaut is 1/2 where 1
+#'     designating the minor allele
 #' @inheritParams read.haplo
 #' @return matrix object containing the haplotypes selected by the
-#' region of interest
+#'     region of interest, or \code{NULL} if there are no variants in
+#'     the region
 #' @seealso \code{\link{read.haplo}},
-#' \code{\link{read.haplo.bedfile}},
-#' \code{\link{read.haplo.shapeit_haps}},
-#' \code{\link{readMapFile}}
-#' @import snpStats
+#'     \code{\link{read.haplo.bedfile}},
+#'     \code{\link{read.haplo.shapeit_haps}},
+#'     \code{\link{readMapFile}}
+#' @author Lennart C. Karssen
+#' @author Sodbo Sharapov
+#' @author Karim Oualkacha
+#' @importFrom data.table fread
 #' @keywords internal
 read.haplo.pedfile <- function(filename = "NULL",
                                map,
                                chr = "NULL",
                                startpos = "NULL",
-                               endpos = "NULL"){
-    # TODO: probably we want to automatically append ".ped" to filename
-    # TODO: add extensions
-    snps2out <- map[which(map[, 1] == chr &
-                              map[, 3] > startpos &
-                                  map[, 3] < endpos),
-                    2]
+                               endpos = "NULL",
+                               recode = "recodeLetters"){
 
-    plink.input <- snpStats::read.pedfile(file = filename,
-                                          snps = snps2out)
+    snps2out <- as.numeric(rownames(map)[which(map[, 1] == chr &
+                                               map[, 3] > startpos &
+                                               map[, 3] < endpos)])
 
-    GenotypeMatrix <- methods::as(plink.input$genotypes, "numeric")
-    # TODO: added 'rownames(GenotypeMatrix) <- idNames' may required
-    return(GenotypeMatrix)
-}
+    if ( length(snps2out) < 1 ) {
+        warning("No genotypes available in the region from ",
+                startpos, " to ", endpos, " on chromosome ",
+                chr, " (filename ", filename, ")")
+        return(NULL)
+    }
+
+    cols2out = as.vector(sapply(snps2out, ff.hap))
 
-# tmp = read.haplo.pedfile(filename = "data.ped",map = tmp1,chr = 1,
-#	startpos = 9000, endpos = 25000)
+    switch(recode,
+           recode12={
+               haplotypes = as.data.frame(fread(filename,
+                                                select = cols2out,
+                                                colClasses = 'character'))
+           },
+           recodeLetters={
+               haplotypes = as.data.frame(fread(filename,
+                                                select = cols2out,
+                                                colClasses = 'character'))
+               list.haplo = lapply(seq_len(ncol(haplotypes)/2), function(i){
+                   haplotypes[, c(((2*i) - 1):(2*i))]
+               }
+               )
+               haplotypes = lapply(list.haplo, lettersTo12)
+               haplotypes = matrix(as.numeric(unlist(haplotypes)),
+                                   nrow = dim(haplotypes[[1]])[1],
+                                   byrow = FALSE)
+           }
+           )
+
+    return(haplotypes)
+}