From dba17876857cbc0e404b973137eb36f29798f4be Mon Sep 17 00:00:00 2001 From: Zilong-Li Date: Sun, 17 Dec 2023 10:59:09 +0100 Subject: [PATCH] more unit tests and a bug fixed --- .Rbuildignore | 1 + NEWS.md | 7 ++ R/RcppExports.R | 3 +- codecov.yaml | 3 + man/vcfreader.Rd | 6 +- src/vcf-reader.cpp | 54 ++++++++++----- tests/testthat/test-modify-vcf.R | 42 ++++++++++++ tests/testthat/test-popgen.R | 10 ++- tests/testthat/test-vcf-reader.R | 114 +++++++++++++++++++++++++++++++ tests/testthat/test-vcf-table.R | 12 ++++ tests/testthat/test-vcf-writer.R | 2 + 11 files changed, 234 insertions(+), 20 deletions(-) create mode 100644 codecov.yaml diff --git a/.Rbuildignore b/.Rbuildignore index 6e0e30d..6bc35cb 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -11,4 +11,5 @@ ^README.html .*\.tar\.gz$ ^.covrignore$ +^codecov.yaml$ ^\.github$ diff --git a/NEWS.md b/NEWS.md index a169626..b84076a 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,10 @@ +# vcfppR 0.3.6 + +* add `vcfreader::line` +* remove `vcfreader::setVariant` +* bug fix `setFormatStr` +* more units tests + # vcfppR 0.3.5 * add vcfreader and vcfwriter diff --git a/R/RcppExports.R b/R/RcppExports.R index 17a9679..a543017 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -61,8 +61,9 @@ heterozygosity <- function(vcffile, region = "", samples = "-", pass = FALSE, qu #' @field hasOTHER Test if current variant has a OTHER or not #' @field hasOVERLAP Test if current variant has a OVERLAP or not #' @field nsamples Return the number of samples -#' @field string Return the raw string of current variant #' @field header Return the raw string of the vcf header +#' @field string Return the raw string of current variant including newline +#' @field line Return the raw string of current variant without newline #' @field output Init an output object for streaming out the variants to another vcf #' @field write Streaming out current variant the output vcf #' @field close Close the connection to the output vcf diff --git a/codecov.yaml b/codecov.yaml new file mode 100644 index 0000000..e883d64 --- /dev/null +++ b/codecov.yaml @@ -0,0 +1,3 @@ +# sample regex patterns +ignore: + - "src/htslib-1.18" # ignore folders and all its contents diff --git a/man/vcfreader.Rd b/man/vcfreader.Rd index 74552f8..f927016 100644 --- a/man/vcfreader.Rd +++ b/man/vcfreader.Rd @@ -91,10 +91,12 @@ Type the name of the class to see the details and methods \item{\code{nsamples}}{Return the number of samples} -\item{\code{string}}{Return the raw string of current variant} - \item{\code{header}}{Return the raw string of the vcf header} +\item{\code{string}}{Return the raw string of current variant including newline} + +\item{\code{line}}{Return the raw string of current variant without newline} + \item{\code{output}}{Init an output object for streaming out the variants to another vcf} \item{\code{write}}{Streaming out current variant the output vcf} diff --git a/src/vcf-reader.cpp b/src/vcf-reader.cpp index 32565c0..ded1234 100644 --- a/src/vcf-reader.cpp +++ b/src/vcf-reader.cpp @@ -51,8 +51,9 @@ using namespace std; //' @field hasOTHER Test if current variant has a OTHER or not //' @field hasOVERLAP Test if current variant has a OVERLAP or not //' @field nsamples Return the number of samples -//' @field string Return the raw string of current variant //' @field header Return the raw string of the vcf header +//' @field string Return the raw string of current variant including newline +//' @field line Return the raw string of current variant without newline //' @field output Init an output object for streaming out the variants to another vcf //' @field write Streaming out current variant the output vcf //' @field close Close the connection to the output vcf @@ -243,13 +244,19 @@ class vcfreader { auto v = genotypes(false); return v.size() / nsamples(); } - inline std::string string() const { return var.asString(); } inline std::string header() const { return br.header.asString(); } + inline std::string string() const { return var.asString(); } + inline std::string line() { + std::string s = var.asString(); + s.pop_back(); + return s; + } // WRITE inline void output(const std::string& vcffile) { bw.open(vcffile); bw.initalHeader(br.header); + writable = true; } inline void write() { bw.writeRecord(var); } inline void close() { bw.close(); } @@ -258,22 +265,30 @@ class vcfreader { inline void setID(std::string s) { var.setID(s.c_str()); } inline void setPOS(int pos) { var.setPOS(pos); } inline void setRefAlt(const std::string& s) { var.setRefAlt(s.c_str()); } - inline void setInfoInt(std::string tag, int v) { var.setINFO(tag, v); } - inline void setInfoFloat(std::string tag, double v) { var.setINFO(tag, v); } - inline void setInfoStr(std::string tag, const std::string& s) { var.setINFO(tag, s); } - inline void setFormatInt(std::string tag, const vector& v) { var.setFORMAT(tag, v); } - inline void setFormatFloat(std::string tag, const vector& v) { + inline bool setInfoInt(std::string tag, int v) { return var.setINFO(tag, v); } + inline bool setInfoFloat(std::string tag, double v) { return var.setINFO(tag, v); } + inline bool setInfoStr(std::string tag, const std::string& s) { return var.setINFO(tag, s); } + inline bool setFormatInt(std::string tag, const vector& v) { + return var.setFORMAT(tag, v); + } + inline bool setFormatFloat(std::string tag, const vector& v) { vector f(v.begin(), v.end()); - var.setFORMAT(tag, f); + return var.setFORMAT(tag, f); + } + inline bool setFormatStr(std::string tag, const std::string& s) { + if (s.length() % nsamples() != 0) { + Rcpp::Rcout << "the length of s must be divisable by nsamples()"; + return false; + } + return var.setFORMAT(tag, s); } - inline void setFormatStr(std::string tag, const std::string& s) { var.setINFO(tag, s); } - inline void setGenotypes(const vector& v) { + inline bool setGenotypes(const vector& v) { if ((int)v.size() != nsamples() * ploidy()) { Rcpp::Rcout << "nsamples: " << nsamples() << ", ploidy: " << ploidy() << "\n"; - throw std::runtime_error( - "the size of genotype vector is not equal to nsamples * ploidy"); + Rcpp::Rcout << "the size of genotype vector is not equal to nsamples * ploidy\n"; + return false; } - var.setGenotypes(v); + return var.setGenotypes(v); } inline void setPhasing(const vector& v) { vector c(v.begin(), v.end()); @@ -281,17 +296,23 @@ class vcfreader { } inline void rmInfoTag(std::string s) { var.removeINFO(s); } - inline void setVariant(const std::string& s) { var.addLineFromString(s); } inline void addINFO(const std::string& id, const std::string& number, const std::string& type, const std::string& desc) { - bw.header.addINFO(id, number, type, desc); + if (writable) + bw.header.addINFO(id, number, type, desc); + else + Rcpp::Rcout << "please call the `output(filename)` function first\n"; } inline void addFORMAT(const std::string& id, const std::string& number, const std::string& type, const std::string& desc) { - bw.header.addFORMAT(id, number, type, desc); + if (writable) + bw.header.addFORMAT(id, number, type, desc); + else + Rcpp::Rcout << "please call the `output(filename)` function first\n"; } private: + bool writable = false; vcfpp::BcfReader br; vcfpp::BcfRecord var; vcfpp::BcfWriter bw; @@ -344,6 +365,7 @@ RCPP_MODULE(vcfreader) { .method("nsamples", &vcfreader::nsamples) .method("header", &vcfreader::header) .method("string", &vcfreader::string) + .method("line", &vcfreader::line) .method("output", &vcfreader::output) .method("write", &vcfreader::write) .method("close", &vcfreader::close) diff --git a/tests/testthat/test-modify-vcf.R b/tests/testthat/test-modify-vcf.R index e25ba76..bd39f61 100644 --- a/tests/testthat/test-modify-vcf.R +++ b/tests/testthat/test-modify-vcf.R @@ -24,3 +24,45 @@ test_that("modify the genotypes", { expect_false(isTRUE(all.equal(g0, g2))) expect_identical(g1, g2) }) + +test_that("modify item in FORMAT", { + ## creat a VCF with GP in FORMAT + outvcf <- paste0(tempfile(), ".vcf.gz") + bw <- vcfwriter$new(outvcf, "VCF4.3") + bw$addContig("chr20") + bw$addINFO("AF", "A", "Float", "Estimated allele frequency in the range (0,1)"); + bw$addFORMAT("GP", "3", "Float", "Posterior genotype probability of 0/0, 0/1, and 1/1"); + bw$addSample("NA12878") + bw$addSample("NA12879") + s1 <- "chr20\t2006060\trs146931526\tG\tC\t100\tPASS\tAF=0.000998403\tGP\t0.966,0.034,0\t0.003,0.872,0.125" + bw$writeline(s1) + bw$close() + ## tests + br <- vcfreader$new(outvcf) + expect_true(br$variant()) ## get a variant record + br$string() + gp <- br$formatFloat("GP") + gp <- array(gp, c(3, br$nsamples())) + ds <- gp[2,] + gp[3,] * 2 + ## will prompt uses a message if `output` is not called + br$addFORMAT("DS", "1", "Float", "Diploid dosage") + br$addINFO("INFO", "1", "Float", "INFO score of imputation") + ## now open another file for output + newvcf <- paste0(tempfile(), ".vcf.gz") + br$output(newvcf) + ## add INFO, DS in header first + br$addINFO("INFO", "1", "Float", "INFO score of imputation") + br$addFORMAT("DS", "1", "Float", "Diploid dosage") + br$addFORMAT("AC", "1", "Integer", "Allele counts") + br$addFORMAT("STR", "1", "String", "Test String type") + print(br$header()) + ## set DS in FORMAT now + br$setFormatFloat("DS", ds) + ## test if DS presents + expect_identical(br$formatFloat("DS"), ds) + ## more tests + br$setFormatInt("AC", c(3L, 4L)) + expect_false(br$setFormatStr("STR","HHH,JJJ")) ## length(s) %% nsamples != 0 + expect_true(br$setFormatStr("STR","HHHJJJ")) ## length(s) %% nsamples == 0 + print(br$string()) +}) diff --git a/tests/testthat/test-popgen.R b/tests/testthat/test-popgen.R index 51d7117..5209ba1 100644 --- a/tests/testthat/test-popgen.R +++ b/tests/testthat/test-popgen.R @@ -2,7 +2,7 @@ vcffile <- system.file("extdata", "raw.gt.vcf.gz", package="vcfppR") test_that("both vcftable and vcfreader work for calculating heterzygosity", { vcf <- vcftable(vcffile, vartype = "snps") - res1 <- colSums(vcf[["gt"]]==1, na.rm = TRUE) + res1 <- as.integer(colSums(vcf[["gt"]]==1, na.rm = TRUE)) br <- vcfreader$new(vcffile) res2 <- rep(0L, br$nsamples()) while(br$variant()) { @@ -14,3 +14,11 @@ test_that("both vcftable and vcfreader work for calculating heterzygosity", { } expect_identical(as.integer(res1), res2) }) + +test_that("heterzygosity only counts snps", { + vcf <- vcftable(vcffile, vartype = "snps", pass = TRUE) + res1 <- as.integer(colSums(vcf[["gt"]]==1, na.rm = TRUE)) + vcf <- vcfpopgen(vcffile, pass = TRUE, fun = "heterozygosity") + res2 <- vcf$hets + expect_identical(res1, res2) +}) diff --git a/tests/testthat/test-vcf-reader.R b/tests/testthat/test-vcf-reader.R index fd1b4c1..20eb365 100644 --- a/tests/testthat/test-vcf-reader.R +++ b/tests/testthat/test-vcf-reader.R @@ -1,6 +1,117 @@ library(testthat) vcffile <- system.file("extdata", "raw.gt.vcf.gz", package="vcfppR") +svfile <- system.file("extdata", "sv.vcf.gz", package="vcfppR") + +test_that("vcfreader: constructor with vcfffile only", { + br <- vcfreader$new(vcffile) + br$variant() + expect_identical(br$chr(), "chr21") + expect_identical(br$pos(), 5030082L) + expect_identical(br$id(), "chr21:5030082:G:A") + expect_identical(br$ref(), "G") + expect_identical(br$alt(), "A") +}) + +test_that("vcfreader: constructor with vcfffile and region", { + br <- vcfreader$new(vcffile, "chr21:5030082-") + br$variant() + expect_identical(br$chr(), "chr21") + expect_identical(br$pos(), 5030082L) + expect_identical(br$id(), "chr21:5030082:G:A") + expect_identical(br$ref(), "G") + expect_identical(br$alt(), "A") +}) + +test_that("vcfreader: constructor with vcfffile, region and samples", { + br <- vcfreader$new(vcffile, "chr21:5030082-", "HG00097,HG00099") + br$variant() + expect_identical(br$genotypes(F), rep(0L, 4)) + expect_identical(br$chr(), "chr21") + expect_identical(br$pos(), 5030082L) + expect_identical(br$id(), "chr21:5030082:G:A") + expect_identical(br$ref(), "G") + expect_identical(br$alt(), "A") +}) + +test_that("vcfreader: get FORMAT item with Float type", { + br <- vcfreader$new(vcffile, "chr21:5030347-", "HG00097,HG00099") + br$variant() + expect_identical(br$pos(), 5030347L) + expect_identical(all(is.na(br$formatFloat("AB"))), TRUE) +}) + +test_that("vcfreader: get FORMAT item with Integer type", { + br <- vcfreader$new(vcffile, "chr21:5030347-", "HG00097,HG00099") + br$variant() + expect_identical(br$pos(), 5030347L) + expect_identical(br$formatInt("AD"), c(4L, 0L, 16L, 0L)) +}) + +test_that("vcfreader: get FORMAT item with String type", { + br <- vcfreader$new(svfile, "chr21:5114000-", "HG00096,HG00097") + expect_true(br$variant()) + expect_identical(br$pos(), 5114000L) + expect_identical(br$formatStr("EV"), c("RD","RD")) +}) + +test_that("vcfreader: get FORMAT item for the unexpected and error", { + br <- vcfreader$new(vcffile, "chr21:5030347-", "HG00097,HG00099") + br$variant() + expect_identical(br$pos(), 5030347L) + expect_error(br$formatInt("AA")) ## AA not exists + expect_identical(br$formatInt("AD"), c(4L, 0L, 16L, 0L)) + ## use formatFloat will return unexpected values if AD in FORMAT if Integer + expect_error(expect_identical(br$formatFloat("AD"), c(4L, 0L, 16L, 0L))) ## AA exists but of Integer type +}) + +test_that("vcfreader: get variants type", { + br <- vcfreader$new(vcffile) + i <- 0 + s <- 0 + m <- 0 + ms <- 0 + while(br$variant()) { + if(br$isIndel()) i <- i + 1 + if(br$isSV()) s <- s + 1 + if(br$isMultiAllelics()) m <- m + 1 + if(br$isMultiAllelicSNP()) ms <- ms + 1 + } + expect_identical(i, 6) + expect_identical(s, 0) + expect_identical(m, 4) + expect_identical(ms, 3) +}) + +test_that("vcfreader: test variants type", { + br <- vcfreader$new(vcffile) + i1 <- 0 + i2 <- 0 + i3 <- 0 + i4 <- 0 + i5 <- 0 + i6 <- 0 + i7 <- 0 + i8 <- 0 + while(br$variant()) { + if(br$hasSNP()) i1 <- i1 + 1 + if(br$hasINDEL()) i2 <- i2 + 1 + if(br$hasINS()) i3 <- i3 + 1 + if(br$hasDEL()) i4 <- i4 + 1 + if(br$hasMNP()) i5 <- i5 + 1 + if(br$hasBND()) i6 <- i6 + 1 + if(br$hasOTHER()) i7 <- i7 + 1 + if(br$hasOVERLAP()) i8 <- i8 + 1 + } + expect_identical(i1, 66) + expect_identical(i2, 6) + expect_identical(i3, 3) + expect_identical(i4, 4) + expect_identical(i5, 0) + expect_identical(i6, 0) + expect_identical(i7, 0) + expect_identical(i8, 1) +}) test_that("vcfreader: reading variant only", { br <- vcfreader$new(vcffile) @@ -13,6 +124,9 @@ test_that("vcfreader: reading variant only", { expect_equal(br$qual(), 70.06, tolerance = 1e-6) expect_identical(br$filter(), "VQSRTrancheSNP99.80to100.00") expect_identical(br$info(), "AC=2;AF=0.000616523;AN=3244;DP=2498;FS=0;MLEAC=1;MLEAF=0.0003083;MQ=17.07;MQ0=0;QD=17.52;SOR=3.258;VQSLOD=-32.6;culprit=MQ;AN_EUR=658;AN_EAS=548;AN_AMR=520;AN_SAS=624;AN_AFR=894;AF_EUR=0;AF_EAS=0;AF_AMR=0.00384615;AF_SAS=0;AF_AFR=0;AC_EUR=0;AC_EAS=0;AC_AMR=2;AC_SAS=0;AC_AFR=0;AC_Het_EUR=0;AC_Het_EAS=0;AC_Het_AMR=0;AC_Het_SAS=0;AC_Het_AFR=0;AC_Het=0;AC_Hom_EUR=0;AC_Hom_EAS=0;AC_Hom_AMR=2;AC_Hom_SAS=0;AC_Hom_AFR=0;AC_Hom=2;HWE_EUR=1;ExcHet_EUR=1;HWE_EAS=1;ExcHet_EAS=1;HWE_AMR=0.00192678;ExcHet_AMR=1;HWE_SAS=1;ExcHet_SAS=1;HWE_AFR=1;ExcHet_AFR=1;HWE=0.000308356;ExcHet=1;ME=0;AN_EUR_unrel=508;AN_EAS_unrel=448;AN_AMR_unrel=330;AN_SAS_unrel=484;AN_AFR_unrel=666;AF_EUR_unrel=0;AF_EAS_unrel=0;AF_AMR_unrel=0;AF_SAS_unrel=0;AF_AFR_unrel=0;AC_EUR_unrel=0;AC_EAS_unrel=0;AC_AMR_unrel=0;AC_SAS_unrel=0;AC_AFR_unrel=0;AC_Het_EUR_unrel=0;AC_Het_EAS_unrel=0;AC_Het_AMR_unrel=0;AC_Het_SAS_unrel=0;AC_Het_AFR_unrel=0;AC_Hom_EUR_unrel=0;AC_Hom_EAS_unrel=0;AC_Hom_AMR_unrel=0;AC_Hom_SAS_unrel=0;AC_Hom_AFR_unrel=0") + br$rmInfoTag("AC") + print(br$info()) + print(br$infoInt("AC")) ## TODO: AC value still exists af <- br$infoFloat("AF") ## expect_equal(0.00061652297, 0.00061652300, tolerance = 1e-6) expect_equal(af, 0.000616523, tolerance = 1e-6) diff --git a/tests/testthat/test-vcf-table.R b/tests/testthat/test-vcf-table.R index 6414e17..87e5a1b 100644 --- a/tests/testthat/test-vcf-table.R +++ b/tests/testthat/test-vcf-table.R @@ -26,6 +26,18 @@ test_that("extract GT for variant with ID=chr21:5030516:G:A", { expect_equal(res$alt, "A") }) +test_that("extract GT for all multisnps", { + res <- vcftable(vcffile, vartype = "multisnps") + expect_identical(res$ref, c("C", "C", "G")) + expect_identical(res$alt, c("G,T", "T,G", "T,*")) +}) + +test_that("extract GT for all multiallelics", { + res <- vcftable(vcffile, vartype = "multiallelics") + expect_identical(res$ref, c("C", "C", "CTTTTTT","G")) + expect_identical(res$alt, c("G,T", "T,G", "CTTTTT,CTTTTTTT,C", "T,*")) +}) + test_that("extract AD (Integer, number=.) for variant with ID=chr21:5030247:G:A", { res <- vcftable(vcffile, id = c("chr21:5030347:G:A"), pass = TRUE, format = "AD") expect_equal(nrow(res$AD), 1) diff --git a/tests/testthat/test-vcf-writer.R b/tests/testthat/test-vcf-writer.R index 242ec61..f87c43c 100644 --- a/tests/testthat/test-vcf-writer.R +++ b/tests/testthat/test-vcf-writer.R @@ -19,6 +19,8 @@ test_that("vcfwriter: writing variant works", { s2 <- gsub("\n", "", br$string()) expect_identical(br$chr(), "chr20") expect_identical(s1, s2) + s2 <- br$line() + expect_identical(s1, s2) })