From a167537c4853a162fbf8d1812eb499f40ec45149 Mon Sep 17 00:00:00 2001 From: Zilong-Li Date: Mon, 8 Jan 2024 20:13:44 +0100 Subject: [PATCH] add vcfwriter$copyHeader and update readme --- R/RcppExports.R | 2 ++ README.Rmd | 13 ++++++------- README.md | 15 +++++++-------- man/vcfwriter.Rd | 3 +++ src/vcf-writer.cpp | 15 ++++++++++++--- tests/testthat/test-modify-vcf.R | 6 ++++++ tests/testthat/test-vcf-writer.R | 16 ++++++++++++++-- 7 files changed, 50 insertions(+), 20 deletions(-) diff --git a/R/RcppExports.R b/R/RcppExports.R index 5f3baa7..49a5690 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -177,6 +177,8 @@ tableFormat <- function(vcffile, region, samples, format, ids, qualval, pass, IN #' \itemize{ \item Parameter: str - A string for a SAMPLE name } #' @field addLine Add a line in the header of the vcf #' \itemize{ \item Parameter: str - A string for a line in the header of VCF } +#' @field copyHeader Copy header of given VCF into the output VCF +#' \itemize{ \item Parameter: vcffile - A string of the VCF path} #' @field writeline Write a variant record given a line #' \itemize{ \item Parameter: line - A string for a line in the variant of VCF. Not ended with "newline" } #' @field close Close and save the vcf file diff --git a/README.Rmd b/README.Rmd index c08f730..d95600c 100644 --- a/README.Rmd +++ b/README.Rmd @@ -45,7 +45,7 @@ In addtion to two classes for R-bindings of vcfpp.h, there are some useful funct ## Read VCF as tabular data in R -This example shows how to read only SNP variants with genotypes in the VCF/BCF into R tables: +This example shows how to read only SNP variants with genotypes in the VCF/BCF into R list: ```r library(vcfppR) vcffile <- "https://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20220422_3202_phased_SNV_INDEL_SV/1kGP_high_coverage_Illumina.chr21.filtered.SNV_INDEL_SV_phased_panel.vcf.gz" @@ -53,7 +53,7 @@ res <- vcftable(vcffile, "chr21:1-5100000", vartype = "snps") str(res) ``` -This example shows how to read only SNP variants with PL format and drop the INFO column in the VCF/BCF into R tables: +This example shows how to read only SNP variants with PL format into R list and drop the INFO column in the VCF/BCF: ```r vcffile <- "https://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20201028_3202_raw_GT_with_annot/20201028_CCDG_14151_B01_GRM_WGS_2020-08-05_chr21.recalibrated_variants.vcf.gz" @@ -61,7 +61,7 @@ res <- vcftable(vcffile, "chr21:1-5100000", vartype = "snps", format = "PL", inf str(res) ``` -This example shows how to read only indels variants with DP format in the VCF/BCF into R tables: +This example shows how to read only indels variants with DP format in the VCF/BCF into R list: ```r vcffile <- "https://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20201028_3202_raw_GT_with_annot/20201028_CCDG_14151_B01_GRM_WGS_2020-08-05_chr21.recalibrated_variants.vcf.gz" @@ -75,7 +75,7 @@ This example shows how to summarize small variants discovered by GATK. The data ```r vcffile <- "https://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20201028_3202_raw_GT_with_annot/20201028_CCDG_14151_B01_GRM_WGS_2020-08-05_chr21.recalibrated_variants.vcf.gz" -region <- "chr21:10000000-10500000" +region <- "chr21:10000000-10010000" res <- vcfsummary(vcffile, region) str(res) # get labels and do plottiing @@ -94,10 +94,9 @@ This example shows how to summarize complex structure variants discovered by GAT ```r svfile <- "https://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20210124.SV_Illumina_Integration/1KGP_3202.gatksv_svtools_novelins.freeze_V3.wAF.vcf.gz" -sv <- vcfsummary(svfile, svtype = TRUE) +sv <- vcfsummary(svfile, svtype = TRUE, region = "chr20") str(sv) allsvs <- sv$summary[-1] bar <- barplot(allsvs, ylim = c(0, 1.1*max(allsvs)), - main = "Variant Count (all SVs)") -text(bar, allsvs+4500, paste("n: ", allsvs, sep="")) + main = "Variant Counts on chr20 (all SVs)") ``` diff --git a/README.md b/README.md index 3e738dd..34c4e1b 100644 --- a/README.md +++ b/README.md @@ -45,7 +45,7 @@ depend on your connection to the servers. ## Read VCF as tabular data in R This example shows how to read only SNP variants with genotypes in the -VCF/BCF into R tables: +VCF/BCF into R list: ``` r library(vcfppR) @@ -54,8 +54,8 @@ res <- vcftable(vcffile, "chr21:1-5100000", vartype = "snps") str(res) ``` -This example shows how to read only SNP variants with PL format and drop -the INFO column in the VCF/BCF into R tables: +This example shows how to read only SNP variants with PL format into R +list and drop the INFO column in the VCF/BCF: ``` r vcffile <- "https://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20201028_3202_raw_GT_with_annot/20201028_CCDG_14151_B01_GRM_WGS_2020-08-05_chr21.recalibrated_variants.vcf.gz" @@ -64,7 +64,7 @@ str(res) ``` This example shows how to read only indels variants with DP format in -the VCF/BCF into R tables: +the VCF/BCF into R list: ``` r vcffile <- "https://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20201028_3202_raw_GT_with_annot/20201028_CCDG_14151_B01_GRM_WGS_2020-08-05_chr21.recalibrated_variants.vcf.gz" @@ -79,7 +79,7 @@ The data is from the 1000 Genome Project. ``` r vcffile <- "https://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20201028_3202_raw_GT_with_annot/20201028_CCDG_14151_B01_GRM_WGS_2020-08-05_chr21.recalibrated_variants.vcf.gz" -region <- "chr21:10000000-10500000" +region <- "chr21:10000000-10010000" res <- vcfsummary(vcffile, region) str(res) # get labels and do plottiing @@ -99,10 +99,9 @@ discovered by GATK-SV. ``` r svfile <- "https://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20210124.SV_Illumina_Integration/1KGP_3202.gatksv_svtools_novelins.freeze_V3.wAF.vcf.gz" -sv <- vcfsummary(svfile, svtype = TRUE) +sv <- vcfsummary(svfile, svtype = TRUE, region = "chr20") str(sv) allsvs <- sv$summary[-1] bar <- barplot(allsvs, ylim = c(0, 1.1*max(allsvs)), - main = "Variant Count (all SVs)") -text(bar, allsvs+4500, paste("n: ", allsvs, sep="")) + main = "Variant Counts on chr20 (all SVs)") ``` diff --git a/man/vcfwriter.Rd b/man/vcfwriter.Rd index d677c52..2710a14 100644 --- a/man/vcfwriter.Rd +++ b/man/vcfwriter.Rd @@ -45,6 +45,9 @@ Type the name of the class to see the details and methods \item{\code{addLine}}{Add a line in the header of the vcf \itemize{ \item Parameter: str - A string for a line in the header of VCF }} +\item{\code{copyHeader}}{Copy header of given VCF into the output VCF +\itemize{ \item Parameter: vcffile - A string of the VCF path}} + \item{\code{writeline}}{Write a variant record given a line \itemize{ \item Parameter: line - A string for a line in the variant of VCF. Not ended with "newline" }} diff --git a/src/vcf-writer.cpp b/src/vcf-writer.cpp index 3a3a0b4..580419a 100644 --- a/src/vcf-writer.cpp +++ b/src/vcf-writer.cpp @@ -33,6 +33,8 @@ using namespace std; //' \itemize{ \item Parameter: str - A string for a SAMPLE name } //' @field addLine Add a line in the header of the vcf //' \itemize{ \item Parameter: str - A string for a line in the header of VCF } +//' @field copyHeader Copy header of given VCF into the output VCF +//' \itemize{ \item Parameter: vcffile - A string of the VCF path} //' @field writeline Write a variant record given a line //' \itemize{ \item Parameter: line - A string for a line in the variant of VCF. Not ended with "newline" } //' @field close Close and save the vcf file @@ -67,12 +69,18 @@ class vcfwriter { const std::string& desc) { bw.header.addFORMAT(id, number, type, desc); } - inline void addSample(const std::string & str) { bw.header.addSample(str); } - inline void addLine(const std::string & str) { bw.header.addLine(str); } - inline void writeline(const std::string & line) { bw.writeLine(line); } + inline void addSample(const std::string& str) { bw.header.addSample(str); } + inline void addLine(const std::string& str) { bw.header.addLine(str); } + inline void copyHeader(const std::string& vcffile) { + br.open(vcffile); + bw.initalHeader(br.header); + br.close(); + } + inline void writeline(const std::string& line) { bw.writeLine(line); } private: vcfpp::BcfWriter bw; + vcfpp::BcfReader br; }; RCPP_EXPOSED_CLASS(vcfwriter) @@ -82,6 +90,7 @@ RCPP_MODULE(vcfwriter) { .constructor("construct vcfwriter given vcf file and the version") .method("close", &vcfwriter::close) .method("writeline", &vcfwriter::writeline) + .method("copyHeader", &vcfwriter::copyHeader) .method("addLine", &vcfwriter::addLine) .method("addSample", &vcfwriter::addSample) .method("addContig", &vcfwriter::addContig) diff --git a/tests/testthat/test-modify-vcf.R b/tests/testthat/test-modify-vcf.R index bd39f61..b232919 100644 --- a/tests/testthat/test-modify-vcf.R +++ b/tests/testthat/test-modify-vcf.R @@ -23,6 +23,12 @@ test_that("modify the genotypes", { (g2 <- br$genotypes(F)) expect_false(isTRUE(all.equal(g0, g2))) expect_identical(g1, g2) + br$close() + ## the original vcf can not be modified + br <- vcfreader$new(outvcf) + br$variant() ## get a variant record + (g3 <- br$genotypes(F)) + expect_identical(g0, g3) }) test_that("modify item in FORMAT", { diff --git a/tests/testthat/test-vcf-writer.R b/tests/testthat/test-vcf-writer.R index f87c43c..3bc2d02 100644 --- a/tests/testthat/test-vcf-writer.R +++ b/tests/testthat/test-vcf-writer.R @@ -23,5 +23,17 @@ test_that("vcfwriter: writing variant works", { expect_identical(s1, s2) }) - - +test_that("vcfwriter: copy header from another vcf", { + outvcf <- paste0(tempfile(), ".vcf.gz") + vcffile <- system.file("extdata", "raw.gt.vcf.gz", package="vcfppR") + bw <- vcfwriter$new(outvcf, "VCF4.3") + bw$copyHeader(vcffile) + bw$close() + br <- vcfreader$new(outvcf) + h1 <- br$header() + br$close() + br <- vcfreader$new(vcffile) + h2 <- br$header() + br$close() + expect_identical(h1, h2) +})