Skip to content

Commit

Permalink
update vcfplot
Browse files Browse the repository at this point in the history
  • Loading branch information
Zilong-Li committed May 7, 2024
1 parent 7059964 commit 969e985
Show file tree
Hide file tree
Showing 8 changed files with 86 additions and 65 deletions.
53 changes: 27 additions & 26 deletions R/vcf-plot.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,25 +7,25 @@
#'
#' @param which.sample which sample among all to be plotted
#'
#' @param pop file contains population information
#' @param variant which types of variant are desired
#'
#' @param variant which type of variant are desired
#' @param pop file contains population information
#'
#' @param ... parameters passed to graphics
#'
#' @export
vcfplot <- function(obj,
what = "r2",
which.sample = 1,
pop = NULL,
variant = c("SNP","INDEL"),
pop = NULL,
...) {
stopifnot(is(obj, "vcfcomp") | is(obj, "vcfsummary"))
if(!is(obj, "vcfcomp") & !is(obj, "vcfsummary"))
stop("the input object is not vcfcomp or vcfsummary class")
colorpalette("colorblind")

if(is(obj, "vcfcomp")){
obj.names <- names(obj)
message("input is an object with vcfcomp class")
if("r2" %in% obj.names & what == "r2")
plot_r2(obj$r2, ...)
if("pse" %in% obj.names & what == "pse")
Expand All @@ -37,15 +37,13 @@ vcfplot <- function(obj,
}

if(is(obj, "vcfsummary")){
message("input is an object with vcfsummary class")
if(is.null(pop)){
if(is.null(pop)) {
svs <- obj$summary[-1]
return(barplot(svs, ylim = c(0, 1.1*max(svs)),...))
}else{
barplot(svs, ylim = c(0, 1.1*max(svs)),...)
} else {
# get labels and do plottiing
ped <- read.table(pop, header=TRUE)
i <- grep("Super|Population", colnames(ped))

if(length(i)>1)
i <- grep("Super", colnames(ped))[1]
upops <- unique(ped[,i])
Expand All @@ -55,12 +53,14 @@ vcfplot <- function(obj,
ord <- match(id, obj$samples)
mat[variant,ord]
})
return(boxplot(out, ...))
boxplot(out, ...)
}
}
}
}

plot_mat <- function(d, which.sample,rm.na = TRUE, w = NULL,

plot_mat <- function(d, which.sample,
rm.na = TRUE, bin = NULL,
ylim = c(0,1), main = "",
xlab = "Minor allele frequency %",
ylab = expression(italic(r^2)),
Expand All @@ -70,53 +70,53 @@ plot_mat <- function(d, which.sample,rm.na = TRUE, w = NULL,
bins <- get_bin(d)
x <- log10(bins)
labels <- 100 * bins
if(is.null(w)) w <- seq_along(x)
if(is.null(bin)) bin <- seq_along(x)
plot(0, 0,
col = "transparent",
axes = FALSE,
xlim = range(x[w]),
xlim = range(x[bin]),
ylim = ylim,
xlab = xlab,
ylab = ylab,
main = main)
for(i in seq_along(which.sample))
points(x=x[w], y=d[w, which.sample[i]], col = i, ...)
axis(side = 1, at = x[w], labels = labels[w], cex.axis=2)
points(x=x[bin], y=d[bin, which.sample[i]], col = i, ...)
axis(side = 1, at = x[bin], labels = labels[bin], cex.axis=2)
axis(side = 2, at = seq(0, 1, 0.2), cex.axis=2)
}


plot_r2 <- function(d, rm.na = TRUE, w = NULL, ylim = c(0,1),
main = "",
plot_r2 <- function(d, rm.na = TRUE, bin = NULL,
ylim = c(0,1), main = "",
xlab = "Minor allele frequency %",
ylab = expression(italic(r^2)),
...) {
if(rm.na) d <- d[complete.cases(d[,"concordance"]),]
bins <- get_bin(d)
x <- log10(bins)
labels <- 100 * bins
if(is.null(w)) w <- seq_along(x)
if(is.null(bin)) bin <- seq_along(x)
plot(0, 0,
col = "transparent",
axes = FALSE,
xlim = range(x[w]),
xlim = range(x[bin]),
ylim = ylim,
xlab = xlab,
ylab = ylab,
main = main)
points(x=x[w], y=d[w, "concordance"], ...)
axis(side = 1, at = x[w], labels = labels[w], cex.axis=2)
points(x=x[bin], y=d[bin, "concordance"], ...)
axis(side = 1, at = x[bin], labels = labels[bin], cex.axis=2)
axis(side = 2, at = seq(0, 1, 0.2), cex.axis=2)
}


plot_pse <- function(pse, extra = 100, col = 2, xaxt = "s",
plot_pse <- function(pse, extra = 10, col = 2, xaxt = "s",
xlab = "Genomic coordinates", ylab = "",
main = "", at = NULL,...) {
COL <- palette()[col]
pos <- lapply(pse, function(p) split_coordinates(p$pos))
xmax <- max(unlist(pos)) + extra
xmin <- 0
xmin <- max(c(0, min(unlist(pos))))
plot(0, 0, col = "white", axes=FALSE,
xlim = c(xmin, xmax), ylim = c(0, length(pse)),
xlab = xlab, ylab = ylab, main = main)
Expand All @@ -125,10 +125,11 @@ plot_pse <- function(pse, extra = 100, col = 2, xaxt = "s",
axis(1, at = at, ...)
## axis(1, at = axTicks(1))
for(n in seq_along(pos)) {
a <- c(0, pos[[n]], xmax) ## pad 0 and xmax
a <- c(xmin, pos[[n]], xmax) ## pad xmin and xmax
for(l in 2:length(a)) {
rect(a[l-1], n-1, a[l], n, col = ifelse(l %% 2 == 0, COL, add_alpha(COL, 0.5)))
}
}
}


23 changes: 17 additions & 6 deletions README.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ output: github_document
knitr::opts_chunk$set(
collapse = TRUE,
comment = "#>",
dpi = 100,
fig.dim = c(6, 4),
fig.path = "man/figures/README-",
out.width = "100%"
)
Expand Down Expand Up @@ -66,10 +68,19 @@ Want to investigate the concordance between two VCF files? `vcfcomp` is the util
```{r r2}
library(vcfppR)
res <- vcfcomp(test = rawvcf, truth = phasedvcf,
region = "chr21:1-5100000", stats = "r2",
stats = "r2", region = "chr21:1-5100000",
formats = c("GT","GT"))
par(mar=c(5,5,2,2), cex.lab = 2)
vcfplot(res, what = "r2", col = 2, cex = 3, lwd = 4, type = "b")
vcfplot(res, what = "r2", col = 2, cex = 2, lwd = 4, type = "b")
```

```{r pse}
res <- vcfcomp(test = rawvcf, truth = phasedvcf,
stats = "pse",
region = "chr21:5000000-6000000",
samples = "HG00673,NA10840",
return_pse_sites = TRUE)
vcfplot(res, what = "pse", which=1:2, main = "Phasing switch error", ylab = "HG00673,NA10840")
```

Check out the [vignettes](https://zilong-li.github.io/vcfppR/articles/) for more!
Expand All @@ -96,10 +107,10 @@ vcfplot(res, main = "Structure Variant Counts", col = 1:7)

`vcftable` gives you fine control over what you want to extract from VCF/BCF files.

**Read SNP variants with GT format**
**Read SNP variants with GT format and two samples**

```{r snp}
res <- vcftable(phasedvcf, "chr21:1-5100000", vartype = "snps")
res <- vcftable(phasedvcf, "chr21:1-5100000", samples = "HG00673,NA10840", vartype = "snps")
str(res)
```

Expand All @@ -110,10 +121,10 @@ res <- vcftable(rawvcf, "chr21:1-5100000", vartype = "snps", format = "PL", info
str(res)
```

**Read INDEL variants with DP format**
**Read INDEL variants with DP format and QUAL>100**

```{r indel}
res <- vcftable(rawvcf, "chr21:1-5100000", vartype = "indels", format = "DP")
res <- vcftable(rawvcf, "chr21:1-5100000", vartype = "indels", format = "DP", qual=100)
str(res)
```

Expand Down
69 changes: 39 additions & 30 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,13 @@ The vcfppR package implements various powerful functions for fast
genomics analyses with VCF/BCF files using the C++ API of
[vcfpp.h](https://github.com/Zilong-Li/vcfpp).

- Load/save content of VCF/BCF into R objects with highly customizable
options
- Visualize and chracterize variants
- Compare two VCF/BCF files and report various statistics
- Streaming read/write VCF/BCF files with fine control of everything
- [Paper](https://doi.org/10.1093/bioinformatics/btae049) shows vcfppR
is 20x faster than `vcfR`. Also, much faster than `cyvcf2`
- Load/save content of VCF/BCF into R objects with highly customizable
options
- Visualize and chracterize variants
- Compare two VCF/BCF files and report various statistics
- Streaming read/write VCF/BCF files with fine control of everything
- [Paper](https://doi.org/10.1093/bioinformatics/btae049) shows vcfppR
is 20x faster than `vcfR`. Also, much faster than `cyvcf2`

## Installation

Expand Down Expand Up @@ -57,35 +57,45 @@ popfile <- "https://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_25
## `vcfcomp`: compare two VCF files and report concordance

Want to investigate the concordance between two VCF files? `vcfcomp` is
the utility function you need\! For example, in benchmarkings, we intend
the utility function you need! For example, in benchmarkings, we intend
to calculate the genotype correlation between the test and the truth.

``` r
library(vcfppR)
res <- vcfcomp(test = rawvcf, truth = phasedvcf,
region = "chr21:1-5100000", stats = "r2",
stats = "r2", region = "chr21:1-5100000",
formats = c("GT","GT"))
par(mar=c(5,5,2,2), cex.lab = 2)
vcfplot(res, what = "r2", col = 2, cex = 3, lwd = 4, type = "b")
#> input is an object with vcfcomp class
vcfplot(res, what = "r2", col = 2, cex = 2, lwd = 4, type = "b")
```

<img src="man/figures/README-r2-1.png" width="100%" />

``` r
res <- vcfcomp(test = rawvcf, truth = phasedvcf,
stats = "pse",
region = "chr21:5000000-6000000",
samples = "HG00673,NA10840",
return_pse_sites = TRUE)
#> stats F1 or NRC or PSE only uses GT format
vcfplot(res, what = "pse", which=1:2, main = "Phasing switch error", ylab = "HG00673,NA10840")
```

<img src="man/figures/README-pse-1.png" width="100%" />

Check out the [vignettes](https://zilong-li.github.io/vcfppR/articles/)
for more\!
for more!

## `vcfsummary`: variants characterization

Want to summarize variants discovered by genotype caller e.g. GATK?
`vcfsummary` is the utility function you need\!
`vcfsummary` is the utility function you need!

**Small variants**

``` r
res <- vcfsummary(rawvcf,"chr21:10000000-10010000")
vcfplot(res, pop = popfile, col = 1:5, main = "Number of SNP & INDEL variants per population")
#> input is an object with vcfsummary class
```

<img src="man/figures/README-summary_sm-1.png" width="100%" />
Expand All @@ -95,7 +105,6 @@ vcfplot(res, pop = popfile, col = 1:5, main = "Number of SNP & INDEL variants pe
``` r
res <- vcfsummary(svfile, svtype = TRUE, region = "chr20")
vcfplot(res, main = "Structure Variant Counts", col = 1:7)
#> input is an object with vcfsummary class
```

<img src="man/figures/README-summary_sv-1.png" width="100%" />
Expand All @@ -105,13 +114,13 @@ vcfplot(res, main = "Structure Variant Counts", col = 1:7)
`vcftable` gives you fine control over what you want to extract from
VCF/BCF files.

**Read SNP variants with GT format**
**Read SNP variants with GT format and two samples**

``` r
res <- vcftable(phasedvcf, "chr21:1-5100000", vartype = "snps")
res <- vcftable(phasedvcf, "chr21:1-5100000", samples = "HG00673,NA10840", vartype = "snps")
str(res)
#> List of 10
#> $ samples: chr [1:3202] "HG00096" "HG00097" "HG00099" "HG00100" ...
#> $ samples: chr [1:2] "HG00673" "NA10840"
#> $ chr : chr [1:194] "chr21" "chr21" "chr21" "chr21" ...
#> $ pos : int [1:194] 5030578 5030588 5030596 5030673 5030957 5030960 5031004 5031031 5031194 5031224 ...
#> $ id : chr [1:194] "21:5030578:C:T" "21:5030588:T:C" "21:5030596:A:G" "21:5030673:G:A" ...
Expand All @@ -120,7 +129,7 @@ str(res)
#> $ qual : num [1:194] 2.14e+09 2.14e+09 2.14e+09 2.14e+09 2.14e+09 ...
#> $ filter : chr [1:194] "." "." "." "." ...
#> $ info : chr [1:194] "AC=74;AF=0.0115553;CM=0;AN=6404;AN_EAS=1170;AN_AMR=980;AN_EUR=1266;AN_AFR=1786;AN_SAS=1202;AN_EUR_unrel=1006;AN"| __truncated__ "AC=53;AF=0.00827608;CM=1.78789e-05;AN=6404;AN_EAS=1170;AN_AMR=980;AN_EUR=1266;AN_AFR=1786;AN_SAS=1202;AN_EUR_un"| __truncated__ "AC=2;AF=0.000312305;CM=3.21821e-05;AN=6404;AN_EAS=1170;AN_AMR=980;AN_EUR=1266;AN_AFR=1786;AN_SAS=1202;AN_EUR_un"| __truncated__ "AC=2;AF=0.000312305;CM=0.00016985;AN=6404;AN_EAS=1170;AN_AMR=980;AN_EUR=1266;AN_AFR=1786;AN_SAS=1202;AN_EUR_unr"| __truncated__ ...
#> $ gt : int [1:194, 1:3202] 0 0 0 0 0 0 0 0 0 0 ...
#> $ gt : int [1:194, 1:2] 0 0 0 0 0 0 0 0 0 0 ...
#> - attr(*, "class")= chr "vcftable"
```

Expand All @@ -143,22 +152,22 @@ str(res)
#> - attr(*, "class")= chr "vcftable"
```

**Read INDEL variants with DP format**
**Read INDEL variants with DP format and QUAL\>100**

``` r
res <- vcftable(rawvcf, "chr21:1-5100000", vartype = "indels", format = "DP")
res <- vcftable(rawvcf, "chr21:1-5100000", vartype = "indels", format = "DP", qual=100)
str(res)
#> List of 10
#> $ samples: chr [1:3202] "HG00096" "HG00097" "HG00099" "HG00100" ...
#> $ chr : chr [1:195] "chr21" "chr21" "chr21" "chr21" ...
#> $ pos : int [1:195] 5030240 5030912 5030937 5031018 5031125 5031147 5031747 5031881 5031919 5031984 ...
#> $ id : chr [1:195] "." "." "." "." ...
#> $ ref : chr [1:195] "AC" "CA" "T" "C" ...
#> $ alt : chr [1:195] "A" "C" "TGGTGCACGCCTGCAGTCCCGGC" "CCTATGATCACACCGT" ...
#> $ qual : num [1:195] 6872 321 233 270 58361 ...
#> $ filter : chr [1:195] "VQSRTrancheINDEL99.00to100.00" "PASS" "PASS" "PASS" ...
#> $ info : chr [1:195] "AC=82;AF=0.0136439;AN=6010;BaseQRankSum=-0.55;ClippingRankSum=0;DP=18067;FS=0;InbreedingCoeff=0.0664;MLEAC=76;M"| __truncated__ "AC=19;AF=0.0030586;AN=6212;BaseQRankSum=0.358;ClippingRankSum=-0.594;DP=30725;FS=44.58;InbreedingCoeff=-1.1608;"| __truncated__ "AC=1;AF=0.00015753;AN=6348;BaseQRankSum=-1.793;ClippingRankSum=-0.48;DP=32486;FS=27.337;InbreedingCoeff=-0.0212"| __truncated__ "AC=2;AF=0.000314268;AN=6364;BaseQRankSum=1.59;ClippingRankSum=0.358;DP=44916;FS=0;InbreedingCoeff=-0.0092;MLEAC"| __truncated__ ...
#> $ DP : int [1:195, 1:3202] 3 15 13 16 19 20 5 34 29 24 ...
#> $ chr : chr [1:180] "chr21" "chr21" "chr21" "chr21" ...
#> $ pos : int [1:180] 5030240 5030912 5030937 5031018 5031125 5031147 5031747 5031881 5031919 5031984 ...
#> $ id : chr [1:180] "." "." "." "." ...
#> $ ref : chr [1:180] "AC" "CA" "T" "C" ...
#> $ alt : chr [1:180] "A" "C" "TGGTGCACGCCTGCAGTCCCGGC" "CCTATGATCACACCGT" ...
#> $ qual : num [1:180] 6872 321 233 270 58361 ...
#> $ filter : chr [1:180] "VQSRTrancheINDEL99.00to100.00" "PASS" "PASS" "PASS" ...
#> $ info : chr [1:180] "AC=82;AF=0.0136439;AN=6010;BaseQRankSum=-0.55;ClippingRankSum=0;DP=18067;FS=0;InbreedingCoeff=0.0664;MLEAC=76;M"| __truncated__ "AC=19;AF=0.0030586;AN=6212;BaseQRankSum=0.358;ClippingRankSum=-0.594;DP=30725;FS=44.58;InbreedingCoeff=-1.1608;"| __truncated__ "AC=1;AF=0.00015753;AN=6348;BaseQRankSum=-1.793;ClippingRankSum=-0.48;DP=32486;FS=27.337;InbreedingCoeff=-0.0212"| __truncated__ "AC=2;AF=0.000314268;AN=6364;BaseQRankSum=1.59;ClippingRankSum=0.358;DP=44916;FS=0;InbreedingCoeff=-0.0092;MLEAC"| __truncated__ ...
#> $ DP : int [1:180, 1:3202] 3 15 13 16 19 20 5 34 29 24 ...
#> - attr(*, "class")= chr "vcftable"
```

Expand Down
Binary file added man/figures/README-pse-1.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified man/figures/README-r2-1.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified man/figures/README-summary_sm-1.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified man/figures/README-summary_sv-1.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
6 changes: 3 additions & 3 deletions man/vcfplot.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 969e985

Please sign in to comment.