diff --git a/.Rbuildignore b/.Rbuildignore index 66f49db8..9e759fda 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -4,3 +4,6 @@ ^\.gitignore$ ^\.travis.yml$ ^\.Rhistory$ +^vignettes/*.bbl$ +^vignettes/*.log$ +^vignettes/*.toc$ \ No newline at end of file diff --git a/.travis.yml b/.travis.yml index 9514fc74..e2c525a8 100644 --- a/.travis.yml +++ b/.travis.yml @@ -15,7 +15,7 @@ install: - "[ ! -d ~/R ] && mkdir ~/R" - R --version - R -e '.libPaths(); sessionInfo()' - - Rscript -e 'install.packages(c("adegenet", "pegas", "vegan", "ggplot2", "phangorn", "ape", "igraph"), repos="http://cran.r-project.org")' + - Rscript -e 'install.packages(c("adegenet", "pegas", "vegan", "ggplot2", "phangorn", "ape", "igraph", "reshape", "seqinr", "testthat", "knitr", "polysat"), repos="http://cran.at.r-project.org")' # run tests script: diff --git a/DESCRIPTION b/DESCRIPTION index 389a1652..764aaaff 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -2,8 +2,8 @@ Package: poppr Type: Package Title: an R package for genetic analysis of populations with mixed reproduction -Version: 1.0.6 -Date: 2014-03-09 +Version: 1.1.0 +Date: 2014-07-23 Author: Zhian N. Kamvar , Javier F. Tabima , Niklaus J. Grunwald @@ -13,15 +13,23 @@ URL: http://cran.r-project.org/package=poppr, http://grunwaldlab.cgrb.oregonstate.edu/poppr-r-package-population-genetics Description: Population genetic analyses for hierarchical analysis of partially clonal populations built upon the architecture of the adegenet package. +MailingList: http://groups.google.com/group/poppr Depends: R (>= 2.15.1), - adegenet + adegenet (>= 1.4-2) Imports: vegan, ggplot2, phangorn, ape, igraph, + methods, + ade4, pegas, - methods -License: GPL-2 + reshape2 +Suggests: + testthat, + knitr, + polysat +License: GPL-2 | GPL-3 +VignetteBuilder: knitr diff --git a/Makefile b/Makefile index c45609e9..dbe499ef 100644 --- a/Makefile +++ b/Makefile @@ -43,5 +43,5 @@ checkdevel: build cd R-devel;\ ./configure;\ make;\ - bin/./R -e 'install.packages(c("colorspace", "stringr", "RColorBrewer", "dichromat", "munsell", "labeling", "ade4", "network", "permute", "plyr", "digest", "gtable", "reshape2", "scales", "proto", "rgl", "quadprog", "adegenet", "pegas", "vegan", "ggplot2", "phangorn", "ape", "igraph", "seqinr", "testthat"), repos="http://cran.at.r-project.org", lib = "library")';\ + bin/./R -e 'install.packages(c("colorspace", "stringr", "RColorBrewer", "dichromat", "munsell", "labeling", "ade4", "network", "permute", "plyr", "digest", "gtable", "reshape2", "scales", "proto", "rgl", "quadprog", "adegenet", "pegas", "vegan", "ggplot2", "phangorn", "ape", "igraph", "seqinr", "testthat", "knitr", "polysat", "caTools", "xtable"), repos="http://cran.at.r-project.org", lib = "library")';\ bin/./R CMD check ../$(PKGNAME)_$(PKGVERS).tar.gz --as-cran \ No newline at end of file diff --git a/NAMESPACE b/NAMESPACE index b8a047c4..fce57193 100755 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,31 +1,71 @@ +# Generated by roxygen2 (4.0.1): do not edit by hand + +S3method(print,amova) +S3method(print,ialist) +S3method(print,locustable) +S3method(print,popprtable) +export("addhierarchy<-") +export("namehierarchy<-") +export("sethierarchy<-") +export("setpop<-") +export("splithierarchy<-") +export(aboot) +export(addhierarchy) +export(as.genclone) export(bruvo.boot) export(bruvo.dist) export(bruvo.msn) export(clonecorrect) export(diss.dist) +export(edwards.dist) export(genind2genalex) +export(genotype_curve) export(getfile) +export(gethierarchy) export(greycurve) export(ia) +export(info_table) export(informloci) +export(is.genclone) +export(locus_table) export(missingno) export(mlg) export(mlg.crosspop) +export(mlg.id) export(mlg.table) export(mlg.vector) +export(namehierarchy) +export(nei.dist) +export(plot_poppr_msn) export(poppr) export(poppr.all) +export(poppr.amova) export(poppr.msn) export(popsub) +export(private_alleles) +export(provesti.dist) export(read.genalex) +export(recode_polyploids) +export(reynolds.dist) +export(rogers.dist) +export(sethierarchy) +export(setpop) export(shufflepop) export(splitcombine) +export(splithierarchy) +exportClasses(bootgen) exportClasses(bruvomat) +exportClasses(genclone) +exportMethods(print) import(adegenet) import(ggplot2) import(methods) -import(pegas) import(vegan) +importFrom(ade4,amova) +importFrom(ade4,cailliez) +importFrom(ade4,is.euclid) +importFrom(ade4,lingoes) +importFrom(ade4,quasieuclid) importFrom(ape,add.scale.bar) importFrom(ape,axisPhylo) importFrom(ape,boot.phylo) @@ -38,10 +78,15 @@ importFrom(igraph,"E<-") importFrom(igraph,"V<-") importFrom(igraph,E) importFrom(igraph,V) +importFrom(igraph,delete.edges) importFrom(igraph,graph.adjacency) +importFrom(igraph,layout.auto) importFrom(igraph,minimum.spanning.tree) importFrom(igraph,plot.igraph) importFrom(igraph,print.igraph) +importFrom(pegas,as.loci) importFrom(phangorn,midpoint) importFrom(phangorn,upgma) +importFrom(reshape2,colsplit) +importFrom(reshape2,melt) useDynLib(poppr) diff --git a/NEWS b/NEWS index 19ee89e4..9a672bab 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,56 @@ +poppr 1.1.0 +=========== +NEW FEATURES +* Polyploids with ambiguous genotypes are now supported in poppr. See documentation for `recode_polyploids` for details. +* Calculations of Bruvo's distance now features correction for partial missing data utilizing genome addition and genome loss models as presented in Bruvo et al. 2004. +* `diss.dist` now has options to return raw distances and a matrix instead of a dist object. +* `read.genalex` now has the option to import as a genclone object. This is the default action. +* `poppr.all` will be able to analyze lists of genind or genclone objects. +* `ia` now has the argument valuereturn which will return the sampled data. +* `[bruvo,poppr].msn` functions now give the user the choice to show the graph. +* `bruvo.boot` has a cleaner plot style. + +NEW DATA CLASSES +* The `genclone` object is a new extension of the `genind` object from adegenet. This object contains slots containing population hierarchies and multilocus genotype definitions and will work with all analyses in adegenet and poppr. + +NEW FUNCTIONS +* [get,set,name,split,add]hierarchy - functions that will manipulate the hierarchy slot in a `genclone` object utilizing hierarchical formulae as arguments for simplification. +* `setpop` will set the population of a `genclone` object utilizing model formulae regarding the hierarchy slot. +* `as.genclone` will automatically convert genind objects to genclone objects. +* `is.genclone` checks the validity of genclone objects. +* `poppr.amova` will run amova on any hierarchical level. This also includes the feature to run amova on clone censored data sets. It utilizes the ade4 version of amova. +* `info_table` will calculate missing data per population per locus or ploidy per individual per locus and gives the user the option to visualize this as a heatmap. +* `locus_table` will calculate diversity and evenness statistics over all loci in a genind or genclone object. +* `*.dist` functions will calculate Nei's distance, Rogers' Distance, Edwards' Distance, Reynolds' Distance, and Provestis' Distance. +* `aboot` will allow the user to create bootstrapped dendrograms for ANY distance that can be calculated on genind or genpop objects. +* `plot_poppr_msn` will plot minimum spanning networks produced with poppr. +* `private_alleles` will give information about the presence of private alleles within a genind or genclone object. +* `recode_polyploids` will take in a polyploid genind/genclone object (with missing alleles coded as extra zero-value allele) and recode them to have frequencies relative to the observed number of alleles. +* `genotype_curve` will create a genotype accumulation curve for increasing number of loci. +* `mlg.id` will return a list indicating the samples belonging to a specific multilocus genotype. + +NEW DATA SETS +* Pinf - a data set of 86 isolates from different populations of the late blight pathogen, Phytophthora infestans. Provided by Erica Goss +* monpop - a large data set of 694 Monilinia fructicola isolates from a single orchard over three years. Provided by Sydney E. Everhart + +NEW CAR +* Not really. + +NAMESPACE CHANGES +* poppr no longer depends on pegas. +* ade4 and reshape2 are now explicitly required. + +IMPROVEMENTS +* default shuffling algorithm has been implemented in C to increase speed. +* output of the mlg functions are now represented as integers to decrease their size in memory. +* `mlg.matrix` is now calculated faster utilizing R's internal tabulating capabilities. +* The function `poppr` will no longer return rounded results, but rather is printed with three significant digits. + +MISC +* Added unit tests. +* The poppr user manual has been shortened to only include instructions on data manipulation. +* A new vignette, "Algorithms and Equations" gives algorithmic details for calculations performed in poppr. + poppr 1.0.7 =========== UPDATE @@ -17,7 +70,7 @@ MISC poppr 1.0.5 =========== NOTABLE CHANGE -* The default shuffling algorithm for calculating the index of association has changed from multilocus-style sampling to permutation of alleles. All of the 4 methods are available, but new assignments are as follows: Method 1: permute alleles, Method 2: parametric bootstrap, Method 3: non-parametric bootstrap, Method 4: Multilocus-style sampling. Previously, Multilocus was 1 and the rest followed in the same order. There should be no compatability issues with this change. Functions affected: `ia`, `poppr` `shufflepop` +* The default shuffling algorithm for calculating the index of association has changed from multilocus-style sampling to permutation of alleles. All of the 4 methods are available, but new assignments are as follows: Method 1: permute alleles, Method 2: parametric bootstrap, Method 3: non-parametric bootstrap, Method 4: Multilocus-style sampling. Previously, Multilocus was 1 and the rest followed in the same order. There should be no compatibility issues with this change. Functions affected: `ia`, `poppr` `shufflepop` BUG FIX * Bootstrapping algorithm for `bruvo.boot` function was not shuffling the repeat lengths for each locus resulting in potentially erroneous bootstrap support values. This has been fixed by implementing an internal S4 class that will allow direct bootstrapping of the data and repeat lengths together. @@ -33,7 +86,7 @@ poppr 1.0.4 =========== BUG FIX * A previous error where bootstrap values greater than 100 were reported from `bruvo.boot` on UPGMA trees has been fixed. -* Fixed correction of negative branch lenghts using Kuhner and Felsenstein (1994) normalization for NJ trees. +* Fixed correction of negative branch lengths using Kuhner and Felsenstein (1994) normalization for NJ trees. MISC * github repository for poppr has changed from github.com/poppr/poppr to github.com/grunwaldlab/poppr diff --git a/R/.gitignore b/R/.gitignore new file mode 100644 index 00000000..62103d2c --- /dev/null +++ b/R/.gitignore @@ -0,0 +1 @@ +sandbox.r~ diff --git a/R/Index_calculations.r b/R/Index_calculations.r index c36f74ef..ec4fb385 100755 --- a/R/Index_calculations.r +++ b/R/Index_calculations.r @@ -42,155 +42,158 @@ #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# #==============================================================================# -# The calculation of the Index of Association and standardized Index of -# Association. -#' #' Produce a basic summary table for population genetic analyses. -#' -#' This function allows the user to quickly view indecies of distance, -#' heterozygosity, and inbreeding to aid in the decision of a path to further -#' analyze a specified dataset. It natively takes \code{\link{genind}} formatted -#' files, but can convert any raw data formats that adegenet can take (fstat, -#' structure, genetix, and genpop) as well as genalex files exported into a csv -#' format (see \code{\link{read.genalex}} for details). -#' -#' -#' @param pop a \code{\link{genind}} object OR any fstat, structure, genetix, -#' genpop, or genalex formatted file. -#' -#' @param total default \code{TRUE}. Should indecies be calculated for the -#' combined populations represented in the entire file? #' -#' @param sublist a list of character strings or integers to indicate specific -#' population names (located in \code{$pop.names} within the -#' \code{\link{genind}} object) Defaults to "ALL". -#' -#' @param blacklist a list of character strings or integers to indicate specific -#' populations to be removed from analysis. Defaults to NULL. +#' This function allows the user to quickly view indicies of heterozygosity, +#' evenness, and inbreeding to aid in the decision of a path to further analyze +#' a specified dataset. It natively takes \code{\linkS4class{genind}} and +#' \code{\linkS4class{genclone}} objects, but can convert any raw data formats +#' that adegenet can take (fstat, structure, genetix, and genpop) as well as +#' genalex files exported into a csv format (see \code{\link{read.genalex}} for +#' details). #' -#' @param sample an integer indicating the number of permutations desired to -#' obtain p-values. Sampling will shuffle genotypes at each locus to simulate -#' a panmictic population using the observed genotypes. Calculating the p-value -#' includes the observed statistics, so set your sample number to one off for a -#' round p-value (eg. \code{sample = 999} will give you p = 0.001 and -#' \code{sample = 1000} will give you p = 0.000999001). -#' -#' @param method an integer from 1 to 4 indicating the method of sampling desired. -#' see \code{\link{shufflepop}} for details. #' +#' @param dat a \code{\linkS4class{genind}} object OR a +#' \code{\linkS4class{genclone}} object OR any fstat, structure, genetix, +#' genpop, or genalex formatted file. +#' +#' @param total When \code{TRUE} (default), indices will be calculated for the +#' pooled populations. +#' +#' @param sublist a list of character strings or integers to indicate specific +#' population names (located in \code{$pop.names} within the +#' \code{\link{genind}} object) Defaults to "ALL". +#' +#' @param blacklist a list of character strings or integers to indicate specific +#' populations to be removed from analysis. Defaults to NULL. +#' +#' @param sample an integer indicating the number of permutations desired to +#' obtain p-values. Sampling will shuffle genotypes at each locus to simulate +#' a panmictic population using the observed genotypes. Calculating the +#' p-value includes the observed statistics, so set your sample number to one +#' off for a round p-value (eg. \code{sample = 999} will give you p = 0.001 +#' and \code{sample = 1000} will give you p = 0.000999001). +#' +#' @param method an integer from 1 to 4 indicating the method of sampling +#' desired. see \code{\link{shufflepop}} for details. +#' #' @param missing how should missing data be treated? \code{"zero"} and -#' \code{"mean"} will set the missing values to those documented in -#' \code{\link{na.replace}}. \code{"loci"} and \code{"geno"} will remove any -#' loci or genotypes with missing data, respectively (see -#' \code{\link{missingno}} for more information. -#' -#' @param cutoff \code{numeric} a number from 0 to 1 indicating the percent -#' missing data allowed for analysis. This is to be used in conjunction with the -#' flag \code{missing} (see \code{\link{missingno}} for details) -#' -#' @param quiet Should the function print anything to the screen while it is -#' performing calculations? \code{TRUE} prints nothing, -#' \code{FALSE} (defualt) will print the population name and a progress bar. -#' -#' @param clonecorrect default \code{FALSE}. -#' must be used with the \code{hier} and \code{dfname} parameters, or the user -#' will potentially get undesiered results. see \code{\link{clonecorrect}} for -#' details. -#' -#' @param hier a \code{numeric or character list}. This is the list of vectors -#' within a data frame (specified in \code{dfname}) in the 'other' slot of the -#' \code{\link{genind}} object. The list should indicate the population -#' hierarchy to be used for clone correction. -#' -#' @param dfname a \code{character string}. This is the name of the data frame -#' or list containing the vectors of the population hierarchy within the -#' \code{other} slot of the \code{\link{genind}} object. -#' -#' @param keep an \code{integer}. This indicates the levels of the population -#' hierarchy you wish to keep after clone correcting your data sets. To combine -#' the hierarchy, just set keep from 1 to the length of your hierarchy. see -#' \code{\link{clonecorrect}} for details. -#' -#' @param hist \code{logical} if \code{TRUE} a histogram will be produced for -#' each population. -#' +#' \code{"mean"} will set the missing values to those documented in +#' \code{\link{na.replace}}. \code{"loci"} and \code{"geno"} will remove any +#' loci or genotypes with missing data, respectively (see +#' \code{\link{missingno}} for more information. +#' +#' @param cutoff \code{numeric} a number from 0 to 1 indicating the percent +#' missing data allowed for analysis. This is to be used in conjunction with +#' the flag \code{missing} (see \code{\link{missingno}} for details) +#' +#' @param quiet \code{FALSE} (default) will display a progress bar for each +#' population analyzed. +#' +#' @param clonecorrect default \code{FALSE}. must be used with the \code{hier} +#' and \code{dfname} parameters, or the user will potentially get undesired +#' results. see \code{\link{clonecorrect}} for details. +#' +#' @param hier \itemize{ \item \strong{for genclone objects} - a \code{formula} +#' indicating the hierarchical levels to be used. The hierarchies should be +#' present in the \code{hierarchy} slot. See \code{\link{sethierarchy}} for +#' details. \item \strong{for genind objects} - a \code{numeric or character} +#' vector OR a hierarchical formula. This is the list of columns within a data +#' frame (specified in \code{dfname}) in the 'other' slot of the +#' \code{\link{genind}} object. The list should indicate the population +#' hierarchy to be used for clone correction. } +#' +#' @param dfname a \code{character string}. (Only for genind objects) This is +#' the name of the data frame or heirarchy containing the vectors of the +#' population hierarchy within the \code{other} slot of the +#' \code{\link{genind}} object. +#' +#' @param keep an \code{integer}. This indicates the levels of the population +#' hierarchy you wish to keep after clone correcting your data sets. To +#' combine the hierarchy, just set keep from 1 to the length of your +#' hierarchy. see \code{\link{clonecorrect}} for details. +#' +#' @param hist \code{logical} if \code{TRUE} (default) and \code{sampling > 0}, +#' a histogram will be produced for each population. +#' #' @param minsamp an \code{integer} indicating the minimum number of individuals -#' to resample for rarefaction analysis. -#' -#' @return -#' \item{Pop}{A vector indicating the pouplation factor} -#' \item{N}{An integer vector indicating the number of individuals/isolates in -#' the specified population.} -#' \item{MLG}{An integer vector indicating the number of multilocus genotypes -#' found in the specified poupulation, (see: \code{\link{mlg}})} -#' \item{eMLG}{The expected number of MLG at the lowest common sample size (set -#' by the parameter \code{minsamp}.} -#' \item{SE}{The standard error for the rarefaction analysis} -#' \item{H}{Shannon-Weiner Diversity index} -#' \item{G}{Stoddard and Taylor's Index} -#' \item{Hexp}{Expected heterozygosity or Nei's 1987 genotypic diversity corrected for sample size.} -#' \item{E.5}{Evenness} -#' \item{Ia}{A numeric vector giving the value of the Index of Association for -#' each population factor, (see \code{\link{ia}}).} -#' \item{p.Ia}{A numeric vector indicating the p-value for Ia from the -#' number of reshufflings indicated in \code{sample}. Lowest value is 1/n where -#' n is the number of observed values.} -#' \item{rbarD}{A numeric vector giving the value of the Standardized Index of -#' Association for each population factor, (see \code{\link{ia}}).} -#' \item{p.rD}{A numeric vector indicating the p-value for rbarD from the -#' number of reshufflings indicated in \code{sample}. Lowest value is 1/n where -#' n is the number of observed values.} -#' \item{File}{A vector indicating the name of the original data file.} -#' -#' @note All values are rounded to three significant digits for the final table. -#' +#' to resample for rarefaction analysis. See \code{\link[vegan]{rarefy}} for +#' details. +#' +#' @param legend \code{logical}. When this is set to \code{TRUE}, a legend +#' describing the resulting table columns will be printed. Defaults to +#' \code{FALSE} +#' +#' @return \item{Pop}{A vector indicating the pouplation factor} \item{N}{An +#' integer vector indicating the number of individuals/isolates in the +#' specified population.} \item{MLG}{An integer vector indicating the number +#' of multilocus genotypes found in the specified poupulation, (see: +#' \code{\link{mlg}})} \item{eMLG}{The expected number of MLG at the lowest +#' common sample size (set by the parameter \code{minsamp}.} \item{SE}{The +#' standard error for the rarefaction analysis} \item{H}{Shannon-Weiner +#' Diversity index} \item{G}{Stoddard and Taylor's Index} \item{Hexp}{Expected +#' heterozygosity or Nei's 1987 genotypic diversity corrected for sample +#' size.} \item{E.5}{Evenness} \item{Ia}{A numeric vector giving the value of +#' the Index of Association for each population factor, (see +#' \code{\link{ia}}).} \item{p.Ia}{A numeric vector indicating the p-value for +#' Ia from the number of reshufflings indicated in \code{sample}. Lowest value +#' is 1/n where n is the number of observed values.} \item{rbarD}{A numeric +#' vector giving the value of the Standardized Index of Association for each +#' population factor, (see \code{\link{ia}}).} \item{p.rD}{A numeric vector +#' indicating the p-value for rbarD from the number of reshuffles indicated +#' in \code{sample}. Lowest value is 1/n where n is the number of observed +#' values.} \item{File}{A vector indicating the name of the original data +#' file.} +#' #' @seealso \code{\link{clonecorrect}}, \code{\link{poppr.all}}, -#' \code{\link{ia}}, \code{\link{missingno}}, \code{\link{mlg}} -#' +#' \code{\link{ia}}, \code{\link{missingno}}, \code{\link{mlg}} +#' #' @export #' @author Zhian N. Kamvar #' @references Paul-Michael Agapow and Austin Burt. Indices of multilocus -#' linkage disequilibrium. \emph{Molecular Ecology Notes}, 1(1-2):101-102, 2001 -#' -#' A.H.D. Brown, M.W. Feldman, and E. Nevo. Multilocus structure of natural -#' populations of hordeum spontaneum. \emph{Genetics}, 96(2):523-536, 1980. -#' -#' Niklaus J. Gr\"unwald, Stephen B. Goodwin, Michael G. Milgroom, and William E. Fry. -#' Analysis of genotypic diversity data for populations of microorganisms. -#' Phytopathology, 93(6):738-46, 2003 -#' -#' Bernhard Haubold and Richard R. Hudson. Lian 3.0: detecting linkage disequilibrium -#' in multilocus data. Bioinformatics, 16(9):847-849, 2000. -#' -#' Kenneth L.Jr. Heck, Gerald van Belle, and Daniel Simberloff. Explicit calculation -#' of the rarefaction diversity measurement and the determination of sufficient sample -#' size. Ecology, 56(6):pp. 1459-1461, 1975 -#' -#' S H Hurlbert. The nonconcept of species diversity: a critique and alternative -#' parameters. Ecology, 52(4):577-586, 1971. -#' -#' J.A. Ludwig and J.F. Reynolds. Statistical Ecology. A Primer on Methods and -#' Computing. New York USA: John Wiley and Sons, 1988. -#' -#' Masatoshi Nei. Estimation of average heterozygosity and genetic distance from -#' a small number of individuals. Genetics, 89(3):583-590, 1978. -#' -#' Jari Oksanen, F. Guillaume Blanchet, Roeland Kindt, Pierre Legendre, Peter R. -#' Minchin, R. B. O'Hara, Gavin L. Simpson, Peter Solymos, M. Henry H. Stevens, -#' and Helene Wagner. vegan: Community Ecology Package, 2012. R package version 2.0-5. -#' -#' E.C. Pielou. Ecological Diversity. Wiley, 1975. -#' -#' Claude Elwood Shannon. A mathematical theory of communication. Bell Systems -#' Technical Journal, 27:379-423,623-656, 1948 -#' -#' J M Smith, N H Smith, M O'Rourke, and B G Spratt. How clonal are bacteria? -#' Proceedings of the National Academy of Sciences, 90(10):4384-4388, 1993. -#' -#' J.A. Stoddart and J.F. Taylor. Genotypic diversity: estimation and prediction -#' in samples. Genetics, 118(4):705-11, 1988. -#' -#' +#' linkage disequilibrium. \emph{Molecular Ecology Notes}, 1(1-2):101-102, +#' 2001 +#' +#' A.H.D. Brown, M.W. Feldman, and E. Nevo. Multilocus structure of natural +#' populations of \emph{Hordeum spontaneum}. \emph{Genetics}, 96(2):523-536, 1980. +#' +#' Niklaus J. Gr\"unwald, Stephen B. Goodwin, Michael G. Milgroom, and William +#' E. Fry. Analysis of genotypic diversity data for populations of +#' microorganisms. Phytopathology, 93(6):738-46, 2003 +#' +#' Bernhard Haubold and Richard R. Hudson. Lian 3.0: detecting linkage +#' disequilibrium in multilocus data. Bioinformatics, 16(9):847-849, 2000. +#' +#' Kenneth L.Jr. Heck, Gerald van Belle, and Daniel Simberloff. Explicit +#' calculation of the rarefaction diversity measurement and the determination +#' of sufficient sample size. Ecology, 56(6):pp. 1459-1461, 1975 +#' +#' S H Hurlbert. The nonconcept of species diversity: a critique and +#' alternative parameters. Ecology, 52(4):577-586, 1971. +#' +#' J.A. Ludwig and J.F. Reynolds. Statistical Ecology. A Primer on Methods and +#' Computing. New York USA: John Wiley and Sons, 1988. +#' +#' Masatoshi Nei. Estimation of average heterozygosity and genetic distance +#' from a small number of individuals. Genetics, 89(3):583-590, 1978. +#' +#' Jari Oksanen, F. Guillaume Blanchet, Roeland Kindt, Pierre Legendre, Peter +#' R. Minchin, R. B. O'Hara, Gavin L. Simpson, Peter Solymos, M. Henry H. +#' Stevens, and Helene Wagner. vegan: Community Ecology Package, 2012. R +#' package version 2.0-5. +#' +#' E.C. Pielou. Ecological Diversity. Wiley, 1975. +#' +#' Claude Elwood Shannon. A mathematical theory of communication. Bell Systems +#' Technical Journal, 27:379-423,623-656, 1948 +#' +#' J M Smith, N H Smith, M O'Rourke, and B G Spratt. How clonal are bacteria? +#' Proceedings of the National Academy of Sciences, 90(10):4384-4388, 1993. +#' +#' J.A. Stoddart and J.F. Taylor. Genotypic diversity: estimation and +#' prediction in samples. Genetics, 118(4):705-11, 1988. +#' +#' #' @examples #' data(nancycats) #' poppr(nancycats) @@ -205,146 +208,114 @@ #' clonecorrect=TRUE, hier="country", dfname="x") #' } #==============================================================================# -#' @import adegenet pegas ggplot2 vegan -poppr <- function(pop,total=TRUE, sublist=c("ALL"), blacklist=c(NULL), sample=0, +#' @import adegenet ggplot2 vegan +poppr <- function(dat, total=TRUE, sublist="ALL", blacklist=NULL, sample=0, method=1, missing="ignore", cutoff=0.05, quiet=FALSE, - clonecorrect=FALSE, hier=c(1), dfname="population_hierarchy", - keep = 1, hist=TRUE, minsamp=10){ + clonecorrect=FALSE, hier=1, dfname="population_hierarchy", + keep = 1, hist=TRUE, minsamp=10, legend=FALSE){ METHODS = c("permute alleles", "parametric bootstrap", "non-parametric bootstrap", "multilocus") - x <- .file.type(pop, missing=missing, cutoff=cutoff, clonecorrect=clonecorrect, - hier=hier, dfname=dfname, keep=keep, quiet=TRUE) + x <- process_file(dat, missing = missing, cutoff = cutoff, + clonecorrect = clonecorrect, hier = hier, dfname = dfname, + keep = keep, quiet = TRUE) # The namelist will contain information such as the filename and population # names so that they can easily be ported around. namelist <- NULL callpop <- match.call() - if(!is.na(grep("system.file", callpop)[1])){ - popsplt <- unlist(strsplit(pop, "/")) + if (!is.na(grep("system.file", callpop)[1])){ + popsplt <- unlist(strsplit(dat, "/")) namelist$File <- popsplt[length(popsplt)] - } - else if(is.genind(pop)){ - namelist$File <- x$X - } - else{ + } else if (is.genind(dat)){ + namelist$File <- as.character(callpop[2]) + } else { namelist$File <- basename(x$X) } #poplist <- x$POPLIST if(toupper(sublist[1]) == "TOTAL" & length(sublist) == 1){ - pop <- x$GENIND - pop(pop) <- NULL + dat <- x$GENIND + pop(dat) <- NULL poplist <- NULL - poplist$Total <- pop + poplist$Total <- dat } else{ - pop <- popsub(x$GENIND, sublist=sublist, blacklist=blacklist) - if (any(levels(pop(pop)) == "")){ - levels(pop(pop))[levels(pop(pop)) == ""] <- "?" + dat <- popsub(x$GENIND, sublist=sublist, blacklist=blacklist) + if (any(levels(pop(dat)) == "")){ + levels(pop(dat))[levels(pop(dat)) == ""] <- "?" warning("missing population factor replaced with '?'") } - poplist <- .pop.divide(pop) + poplist <- .pop.divide(dat) } # Creating the genotype matrix for vegan's diversity analysis. - pop.mat <- mlg.matrix(pop) + pop.mat <- mlg.matrix(dat) if (total==TRUE & !is.null(poplist) & length(poplist) > 1){ - poplist$Total <- pop + poplist$Total <- dat pop.mat <- rbind(pop.mat, colSums(pop.mat)) } sublist <- names(poplist) Iout <- NULL - result <- NULL origpop <- x$GENIND - rm(x) total <- toupper(total) missing <- toupper(missing) - type <- pop@type + type <- dat@type # For presence/absences markers, a different algorithm is applied. if(type=="PA"){ .Ia.Rd <- .PA.Ia.Rd } - if (is.null(poplist)){ - MPI <- NULL - } - else{ - MPI <- 1 - } - if (!is.null(MPI)){ - MLG.vec <- vapply(sublist, function(x) mlg(poplist[[x]], quiet=TRUE), 1) - N.vec <- vapply(sublist, function(x) length(poplist[[x]]@ind.names), 1) - # Shannon-Weiner diversity index. - H <- vegan::diversity(pop.mat) - # inverse Simpson's index aka Stoddard and Taylor: 1/lambda - G <- vegan::diversity(pop.mat, "inv") - Hexp <- (N.vec/(N.vec-1))*vegan::diversity(pop.mat, "simp") - # E_5 - E.5 <- (G-1)/(exp(H)-1) + if (legend) poppr_message() + + MLG.vec <- rowSums(ifelse(pop.mat > 0, 1, 0)) + N.vec <- rowSums(pop.mat) + # Shannon-Weiner diversity index. + H <- vegan::diversity(pop.mat) + # inverse Simpson's index aka Stoddard and Taylor: 1/lambda + G <- vegan::diversity(pop.mat, "inv") + Hexp <- (N.vec/(N.vec-1))*vegan::diversity(pop.mat, "simp") + # E_5 + E.5 <- (G-1)/(exp(H)-1) + + if (!is.null(poplist)){ # rarefaction giving the standard errors. This will use the minimum pop size # above a user-defined threshold. raremax <- ifelse(is.null(nrow(pop.mat)), sum(pop.mat), ifelse(min(rowSums(pop.mat)) > minsamp, min(rowSums(pop.mat)), minsamp)) - - N.rare <- suppressWarnings(rarefy(pop.mat, raremax, se=TRUE)) - IaList <- NULL - invisible(lapply(sublist, function(x) - IaList <<- rbind(IaList, - .ia(poplist[[x]], - sample=sample, - method=method, - quiet=quiet, - missing=missing, - namelist=list(File=namelist$File, population = x), - hist=hist - )))) - + N.rare <- suppressWarnings(rarefy(pop.mat, raremax, se=TRUE)) + IaList <- NULL + invisible(lapply(sublist, function(x){ + namelist <- list(File = namelist$File, population = x) + IaList <<- rbind(IaList, + .ia(poplist[[x]], sample=sample, method=method, + quiet=quiet, missing=missing, namelist=namelist, + hist=hist) + ) + })) Iout <- as.data.frame(list(Pop=sublist, N=N.vec, MLG=MLG.vec, - eMLG=round(N.rare[1, ], 3), - SE=round(N.rare[2, ], 3), - H=round(H, 3), - G=round(G,3), - Hexp=round(Hexp, 3), - E.5=round(E.5,3), - round(IaList, 3), - File=namelist$File)) + eMLG=N.rare[1, ], SE=N.rare[2, ], H=H, + G=G, Hexp=Hexp, E.5=E.5, IaList, + File=namelist$File)) rownames(Iout) <- NULL - return(final(Iout, result)) } else { - MLG.vec <- mlg(pop, quiet=TRUE) - N.vec <- length(pop@ind.names) - # Shannon-Weiner diversity index. - H <- vegan::diversity(pop.mat) - # E_1, Pielou's evenness. - # J <- H / log(rowSums(pop.mat > 0)) - # inverse Simpson's index aka Stoddard and Taylor: 1/lambda - G <- vegan::diversity(pop.mat, "inv") - Hexp <- (N.vec/(N.vec-1))*vegan::diversity(pop.mat, "simp") - # E_5 - E.5 <- (G-1)/(exp(H)-1) # rarefaction giving the standard errors. No population structure means that # the sample is equal to the number of individuals. N.rare <- rarefy(pop.mat, sum(pop.mat), se=TRUE) - IaList <- .ia(pop, sample=sample, method=method, quiet=quiet, missing=missing, + IaList <- .ia(dat, sample=sample, method=method, quiet=quiet, missing=missing, namelist=(list(File=namelist$File, population="Total")), hist=hist) Iout <- as.data.frame(list(Pop="Total", N=N.vec, MLG=MLG.vec, - eMLG=round(N.rare[1, ], 3), - SE=round(N.rare[2, ], 3), - H=round(H, 3), - G=round(G,3), - Hexp=round(Hexp, 3), - E.5=round(E.5,3), - round(as.data.frame(t(IaList)), 3), - File=namelist$File)) + eMLG=N.rare[1, ], SE=N.rare[2, ], H=H, G=G, + Hexp=Hexp, E.5=E.5, as.data.frame(t(IaList)), + File=namelist$File)) rownames(Iout) <- NULL - return(final(Iout, result)) } + class(Iout) <- c("popprtable", "data.frame") + return(Iout) } #==============================================================================# -# This will process a list of files given by filelist #' Process a list of files with poppr #' #' poppr.all is a wrapper function that will loop through a list of files from -#' the workind directory, execute \code{\link{poppr}}, and concatenate the +#' the working directory, execute \code{\link{poppr}}, and concatenate the #' output into one data frame. #' #' @param filelist a list of files in the current working directory @@ -368,76 +339,157 @@ poppr <- function(pop,total=TRUE, sublist=c("ALL"), blacklist=c(NULL), sample=0, #' poppr.all(x$files) #' } #==============================================================================# -poppr.all <- function(filelist, ...) { +poppr.all <- function(filelist, ...){ result <- NULL - for(a in filelist){ - cat("| File: ",basename(a),"\n") - result <- rbind(result, poppr(a, ...)) + for(a in seq(length(filelist))){ + cat(" \\ \n") + input <- filelist[[a]] + if (is.genind(input)){ + file <- names(filelist)[a] + if (is.null(file)){ + file <- a + } + cat(" | Data: ") + } else { + file <- basename(input) + cat(" | File: ") + } + cat(file, "\n / \n") + res <- poppr(input, ...) + res$File <- file + result <- rbind(result, res) } return(result) } #==============================================================================# -# -# This will now calculate the index of associaton and also perform the necessary -# permutation analysis, printing out a table of raw information. -# #' Index of Association #' -#' Calculate the Index of Association and Standardized Index of Association. -#' Obtain p-values from one-sided permutation tests. +#' Calculate the Index of Association and Standardized Index of Association. +#' Obtain p-values from one-sided permutation tests. #' #' @param pop a \code{\link{genind}} object OR any fstat, structure, genetix, -#' genpop, or genalex formatted files. -#' -#' @param sample an integer indicating the number of permutations desired (eg -#' 999). -#' -#' @param method an integer from 1 to 4 indicating the sampling method desired. -#' see \code{\link{shufflepop}} for details. -#' -#' @param quiet Should the function print anything to the screen while it is -#' performing calculations? -#' -#' \code{TRUE} prints nothing. -#' -#' \code{FALSE} (defualt) will print the population name and progress bar. -#' +#' genpop, or genalex formatted files. +#' +#' @param sample an integer indicating the number of permutations desired (eg +#' 999). +#' +#' @param method an integer from 1 to 4 indicating the sampling method desired. +#' see \code{\link{shufflepop}} for details. +#' +#' @param quiet Should the function print anything to the screen while it is +#' performing calculations? +#' +#' \code{TRUE} prints nothing. +#' +#' \code{FALSE} (defualt) will print the population name and progress bar. +#' #' @param missing a character string. see \code{\link{missingno}} for details. -#' -#' @param hist \code{logical} if \code{TRUE}, a histogram will be printed for -#' each population if there is sampling. -#' -#' @return -#' \emph{If no sampling has occured:} -#' -#' A named number vector of length 2 giving the Index of Association, -#' "Ia"; and the Standardized Index of Association, "rbarD" -#' -#' \emph{If there is sampling:} -#' -#' A a named number vector of length 4 with the following values: -#' \item{Ia}{numeric. The index of association.} -#' \item{p.Ia}{A number indicating the p-value resulting from a one-sided -#' permutation test based on the number of samples indicated in the original -#' call.} -#' \item{rbarD}{numeric. The standardized index of association.} -#' \item{p.rD}{A factor indicating the p-value resutling from a one-sided -#' permutation test based on the number of samples indicated in the original -#' call.} -#' +#' +#' @param hist \code{logical} if \code{TRUE}, a histogram will be printed for +#' each population if there is sampling. +#' +#' @param valuereturn \code{logical} if \code{TRUE}, the index values from the +#' reshuffled data is returned. If \code{FALSE} (default), the index is +#' returned with associated p-values in a 4 element numeric vector. +#' +#' @return \subsection{If no sampling has occured:}{ A named number vector of +#' length 2 giving the Index of Association, "Ia"; and the Standardized Index +#' of Association, "rbarD" } \subsection{If there is sampling:}{ A a named +#' number vector of length 4 with the following values: \itemize{\item{Ia - +#' }{numeric. The index of association.} \item{p.Ia - }{A number indicating +#' the p-value resulting from a one-sided permutation test based on the number +#' of samples indicated in the original call.} \item{rbarD - }{numeric. The +#' standardized index of association.} \item{p.rD - }{A factor indicating the +#' p-value resulting from a one-sided permutation test based on the number of +#' samples indicated in the original call.}} } \subsection{If there is +#' sampling and valureturn = TRUE}{ A list with the following +#' elements: \itemize{ \item{index}{The above vector} \item{samples}{A data +#' frame with s by 2 column data frame where s is the number of samples +#' defined. The columns are for the values of Ia and rbarD, respectively.}}} +#' +#' @details The index of association was originally developed by A.H.D. Brown +#' analyzing population structure of wheat (Brown, 1980). It has been widely +#' used as a tool to detect clonal reproduction within populations . +#' Populations whose members are undergoing sexual reproduction, whether it be +#' selfing or out-crossing, will produce gametes via meiosis, and thus have a +#' chance to shuffle alleles in the next generation. Populations whose members +#' are undergoing clonal reproduction, however, generally do so via mitosis. +#' This means that the most likely mechanism for a change in genotype is via +#' mutation. The rate of mutation varies from species to species, but it is +#' rarely sufficiently high to approximate a random shuffling of alleles. The +#' index of association is a calculation based on the ratio of the variance of +#' the raw number of differences between individuals and the sum of those +#' variances over each locus . You can also think of it as the observed +#' variance over the expected variance. If they are the same, then the index +#' is zero after subtracting one (from Maynard-Smith, 1993): \deqn{I_A = +#' \frac{V_O}{V_E}-1}{Ia = Vo/Ve} Since the distance is more or less a binary +#' distance, any sort of marker can be used for this analysis. In the +#' calculation, phase is not considered, and any difference increases the +#' distance between two individuals. Remember that each column represents a +#' different allele and that each entry in the table represents the fraction +#' of the genotype made up by that allele at that locus. Notice also that the +#' sum of the rows all equal one. Poppr uses this to calculate distances by +#' simply taking the sum of the absolute values of the differences between +#' rows. +#' +#' The calculation for the distance between two individuals at a single locus +#' with \emph{a} allelic states and a ploidy of \emph{k} is as follows (except +#' for Presence/Absence data): \deqn{ d = \displaystyle +#' \frac{k}{2}\sum_{i=1}^{a} \mid A_{i} - B_{i}\mid }{d(A,B) = (k/2)*sum(abs(Ai - Bi))} +#' To find the total number of differences +#' between two individuals over all loci, you just take \emph{d} over \emph{m} +#' loci, a value we'll call \emph{D}: +#' +#' \deqn{D = \displaystyle \sum_{i=1}^{m} d_i }{D = sum(di)} +#' +#' These values are calculated over all possible combinations of individuals +#' in the data set, \eqn{{n \choose 2}}{choose(n, 2)} after which you end up +#' with \eqn{{n \choose 2}\cdot{}m}{choose(n, 2) * m}. values of \emph{d} and +#' \eqn{{n \choose 2}}{choose(n, 2)} values of \emph{D}. Calculating the +#' observed variances is fairly straightforward (modified from Agapow and +#' Burt, 2001): +#' +#' \deqn{ V_O = \frac{\displaystyle \sum_{i=1}^{n \choose 2} D_{i}^2 - +#' \frac{(\displaystyle\sum_{i=1}^{n \choose 2} D_{i})^2}{{n \choose 2}}}{{n +#' \choose 2}}}{Vo = var(D)} +#' +#' Calculating the expected variance is the sum of each of the variances of +#' the individual loci. The calculation at a single locus, \emph{j} is the +#' same as the previous equation, substituting values of \emph{D} for +#' \emph{d}: +#' +#' \deqn{ var_j = \frac{\displaystyle \sum_{i=1}^{n \choose 2} d_{i}^2 - +#' \frac{(\displaystyle\sum_{i=1}^{n \choose 2} d_i)^2}{{n \choose 2}}}{{n +#' \choose 2}} }{Varj = var(dj)} +#' +#' The expected variance is then the sum of all the variances over all +#' \emph{m} loci: +#' +#' \deqn{ V_E = \displaystyle \sum_{j=1}^{m} var_j }{Ve = sum(var(dj))} +#' +#' Agapow and Burt showed that \eqn{I_A}{Ia} increases steadily with the +#' number of loci, so they came up with an approximation that is widely used, +#' \eqn{\bar r_d}{rbarD}. For the derivation, see the manual for +#' \emph{multilocus}. +#' +#' \deqn{ \bar{r_d} = \frac{V_O - V_E} {2\displaystyle +#' \sum_{j=1}^{m}\displaystyle \sum_{k \neq j}^{m}\sqrt{var_j\cdot{}var_k}} +#' }{rbarD = (Vo - Ve)/(2*sum(sum(sqrt(var(dj)*var(dk))))} +#' #' @references Paul-Michael Agapow and Austin Burt. Indices of multilocus -#' linkage disequilibrium. \emph{Molecular Ecology Notes}, 1(1-2):101-102, 2001 -#' -#' A.H.D. Brown, M.W. Feldman, and E. Nevo. Multilocus structure of natural -#' populations of hordeum spontaneum. \emph{Genetics}, 96(2):523-536, 1980. -#' -#' J M Smith, N H Smith, M O'Rourke, and B G Spratt. How clonal are bacteria? -#' Proceedings of the National Academy of Sciences, 90(10):4384-4388, 1993. -#' -#' @seealso \code{\link{poppr}}, \code{\link{missingno}}, -#' \code{\link{import2genind}}, -#' \code{\link{read.genalex}}, \code{\link{clonecorrect}} -#' +#' linkage disequilibrium. \emph{Molecular Ecology Notes}, 1(1-2):101-102, +#' 2001 +#' +#' A.H.D. Brown, M.W. Feldman, and E. Nevo. Multilocus structure of natural +#' populations of \emph{Hordeum spontaneum}. \emph{Genetics}, 96(2):523-536, 1980. +#' +#' J M Smith, N H Smith, M O'Rourke, and B G Spratt. How clonal are bacteria? +#' Proceedings of the National Academy of Sciences, 90(10):4384-4388, 1993. +#' +#' @seealso \code{\link{poppr}}, \code{\link{missingno}}, +#' \code{\link{import2genind}}, \code{\link{read.genalex}}, +#' \code{\link{clonecorrect}} +#' #' @export #' @author Zhian N. Kamvar #' @examples @@ -445,6 +497,12 @@ poppr.all <- function(filelist, ...) { #' ia(nancycats) #' #' \dontrun{ +#' # Get the indices back and plot them using base R graphics: +#' nansamp <- ia(nancycats, sample = 999, valuereturn = TRUE) +#' layout(matrix(c(1,1,2,2,), 2, 2, byrow = TRUE)) +#' hist(nansamp$samples$Ia); abline(v = nansamp$index[1]) +#' hist(nansamp$samples$rbarD); abline(v = nansamp$index[3]) +#' #' # Get the index for each population. #' lapply(seppop(nancycats), ia) #' # With sampling @@ -453,14 +511,14 @@ poppr.all <- function(filelist, ...) { #==============================================================================# ia <- function(pop, sample=0, method=1, quiet=FALSE, missing="ignore", - hist=TRUE){ + hist=TRUE, valuereturn = FALSE){ METHODS = c("permute alleles", "parametric bootstrap", "non-parametric bootstrap", "multilocus") namelist <- NULL namelist$population <- ifelse(length(levels(pop@pop)) > 1 | is.null(pop@pop), "Total", pop@pop.names) - namelist$File <- as.character(pop@call[2]) + namelist$File <- as.character(match.call()[2]) popx <- pop missing <- toupper(missing) @@ -468,21 +526,19 @@ ia <- function(pop, sample=0, method=1, quiet=FALSE, missing="ignore", if(type=="PA"){ .Ia.Rd <- .PA.Ia.Rd - } - else { + } else { popx <- seploc(popx) } # if there are less than three individuals in the population, the calculation # does not proceed. if (nInd(pop) < 3){ - IarD <- as.numeric(c(NA,NA)) + IarD <- as.numeric(c(NA, NA)) names(IarD) <- c("Ia", "rbarD") - if(sample==0){ + if (sample==0){ return(IarD) - } - else{ - IarD <- as.numeric(rep(NA,4)) + } else { + IarD <- as.numeric(rep(NA, 4)) names(IarD) <- c("Ia","p.Ia","rbarD","p.rD") return(IarD) } @@ -494,18 +550,16 @@ ia <- function(pop, sample=0, method=1, quiet=FALSE, missing="ignore", if (sample==0){ Iout <- IarD result <- NULL - } + } else { # sampling will perform the iterations and then return a data frame indicating # the population, index, observed value, and p-value. It will also produce a # histogram. - else{ Iout <- NULL idx <- as.data.frame(list(Index=names(IarD))) samp <- .sampling(popx, sample, missing, quiet=quiet, type=type, method=method) - samp2 <- rbind(samp, IarD) - p.val <- ia.pval(index="Ia", samp2, IarD[1]) - p.val[2] <- ia.pval(index="rbarD", samp2, IarD[2]) - if(hist == TRUE){ + p.val <- sum(IarD[1] <= c(samp$Ia, IarD[1]))/(sample + 1)#ia.pval(index="Ia", samp2, IarD[1]) + p.val[2] <- sum(IarD[2] <= c(samp$rbarD, IarD[2]))/(sample + 1)#ia.pval(index="rbarD", samp2, IarD[2]) + if (hist == TRUE){ poppr.plot(samp, observed=IarD, pop=namelist$population, file=namelist$File, pval=p.val, N=nrow(pop@tab)) } @@ -513,7 +567,175 @@ ia <- function(pop, sample=0, method=1, quiet=FALSE, missing="ignore", result[c(1,3)] <- IarD result[c(2,4)] <- p.val names(result) <- c("Ia","p.Ia","rbarD","p.rD") + if (valuereturn == TRUE){ + iaobj <- list(index = final(Iout, result), samples = samp) + class(iaobj) <- "ialist" + return(iaobj) + } } return(final(Iout, result)) } + + +#==============================================================================# +#' Create a table of summary statistics per locus. +#' +#' @param x a \code{\linkS4class{genind}} or \code{\linkS4class{genclone}} +#' object. +#' +#' @param index Which diversity index to use. Choices are \itemize{ \item +#' \code{"simpson"} (Default) to give Simpson's index \item \code{"shannon"} +#' to give the Shannon-Wiener index \item \code{"invsimpson"} to give the +#' Inverse Simpson's index aka the Stoddard and Tayor index.} +#' +#' @param lev At what level do you want to analyze diversity? Choices are +#' \code{"allele"} (Default) or \code{"genotype"}. +#' +#' @param population Select the populations to be analyzed. This is the +#' parameter \code{sublist} passed on to the function \code{\link{popsub}}. +#' Defaults to \code{"ALL"}. +#' +#' @param information When \code{TRUE} (Default), this will print out a header +#' of information to the R console. +#' +#' @return a table with 4 columns indicating the Number of alleles/genotypes +#' observed, Diversity index chosen, Nei's 1978 expected heterozygosity, and +#' Evenness. +#' +#' @seealso \code{\link[vegan]{diversity}}, \code{\link{poppr}} +#' +#' @note This will calculate statistics for polyploids as well by only counting +#' observed allelic states. +#' +#' @author Zhian N. Kamvar +#' +#' @references +#' Jari Oksanen, F. Guillaume Blanchet, Roeland Kindt, Pierre Legendre, Peter +#' R. Minchin, R. B. O'Hara, Gavin L. Simpson, Peter Solymos, M. Henry H. +#' Stevens, and Helene Wagner. vegan: Community Ecology Package, 2012. R +#' package version 2.0-5. +#' +#' Niklaus J. Gr\"unwald, Stephen B. Goodwin, Michael G. Milgroom, and William +#' E. Fry. Analysis of genotypic diversity data for populations of +#' microorganisms. Phytopathology, 93(6):738-46, 2003 +#' +#' J.A. Ludwig and J.F. Reynolds. Statistical Ecology. A Primer on Methods and +#' Computing. New York USA: John Wiley and Sons, 1988. +#' +#' E.C. Pielou. Ecological Diversity. Wiley, 1975. +#' +#' J.A. Stoddart and J.F. Taylor. Genotypic diversity: estimation and +#' prediction in samples. Genetics, 118(4):705-11, 1988. +#' +#' Masatoshi Nei. Estimation of average heterozygosity and genetic distance +#' from a small number of individuals. Genetics, 89(3):583-590, 1978. +#' +#' Claude Elwood Shannon. A mathematical theory of communication. Bell Systems +#' Technical Journal, 27:379-423,623-656, 1948 +#' +#' @export +#' @examples +#' # Analyze locus statistics for the North American population of P. infestans. +#' data(Pinf) +#' locus_table(Pinf, population = "North America") +#==============================================================================# +locus_table <- function(x, index = "simpson", lev = "allele", + population = "ALL", information = TRUE){ + INDICES <- c("shannon", "simpson", "invsimpson") + index <- match.arg(index, INDICES) + x <- popsub(x, population, drop = FALSE) + x.loc <- summary(as.loci(x)) + outmat <- vapply(x.loc, locus_table_pegas, numeric(4), index, lev, x@type) + loci <- colnames(outmat) + divs <- rownames(outmat) + res <- matrix(0.0, nrow = ncol(outmat) + 1, ncol = nrow(outmat)) + dimlist <- list(`locus` = c(loci, "mean"), `summary` = divs) + res[-nrow(res), ] <- t(outmat) + res[nrow(res), ] <- colMeans(res[-nrow(res), ], na.rm = TRUE) + attr(res, "dimnames") <- dimlist + if (information){ + if (index == "simpson"){ + msg <- "Simpson index" + } else if (index == "shannon"){ + msg <- "Shannon-Wiener index" + } else { + msg <- "Stoddard and Taylor index" + } + cat("\n", divs[1], "= Number of observed", paste0(divs[1], "s")) + cat("\n", divs[2], "=", msg) + cat("\n", divs[3], "= Nei's 1978 expected heterozygosity\n") + cat("------------------------------------------\n") + } + class(res) <- c("locustable", "matrix") + return(res) +} + +#==============================================================================# +#' Tabulate alleles the occur in only one population. +#' +#' @param gid a \code{\linkS4class{genind}} or \code{\linkS4class{genclone}} +#' object. +#' +#' @param report one of \code{"table", "vector",} or \code{"data.frame"}. Tables +#' (Default) and data frame will report counts along with populations or +#' individuals. Vectors will simply report which populations or individuals +#' contain private alleles. Tables are matrices with populations or +#' individuals in rows and alleles in columns. Data frames are long form. +#' +#' @param level one of \code{"population"} (Default) or \code{"individual"}. +#' +#' @return a matrix, data.frame, or vector defining the populations or +#' individuals containing private alleles. If vector is chosen, alleles are +#' not defined. +#' +#' @export +#' @examples +#' +#' data(Pinf) # Load P. infestans data. +#' setpop(Pinf) <- ~Country # Set the population to be at the country level +#' private_alleles(Pinf) +#' \dontrun{ +#' # An example of how this data can be displayed. +#' library(ggplot2) +#' Pinfpriv <- private_alleles(Pinf, report = "data.frame") +#' ggplot(Pinfpriv) + geom_tile(aes(x = population, y = allele, fill = count)) +#' } +#==============================================================================# +private_alleles <- function(gid, report = "table", level = "population"){ + REPORTARGS <- c("table", "vector", "data.frame") + LEVELARGS <- c("individual", "population") + report <- match.arg(report, REPORTARGS) + level <- match.arg(level, LEVELARGS) + if (!is.genind(gid) & !is.genpop(gid)){ + stop(paste(gid, "is not a genind or genpop object.")) + } + if (is.genind(gid) & !is.null(pop(gid)) | is.genpop(gid) & nrow(gid@tab) > 1){ + if (is.genind(gid)){ + gid.pop <- truenames(genind2genpop(gid, quiet = TRUE)) + } else { + gid.pop <- truenames(gid) + } + privates <- gid.pop[, colSums(ifelse(gid.pop > 0, 1, 0), na.rm = TRUE) < 2] + privates <- privates[rowSums(privates) > 0, ] + if (level == "individual" & is.genind(gid)){ + gid.tab <- truenames(gid)$tab + privates <- gid.tab[, colnames(gid.pop) %in% colnames(privates)] + privates <- privates[rowSums(privates, na.rm = TRUE) > 0, ] + } + if (length(privates) == 0){ + privates <- NULL + cat("No private alleles detected.") + return(invisible(NULL)) + } + if (report == "vector"){ + privates <- rownames(privates) + } else if (report == "data.frame"){ + privates <- melt(privates, varnames = c(level, "allele"), + value.name = "count") + } + return(privates) + } else { + stop("There are no populations detected") + } +} diff --git a/R/amova.r b/R/amova.r new file mode 100644 index 00000000..b950dfe0 --- /dev/null +++ b/R/amova.r @@ -0,0 +1,285 @@ +#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# +#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# +#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# +#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# +#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# +# +# This software was authored by Zhian N. Kamvar and Javier F. Tabima, graduate +# students at Oregon State University; and Dr. Nik Grünwald, an employee of +# USDA-ARS. +# +# Permission to use, copy, modify, and distribute this software and its +# documentation for educational, research and non-profit purposes, without fee, +# and without a written agreement is hereby granted, provided that the statement +# above is incorporated into the material, giving appropriate attribution to the +# authors. +# +# Permission to incorporate this software into commercial products may be +# obtained by contacting USDA ARS and OREGON STATE UNIVERSITY Office for +# Commercialization and Corporate Development. +# +# The software program and documentation are supplied "as is", without any +# accompanying services from the USDA or the University. USDA ARS or the +# University do not warrant that the operation of the program will be +# uninterrupted or error-free. The end-user understands that the program was +# developed for research purposes and is advised not to rely exclusively on the +# program for any reason. +# +# IN NO EVENT SHALL USDA ARS OR OREGON STATE UNIVERSITY BE LIABLE TO ANY PARTY +# FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING +# LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, +# EVEN IF THE OREGON STATE UNIVERSITY HAS BEEN ADVISED OF THE POSSIBILITY OF +# SUCH DAMAGE. USDA ARS OR OREGON STATE UNIVERSITY SPECIFICALLY DISCLAIMS ANY +# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE AND ANY STATUTORY +# WARRANTY OF NON-INFRINGEMENT. THE SOFTWARE PROVIDED HEREUNDER IS ON AN "AS IS" +# BASIS, AND USDA ARS AND OREGON STATE UNIVERSITY HAVE NO OBLIGATIONS TO PROVIDE +# MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. +# +#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# +#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# +#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# +#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# +#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# +#==============================================================================# +# Implementation of ade4's AMOVA function. Note that this cannot be used at the +# moment to calculate within individual variances. It will either compute a +# distance matrix or take in a distance matrix. Missing data must be treated +# here and is currently treated as extra alleles, but this can be modified by +# the user. Since ade4 needs a euclidean matrix, this function will, by default, +# correct the distance via the cailliez correction, which will add a number to +# all the distances to satisfy euclidean nature. +# Clone correction at the lowest level of the hierarchy is possible. +# +# Note: This takes a nested formula argument. If you want to analyze the +# hierarchy of Year to Population to Subpopulation, you should make sure you +# have the right data frame in your "other" slot and then write the formula +# thusly: ~ Year/Population/Subpopulation +# +# arguments: +# +# x = a genind object +# hier = a formula such as ~Pop/Subpop +# clonecorrect = This refers to clone correction of the final output relative to +# The lowest hierararchical level. FALSE +# within = should witin individual variation be calculated? TRUE +# dist = A user provided distance matrix NULL +# squared = Is the distance matrix squared? TRUE +# correction = A correction for non-euclidean distances provided by ade4. +# The default, "quasieuclid", seems to give the best results. +# dfname = the data frame containing the population hierarchy. +# "population_hierarchy" +# sep = the separator for the population hierarchy levels. "_" +# missing = how to deal with missing data. Default is "loci". +# cutoff = a cutoff for percent missing data to tolerate. 0.05 +# quiet = Should messages be printed? TRUE +#==============================================================================# +#' Perform Analysis of Molecular Variance (AMOVA) on genind or genclone objects. +#' +#' This function utilizes the ade4 implementation of AMOVA. See +#' \code{\link[ade4]{amova}} for details on the specific implementation. +#' +#' @param x a \code{\linkS4class{genind}} or \code{\linkS4class{genclone}} +#' object +#' +#' @param hier a hierarchical \code{\link{formula}} that defines your population +#' hieararchy. (e.g.: ~Population/Subpopulation). \strong{See Details below.} +#' +#' @param clonecorrect \code{logical} if \code{TRUE}, the data set will be clone +#' corrected with respect to the lowest level of the hierarchy. The default is +#' set to \code{FALSE}. See \code{\link{clonecorrect}} for details. +#' +#' @param within \code{logical}. When this is set to \code{TRUE} (Default), +#' variance within individuals are calculated as well. If this is set to +#' \code{FALSE}, The lowest level of the hierarchy will be the sample level. +#' See Details below. +#' +#' @param dist an optional distance matrix calculated on your data. +#' +#' @param squared if a distance matrix is supplied, this indicates whether or +#' not it represents squared distances. +#' +#' @param correction a \code{character} defining the correction method for +#' non-euclidean distances. Options are \code{\link[ade4]{quasieuclid}} +#' (Default), \code{\link[ade4]{lingoes}}, and \code{\link[ade4]{cailliez}}. +#' See Details below. +#' +#' @param dfname if the input data set is a \code{\linkS4class{genind}} object, +#' specify the name of the data frame in the \code{\link[adegenet]{other}} +#' slot defining the population hierarchy. Defaults to +#' \code{"population_hierarchy"} +#' +#' @param sep A single character used to separate the hierarchical levels. This +#' defaults to "_". +#' +#' @param missing specify method of correcting for missing data utilizing +#' options given in the function \code{\link{missingno}}. Default is +#' \code{"loci"}. +#' +#' @param cutoff specify the level at which missing data should be +#' removed/modified. See \code{\link{missingno}} for details. +#' +#' @param quiet \code{logical} If \code{FALSE} (Default), messages regarding any +#' corrections will be printed to the screen. If \code{TRUE}, no messages will +#' be printed. +#' +#' @return a list of class \code{amova} from the ade4 package. See +#' \code{\link[ade4]{amova}} for details. +#' +#' @details The poppr implementation of AMOVA is a very detailed wrapper for the +#' ade4 implementation. The output is an \code{\link[ade4]{amova}} class list +#' that contains the results in the first four elements. The inputs are contained in the +#' last three elements. The inputs required for the ade4 implementation are: +#' \enumerate{ +#' \item a distance matrix on all unique genotypes (haplotypes) +#' \item a data frame defining the hierarchy of the distance matrix +#' \item a genotype (haplotype) frequency table.} +#' All of this data can be constructed from a +#' \code{\linkS4class{genind}} object, but can be daunting for a novice R +#' user. \emph{This function automates the entire process}. Since there are many +#' variables regarding genetic data, some points need to be highlighted: +#' +#' \subsection{On Hierarchies:}{The hierarchy is defined by different hierarchical +#' levels that separate your data. In a \code{\linkS4class{genclone}} object, +#' these levels are inherently defined in the \code{hierarchy} slot. For +#' \code{\linkS4class{genind}} objects, these levels must be defined in a data +#' frame located within the \code{\link[adegenet]{other}} slot. It is best +#' practice to name this data frame \code{"population_hierarchy"}.} +#' +#' \subsection{On Within Individual Variance:}{ Heterozygosities within diploid +#' genotypes are sources of variation from within individuals and can be +#' quantified in AMOVA. When \code{within = TRUE}, poppr will split diploid +#' genotypes into haplotypes and use those to calculate within-individual +#' variance. No estimation of phase is made. This acts much like the default +#' settings for AMOVA in the Arlequin software package. Within individual +#' variance will not be calculated for haploid individuals or dominant +#' markers.} +#' +#' \subsection{On Euclidean Distances:}{ AMOVA, as defined by +#' Excoffier et al., utilizes an absolute genetic distance measured in the +#' number of differences between two samples across all loci. With the ade4 +#' implementation of AMOVA (utilized by poppr), distances must be Euclidean +#' (due to the nature of the calculations). Unfortunately, many genetic +#' distance measures are not always euclidean and must be corrected for before +#' being analyzed. Poppr automates this with three methods implemented in +#' ade4, \code{\link{quasieuclid}}, \code{\link{lingoes}}, and +#' \code{\link{cailliez}}. The correction of these distances should not +#' adversely affect the outcome of the analysis.} +#' +#' @keywords amova +#' @aliases amova +#' +#' @references Excoffier, L., Smouse, P.E. and Quattro, J.M. (1992) Analysis of +#' molecular variance inferred from metric distances among DNA haplotypes: +#' application to human mitochondrial DNA restriction data. \emph{Genetics}, +#' \strong{131}, 479–491. +#' +#' @seealso \code{\link[ade4]{amova}} \code{\link{clonecorrect}} +#' \code{\link{diss.dist}} \code{\link{missingno}} +#' \code{\link[ade4]{is.euclid}} \code{\link{sethierarchy}} +#' @export +#' @examples +#' data(Aeut) +#' agc <- as.genclone(Aeut) +#' agc +#' amova.result <- poppr.amova(agc, ~Pop/Subpop) +#' amova.result +#' amova.test <- randtest(amova.result) # Test for significance +#' plot(amova.test) +#' amova.test +#' \dontrun{ +#' amova.cc.result <- poppr.amova(agc, ~Pop/Subpop, clonecorrect = TRUE) +#' amova.cc.result +#' amova.cc.test <- randtest(amova.cc.result) +#' plot(amova.cc.test) +#' amova.cc.test +#' } +#==============================================================================# +#' @importFrom ade4 amova is.euclid cailliez quasieuclid lingoes +poppr.amova <- function(x, hier = NULL, clonecorrect = FALSE, within = TRUE, + dist = NULL, squared = TRUE, correction = "quasieuclid", + dfname = "population_hierarchy", sep = "_", + missing = "loci", cutoff = 0.05, quiet = FALSE){ + if (!is.genind(x)) stop(paste(substitute(x), "must be a genind object.")) + if (is.null(hier)) stop("A population hierarchy must be specified") + parsed_hier <- gsub(":", sep, attr(terms(hier), "term.labels")) + full_hier <- parsed_hier[length(parsed_hier)] + + if (is.genclone(x)){ + setpop(x) <- hier + other(x)[[dfname]] <- gethierarchy(x, hier, combine = FALSE) + } else { + if (!dfname %in% names(other(x))){ + stop(paste(dfname, "is not present in the 'other' slot")) + } + if (!full_hier %in% names(other(x)[[dfname]])){ + hiers <- all.vars(hier) + if (!all(hiers %in% names(other(x)[[dfname]]))){ + hier_incompatible_warning(hiers, df) + } + x <- splitcombine(x, hier = hiers, dfname = dfname, method = 2) + } else { + pop(x) <- other(x)[[dfname]][[full_hier]] + } + } + # Treat missing data. This is a part I do not particularly like. The distance + # matrix must be euclidean, but the dissimilarity distance will not allow + # missing data to contribute to the distance. + # + # This is corrected by setting missing to zero: more diversity + # missing to mean of the columns: indiscreet distances. + # remove loci at cutoff + # remove individuals at cutoff + if (clonecorrect){ + x <- clonecorrect(x, hier = hier, keep = 1:length(all.vars(hier))) + } + if (within & ploidy(x) == 2 & check_Hs(x)){ + hier <- update(hier, ~./Individual) + x <- pool_haplotypes(x, dfname = dfname) + } + x <- missingno(x, type = missing, cutoff = cutoff, quiet = quiet) + hierdf <- make_hierarchy(hier, other(x)[[dfname]]) + xstruct <- make_ade_df(hier, hierdf) + if (is.null(dist)){ + xdist <- sqrt(diss.dist(clonecorrect(x, hier = NA), percent = FALSE)) + } else { + datalength <- choose(nInd(x), 2) + mlgs <- mlg(x, quiet = TRUE) + mlglength <- choose(mlgs, 2) + if (length(dist) > mlglength & length(dist) == datalength){ + corrected <- .clonecorrector(x) + xdist <- as.dist(as.matrix(dist)[corrected, corrected]) + } else if(length(dist) == mlglength){ + xdist <- dist + } else { + msg <- paste("\nDistance matrix does not match the data.", + "\nUncorrected observations expected..........", nInd(x), + "\nClone corrected observations expected......", mlgs, + "\nObservations in provided distance matrix...", ceiling(sqrt(length(dist)*2)), + ifelse(within == TRUE, "\nTry setting within = FALSE.", "\n")) + stop(msg) + } + if (squared){ + xdist <- sqrt(xdist) + } + } + if (!is.euclid(xdist)){ + CORRECTIONS <- c("cailliez", "quasieuclid", "lingoes") + try(correct <- match.arg(correction, CORRECTIONS)) + if (!exists("correct")){ + stop(not_euclid_msg(correction)) + } else { + correct_fun <- match.fun(correct) + if (correct == CORRECTIONS[2]){ + cat("Distance matrix is non-euclidean.\n") + cat("Utilizing quasieuclid correction method. See ?quasieuclid for details.\n") + xdist <- correct_fun(xdist) + } else { + xdist <- correct_fun(xdist, print = TRUE, cor.zero = FALSE) + } + } + } + xtab <- t(mlg.matrix(x)) + xtab <- as.data.frame(xtab[unique(mlg.vector(x)), ]) + return(ade4::amova(samples = xtab, distances = xdist, structures = xstruct)) +} \ No newline at end of file diff --git a/R/bruvo.r b/R/bruvo.r index de2d0b24..25d0db05 100755 --- a/R/bruvo.r +++ b/R/bruvo.r @@ -50,47 +50,99 @@ #==============================================================================# #==============================================================================# # -#' Calculate the average Bruvo's Distance over all loci in a population. -#' +#' Bruvo's distance for microsatellites +#' +#' Calculate the average Bruvo's distance over all loci in a population. +#' #' @param pop a \code{\link{genind}} object -#' +#' #' @param replen a \code{vector} of \code{integers} indicating the length of the -#' nucleotide repeats for each microsatellite locus. -#' +#' nucleotide repeats for each microsatellite locus. +#' +#' @param add if \code{TRUE}, genotypes with zero values will be treated under +#' the genome addition model presented in Bruvo et al. 2004. +#' +#' @param loss if \code{TRUE}, genotypes with zero values will be treated under +#' the genome loss model presented in Bruvo et al. 2004. +#' #' @return a \code{distance matrix} -#' -#' @seealso \code{\link{nancycats}} -#' -#' @note This function calculates bruvo's distance for non-special cases (ie. -#' the ploidy and all alleles are known). Currently there is no way to import -#' polyploid partial heterozygote data into adegenet. For Bruvo's Distance -#' concerning special cases, see the package \code{polysat}. -#' -#' If the user does not provide a vector of appropriate length for \code{replen} -#' , it will be estimated by taking the minimum difference among represented -#' alleles at each locus. IT IS NOT RECOMMENDED TO RELY ON THIS ESTIMATION. -#' +#' +#' @note The result of both \code{add = TRUE} and \code{loss = TRUE} is that the +#' distance is averaged over both values. If both are set to \code{FALSE}, +#' then the infinite alleles model is used. For genotypes with all missing +#' values, the result will be NA. +#' +#' If the user does not provide a vector of appropriate length for +#' \code{replen} , it will be estimated by taking the minimum difference among +#' represented alleles at each locus. IT IS NOT RECOMMENDED TO RELY ON THIS +#' ESTIMATION. +#' +#' @details Ploidy is irrelevant with respect to calculation of Bruvo's +#' distance. However, since it makes a comparison between all alleles at a +#' locus, it only makes sense that the two loci need to have the same ploidy +#' level. Unfortunately for polyploids, it's often difficult to fully separate +#' distinct alleles at each locus, so you end up with genotypes that appear to +#' have a lower ploidy level than the organism. +#' +#' To help deal with these situations, Bruvo has suggested three methods for +#' dealing with these differences in ploidy levels: \itemize{ \item Infinite +#' Model - The simplest way to deal with it is to count all missing alleles as +#' infinitely large so that the distance between it and anything else is 1. +#' Aside from this being computationally simple, it will tend to +#' \strong{inflate distances between individuals}. \item Genome Addition Model +#' - If it is suspected that the organism has gone through a recent genome +#' expansion, \strong{the missing alleles will be replace with all possible +#' combinations of the observed alleles in the shorter genotype}. For example, +#' if there is a genotype of [69, 70, 0, 0] where 0 is a missing allele, the +#' possible combinations are: [69, 70, 69, 69], [69, 70, 69, 70], and [69, 70, +#' 70, 70]. The resulting distances are then averaged over the number of +#' comparisons. \item Genome Loss Model - This is similar to the genome +#' addition model, except that it assumes that there was a recent genome +#' reduction event and uses \strong{the observed values in the full genotype +#' to fill the missing values in the short genotype}. As with the Genome +#' Addition Model, the resulting distances are averaged over the number of +#' comparisons. \item Combination Model - Combine and average the genome +#' addition and loss models. } As mentioned above, the infinite model is +#' biased, but it is not nearly as computationally intensive as either of the +#' other models. The reason for this is that both of the addition and loss +#' models requires replacement of alleles and recalculation of Bruvo's +#' distance. The number of replacements required is equal to the multiset +#' coefficient: \eqn{\left({n \choose k}\right) == {(n+k-1) \choose +#' k}}{choose(n+k-1, k)} where \emph{n} is the number of potential +#' replacements and \emph{k} is the number of alleles to be replaced. So, for +#' the example given above, The genome addition model would require +#' \eqn{\left({2 \choose 2}\right) = 3}{choose(2+2-1, 2) == 3} calculations of +#' Bruvo's distance, whereas the genome loss model would require \eqn{\left({4 +#' \choose 2}\right) = 10}{choose(4+2-1, 2) == 10} calculations. +#' +#' To reduce the number of calculations and assumptions otherwise, Bruvo's +#' distance will be calculated using the largest observed ploidy in pairwise +#' comparisons. This means that when comparing [69,70,71,0] and [59,60,0,0], +#' they will be treated as triploids. +#' #' @export #' @author Zhian N. Kamvar -#' -#' @references -#' Ruzica Bruvo, Nicolaas K. Michiels, Thomas G. D'Souza, and Hinrich Schulenburg. A -#' simple method for the calculation of microsatellite genotype distances irrespective -#' of ploidy level. Molecular Ecology, 13(7):2101-2106, 2004. -#' +#' +#' @references Ruzica Bruvo, Nicolaas K. Michiels, Thomas G. D'Souza, and +#' Hinrich Schulenburg. A simple method for the calculation of microsatellite +#' genotype distances irrespective of ploidy level. Molecular Ecology, +#' 13(7):2101-2106, 2004. +#' +#' @seealso \code{\link{bruvo.boot}}, \code{\link{bruvo.msn}} +#' #' @examples #' # Please note that the data presented is assuming that the nancycat dataset #' # contains all dinucleotide repeats, it most likely is not an accurate #' # representation of the data. -#' +#' #' # Load the nancycats dataset and construct the repeat vector. #' data(nancycats) #' ssr <- rep(2, 9) #' #' # Analyze the 1st population in nancycats -#' +#' #' bruvo.dist(popsub(nancycats, 1), replen = ssr) -#' +#' #' # View each population as a heatmap. #' \dontrun{ #' sapply(nancycats$pop.names, function(x) @@ -98,23 +150,22 @@ #' } #==============================================================================# #' @useDynLib poppr -bruvo.dist <- function(pop, replen=c(1)){ +bruvo.dist <- function(pop, replen = 1, add = TRUE, loss = TRUE){ # This attempts to make sure the data is true microsatellite data. It will # reject snp and aflp data. if (pop@type != "codom" | all(is.na(unlist(lapply(pop@all.names, as.numeric))))){ - stop("\nThis dataset does not appear to be microsatellite data. Bruvo's Distance can only be applied for true microsatellites.") + stop(non_ssr_data_warning()) } # Bruvo's distance depends on the knowledge of the repeat length. If the user # does not provide the repeat length, it can be estimated by the smallest # repeat difference greater than 1. This is not a preferred method. if (length(replen) != length(pop@loc.names)){ replen <- vapply(pop@all.names, function(x) guesslengths(as.numeric(x)), 1) - warning("\n\nRepeat length vector for loci is not equal to the number of loci represented.\nEstimating repeat lengths from data:\n", immediate.=TRUE) - cat(replen,"\n\n") + warning(repeat_length_warning(replen), immediate. = TRUE) } bruvomat <- new('bruvomat', pop, replen) funk_call <- match.call() - dist.mat <- bruvos_distance(bruvomat, funk_call = funk_call) + dist.mat <- bruvos_distance(bruvomat, funk_call = funk_call, add, loss) return(dist.mat) } @@ -122,90 +173,91 @@ bruvo.dist <- function(pop, replen=c(1)){ #==============================================================================# # #' Create a tree using Bruvo's Distance with non-parametric bootstrapping. -#' +#' #' @param pop a \code{\link{genind}} object -#' +#' #' @param replen a \code{vector} of \code{integers} indicating the length of the -#' nucleotide repeats for each microsatellite locus. -#' -#' @param sample an \code{integer} indicated the number of bootstrap replicates -#' desired. -#' -#' @param tree choose between "nj" for neighbor-joining and "upgma" for a upgma -#' tree to be produced. -#' -#' @param showtree \code{logical} if \code{TRUE}, a tree will be plotted with -#' nodelabels. -#' -#' @param cutoff \code{integer} the cutoff value for bootstrap node label values -#' (between 0 and 100). -#' -#' @param quiet \code{logical} defaults to \code{FALSE}. If \code{TRUE}, a -#' progress bar and messages will be supressed. -#' -#' @param ... any argument to be passed on to \code{\link{boot.phylo}}. eg. -#' \code{quiet = TRUE}. -#' -#' @return a tree with nodelables -#' -#' @seealso \code{\link{nancycats}}, \code{\link{upgma}}, \code{\link{nj}}, -#' \code{\link{boot.phylo}}, \code{\link{nodelabels}}, \code{\link{na.replace}}, -#' \code{\link{missingno}}. -#' -#' @note This function calculates bruvo's distance for non-special cases (ie. -#' the ploidy and all alleles are known). Currently there is no way to import -#' polyploid partial heterozygote data into adegenet. For Bruvo's Distance -#' concerning special cases, see the package \code{polysat}. -#' Missing data is ignored, but be sure that missing data is NOT set to 0 in the -#' genind object. This is not easy to detect and will result in an error. Please -#' use any other method in \code{\link{na.replace}} or \code{\link{missingno}}. -#' -#' If the user does not provide a vector of appropriate length for \code{replen} -#' , it will be estimated by taking the minimum difference among represented -#' alleles at each locus. IT IS NOT RECOMMENDED TO RELY ON THIS ESTIMATION. -#' +#' nucleotide repeats for each microsatellite locus. +#' +#' @param add if \code{TRUE}, genotypes with zero values will be treated under +#' the genome addition model presented in Bruvo et al. 2004. +#' +#' @param loss if \code{TRUE}, genotypes with zero values will be treated under +#' the genome loss model presented in Bruvo et al. 2004. +#' +#' @param sample an \code{integer} indicated the number of bootstrap replicates +#' desired. +#' +#' @param tree choose between "nj" for neighbor-joining and "upgma" for a upgma +#' tree to be produced. +#' +#' @param showtree \code{logical} if \code{TRUE}, a tree will be plotted with +#' nodelabels. +#' +#' @param cutoff \code{integer} the cutoff value for bootstrap node label values +#' (between 0 and 100). +#' +#' @param quiet \code{logical} defaults to \code{FALSE}. If \code{TRUE}, a +#' progress bar and messages will be suppressed. +#' +#' @param ... any argument to be passed on to \code{\link{boot.phylo}}. eg. +#' \code{quiet = TRUE}. +#' +#' @return a tree of class phylo with nodelables +#' +#' @seealso \code{\link{bruvo.dist}}, \code{\link{nancycats}}, +#' \code{\link{upgma}}, \code{\link{nj}}, \code{\link{boot.phylo}}, +#' \code{\link{nodelabels}}, \code{\link{na.replace}}, +#' \code{\link{missingno}}. +#' +#' @note \strong{Please refer to the documentation for bruvo.dist for details on +#' the algorithm.} If the user does not provide a vector of appropriate length +#' for \code{replen} , it will be estimated by taking the minimum difference +#' among represented alleles at each locus. IT IS NOT RECOMMENDED TO RELY ON +#' THIS ESTIMATION. +#' #' @export -#' @author Javier F. Tabima, Zhian N. Kamvar -#' -#' @references -#' Ruzica Bruvo, Nicolaas K. Michiels, Thomas G. D'Souza, and Hinrich Schulenburg. A -#' simple method for the calculation of microsatellite genotype distances irrespective -#' of ploidy level. Molecular Ecology, 13(7):2101-2106, 2004. -#' +#' @author Zhian N. Kamvar, Javier F. Tabima +#' +#' @references Ruzica Bruvo, Nicolaas K. Michiels, Thomas G. D'Souza, and +#' Hinrich Schulenburg. A simple method for the calculation of microsatellite +#' genotype distances irrespective of ploidy level. Molecular Ecology, +#' 13(7):2101-2106, 2004. +#' #' @examples #' # Please note that the data presented is assuming that the nancycat dataset #' # contains all dinucleotide repeats, it most likely is not an accurate #' # representation of the data. -#' +#' #' # Load the nancycats dataset and construct the repeat vector. #' data(nancycats) #' ssr <- rep(2, 9) #' #' # Analyze the 1st population in nancycats -#' +#' #' bruvo.boot(popsub(nancycats, 1), replen = ssr) -#' +#' #==============================================================================# -#' @importFrom phangorn upgma midpoint +#' @importFrom phangorn upgma midpoint #' @importFrom ape nodelabels nj boot.phylo plot.phylo axisPhylo ladderize #' @importFrom ape add.scale.bar nodelabels tiplabels # / \ # |=(o)=| # \ / -bruvo.boot <- function(pop, replen = c(1), sample = 100, tree = "upgma", - showtree = TRUE, cutoff = NULL, quiet = FALSE, ...) { +bruvo.boot <- function(pop, replen = 1, add = TRUE, loss = TRUE, sample = 100, + tree = "upgma", showtree = TRUE, cutoff = NULL, + quiet = FALSE, ...){ # This attempts to make sure the data is true microsatellite data. It will # reject snp and aflp data. if (pop@type != "codom" | all(is.na(unlist(lapply(pop@all.names, as.numeric))))){ - stop("\nThis dataset does not appear to be microsatellite data. Bruvo's Distance can only be applied for true microsatellites.") + stop(non_ssr_data_warning()) } # Bruvo's distance depends on the knowledge of the repeat length. If the user # does not provide the repeat length, it can be estimated by the smallest # repeat difference greater than 1. This is not a preferred method. if (length(replen) != length(pop@loc.names)){ replen <- vapply(pop@all.names, function(x) guesslengths(as.numeric(x)), 1) - warning("\n\nRepeat length vector for loci is not equal to the number of loci represented.\nEstimating repeat lengths from data:\n", immediate.=TRUE) - cat(replen,"\n\n") + warning(repeat_length_warning(replen), immediate. = TRUE) } bootgen <- new('bruvomat', pop, replen) # Steps: Create initial tree and then use boot.phylo to perform bootstrap @@ -217,9 +269,9 @@ bruvo.boot <- function(pop, replen = c(1), sample = 100, tree = "upgma", root <- FALSE newfunk <- match.fun(nj) } - tre <- newfunk(bruvos_distance(bootgen)) + tre <- newfunk(bruvos_distance(bootgen, funk_call = match.call(), add = add, loss = loss)) if (any (tre$edge.length < 0)){ - warning("Some branch lengths of the tree are negative. Normalizing branches according to Kuhner and Felsenstein (1994)", immediate.=TRUE) + warning(negative_branch_warning(), immediate.=TRUE) tre <- fix_negative_branch(tre) } if (quiet == FALSE){ @@ -228,7 +280,7 @@ bruvo.boot <- function(pop, replen = c(1), sample = 100, tree = "upgma", cat(" the progress bar is full)\n\n") } bootfun <- function(x){ - return(newfunk(bruvos_distance(x))) + return(newfunk(bruvos_distance(x, funk_call = match.call(), add = add, loss = loss))) } bp <- boot.phylo(tre, bootgen, FUN = bootfun, B = sample, quiet = quiet, rooted = root, ...) @@ -236,95 +288,108 @@ bruvo.boot <- function(pop, replen = c(1), sample = 100, tree = "upgma", if (!is.null(cutoff)){ if (cutoff < 1 | cutoff > 100){ cat("Cutoff value must be between 0 and 100.\n") - cutoff <- as.numeric(readline(prompt = "Choose a new cutoff value between 0 and 100:\n")) + prompt_msg <- "Choose a new cutoff value between 0 and 100:\n" + cutoff <- as.numeric(readline(prompt = prompt_msg)) } tre$node.labels[tre$node.labels < cutoff] <- NA } tre$tip.label <- pop@ind.names - if(showtree == TRUE){ - plot(tre, show.node.label=TRUE) - } - if(tree=="upgma"){ - axisPhylo(3) - } else if (tree == "nj"){ - # I have tried different positions of the scale bar and have failed. - add.scale.bar(lwd = 5) + if (showtree == TRUE){ + poppr.plot.phylo(tre, tree) } return(tre) } #==============================================================================# # -#' Create minimum spanning network of selected populations using Brvuo's +#' Create minimum spanning network of selected populations using Bruvo's #' distance. -#' +#' #' @param pop a \code{\link{genind}} object -#' +#' #' @param replen a \code{vector} of \code{integers} indicating the length of the -#' nucleotide repeats for each microsatellite locus. -#' -#' @param palette a \code{function} defining the color palette to be used to -#' color the populations on the graph. It defaults to \code{\link{topo.colors}}, -#' but you can easily create new schemes by using \code{\link{colorRampPalette}} -#' (see examples for details) -#' -#' @param sublist a \code{vector} of population names or indexes that the user -#' wishes to keep. Default to "ALL". -#' +#' nucleotide repeats for each microsatellite locus. +#' +#' @param add if \code{TRUE}, genotypes with zero values will be treated under +#' the genome addition model presented in Bruvo et al. 2004. +#' +#' @param loss if \code{TRUE}, genotypes with zero values will be treated under +#' the genome loss model presented in Bruvo et al. 2004. +#' +#' @param palette a \code{function} defining the color palette to be used to +#' color the populations on the graph. It defaults to +#' \code{\link{topo.colors}}, but you can easily create new schemes by using +#' \code{\link{colorRampPalette}} (see examples for details) +#' +#' @param sublist a \code{vector} of population names or indexes that the user +#' wishes to keep. Default to "ALL". +#' #' @param blacklist a \code{vector} of population names or indexes that the user -#' wishes to discard. Default to \code{NULL} -#' +#' wishes to discard. Default to \code{NULL} +#' #' @param vertex.label a \code{vector} of characters to label each vertex. There -#' are two defaults: \code{"MLG"} will label the nodes with the multilocus genotype -#' from the original data set and \code{"inds"} will label the nodes with the -#' representative individual names. -#' +#' are two defaults: \code{"MLG"} will label the nodes with the multilocus +#' genotype from the original data set and \code{"inds"} will label the nodes +#' with the representative individual names. +#' #' @param gscale "grey scale". If this is \code{TRUE}, this will scale the color -#' of the edges proportional to Bruvo's distance, with the lines becoming darker -#' for more related nodes. See \code{\link{greycurve}} for details. -#' +#' of the edges proportional to Bruvo's distance, with the lines becoming +#' darker for more related nodes. See \code{\link{greycurve}} for details. +#' #' @param glim "grey limit". Two numbers between zero and one. They determine -#' the upper and lower limits for the \code{\link{gray}} function. Default is 0 -#' (black) and 0.8 (20\% black). See \code{\link{greycurve}} for details. -#' -#' @param gadj "grey adjust". a positive \code{integer} greater than zero that -#' will serve as the exponent to the edge weight to scale the grey value to -#' represent that weight. See \code{\link{greycurve}} for details. -#' -#' @param gweight "grey weight". an \code{integer}. If it's 1, the grey scale -#' will be weighted to emphasize the differences between closely related nodes. -#' If it is 2, the grey scale will be weighted to emphasize the differences -#' between more distantly related nodes. See \code{\link{greycurve}} for details. -#' -#' @param wscale "width scale". If this is \code{TRUE}, the edge widths will be -#' scaled proportional to the inverse of Bruvo's distance , with the lines -#' becoming thicker for more related nodes. -#' +#' the upper and lower limits for the \code{\link{gray}} function. Default is +#' 0 (black) and 0.8 (20\% black). See \code{\link{greycurve}} for details. +#' +#' @param gadj "grey adjust". a positive \code{integer} greater than zero that +#' will serve as the exponent to the edge weight to scale the grey value to +#' represent that weight. See \code{\link{greycurve}} for details. +#' +#' @param gweight "grey weight". an \code{integer}. If it's 1, the grey scale +#' will be weighted to emphasize the differences between closely related +#' nodes. If it is 2, the grey scale will be weighted to emphasize the +#' differences between more distantly related nodes. See +#' \code{\link{greycurve}} for details. +#' +#' @param wscale "width scale". If this is \code{TRUE}, the edge widths will be +#' scaled proportional to Bruvo's distance, with the lines becoming thicker +#' for more related nodes. +#' +#' @param showplot logical. If \code{TRUE}, the graph will be plotted. If +#' \code{FALSE}, it will simply be returned. +#' #' @param ... any other arguments that could go into plot.igraph -#' -#' @return -#' \item{graph}{a minimum spanning network with nodes corresponding to MLGs within -#' the data set. Colors of the nodes represent population membership. Width and -#' color of the edges represent distance.} -#' \item{populations}{a vector of the population names corresponding to the -#' vertex colors} -#' \item{colors}{a vector of the hexadecimal representations of the colors used -#' in the vertex colors} -#' -#' @note The edges of these graphs may cross each other if the graph becomes too -#' large. -#' -#' @seealso \code{\link{nancycats}}, \code{\link{upgma}}, \code{\link{nj}}, -#' \code{\link{boot.phylo}}, \code{\link{nodelabels}}, \code{\link{na.replace}}, -#' \code{\link{missingno}}, \code{\link{bruvo.boot}}, \code{\link{greycurve}}. -#' +#' +#' @return \item{graph}{a minimum spanning network with nodes corresponding to +#' MLGs within the data set. Colors of the nodes represent population +#' membership. Width and color of the edges represent distance.} +#' \item{populations}{a vector of the population names corresponding to the +#' vertex colors} \item{colors}{a vector of the hexadecimal representations of +#' the colors used in the vertex colors} +#' +#' @note \itemize{ \item \strong{Please see the documentation for +#' \code{\link{bruvo.dist}} for details on the algorithm}. \item The edges of +#' these graphs may cross each other if the graph becomes too large. \item The +#' nodes in the graph represent multilocus genotypes. The colors of the nodes +#' are representative of population membership. It is not uncommon to see +#' different populations containing the same multilocus genotype.} +#' +#' @details The minimum spanning network generated by this function is generated +#' via igraph's \code{\link[igraph]{minimum.spanning.tree}}. The resultant +#' graph produced can be plotted using igraph functions, or the entire object +#' can be plotted using the function \code{\link{plot_poppr_msn}}, which will +#' give the user a scale bar and the option to layout your data. +#' +#' @seealso \code{\link{bruvo.dist}}, \code{\link{nancycats}}, +#' \code{\link{plot_poppr_msn}}, \code{\link[igraph]{minimum.spanning.tree}} +#' \code{\link{bruvo.boot}}, \code{\link{greycurve}} +#' #' @export -#' @author Javier F. Tabima, Zhian N. Kamvar -#' -#' @references -#' Ruzica Bruvo, Nicolaas K. Michiels, Thomas G. D'Souza, and Hinrich Schulenburg. A -#' simple method for the calculation of microsatellite genotype distances irrespective -#' of ploidy level. Molecular Ecology, 13(7):2101-2106, 2004. -#' +#' @author Zhian N. Kamvar, Javier F. Tabima +#' @aliases msn.bruvo +#' @references Ruzica Bruvo, Nicolaas K. Michiels, Thomas G. D'Souza, and +#' Hinrich Schulenburg. A simple method for the calculation of microsatellite +#' genotype distances irrespective of ploidy level. Molecular Ecology, +#' 13(7):2101-2106, 2004. +#' #' @examples #' #' # Load the data set. @@ -340,27 +405,27 @@ bruvo.boot <- function(pop, replen = c(1), sample = 100, tree = "upgma", #' #' # View custom colors. Here, we use black and orange. #' bruvo.msn(nancycats, replen=rep(2, 9), sublist=8:9, vertex.label="inds", -#' palette = colorRampPalette(c("orange", "black"), vertex.label.cex=0.7, +#' palette = colorRampPalette(c("orange", "black")), vertex.label.cex=0.7, #' vertex.label.dist=0.4) #' #' # View with darker shades of grey (setting the upper limit to 1/2 black 1/2 white). #' bruvo.msn(nancycats, replen=rep(2, 9), sublist=8:9, vertex.label="inds", -#' palette = colorRampPalette(c("orange", "black"), vertex.label.cex=0.7, +#' palette = colorRampPalette(c("orange", "black")), vertex.label.cex=0.7, #' vertex.label.dist=0.4, glim=c(0, 0.5)) #' #' # View with no grey scaling. #' bruvo.msn(nancycats, replen=rep(2, 9), sublist=8:9, vertex.label="inds", -#' palette = colorRampPalette(c("orange", "black"), vertex.label.cex=0.7, +#' palette = colorRampPalette(c("orange", "black")), vertex.label.cex=0.7, #' vertex.label.dist=0.4, gscale=FALSE) #' #' # View with no line widths. #' bruvo.msn(nancycats, replen=rep(2, 9), sublist=8:9, vertex.label="inds", -#' palette = colorRampPalette(c("orange", "black"), vertex.label.cex=0.7, +#' palette = colorRampPalette(c("orange", "black")), vertex.label.cex=0.7, #' vertex.label.dist=0.4, wscale=FALSE) #' #' # View with no scaling at all. #' bruvo.msn(nancycats, replen=rep(2, 9), sublist=8:9, vertex.label="inds", -#' palette = colorRampPalette(c("orange", "black"), vertex.label.cex=0.7, +#' palette = colorRampPalette(c("orange", "black")), vertex.label.cex=0.7, #' vertex.label.dist=0.4, vscale=FALSE, gscale=FALSE) #' #' # View the whole population, but without labels. @@ -368,15 +433,16 @@ bruvo.boot <- function(pop, replen = c(1), sample = 100, tree = "upgma", #' } #==============================================================================# #' @importFrom igraph graph.adjacency plot.igraph V E minimum.spanning.tree V<- E<- print.igraph -bruvo.msn <- function (pop, replen = c(1), palette = topo.colors, +bruvo.msn <- function (pop, replen = 1, add = TRUE, loss = TRUE, palette = topo.colors, sublist = "All", blacklist = NULL, vertex.label = "MLG", gscale = TRUE, glim = c(0,0.8), gadj = 3, gweight = 1, - wscale = TRUE, ...){ + wscale = TRUE, showplot = TRUE, ...){ gadj <- ifelse(gweight == 1, gadj, -gadj) - # Storing the MLG vector into the genind object - pop$other$mlg.vec <- mlg.vector(pop) - + if (!is.genclone(pop)){ + # Storing the MLG vector into the genind object + pop$other$mlg.vec <- mlg.vector(pop) + } if (is.null(pop(pop)) | length(pop@pop.names) == 1){ return(singlepop_msn(pop, vertex.label, replen = replen, gscale = gscale, glim = glim, gadj = gadj, wscale = wscale, @@ -388,18 +454,31 @@ bruvo.msn <- function (pop, replen = c(1), palette = topo.colors, if (is.null(pop(pop)) | length(pop@pop.names) == 1){ return(singlepop_msn(pop, vertex.label, replen = replen, gscale = gscale, glim = glim, gadj = gadj, wscale = wscale, - palette = palette)) + palette = palette, showplot = showplot, ...)) } # Obtaining population information for all MLGs - mlg.cp <- mlg.crosspop(pop, mlgsub=1:mlg(pop, quiet=TRUE), quiet=TRUE) - names(mlg.cp) <- paste0("MLG.", sort(unique(pop$other$mlg.vec))) cpop <- pop[.clonecorrector(pop), ] + if (is.genclone(pop)){ + subs <- sort(unique(pop@mlg)) + } else { + subs <- 1:mlg(pop, quiet = TRUE) + } + mlg.cp <- mlg.crosspop(pop, mlgsub = subs, quiet=TRUE) + if (is.genclone(pop)){ + mlgs <- pop@mlg + cmlg <- cpop@mlg + } else { + mlgs <- pop$other$mlg.vec + cmlg <- cpop$other$mlg.vec + } + names(mlg.cp) <- paste0("MLG.", sort(unique(mlgs))) + # This will determine the size of the nodes based on the number of individuals # in the MLG. Subsetting by the MLG vector of the clone corrected set will # give us the numbers and the population information in the correct order. # Note: rank is used to correctly subset the data - mlg.number <- table(pop$other$mlg.vec)[rank(cpop$other$mlg.vec)] - mlg.cp <- mlg.cp[rank(cpop$other$mlg.vec)] + mlg.number <- table(mlgs)[rank(cmlg)] + mlg.cp <- mlg.cp[rank(cmlg)] bclone <- bruvo.dist(cpop, replen=replen) ###### Create a graph ####### @@ -408,49 +487,27 @@ bruvo.msn <- function (pop, replen = c(1), palette = topo.colors, if (!is.na(vertex.label[1]) & length(vertex.label) == 1){ if (toupper(vertex.label) == "MLG"){ - vertex.label <- paste0("MLG.", cpop$other$mlg.vec) + vertex.label <- paste0("MLG.", cmlg) } else if (toupper(vertex.label) == "INDS"){ vertex.label <- cpop$ind.names } } ###### Color schemes ####### - # The pallete is determined by what the user types in the argument. It can be + # The palette is determined by what the user types in the argument. It can be # rainbow, topo.colors, heat.colors ...etc. palette <- match.fun(palette) color <- palette(length(pop@pop.names)) - if(gscale == TRUE){ - E(mst)$color <- gray(adjustcurve(E(mst)$weight, glim=glim, correction=gadj, - show=FALSE)) - } else { - E(mst)$color <- rep("black", length(E(mst)$weight)) - } - - edgewidth <- 2 - if (wscale == TRUE){ - edgewidth <- 1/(E(mst)$weight) - if (any(E(mst)$weight < 0.08)){ - edgewidth <- 1/(E(mst)$weight + 0.08) - } - } + mst <- update_edge_scales(mst, wscale, gscale, glim, gadj) + # This creates a list of colors corresponding to populations. mlg.color <- lapply(mlg.cp, function(x) color[pop@pop.names %in% names(x)]) - # def.par <- par(no.readonly = TRUE) - # layout(matrix(1:3,ncol=3), width = c(1,3,1)) - # gcol <- gray(adjustcurve(sort(E(mst)$weight), glim=glim, correction=gadj, - # show=FALSE)) - # plot(c(0,2),c(0,1),type = 'n', axes = F,xlab = '', ylab = '', main = 'GREY') - - plot.igraph(mst, edge.width = edgewidth, edge.color = E(mst)$color, - vertex.size = mlg.number*3, vertex.shape = "pie", vertex.pie = mlg.cp, - vertex.pie.color = mlg.color, vertex.label = vertex.label, ...) - legend(-1.55, 1, bty = "n", cex = 0.75, legend = pop$pop.names, - title = "Populations", fill = color, border = NULL) - # legend_image <- as.raster(matrix(gcol, ncol=1)) - # plot(c(0,2),c(0,1),type = 'n', axes = F,xlab = '', ylab = '', main = 'GREY') - # text(x= 1.5, y = seq(0,1,l=5), labels = round(quantile(E(mst)$weight), 3)) - # rasterImage(legend_image, 0, 0, 1,1) - # par(def.par) - E(mst)$width <- edgewidth + if (showplot){ + plot.igraph(mst, edge.width = E(mst)$width, edge.color = E(mst)$color, + vertex.size = mlg.number*3, vertex.shape = "pie", vertex.pie = mlg.cp, + vertex.pie.color = mlg.color, vertex.label = vertex.label, ...) + legend(-1.55, 1, bty = "n", cex = 0.75, legend = pop$pop.names, + title = "Populations", fill = color, border = NULL) + } V(mst)$size <- mlg.number V(mst)$shape <- "pie" V(mst)$pie <- mlg.cp diff --git a/R/bruvoclass.r b/R/bruvoclass.r deleted file mode 100644 index 0050703d..00000000 --- a/R/bruvoclass.r +++ /dev/null @@ -1,265 +0,0 @@ -#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# -#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# -#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# -#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# -#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# -# -# This software was authored by Zhian N. Kamvar and Javier F. Tabima, graduate -# students at Oregon State University; and Dr. Nik Grünwald, an employee of -# USDA-ARS. -# -# Permission to use, copy, modify, and distribute this software and its -# documentation for educational, research and non-profit purposes, without fee, -# and without a written agreement is hereby granted, provided that the statement -# above is incorporated into the material, giving appropriate attribution to the -# authors. -# -# Permission to incorporate this software into commercial products may be -# obtained by contacting USDA ARS and OREGON STATE UNIVERSITY Office for -# Commercialization and Corporate Development. -# -# The software program and documentation are supplied "as is", without any -# accompanying services from the USDA or the University. USDA ARS or the -# University do not warrant that the operation of the program will be -# uninterrupted or error-free. The end-user understands that the program was -# developed for research purposes and is advised not to rely exclusively on the -# program for any reason. -# -# IN NO EVENT SHALL USDA ARS OR OREGON STATE UNIVERSITY BE LIABLE TO ANY PARTY -# FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING -# LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, -# EVEN IF THE OREGON STATE UNIVERSITY HAS BEEN ADVISED OF THE POSSIBILITY OF -# SUCH DAMAGE. USDA ARS OR OREGON STATE UNIVERSITY SPECIFICALLY DISCLAIMS ANY -# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF -# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE AND ANY STATUTORY -# WARRANTY OF NON-INFRINGEMENT. THE SOFTWARE PROVIDED HEREUNDER IS ON AN "AS IS" -# BASIS, AND USDA ARS AND OREGON STATE UNIVERSITY HAVE NO OBLIGATIONS TO PROVIDE -# MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. -# -#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# -#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# -#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# -#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# -#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# -#==============================================================================# -# #==============================================================================# -# #~ Bootgen object -# #~ -# #~ An internal object used for bootstrapping. Not intended for user interaction. -# #~ -# #~ @section Extends: -# #~ Class \code{"\linkS4class{genind}"}, directly. -# #~ -# #~ @name bootgen-class -# #~ @rdname bootgen-class -# #~ @export -# #~ @slot replen repeat length of microsatellite loci -# #~ @author Zhian Kamvar -# #~ @import methods -# #==============================================================================# -# setClass("bootgen", -# contains = c("genind"), -# representation = representation(replen = "numeric"), -# ) -# -# #==============================================================================# -# #~ Methods used for the bootgen object. -# #~ -# #~ This is not designed for user interaction. -# #~ -# #~ @rdname bootgen-methods -# #~ @param x a \code{"\linkS4class{bootgen}"} object -# #~ @param i vector of numerics indicating number of individuals desired -# #~ @param j a vector of numerics corresponding to the loci desired. -# #~ @param ... unused. -# #~ @param drop set to \code{FALSE} -# #==============================================================================# -# setMethod( -# f = "[", -# signature(x = "bootgen"), -# definition = function(x, i, j, ..., drop = FALSE){ -# if (missing(i)) i <- TRUE -# if (missing(j)) j <- TRUE -# -# ## Information Gathering -# j_length <- 1:length(j) -# if (length(j) > nLoc(x) | any(j > nLoc(x))){ -# stop('subscript out of bounds') -# } -# loci <- levels(x@loc.fac) -# # numbers of alleles per locus -# locnall <- x@loc.nall[j] -# names(locnall) <- names(x@loc.nall[j_length]) -# # names of all the alleles. Length of each list element will be equal to locnall -# allnames <- x@all.names[j] -# names(allnames) <- names(x@all.names)[j_length] -# -# ## Subsetting table columns -# indices <- unlist(lapply(loci[j], function(locus){res <- which(x@loc.fac %in% locus)})) -# locnames <- rep(names(allnames), locnall) -# tabnames <- paste(locnames, unlist(allnames), sep = ".") -# if (!is.null(pop(x))){ -# res <- truenames(x)$tab[i, indices, drop = drop] -# } else { -# res <- truenames(x)[i, indices, drop = drop] -# } -# colnames(res) <- tabnames -# -# ## Resetting all factors that need to be set. -# x@tab <- res -# x@loc.fac <- factor(rep(names(locnall), locnall)) -# x@loc.names <- names(locnall) -# x@loc.nall <- locnall -# x@all.names <- allnames -# x@replen <- x@replen[j] -# return(x) -# } -# ) -# -# #==============================================================================# -# #~ @rdname bootgen-methods -# #==============================================================================# -# setMethod( -# f = "dim", -# signature(x = "bootgen"), -# definition = function(x){ -# return(c(nInd(x), nLoc(x))) -# } -# ) -# -# -# #==============================================================================# -# #~ @rdname bootgen-methods -# #~ @param .Object a character, "bootgen" -# #~ @param gen \code{"\linkS4class{genind}"} object -# #~ @param replen a vector of numbers indicating the repeat length for each -# #~ microsatellite locus. -# #==============================================================================# -# setMethod( -# f = "initialize", -# signature = "bootgen", -# definition = function(.Object, gen, replen){ -# if (missing(gen)) gen <- new("genind") -# if (missing(replen)){ -# replen <- vapply(gen@all.names, function(y) guesslengths(as.numeric(y)), 1) -# } -# lapply(names(gen), function(y) slot(.Object, y) <<- slot(gen, y)) -# slot(.Object, "replen") <- replen -# return(.Object) -# }) -# -# - -#==============================================================================# -#' bruvomat object -#' -#' An internal object used for bruvo's distance. -#' Not intended for user interaction. -#' -#' -#' @name bruvomat-class -#' @rdname bruvomat-class -#' @export -#' @slot mat a matrix of genotypes with one allele per locus. Number of rows will -#' be equal to (ploidy)*(number of loci) -#' @slot replen repeat length of microsatellite loci -#' @slot ploidy the ploidy of the data set -#' @slot ind.names names of individuals in matrix rows. -#' @author Zhian Kamvar -#' @import methods -#==============================================================================# -setClass( - Class = "bruvomat", - representation = representation( - mat = "matrix", - replen = "numeric", - ploidy = "numeric", - ind.names = "character" - ), - prototype = prototype( - mat = matrix(ncol = 0, nrow = 0), - replen = 0, - ploidy = 2, - ind.names = "none" - ) -) - -#==============================================================================# -#' @rdname bruvomat-methods -#' @param .Object a character, "bruvomat" -#' @param gen \code{"\linkS4class{genind}"} object -#' @param replen a vector of numbers indicating the repeat length for each -#' microsatellite locus. -#==============================================================================# -setMethod( - f = "initialize", - signature = "bruvomat", - definition = function(.Object, gen, replen){ - if (missing(gen)) gen <- new("genind") - if (missing(replen)){ - replen <- vapply(gen@all.names, function(y) guesslengths(as.numeric(y)), 1) - } - ploid <- ploidy(gen) - # This controlls for the user correcting missing data using "mean". - if (any(!gen@tab %in% c((0:ploid)/ploid, NA))){ - gen@tab[!gen@tab %in% c((0:ploid)/ploid, NA)] <- NA - } - # This will check for data that has missing scored as "zero". - popcols <- ploid*nLoc(gen) - if (!any(is.na(gen@tab)) & any(rowSums(gen@tab, na.rm=TRUE) < nLoc(gen))){ - mat1 <- as.matrix.data.frame(genind2df(gen, sep="/", usepop=FALSE)) - mat1[mat1 %in% c("", NA)] <- paste(rep(0, ploid), collapse="/") - mat2 <- apply(mat1, 1, strsplit, "/") - mat3 <- apply(as.matrix(t(sapply(mat2, unlist))), 2, as.numeric) - vec1 <- suppressWarnings(as.numeric(unlist(mat3))) - pop <- matrix(vec1, nrow=nInd(gen), ncol=popcols) - } else { - popdf <- genind2df(gen, oneColPerAll=TRUE, usepop=FALSE) - mat1 <- as.matrix.data.frame(popdf) - pop <- suppressWarnings(matrix(as.numeric(mat1), ncol=popcols)) - } - slot(.Object, "mat") <- pop - slot(.Object, "replen") <- replen - slot(.Object, "ploidy") <- ploid - slot(.Object, "ind.names") <- indNames(gen) - return(.Object) - } -) - -#==============================================================================# -#' @rdname bruvomat-methods -#==============================================================================# -setMethod( - f = "dim", - signature(x = "bruvomat"), - definition = function(x){ - return(c(nrow(x@mat), ncol(x@mat)/x@ploidy)) - } -) - -#==============================================================================# -#' Methods used for the bruvomat object. -#' -#' This is not designed for user interaction. -#' -#' @rdname bruvomat-methods -#' @param x a \code{"\linkS4class{bruvomat}"} object -#' @param i vector of numerics indicating number of individuals desired -#' @param j a vector of numerics corresponding to the loci desired. -#' @param ... unused. -#' @param drop set to \code{FALSE} -#==============================================================================# -setMethod( - f = "[", - signature(x = "bruvomat"), - definition = function(x, i, j, ..., drop = FALSE){ - if (missing(i)) i <- TRUE - if (missing(j)) j <- TRUE - x@replen <- x@replen[j] - x@ind.names <- x@ind.names[i] - cols <- rep(1:ncol(x), each = x@ploidy) - replacement <- vapply(j, function(ind) which(cols == ind), 1:x@ploidy) - x@mat <- x@mat[i, as.vector(replacement), drop = FALSE] - return(x) - } -) \ No newline at end of file diff --git a/R/classes.r b/R/classes.r new file mode 100644 index 00000000..1eab9d7c --- /dev/null +++ b/R/classes.r @@ -0,0 +1,224 @@ +#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# +#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# +#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# +#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# +#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# +# +# This software was authored by Zhian N. Kamvar and Javier F. Tabima, graduate +# students at Oregon State University; and Dr. Nik Grünwald, an employee of +# USDA-ARS. +# +# Permission to use, copy, modify, and distribute this software and its +# documentation for educational, research and non-profit purposes, without fee, +# and without a written agreement is hereby granted, provided that the statement +# above is incorporated into the material, giving appropriate attribution to the +# authors. +# +# Permission to incorporate this software into commercial products may be +# obtained by contacting USDA ARS and OREGON STATE UNIVERSITY Office for +# Commercialization and Corporate Development. +# +# The software program and documentation are supplied "as is", without any +# accompanying services from the USDA or the University. USDA ARS or the +# University do not warrant that the operation of the program will be +# uninterrupted or error-free. The end-user understands that the program was +# developed for research purposes and is advised not to rely exclusively on the +# program for any reason. +# +# IN NO EVENT SHALL USDA ARS OR OREGON STATE UNIVERSITY BE LIABLE TO ANY PARTY +# FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING +# LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, +# EVEN IF THE OREGON STATE UNIVERSITY HAS BEEN ADVISED OF THE POSSIBILITY OF +# SUCH DAMAGE. USDA ARS OR OREGON STATE UNIVERSITY SPECIFICALLY DISCLAIMS ANY +# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE AND ANY STATUTORY +# WARRANTY OF NON-INFRINGEMENT. THE SOFTWARE PROVIDED HEREUNDER IS ON AN "AS IS" +# BASIS, AND USDA ARS AND OREGON STATE UNIVERSITY HAVE NO OBLIGATIONS TO PROVIDE +# MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. +# +#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# +#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# +#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# +#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# +#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# +#==============================================================================# + +#==============================================================================# +#' Genclone class +#' +#' Genclone is an S4 class that extends the \code{\linkS4class{genind}} +#' from the \pkg{\link{adegenet}} package. It will have all of the same +#' attributes as the \code{\linkS4class{genind}}, but it will contain two +#' extra slots that will help retain information about population hierarchies +#' and multilocus genotypes. +#' +#' @section Extends: +#' Class \code{"\linkS4class{genind}"}, directly. +#' +#' @details The genclone class will allow for more optimized methods of clone +#' correcting and analyzing data over multiple levels of population hierarchy. +#' +#' Previously, for hierarchical analysis to work in a \code{\link{genind}} +#' object, the user had to place a data frame in the \code{\link{other}} slot of +#' the object. The suggested name of the data frame was +#' \code{population_hierarchy}, and this was used to be able to store the +#' hierarchical information inside the object so that the user did not have to +#' keep track of that information. This method worked, but it became apparent +#' that it was a bit confusing to the user as the method for changing the +#' population of an object became: +#' +#' \code{pop(object) <- other(object)$population_hierarchy$population_name} +#' +#' That is a lot to keep track of. The new \strong{\code{hierarchy}} slot will +#' allow the user to change the population factor with one function and a formula: +#' +#' \code{setpop(object) <- ~Population/Subpopulation} +#' +#' making this become slightly more intuitive and tractable. +#' +#' Previously for \linkS4class{genind} objects, multilocus genotypes were not +#' retained after a data set was subset by population. The new +#' \strong{\code{mlg}} slot allows us to assign the multilocus genotypes and +#' retain that information no matter how we subset the data set. +#' +#' @name genclone-class +#' @rdname genclone-class +#' @aliases genclone +#' @export +#' @slot mlg a vector representing multilocus genotypes for the data set. +#' @slot hierarchy a data frame containing hierarchical levels. +#' @author Zhian N. Kamvar +#' @seealso \code{\link{as.genclone}} \code{\link{sethierarchy}} \code{\link{setpop}} +#' \code{\linkS4class{genind}} +#' @import methods +#==============================================================================# +setClass("genclone", + contains = "genind", + representation = representation(mlg = "numeric", + hierarchy = "data.frame"), +) + +# valid.genclone <- function(object){ +# slots <- slotNames(object) +# if (any(!c("mlg", "hierarchy") %in% slots)){ +# return(FALSE) +# } +# inds <- length(object@ind.names) +# mlgs <- length(object@ind.names) +# hier <- length(object@hierarchy) +# hierobs <- nrow(object@hierarchy) +# if (mlgs != inds){ +# cat("Multilocus genotypes do not match the number of observations") +# return(FALSE) +# } +# if (hier > 0 & hierobs != inds){ +# cat("Hierarchy does not match the number of observations") +# return(FALSE) +# } +# return(TRUE) +# } +# +# setValidity("genclone", valid.genclone) + +#==============================================================================# +#~ mlg object +#~ +#~ An internal object used for multilocus genotype definition. +#~ Not intended for user interaction. Ideally what this will do is create the +#~ multilocus genotype table from scratch with the lowest level of the +#~ population hierarchy and then subset it with higher levels when requested. +#~ +#~ +#~ @name mlg-class +#~ @rdname mlg-class +#~ @export +#~ @slot table a matrix representing the multilocus genotypes of the smallest +#~ hierarchy. Defaults to NULL +#~ @slot mlg a single integer with the number of multilocus genotypes in the +#~ dataset +#~ @slot vec a vector defining the multilocus genotypes +#~ @author Zhian N. Kamvar +#~ @import methods +#==============================================================================# +subset_mlgtable <- function(tab, hierarchy, df){ + if (nrow(df) != sum(tab)){ + stop("Number of rows in the data frame must equal the sum of the table.") + } + form <- as.formula(paste0("~", paste(names(df), collapse = "/"))) + def_hier <- all.vars(hierarchy) + def_hier <- def_hier[length(def_hier)] + + newdf <- make_ade_df(form, df)[[def_hier]] + + hier_levs <- levels(newdf[def_hier]) + ncols <- ncol(tab) + newmat <- vapply(hier_levs, function(x) colSums(tab[newdf == x, , drop = FALSE]), numeric(ncols)) + dimnames(newmat) <- list(colnames(tab), hier_levs) + return(t(newmat)) +} + + +#==============================================================================# +#' bruvomat object +#' +#' An internal object used for bruvo's distance. +#' Not intended for user interaction. +#' +#' +#' @name bruvomat-class +#' @rdname bruvomat-class +#' @export +#' @slot mat a matrix of genotypes with one allele per locus. Number of rows will +#' be equal to (ploidy)*(number of loci) +#' @slot replen repeat length of microsatellite loci +#' @slot ploidy the ploidy of the data set +#' @slot ind.names names of individuals in matrix rows. +#' @keywords internal +#' @author Zhian N. Kamvar +#' @import methods +#==============================================================================# +setClass( + Class = "bruvomat", + representation = representation( + mat = "matrix", + replen = "numeric", + ploidy = "numeric", + ind.names = "character" + ), + prototype = prototype( + mat = matrix(ncol = 0, nrow = 0), + replen = 0, + ploidy = 2, + ind.names = "none" + ) +) + +#==============================================================================# +#' Bootgen object +#' +#' An internal object used for bootstrapping. Not intended for user interaction. +#' +#' @section Extends: +#' Virtual Class \code{"\linkS4class{gen}"}. +#' +#' @name bootgen-class +#' @rdname bootgen-class +#' @export +#' @slot type a character denoting Codominant ("codom") or Dominant data ("P/A") +#' @slot ploidy an integer denoting the ploidy of the data set. (>=1) +#' @slot alllist a list with numeric vectors, each representing a different +#' locus where each element in the vector represents the index for a specific +#' allele. +#' @slot names a vector containing names of the observed samples. +#' @keywords internal +#' @author Zhian N. Kamvar +#' @import methods +#==============================================================================# +setClass("bootgen", + contains = c("gen"), + representation = representation( + type = "character", + ploidy = "integer", + names = "vector", + alllist = "list"), +) \ No newline at end of file diff --git a/R/data_subset.r b/R/data_subset.r index f6c4d6ed..52953c82 100644 --- a/R/data_subset.r +++ b/R/data_subset.r @@ -42,114 +42,134 @@ #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# #==============================================================================# -# -# The clone correct function will need a parameter for the lowest population -# level in order to keep at least one individual represented in each population. -# It takes a popper object and will return a poppr object. -# -#' Remove potential bias caused by cloned genotypes in genind object. +#' Remove potential bias caused by cloned genotypes in genind or genclone +#' object. #' -#' This function removes any duplicated multi locus genotypes from any specified +#' This function removes any duplicated multilocus genotypes from any specified #' population hierarchy. -#' -#' @param pop a \code{\link{genind}} object -#' -#' @param hier a \code{numeric or character list}. This is the list of vectors -#' within a data frame (specified in \code{dfname}) in the 'other' slot of the -#' \code{\link{genind}} object. The list should indicate the population -#' hierarchy to be used for clone correction. -#' -#' @param dfname a \code{character string}. This is the name of the data frame -#' or list containing the vectors of the population hierarchy within the -#' \code{other} slot of the \code{\link{genind}} object. -#' -#' @param combine \code{logical}. When set to TRUE, the heirarchy will be -#' combined to create a new population for the genind object. -#' -#' @param keep \code{integer}. When \code{combine} is set to \code{FALSE}, you -#' can use this flag to choose the levels of your population hierarchy. For -#' example: if your clone correction hierarchy is set to "Pop", "Subpop", and -#' "Year", and you want to analyze your populations with respect to year, you -#' can set \code{keep = c(1,3)}. -#' -#' @return a clone corrected \code{\link{genind}} object. #' -#' @note -#' This function will clone correct to the population level indicated in -#' the \code{pop} slot of the \code{\link{genind}} object if there is no data -#' frame specified in dfname. If there is no population structure and there is -#' no specified data frame, it will clone correct the entire -#' \code{\link{genind}} object. -#' -#' +#' @param pop a \code{\link{genind}} object +#' +#' @param hier a hierarchical formula or numeric vector. In a +#' \code{\linkS4class{genclone}} object, this will define the columns of the +#' data frame in the hierarchy slot to use. In a \code{\linkS4class{genind}} +#' object, the data frame must exist within the \code{\link[adegenet]{other}} +#' slot and the user must define the name of the data frame with the parameter +#' \code{dfname} +#' +#' @param dfname a \code{character string}. \strong{Only for genind objects} +#' This is the name of the data frame or list containing the vectors of the +#' population hierarchy within the \code{other} slot of the +#' \code{\link{genind}} object. +#' +#' @param combine \code{logical}. When set to TRUE, the heirarchy will be +#' combined to create a new population for the clone corrected genind or +#' genclone object. +#' +#' @param keep \code{integer}. When \code{combine} is set to \code{FALSE}, you +#' can use this flag to choose the levels of your population hierarchy. For +#' example: if your clone correction hierarchy is set to "Pop", "Subpop", and +#' "Year", and you want to analyze your populations with respect to year, you +#' can set \code{keep = c(1,3)}. +#' +#' @return a clone corrected \code{\linkS4class{genclone}} or +#' \code{\linkS4class{genind}} object. +#' +#' @details This function will clone correct based on the hierarchical level +#' provided. To clone correct indiscriminantly of hierarchical structure, set +#' \code{hier = NA}. It is recommended to use this function with +#' \code{\linkS4class{genclone}} objects as they have a specific slot for +#' hierarchies. If you wish to use this function on a +#' \code{\linkS4class{genind}} object, see below. +#' +#' @note \subsection{For genind objects}{ \code{\linkS4class{genind}} objects do +#' not have a specific slot for hierarchies and thus require the user to +#' specfy the hierarchical levels in a data frame within the +#' \code{\link[adegenet]{other}} slot. If there is no data frame indicating +#' population hierarchy, then clone correction will occur on the population +#' factor that is set in the \code{\link[adegenet]{pop}} slot.} +#' +#' #' @export #' @author Zhian N. Kamvar #' @examples #' # LOAD A. euteiches data set #' data(Aeut) #' +#' # Redefine it as a genclone object +#' Aeut <- as.genclone(Aeut, hier = other(Aeut)$population_hierarchy[-1]) +#' #' # Check the number of multilocus genotypes #' mlg(Aeut) #' Aeut$pop.names #' #' # Clone correct at the population level. -#' Aeut.pop <- clonecorrect(Aeut, hier="Pop") +#' Aeut.pop <- clonecorrect(Aeut, hier= ~Pop) #' mlg(Aeut.pop) #' Aeut.pop$pop.names #' #' \dontrun{ #' # Clone correct at the subpopulation level with respect to population and #' # combine. -#' Aeut.subpop <- clonecorrect(Aeut, hier=c("Pop", "Subpop"), combine=TRUE) +#' Aeut.subpop <- clonecorrect(Aeut, hier=~Pop/Subpop, combine=TRUE) #' mlg(Aeut.subpop) #' Aeut.subpop$pop.names #' #' # Do the same, but set to the population level. -#' Aeut.subpop2 <- clonecorrect(Aeut, hier=c("Pop", "Subpop"), keep=1) +#' Aeut.subpop2 <- clonecorrect(Aeut, hier=~Pop/Subpop, keep=1) #' mlg(Aeut.subpop2) #' Aeut.subpop2$pop.names #' #' # LOAD H3N2 dataset #' data(H3N2) -#' +#' #' # Extract only the individuals located in China -#' country <- clonecorrect(H3N2, hier=c("country"), dfname="x") -#' +#' country <- clonecorrect(H3N2, hier= ~country, dfname="x") +#' #' # How many isolates did we have from China before clone correction? #' length(which(other(H3N2)$x$country=="China")) # 155 -#' +#' #' # How many unique isolates from China after clone correction? #' length(which(other(country)$x$country=="China")) # 79 #' #' # Something a little more complicated. (This could take a few minutes on #' # slower computers) -#' +#' #' # setting the hierarchy to be Country > Year > Month -#' c.y.m <- clonecorrect(H3N2, hier=c("year","month","country"), dfname="x") -#' +#' c.y.m <- clonecorrect(H3N2, hier= ~year/month/country, dfname="x") +#' #' # How many isolates in the original data set? #' length(other(H3N2)$x$country) # 1903 -#' +#' #' # How many after we clone corrected for country, year, and month? #' length(other(c.y.m)$x$country) # 1190 #' } #==============================================================================# -clonecorrect <- function(pop, hier=c(1), dfname="population_hierarchy", +clonecorrect <- function(pop, hier=1, dfname="population_hierarchy", combine = FALSE, keep = 1){ clonecall <- match.call()$pop if(!is.genind(pop)){ stop(paste(paste(substitute(pop), collapse=""), "is not a genind object.\n")) } - + if (is.language(hier)){ + hierformula <- hier + hier <- all.vars(hier) + } popcall <- pop@call if (is.na(hier[1])){ return(pop[.clonecorrector(pop), ]) } - - # Checks for data frame in the @other slot. If it's not there, this loop is - # initiated. - if(is.null(other(pop)[[dfname]])){ + if (is.genclone(pop)){ + if (is.numeric(hier)){ + hier <- names(gethierarchy(pop))[hier] + hierformula <- as.formula(paste0("~", paste(hier, collapse = "/"))) + } + if (!all(hier %in% names(gethierarchy(pop)))){ + stop(hier_incompatible_warning(hier, gethierarchy(pop))) + } + setpop(pop) <- hierformula + } else if (is.null(other(pop)[[dfname]])) { if(length(hier) == 1 & hier[1] == 1){ if(length(levels(pop(pop))) == 1 | is.null(pop(pop))){ pop <- pop[.clonecorrector(pop), ] @@ -175,7 +195,9 @@ clonecorrect <- function(pop, hier=c(1), dfname="population_hierarchy", } # Combining the population factor by the hierarchy - pop <- splitcombine(pop, method=2, dfname=dfname, hier=hier) + if(!is.genclone(pop)){ + pop <- splitcombine(pop, method=2, dfname=dfname, hier=hier) + } cpop <- length(pop$pop.names) # Steps for correction: @@ -196,13 +218,19 @@ clonecorrect <- function(pop, hier=c(1), dfname="population_hierarchy", # When the combine flag is not true, the default is to keep the first level # of the hierarchy. The keep flag is a numeric vector corresponding to the # hier flag indicating which levels the user wants to keep. - if(length(keep) > 1){ - pop <- splitcombine(pop, hier=hier[keep], method=2, dfname=dfname) - } - else{ - pop(pop) <- pop$other[[dfname]][[hier[keep]]] + if (is.genclone(pop)){ + hier <- hier[keep] + newformula <- as.formula(paste0("~", paste(hier, collapse = "/"))) + setpop(pop) <- newformula + } else { + if(length(keep) > 1){ + pop <- splitcombine(pop, hier=hier[keep], method=2, dfname=dfname) + } + else{ + pop(pop) <- pop$other[[dfname]][[hier[keep]]] + } + names(pop$pop.names) <- levels(pop$pop) } - names(pop$pop.names) <- levels(pop$pop) } pop@call <- popcall return(pop) @@ -210,18 +238,12 @@ clonecorrect <- function(pop, hier=c(1), dfname="population_hierarchy", #==============================================================================# -# subset a population with a combination of sublists and blacklists. Either one -# is optional, and the default is to do nothing. The structure will allow the -# user to select a range of populations and exclude a small number of them -# without having to use the total. -# eg pop <- pop.subset(pop, sublist=1:50, blacklist=c(17, 33)) -# -#' Subset a \code{\link{genind}} object by population +#' Subset a \code{\linkS4class{genclone}} or \code{\linkS4class{genind}} object by population #' #' Create a new dataset with specified populations or exclude specified #' populations from the dataset. #' -#' @param pop a \code{\link{genind}} object. +#' @param gid a \code{\linkS4class{genclone}} or \code{\linkS4class{genind}} object. #' #' @param sublist a \code{vector} of population names or indexes that the user #' wishes to keep. Default to "ALL". @@ -237,7 +259,7 @@ clonecorrect <- function(pop, hier=c(1), dfname="population_hierarchy", #' from the population. #' #' @return A \code{genind} object or a matrix. -#' +#' @author Zhian N. Kamvar #' @examples #' # Load the dataset microbov. #' data(microbov) @@ -258,33 +280,33 @@ clonecorrect <- function(pop, hier=c(1), dfname="population_hierarchy", #' @export #==============================================================================# -popsub <- function(pop, sublist="ALL", blacklist=NULL, mat=NULL, drop=TRUE){ +popsub <- function(gid, sublist="ALL", blacklist=NULL, mat=NULL, drop=TRUE){ - if (!is.genind(pop)){ - stop("pop.subset requires a genind object\n") + if (!is.genind(gid)){ + stop("popsub requires a genind object\n") } - if (is.null(pop(pop))){ + if (is.null(pop(gid))){ if(sublist[1] != "ALL") warning("No population structure. Subsetting not taking place.") - return(pop) + return(gid) } - if(toupper(sublist[1]) == "ALL"){ + orig_list <- sublist + popnames <- gid@pop.names + if (toupper(sublist[1]) == "ALL"){ if (is.null(blacklist)){ - return(pop) - } - else { + return(gid) + } else { # filling the sublist with all of the population names. - sublist <- pop@pop.names + sublist <- popnames } } # Checking if there are names for the population names. # If there are none, it will give them names. - if (is.null(names(pop@pop.names))){ - if (length(pop@pop.names) == length(levels(pop@pop))){ - names(pop@pop.names) <- levels(pop@pop) - } - else{ + if (is.null(names(popnames))){ + if (length(popnames) == length(levels(gid@pop))){ + names(popnames) <- levels(gid@pop) + } else { stop("Population names do not match population factors.") } } @@ -293,94 +315,90 @@ popsub <- function(pop, sublist="ALL", blacklist=NULL, mat=NULL, drop=TRUE){ if (!is.null(blacklist)){ # If both the sublist and blacklist are numeric or character. - if(is.numeric(sublist) & is.numeric(blacklist) | class(sublist) == class(blacklist)){ + if (is.numeric(sublist) & is.numeric(blacklist) | class(sublist) == class(blacklist)){ sublist <- sublist[!sublist %in% blacklist] - } - - # if the sublist is numeric and blacklist is a character. eg s=1:10, b="USA" - else if(is.numeric(sublist) & class(blacklist) == "character"){ - sublist <- sublist[sublist %in% which(!pop@pop.names %in% blacklist)] - } - else{ - + } else if (is.numeric(sublist) & class(blacklist) == "character"){ + # if the sublist is numeric and blacklist is a character. eg s=1:10, b="USA" + sublist <- sublist[sublist %in% which(!popnames %in% blacklist)] + } else { # no sublist specified. Ideal situation - if(all(pop@pop.names %in% sublist)){ + if(all(popnames %in% sublist)){ sublist <- sublist[-blacklist] - } - - # weird situation where the user will specify a certain sublist, yet index - # the blacklist numerically. Interpreted as an index of populations in the - # whole data set as opposed to the sublist. - else{ + } else { + # weird situation where the user will specify a certain sublist, yet + # index the blacklist numerically. Interpreted as an index of + # populations in the whole data set as opposed to the sublist. warning("Blacklist is numeric. Interpreting blacklist as the index of the population in the total data set.") - sublist <- sublist[!sublist %in% pop@pop.names[blacklist]] + sublist <- sublist[!sublist %in% popnames[blacklist]] } } } - if(!is.null(mat)){ + if (!is.null(mat)){ mat <- mat[sublist, , drop=FALSE] return(mat[, which(colSums(mat) > 0), drop=FALSE]) - } - else{ + } else { # subsetting the population. - if (is.numeric(sublist)) - sublist <- names(pop@pop.names[sublist]) - else - sublist <- names(pop@pop.names[pop@pop.names %in% sublist]) - sublist <- (1:length(pop@pop))[pop@pop %in% sublist] - if(is.na(sublist[1])){ - warning("All items present in Sublist are also present in the Blacklist.\nSubsetting not taking place.") - return(pop) + if (is.numeric(sublist)){ + sublist <- popnames[sublist] + } else { + sublist <- popnames[popnames %in% sublist] } - pop <- pop[sublist, ,drop=drop] - pop@call <- match.call() - return(pop) + sublist <- pop(gid) %in% sublist + if (!any(sublist)){ + if (!is.numeric(orig_list) & !any(gid@pop.names %in% orig_list)){ + stop(unmatched_pops_warning(gid@pop.names, orig_list)) + } else { + nothing_warn <- paste("Nothing present in the sublist.\n", + "Perhaps the sublist and blacklist arguments have", + "duplicate entries?\n", + "Subsetting not taking place.") + warning(nothing_warn) + return(gid) + } + } + gid <- gid[sublist, , drop = drop] + gid@call <- match.call() + return(gid) } } #==============================================================================# -# missigno simply applies one of four methods to deal with missing data. -# default is to remove missing loci. -#' How to deal with missing data in a genind object. -#' -#' missingno gives the user four options to deal with missing data. -#' -#' @param pop a \code{\link{genind}} object. -#' -#' @param type a \code{character} string: can be "zero", "mean", "loci", or "geno" -#' (see \code{Details} for definitions).] -#' -#' @param cutoff \code{numeric}. A number from 0 to 1 indicating the allowable -#' rate of missing data in either genotypes or loci. This will be ignored for -#' \code{type} values of \code{"mean"} or \code{"zero"}. -#' -#' @param quiet if \code{TRUE}, it will print to the screen the action performed. -#' -#' @section Details: The default way that functions in \code{poppr} deal with -#' missing data is to simply ignore it. These methods provide a way to deal with -#' systematic missing data and to give a wrapper for \code{adegenet}'s \code{ -#' \link{na.replace}} function. ALL OF THESE ARE TO BE USED WITH CAUTION. -#' -#' \strong{\code{"loci"}} - removes all loci containing missing data in the entire data -#' set. -#' -#' \strong{\code{"geno"}} - removes any genotypes/isolates/individuals with missing data. -#' -#' \strong{\code{"mean"}} - replaces all NA's with the mean of the alleles for the entire -#' data set. -#' -#' \strong{\code{"zero"}} or \strong{\code{"0"}} - replaces all NA's with "0". -#' Introduces more diversity. -#' -#' @return a \code{\link{genind}} object. -#' -#' @note -#' \emph{"wild missingno appeared!"} -#' -#' @seealso \code{\link{na.replace}}, \code{\link{poppr}} -#' -#' @export -#' @author Zhian N. Kamvar +#'How to deal with missing data in a genind object. +#' +#'missingno gives the user four options to deal with missing data. +#' +#'@param pop a \code{\linkS4class{genclone}} or \code{\linkS4class{genind}} +#' object. +#' +#'@param type a \code{character} string: can be "ignore", "zero", "mean", +#' "loci", or "geno" (see \code{Details} for definitions). +#' +#'@param cutoff \code{numeric}. A number from 0 to 1 indicating the allowable +#' rate of missing data in either genotypes or loci. This will be ignored for +#' \code{type} values of \code{"mean"} or \code{"zero"}. +#' +#'@param quiet if \code{TRUE}, it will print to the screen the action performed. +#' +#'@details These methods provide a way to deal with systematic missing data and +#' to give a wrapper for \code{adegenet}'s \code{ \link{na.replace}} function. +#' ALL OF THESE ARE TO BE USED WITH CAUTION. +#' +#' \subsection{Treatment types}{ \itemize{ \item{\code{"ignore"} - does not +#' remove or replace missing data.} \item{\code{"loci"} - removes all loci +#' containing missing data in the entire data set. } \item{\code{"genotype"} - +#' removes any genotypes/isolates/individuals with missing data.} +#' \item{\code{"mean"} - replaces all NA's with the mean of the alleles for the +#' entire data set.} \item{\code{"zero"} or \code{"0"} - replaces all NA's with +#' "0". Introduces more diversity.}}} +#'@return a \code{\linkS4class{genclone}} or \code{\linkS4class{genind}} object. +#' +#'@note \emph{"wild missingno appeared!"} +#' +#'@seealso \code{\link{na.replace}}, \code{\link{poppr}}, +#' \code{\link{poppr.amova}}, \code{\link{nei.dist}}, \code{\link{aboot}} +#' +#'@export +#'@author Zhian N. Kamvar #' @examples #' #' data(nancycats) @@ -423,49 +441,78 @@ popsub <- function(pop, sublist="ALL", blacklist=NULL, mat=NULL, drop=TRUE){ missingno <- function(pop, type = "loci", cutoff = 0.05, quiet=FALSE){ if(sum(is.na(pop@tab)) > 0){ # removes any loci (columns) with missing values. - if (toupper(type)=="LOCI"){ - naloci <- percent_missing(pop, type=type, cutoff=cutoff) + MISSINGOPTS <- c("loci", "genotypes", "mean", "zero", "0", "ignore") + type <- match.arg(tolower(type), MISSINGOPTS) + if (type == "ignore"){ + return(pop) + } + navals <- percent_missing(pop, type = type, cutoff = cutoff) + if (type == "loci"){ if(quiet != TRUE){ - if(all(naloci < 0)){ - remloc <- pop@loc.names[which(cumsum(pop@loc.nall) %in% -naloci)] + # if(all(naloci < 0)){ + # remloc <- pop@loc.names[which(cumsum(pop@loc.nall) %in% -naloci)] + # cat("\n Found", sum(is.na(pop@tab)),"missing values.") + # loci <- paste(length(remloc), ifelse(length(remloc) == 1, "locus", "loci")) + # cat("\n",loci,"contained missing values greater than",paste(cutoff*100,"%.",sep="")) + # cat("\n Removing",loci,":", remloc,"\n", fill = 80) + # } + # else{ + # cat("\n No loci with missing values above",paste(cutoff*100,"%",sep=""),"found.\n") + # } + if (length(navals) == ncol(pop@tab)){ + cat("\n No loci with missing values above", + paste0(cutoff*100,"%"),"found.\n") + } else { + remloc <- pop@loc.names[!cumsum(pop@loc.nall) %in% navals] cat("\n Found", sum(is.na(pop@tab)),"missing values.") - loci <- paste(length(remloc), ifelse(length(remloc) == 1, "locus", "loci")) - cat("\n",loci,"contained missing values greater than",paste(cutoff*100,"%.",sep="")) - cat("\n Removing",loci,":", remloc,"\n", fill = 80) - } - else{ - cat("\n No loci with missing values above",paste(cutoff*100,"%",sep=""),"found.\n") + loci <- paste(length(remloc), ifelse(length(remloc) == 1, "locus", + "loci")) + cat("\n", loci, "contained missing values greater than", + paste0(cutoff*100,"%.")) + cat("\n Removing", loci, ":", remloc,"\n", fill = 80) } } - pop <- pop[, naloci] + pop <- pop[, navals] } # removes any genotypes (rows) with missing values. - else if (!is.na(grep("GEN", toupper(type), value=TRUE)[1])){ - nageno <- percent_missing(pop, type=type, cutoff=cutoff) + else if (type == "genotypes"){ + # nageno <- percent_missing(pop, type=type, cutoff=cutoff) if(quiet != TRUE){ - if(all(nageno < 0)){ - remgeno <- pop@ind.names[-nageno] - cat("\n Found", sum(is.na(pop@tab)),"missing values.") - genotypes <- paste(length(remgeno), ifelse(length(remgeno) == 1, - "genotype", "genotypes")) - cat("\n",genotypes,"contained missing values greater than", - paste(cutoff*100,"%.",sep="")) - cat("\n Removing",genotypes,":",remgeno,"\n", fill = 80) - } - else{ + # if(all(nageno < 0)){ + # remgeno <- pop@ind.names[-nageno] + # cat("\n Found", sum(is.na(pop@tab)),"missing values.") + # genotypes <- paste(length(remgeno), ifelse(length(remgeno) == 1, + # "genotype", "genotypes")) + # cat("\n",genotypes,"contained missing values greater than", + # paste(cutoff*100,"%.",sep="")) + # cat("\n Removing",genotypes,":",remgeno,"\n", fill = 80) + # } + # else{ + # cat("\n No genotypes with missing values above", + # paste(cutoff*100,"%",sep=""),"found.\n") + # } + if (length(navals) == nInd(pop)){ cat("\n No genotypes with missing values above", - paste(cutoff*100,"%",sep=""),"found.\n") + paste0(cutoff*100, "%"),"found.\n") + } else { + remgeno <- indNames(pop)[-navals] + cat("\n Found", sum(is.na(pop@tab)),"missing values.") + geno <- paste(length(remgeno), ifelse(length(remgeno) == 1, + "genotype", "genotypes")) + cat("\n", geno, "contained missing values greater than", + paste0(cutoff*100,"%.")) + cat("\n Removing", geno, ":", remgeno,"\n", fill = 80) } } - pop <- pop[nageno, ] + pop <- pop[navals, ] } # changes all NA's to the mean of the column. NOT RECOMMENDED - else if (toupper(type)=="MEAN"){ - pop <- na.replace(pop,"mean", quiet=quiet) + else if (type == "mean"){ + pop <- na.replace(pop, "mean", quiet=quiet) } # changes all NA's to 0. NOT RECOMMENDED. INTRODUCES MORE DIVERSITY. - else if (toupper(type) %in% c("ZERO","0")){ - pop <- na.replace(pop,"0", quiet=quiet) + else if (type %in% c("zero","0")){ + pop <- na.replace(pop, "0", quiet=quiet) } } else{ @@ -477,7 +524,7 @@ missingno <- function(pop, type = "loci", cutoff = 0.05, quiet=FALSE){ } #==============================================================================# -#' Split a or combine items within a data frame in \code{\link{genind}} objects. +#' Split a or combine items within a data frame in \code{\link{genind}} objects (DEPRECATED). #' #' Often, one way a lot of file formats fail is that they do not allow multiple #' population hierarchies. This can be circumvented, however, by coding all of @@ -510,12 +557,18 @@ missingno <- function(pop, type = "loci", cutoff = 0.05, quiet=FALSE){ #' @return a \code{\link{genind}} object with a modified data frame in the #' \code{\link{other}} slot. #' -#' @note The separator field is sensitive to regular expressions. If you do not -#' know what those are, please use the default underscore to separate your -#' populations. Use \code{fixed = TRUE} to ignore regular expressions. -#' If you do not set the \code{hier} flag for the split method, your new data -#' frame will have the names "comb", "h1", "h2" and so on; for the combine -#' method, your data frame will return the first column of your data frame. +#' @note +#' This function has been deprecated and replaced by functions like +#' \code{\link{splithierarchy}}. Please consider using the +#' \code{\linkS4class{genclone}} object for storing hierarchies. +#' +#' The separator +#' field is sensitive to regular expressions. If you do not know what those are, +#' please use the default underscore to separate your populations. Use \code{fixed +#' = TRUE} to ignore regular expressions. If you do not set the \code{hier} flag +#' for the split method, your new data frame will have the names "comb", "h1", "h2" +#' and so on; for the combine method, your data frame will return the first column +#' of your data frame. #' #' @export #' @author Zhian N. Kamvar @@ -552,7 +605,7 @@ missingno <- function(pop, type = "loci", cutoff = 0.05, quiet=FALSE){ #' H.comb <- splitcombine(H.comb, method=1, dfname="year_country", hier=c("year", "country")) #' } #==============================================================================# -splitcombine <- function(pop, method=1, dfname="population_hierarchy", sep="_", hier=c(1), setpopulation=TRUE, fixed=TRUE){ +splitcombine <- function(pop, method=1, dfname="population_hierarchy", sep="_", hier=1, setpopulation=TRUE, fixed=TRUE){ if (!is.genind(pop)){ stop(paste(paste(substitute(pop), collapse=""), "is not a genind object.\n")) } @@ -652,54 +705,61 @@ splitcombine <- function(pop, method=1, dfname="population_hierarchy", sep="_", #' Remove all non-phylogentically informative loci #' #' This function will facilitate in removing phylogenetically uninformative loci -#' from a \code{\link{genind}} object. The user can specify what is meant by -#' phylogenetically uninformative with a specification of the cutoff percentage. -#' Any loci under the cutoff will be removed. For convenience's sake, the -#' default cutoff is set to 2 individuals. -#' -#' @param pop a \code{\link{genind}} object. -#' -#' @param cutoff \code{numeric}. This is a number from 0 to 1 representing the -#' minimum percentage of differentiating individuals. Defaults is 2 individuals. -#' -#' @param quiet \code{logical}. When \code{quiet = TRUE}, messages indicating -#' the loci removed will be printed to screen. When \code{quiet = FALSE}, -#' nothing will be printed to screen. -#' +#' from a \code{\linkS4class{genclone}} or \code{\linkS4class{genind}} object. +#' The user can specify what is meant by phylogenetically uninformative with a +#' specification of the cutoff percentage. Any loci under the cutoff will be +#' removed. For convenience's sake, the default cutoff is set to 2 individuals. +#' +#' @param pop a \code{\linkS4class{genclone}} or \code{\linkS4class{genind}} +#' object. +#' +#' @param cutoff \code{numeric}. This is a number from 0 to 1 representing the +#' minimum percentage of differentiating individuals. Defaults is 2 +#' individuals. +#' +#' @param quiet \code{logical}. When \code{quiet = TRUE}, messages indicating +#' the loci removed will be printed to screen. When \code{quiet = FALSE}, +#' nothing will be printed to screen. +#' #' @return A \code{genind} object with user-defined informative loci. -#' -#' @note This will have a few side effects that affect certain analyses. First, -#' the number of multilocus genotypes might be reduced due to the reduced number -#' of markers. Second, if you plan on using this data for analysis of the index -#' of association, be sure to use the standardized version (rbarD) that corrects -#' for the number of observed loci. -#' +#' +#' @details This function works by analyzing the genotypes at each locus. This +#' has the effect that if a locus has fixed heterozygotes for two alleles, it +#' will be removed as all individuals are invariant. +#' +#' @note This will have a few side effects that affect certain analyses. First, +#' the number of multilocus genotypes might be reduced due to the reduced +#' number of markers. Second, if you plan on using this data for analysis of +#' the index of association, be sure to use the standardized version (rbarD) +#' that corrects for the number of observed loci. +#' +#' @author Zhian N. Kamvar #' @examples #' # Load the data set H3N2 #' data(H3N2) #' pop(H3N2) <- H3N2$other$x$country #' Nepal <- popsub(H3N2, "Nepal") -#' +#' #' # Using the default 2 individuals. #' N.inform <- informloci(Nepal) -#' +#' #' # 5 individuals. #' N.informfive <- informloci(Nepal, cutoff = 5/nInd(Nepal)) -#' +#' #' # 10 individuals. Too many. Gives warning. #' N.informten <- informloci(Nepal, cutoff = 10/nInd(Nepal)) -#' +#' #' # Decimate (10%) #' N.informdecimated <- informloci(Nepal, cutoff = 0.1) #' @export #==============================================================================# - +#' @importFrom pegas as.loci informloci <- function(pop, cutoff = 2/nInd(pop), quiet = FALSE){ - if(!is.genind(pop)){ + if (!is.genind(pop)){ stop("This function only works on genind objects.") } MLG <- mlg(pop, quiet = TRUE) - if(MLG < 3){ + if (MLG < 3){ if(!isTRUE(quiet)){ cat("Not enough multilocus genotypes to be meaningful.\n") } @@ -707,10 +767,10 @@ informloci <- function(pop, cutoff = 2/nInd(pop), quiet = FALSE){ } cutoff <- ifelse(cutoff > 0.5, 1 - cutoff, cutoff) min_ind = round(cutoff*nInd(pop)) - if(!isTRUE(quiet)){ + if (!isTRUE(quiet)){ cat("cutoff value:", cutoff*100, "percent (",min_ind,"individuals ).\n") } - if(pop@type == "PA"){ + if (pop@type == "PA"){ # cutoff applies to too many or too few typed individuals in AFLP cases. locivals <- apply(pop@tab, 2, sum) %in% min_ind:(nInd(pop) - min_ind) if(!isTRUE(quiet)){ @@ -725,26 +785,22 @@ informloci <- function(pop, cutoff = 2/nInd(pop), quiet = FALSE){ } } return(pop[, locivals]) - } - else{ + } else { # as.loci will put the population factor first when creating the data frame. - if(is.null(pop@pop)){ + if (is.null(pop@pop)){ locivals <- apply(as.loci(pop), 2, test_table, min_ind, nInd(pop)) - } - else{ + } else { locivals <- apply(as.loci(pop)[-1], 2, test_table, min_ind, nInd(pop)) } - if(!isTRUE(quiet)){ - if(all(locivals == TRUE)){ + if (!isTRUE(quiet)){ + if (all(locivals == TRUE)){ cat("No sites found with fewer than", min_ind, "different individuals.\n", fill = 80) - } - else if(sum(locivals) < 2){ + } else if (sum(locivals) < 2){ cat("Fewer than 2 loci found informative. Perhaps you should choose a", "lower cutoff value?\nReturning with no changes.\n") return(pop) - } - else{ + } else { cat(sum(!locivals), "uninformative", ifelse(sum(!locivals) > 1, "loci", "locus"), "found:", pop@loc.names[!locivals],"\n", fill = 80) @@ -754,4 +810,136 @@ informloci <- function(pop, cutoff = 2/nInd(pop), quiet = FALSE){ } } +#==============================================================================# +#' Recode polyploid microsatellite data for use in frequency based statistics. +#' +#' As the genind object requires ploidy to be consistent across loci, a +#' workaround to importing polyploid data was to code missing alleles as "0" +#' (for microsatellite data sets). The advantage of this is that users would be +#' able to calculate Bruvo's distance, the index of association, and genotypic +#' diversity statistics. The tradeoff was the fact that this broke all other +#' analyses as they relied on allele frequencies and the missing alleles are +#' treated as extra alleles. This function removes those alleles and returns a +#' \code{\linkS4class{genclone}} or \code{\linkS4class{genind}} object where +#' allele frequencies are coded based on the number of alleles observed at a +#' single locus per individual. See the examples for more details. +#' +#' @param poly a \code{\linkS4class{genclone}} or \code{\linkS4class{genind}} +#' object that has a ploidy of >2 +#' @param newploidy an \code{integer}. This gives the user the option to reset +#' the ploidy of the data set. It's default is set to the ploidy of the +#' incoming data set. +#' +#' @return a \code{\linkS4class{genclone}} or \code{\linkS4class{genind}} +#' object. +#' +#' @details The genind object has two caveats that make it difficult to work +#' with polyploid data sets: \enumerate{\item ploidy must be constant +#' throughout the data set \item missing data is treated as "all-or-none"} In +#' an ideal world, polyploid genotypes would be just as unambigouous as +#' diploid or haploid genotypes. Unfortunately, the world we live in is far +#' from ideal and a genotype of AB in a tetraploid organism could be AAAB, +#' AABB, or ABBB. In order to get polyploid data in to \pkg{adegenet} or +#' \pkg{poppr}, we must code all loci to have the same number of allelic +#' states as the ploidy or largest observed heterozygote (if ploidy is +#' unknown). The way to do this is to insert zeroes to pad the alleles. So, to +#' import two genotypes of: +#' \tabular{rrrr}{ +#' NA \tab 20 \tab 23 \tab 24\cr +#' 20 \tab 24 \tab 26 \tab 43 +#' } +#' they should be coded as: +#' \tabular{rrrr}{ +#' 0 \tab 20 \tab 23 \tab 24\cr +#' 20 \tab 24 \tab 26 \tab 43 +#' } +#' This zero is treated as an extra allele and is represented in the genind object as so: +#' \tabular{rrrrrr}{ +#' \strong{0} \tab \strong{20} \tab \strong{23} \tab \strong{24} \tab \strong{26} \tab \strong{43}\cr +#' 0.25 \tab 0.25 \tab 0.25 \tab 0.25 \tab 0.00 \tab 0.00\cr +#' 0.00 \tab 0.25 \tab 0.00 \tab 0.25 \tab 0.25 \tab 0.25 +#' } +#' +#' A homozygote would have the \strong{0} column at a value of 0.75. This +#' function remidies this problem by removing the zero column and rescaling the allele +#' frequencies to those observed. The above table would become: +#' \tabular{rrrrr}{ +#' \strong{20} \tab \strong{23} \tab \strong{24} \tab \strong{26} \tab \strong{43}\cr +#' 0.333 \tab 0.333 \tab 0.333 \tab 0.00 \tab 0.00\cr +#' 0.25 \tab 0.00 \tab 0.25 \tab 0.25 \tab 0.25 +#' } +#' +#' With this, the user is able to calculate frequency based statistics on the +#' data set. +#' +#' @note This is an approximation, and a bad one at that. \pkg{Poppr} was not +#' originally intended for polyploids, but with the inclusion of Bruvo's +#' distance, it only made sense to attempt something beyond single use. +#' +#' \strong{Do not use recoded data with Bruvo's distance or the index of association.} +#' +#' @author Zhian N. Kamvar +#' @export +#' @examples +#' data(Pinf) +#' iPinf <- recode_polyploids(Pinf) +#' +#' # Obtaining basic summaries. Note the heterozygosity measures. +#' summary(Pinf) +#' summary(iPinf) +#' +#' \dontrun{ +#' library(ape) +#' +#' # Removing missing data. +#' Pinf <- missingno(Pinf, "geno", cutoff = 0) +#' iPinf <- recode_polyploids(Pinf) +#' +#' # Calculating Rogers' distance. +#' rog <- rogers.dist(Pinf) +#' irog <- rogers.dist(iPinf) +#' +#' # We will now plot neighbor joining trees. Note the decreased distance in the +#' # original data. +#' plot(nj(rog), type = "unrooted", show.tip.label = FALSE) +#' add.scale.bar(lcol = "red") +#' plot(nj(irog), type = "unrooted", show.tip.label = FALSE) +#' add.scale.bar(lcol = "red") +#' } +#==============================================================================# +recode_polyploids <- function(poly, newploidy = poly@ploidy){ + if (!is.genind(poly)){ + stop("input must be a genind object.") + } else if (!test_zeroes(poly)){ + warning("Input is not a polyploid data set, returning original.") + return(poly) + } + if (!is.null(pop(poly))){ + MAT <- truenames(poly)$tab + } else { + MAT <- truenames(poly) + } + fac <- poly@loc.fac + zerocols <- as.numeric(unlist(poly@all.names)) == 0 #!duplicated(fac) + newfac <- fac[!zerocols] + loci <- lapply(split(MAT, fac[col(MAT)]), matrix, nrow = nInd(poly)) + loci <- lapply(1:length(loci), function(x){ + locus <- loci[[x]] + alleles <- as.numeric(poly@all.names[[x]]) + return(locus[, alleles > 0]) + }) + loci <- lapply(loci, function(mat) t(apply(mat, 1, function(x) x/sum(x)))) + newMAT <- matrix(nrow = nInd(poly), ncol = length(newfac)) + newMAT[] <- unlist(loci) + colnames(newMAT) <- colnames(MAT)[!zerocols] + rownames(newMAT) <- rownames(MAT) + newgen <- genind(newMAT, pop = pop(poly), ploidy = newploidy, type = poly@type) + newgen@other <- poly@other + if (is.genclone(poly)){ + newgen <- new('genclone', newgen, poly@hierarchy, poly@mlg) + } + return(newgen) +} + + diff --git a/R/distances.r b/R/distances.r index 8a3d1e93..a5f9edc5 100644 --- a/R/distances.r +++ b/R/distances.r @@ -42,103 +42,459 @@ #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# #==============================================================================# -#' Display a greyscale gradient adjusted to specific parameters -#' -#' This function has one purpose. It is for deciding the appropriate scaling for -#' a grey palette to be used for edge weights of a minimum spanning network. -#' -#' @param glim "grey limit". Two numbers between zero and one. They determine -#' the upper and lower limits for the \code{\link{gray}} function. Default is 0 -#' (black) and 0.8 (20\% black). -#' -#' @param gadj "grey adjust". a positive \code{integer} greater than zero that -#' will serve as the exponent to the edge weight to scale the grey value to -#' represent that weight. -#' -#' @param gweight "grey weight". an \code{integer}. If it's 1, the grey scale -#' will be weighted to emphasize the differences between closely related nodes. -#' If it is 2, the grey scale will be weighted to emphasize the differences -#' between more distantly related nodes. +#' Calculate a distance matrix based on relative dissimilarity #' -#' @return A plot displaying a grey gradient from 0.001 to 1 with minimum and -#' maximum values displayed as yellow lines, and an equation for the correction -#' displayed in red. +#' diss.dist uses the same discreet dissimilarity matrix utilized by the index +#' of association (see \code{\link{ia}} for details). By default, it returns a +#' distance reflecting the number of allelic differences between two +#' individuals. When \code{percent = TRUE}, it returns a ratio of the number of +#' observed differences by the number of possible differences. Eg. two +#' individuals who share half of the same alleles will have a distance of 0.5. +#' This function can analyze distances for any marker system. #' +#' @param x a \code{\link{genind}} object. +#' +#' @param percent \code{logical}. Should the distance be represented as a +#' percent? If set to \code{FALSE} (default), the distance will be reflected +#' as the number of alleles differing between to individuals. When set to +#' \code{TRUE}, These will be divided by the ploidy multiplied by the number +#' of loci. +#' +#' @param mat \code{logical}. Return a matrix object. Default set to +#' \code{FALSE}, returning a dist object. \code{TRUE} returns a matrix object. +#' +#' @return Pairwise distances between individuals present in the genind object. #' @author Zhian N. Kamvar -#' +#' +#' @note When \code{percent = TRUE}, this is exactly the same as +#' \code{\link{provesti.dist}}, except that it performs better for large +#' numbers of individuals (n > 125) at the cost of avaliable memory. +#' #' @examples -#' # Normal grey curve with an adjustment of 3, an upper limit of 0.8, and -#' # weighted towards smaller values. -#' greycurve() -#' \dontrun{ -#' # 1:1 relationship grey curve. -#' greycurve(gadj=1, glim=1:0) #' -#' # Grey curve weighted towards larger values. -#' greycurve(gweight=2) -#' -#' # Same as the first, but the limit is 1. -#' greycurve(glim=1:0) +#' # A simple example. Let's analyze the mean distance among populations of A. +#' # euteiches. #' -#' # Setting the lower limit to 0.1 and weighting towards larger values. -#' greycurve(glim=c(0.1,0.8), gweight=2) +#' data(Aeut) +#' mean(diss.dist(popsub(Aeut, 1))) +#' \dontrun{ +#' mean(diss.dist(popsub(Aeut, 2))) +#' mean(diss.dist(Aeut)) #' } #' @export #==============================================================================# -greycurve <- function(glim = c(0,0.8), gadj = 3, gweight = 1){ - gadj <- ifelse(gweight == 1, gadj, -gadj) - adjustcurve(seq(0.001, 1, 0.001), glim, correction=gadj, show=TRUE) + +diss.dist <- function(x, percent=FALSE, mat=FALSE){ + stopifnot(is.genind(x)) + ploid <- ploidy(x) + ind.names <- x@ind.names + inds <- nrow(x@tab) + np <- choose(inds, 2) + dist.mat <- matrix(data = 0, nrow = inds, ncol = inds) + numLoci <- length(x@loc.names) + type <- x@type + if (type == "PA"){ + dist_by_locus <- matrix(.Call("pairdiffs", x@tab)) + ploid <- 1 + } else { + x <- seploc(x) + dist_by_locus <- vapply(x, function(x) .Call("pairdiffs", x@tab)*(ploid/2), + numeric(np)) + } + dist.mat[lower.tri(dist.mat)] <- rowSums(dist_by_locus) + colnames(dist.mat) <- ind.names + rownames(dist.mat) <- ind.names + if (percent){ + dist.mat <- dist.mat/(numLoci*ploid) + } + dist.mat <- as.dist(dist.mat) + if (mat == TRUE){ + dist.mat <- as.matrix(dist.mat) + } + return(dist.mat) } + #==============================================================================# -#' Calculate a distance matrix based on relative dissimilarity +# Calculating Nei's distance for a genind object. Lifted from dist.genpop with +# modifications. +#' Calculate Genetic Distance for a genind or genclone object. #' -#' diss.dist uses the same discreet dissimilarity matrix utilized by the index -#' of association (see \code{\link{ia}} for details). It returns a distance -#' reflecting a ratio of the number of observed differences by the number of -#' possible differences. Eg. two individuals who share half of the same alleles -#' will have a distance of 0.5. This function can analyze distances for any -#' marker system. +#' These functions are modified from the function \link[adegenet]{dist.genpop} to +#' be applicable for distances between individuals. #' -#' @param pop a \code{\link{genind}} object. +#' @param x a \linkS4class{genind}, \linkS4class{genclone}, or matrix object. +#' +#' @param warning If \code{TRUE}, a warning will be printed if any infinite +#' values are detected and replaced. If \code{FALSE}, these values will be +#' replaced without warning. See Details below. +#' +#' @return an object of class dist with the same number of observations as the +#' number of individuals in your data. +#' +#' @details It is important to be careful with the interpretation of these +#' distances as they were originally intended for calculation of +#' between-population distance. As Nei's distance is the negative log of 0:1, +#' this means that it is very possible to obtain distances of infinity. When +#' this happens, infinite values are corrected to be 10 * max(D) where D is +#' the distance matrix without infinite values. +#' +#' @note Provesti's distance is identical to \code{\link{diss.dist}}, except +#' that \code{\link{diss.dist}} is optimized for a larger number of +#' individuals (n > 125) at the cost of required memory. +#' +#' These distances were adapted from the \pkg{adegenet} function +#' \code{\link{dist.genpop}} to work with \code{\linkS4class{genind}} objects. #' -#' @return A distance object. -#' @author Zhian N. Kamvar -#' -#' @examples +#' @seealso \code{\link{aboot}} \code{\link{diss.dist}} \code{\link{poppr.amova}} +#' @rdname genetic_distance +#' @author Zhian N. Kamvar (poppr adaptation) +#' Thibaut Jombart (adegenet adaptation) +#' Daniel Chessel (ade4) +#' +#' @keywords nei rogers rodgers reynolds coancestry edwards angular provesti +#' @references +#' Nei, M. (1972) Genetic distances between populations. American Naturalist, +#' 106, 283–292. #' -#' # A simple example. Let's analyze the mean distance among populations of A. -#' # euteiches. +#' Nei M. (1978) Estimation of average heterozygosity and genetic +#' distance from a small number of individuals. Genetics, 23, 341–369. #' -#' data(Aeut) -#' mean(diss.dist(popsub(Aeut, 1))) -#' \dontrun{ -#' mean(diss.dist(popsub(Aeut, 2))) -#' mean(diss.dist(Aeut)) -#' } +#' Avise, J. C. (1994) Molecular markers, natural history and evolution. Chapman & Hall, +#' London. +#' +#' Edwards, A.W.F. (1971) Distance between populations on the basis of gene +#' frequencies. Biometrics, 27, 873–881. +#' +#' Cavalli-Sforza L.L. and Edwards A.W.F. +#' (1967) Phylogenetic analysis: models and estimation procedures. Evolution, +#' 32, 550–570. +#' +#' Hartl, D.L. and Clark, A.G. (1989) Principles of population +#' genetics. Sinauer Associates, Sunderland, Massachussetts (p. 303). +#' +#' Reynolds, J. B., B. S. Weir, and C. C. Cockerham. (1983) Estimation of the +#' coancestry coefficient: basis for a short-term genetic distance. Genetics, +#' 105, 767–779. +#' +#' Rogers, J.S. (1972) Measures of genetic similarity and genetic distances. +#' Studies in Genetics, Univ. Texas Publ., 7213, 145–153. +#' +#' Avise, J. C. (1994) +#' Molecular markers, natural history and evolution. Chapman & Hall, London. +#' +#' Prevosti A. (1974) La distancia genetica entre poblaciones. Miscellanea +#' Alcobe, 68, 109–118. +#' +#' Prevosti A., Oca\~na J. and Alonso G. (1975) Distances +#' between populations of Drosophila subobscura, based on chromosome +#' arrangements frequencies. Theoretical and Applied Genetics, 45, 231–241. +#' +#' For more information on dissimilarity indexes: +#' +#' Gower J. and Legendre P. (1986) +#' Metric and Euclidean properties of dissimilarity coefficients. Journal of +#' Classification, 3, 5–48 +#' +#' Legendre P. and Legendre L. (1998) Numerical Ecology, +#' Elsevier Science B.V. 20, pp274–288. #' @export +#' @examples +#' +#' data(nancycats) +#' nan9 <- popsub(nancycats, 9) +#' neinan <- nei.dist(nan9) +#' ednan <- edwards.dist(nan9) +#' rodnan <- rogers.dist(nan9) +#' reynan <- reynolds.dist(nan9) +#' pronan <- provesti.dist(nan9) +#' #==============================================================================# +nei.dist <- function(x, warning = TRUE){ + if (is(x, "gen")) + MAT <- get_gen_mat(x) + else if (length(dim(x)) == 2) + MAT <- x + else + stop("Object must be a matrix or genind object") + IDMAT <- MAT %*% t(MAT) + vec <- sqrt(diag(IDMAT)) + IDMAT <- IDMAT/vec[col(IDMAT)] + IDMAT <- IDMAT/vec[row(IDMAT)] + D <- -log(IDMAT) + if (any(D %in% Inf)){ + D <- infinite_vals_replacement(D, warning) + } + D <- as.dist(D) + labs <- get_gen_dist_labs(x) + D <- make_attributes(D, length(labs), labs, "Nei", match.call()) + return(D) +} -diss.dist <- function(pop){ - ploid <- ploidy(pop) - ind.names <- pop@ind.names - inds <- nInd(pop) - np <- choose(inds, 2) - dist.vec <- matrix(data = 0, nrow=inds, ncol=inds) - if(pop@type == "PA"){ - dist.vec[lower.tri(dist.vec)] <- .Call("pairdiffs",pop@tab) +# modified from adegenet dist.genpop + +#' @rdname genetic_distance +#' @export +edwards.dist <- function(x){ + if (is(x, "gen")){ + MAT <- get_gen_mat(x) + nloc <- length(x@loc.names) + } else if (length(dim(x)) == 2){ + MAT <- x + nloc <- ncol(x) + } else{ + stop("Object must be a matrix or genind object") + } + MAT <- sqrt(MAT) + D <- MAT%*%t(MAT) + D <- 1 - D/nloc # Negative number are generated here. + diag(D) <- 0 + D <- sqrt(D) + D <- as.dist(D) + labs <- get_gen_dist_labs(x) + D <- make_attributes(D, length(labs), labs, "Edwards", match.call()) + return(D) +} + + +#' @rdname genetic_distance +#' @export +rogers.dist <- function(x){ + if (is(x, "gen")){ + if (is.genind(x) && x@type == "PA"){ + MAT <- x@tab + nloc <- length(x@loc.names) + loc.fac <- factor(x@loc.names, levels = x@loc.names) + nlig <- nrow(x@tab) + } else { + MAT <- get_gen_mat(x) + nloc <- length(x@loc.names) + loc.fac <- x@loc.fac + nlig <- nrow(x@tab) + } + } + else if (length(dim(x)) == 2){ + MAT <- x + nloc <- ncol(x) + loc.fac <- factor(colnames(x), levels = colnames(x)) + nlig <- nrow(x) + } + else{ + stop("Object must be a matrix or genind object") + } + # kX is a list of K=nloc matrices + kX <- lapply(split(MAT, loc.fac[col(MAT)]), matrix, nrow = nlig) + D <- matrix(0, nlig, nlig) + for(i in 1:length(kX)){ + D <- D + dcano(kX[[i]]) + } + D <- D/length(kX) + D <- as.dist(D) + labs <- get_gen_dist_labs(x) + D <- make_attributes(D, nlig, labs, "Rogers", match.call()) + return(D) +} + +#' @rdname genetic_distance +#' @export +reynolds.dist <- function(x){ + if (is(x, "gen")){ + MAT <- get_gen_mat(x) + nloc <- length(x@loc.names) + } + else if (length(dim(x)) == 2){ + MAT <- x + nloc <- ncol(x) } else{ - pop <- seploc(pop) - numLoci <- length(pop) - temp.d.vector <- matrix(nrow = np, ncol = numLoci, data = as.numeric(NA)) - temp.d.vector <- vapply(pop, function(x) .Call("pairdiffs",x@tab)*(ploid/2), - temp.d.vector[, 1]) - dist.vec[lower.tri(dist.vec)] <- rowSums(temp.d.vector) - } - colnames(dist.vec) <- ind.names - rownames(dist.vec) <- ind.names - loci <- ifelse(is.list(pop), length(pop), nLoc(pop)) - return(as.dist(dist.vec/(loci*ploid))) + stop("Object must be a matrix or genind object") + } + denomi <- MAT %*% t(MAT) + vec <- apply(MAT, 1, function(x) sum(x*x)) + D <- -2*denomi + vec[col(denomi)] + vec[row(denomi)] + diag(D) <- 0 + denomi <- 2*nloc - 2*denomi + diag(denomi) <- 1 + D <- D/denomi + D <- sqrt(D) + D <- as.dist(D) + labs <- get_gen_dist_labs(x) + D <- make_attributes(D, length(labs), labs, "Reynolds", + match.call()) + return(D) } +#' @rdname genetic_distance +#' @export +provesti.dist <- function(x){ + if (is(x, "gen")){ + MAT <- get_gen_mat(x) + nlig <- nrow(x@tab) + nloc <- length(x@loc.names) + ploid <- x@ploidy + } else if (length(dim(x)) == 2){ + MAT <- x + nlig <- nrow(x) + nloc <- ncol(x) + } else { + stop("Object must be a matrix or genind object") + } + w0 <- 1:(nlig-1) + loca <- function(k, nlig, MAT, nLoc){ + w1 <- (k+1):nlig + resloc <- vapply(w1, function(y) sum(abs(MAT[k, ] - MAT[y, ]), na.rm = TRUE), numeric(1)) + if (x@type == "codom"){ + # This only applies to codominant data because dominant data can only take + # on a single state. Dividing by two indicates that the observations can + # occupy co-occurring states. + resloc <- resloc/2 + } + return(resloc/nloc) + } + + d <- unlist(lapply(w0, loca, nlig, MAT, nLoc)) + + labs <- get_gen_dist_labs(x) + d <- make_attributes(d, nlig, labs, "Provesti", match.call()) + + # resmat <- matrix(numeric(0), nlig, nlig) + # resmat[lower.tri(resmat)] <- d + # d <- as.dist(resmat) + # attr(d, "Labels") <- labs + return(d) +} + + +#==============================================================================# +#' Calculate a dendrogram with bootstrap support using any distance applicable +#' to genind or genclone objects. +#' +#' @param x a \linkS4class{genind}, \linkS4class{genclone}, or matrix object. +#' +#' @param tree one of "upgma" (Default) or "nj" defining the type of dendrogram +#' to be produced, UPGMA or Neighbor-Joining. +#' +#' @param distance a character or function defining the distance to be applied +#' to x. Defaults to \code{\link{nei.dist}}. +#' +#' @param sample An integer representing the number of bootstrap replicates +#' Default is 100. +#' +#' @param cutoff An integer from 0 to 100 setting the cutoff value to return the +#' bootstrap values on the nodes. Default is 0. +#' +#' @param showtree If \code{TRUE} (Default), a dendrogram will be plotted. If +#' \code{FALSE}, nothing will be plotted. +#' +#' @param missing any method to be used by \code{\link{missingno}}: "mean" +#' (default), "zero", "loci", "genotype", or "ignore". +#' +#' @param quiet if \code{FALSE} (Default), a progress bar will be printed to +#' screen. +#' +#' @param ... any parameters to be passed off to the distance method. +#' +#' @return an object of class \code{\link[ape]{phylo}}. +#' +#' @details This function utilizes an internal class called +#' \code{\linkS4class{bootgen}} that allows bootstrapping of objects that +#' inherit the genind class. This is necessary due to the fact that columns in +#' the genind matrix are defined as alleles and are thus interrelated. This +#' function will specifically bootstrap loci so that results are biologically +#' relevant. With this function, the user can also define a custom distance to +#' be performed on the genind or genclone object. +#' +#' @note \code{\link{provesti.dist}} and \code{\link{diss.dist}} are exactly the +#' same, but \code{\link{diss.dist}} scales better for large numbers of +#' individuals (n > 125) at the cost of required memory. \subsection{missing +#' data}{Missing data is not allowed by many of the distances. Thus, one of +#' the first steps of this function is to treat missing data by setting it to +#' the average allele frequency in the data set. If you are using a distance +#' that can handle missing data (Provesti's distance), you can set +#' \code{missing = "ignore"} to allow the distance function to handle any +#' missing data. See \code{\link{missingno}} for details on missing +#' data.}\subsection{Bruvo's Distance}{While calculation of Bruvo's distance +#' is possible with this function, it is optimized in the function +#' \code{\link{bruvo.boot}}.} +#' +#' @seealso \code{\link{nei.dist}} \code{\link{edwards.dist}} +#' \code{\link{rogers.dist}} \code{\link{reynolds.dist}} +#' \code{\link{provesti.dist}} \code{\link{diss.dist}} +#' \code{\link{bruvo.boot}} \code{\link[ape]{boot.phylo}} +#' \code{\link[adegenet]{dist.genpop}} \code{\link{dist}} +#' +#' @export +#' @keywords bootstrap +#' @aliases bootstrap +#' @examples +#' +#' data(nancycats) +#' nan9 <- popsub(nancycats, 9) +#' +#' set.seed(9999) +#' # Generate a tree using nei's distance +#' neinan <- aboot(nan9, dist = nei.dist) +#' +#' set.seed(9999) +#' # Generate a tree using custom distance +#' bindist <- function(x) dist(x$tab, method = "binary") +#' binnan <- aboot(nan9, dist = bindist) +#' +#' \dontrun{ +#' # AFLP data +#' data(Aeut) +#' +#' # Nei's distance +#' anei <- aboot(Aeut, dist = nei.dist, sample = 1000, cutoff = 50) +#' +#' # Rogers' distance +#' arog <- aboot(Aeut, dist = rogers.dist, sample = 1000, cutoff = 50) +#' +#' # This can also be run on genpop objects +#' Aeut.gc <- as.genclone(Aeut, hierarchy=other(Aeut)$population_hierarchy[-1]) +#' setpop(Aeut.gc) <- ~Pop/Subpop +#' Aeut.pop <- genind2genpop(Aeut.gc) +#' set.seed(5000) +#' aboot(Aeut.pop) # compare to Grunwald et al. 2006 +#' +#' } +#==============================================================================# +aboot <- function(x, tree = "upgma", distance = "nei.dist", sample = 100, + cutoff = 0, showtree = TRUE, missing = "mean", quiet = FALSE, + ...){ + if (is.genind(x)){ + x <- missingno(x, missing, cutoff = 0) + } + if (x@type == "PA"){ + xboot <- x@tab + colnames(xboot) <- locNames(x) + if (is.genpop(x)){ + rownames(xboot) <- x@pop.names + } else { + rownames(xboot) <- indNames(x) + } + } else { + xboot <- new("bootgen", x) + } + ARGS <- c("nj", "upgma") + treearg <- match.arg(tree, ARGS) + treefunk <- tree_generator(treearg, distance, ...) + xtree <- treefunk(xboot) + if (any(xtree$edge.len < 0)){ + xtree <- fix_negative_branch(xtree) + } + root <- ifelse(treearg == "nj", FALSE, TRUE) + nodelabs <- boot.phylo(xtree, xboot, treefunk, B = sample, rooted = root, quiet = quiet) + nodelabs <- (nodelabs/sample)*100 + nodelabs <- ifelse(nodelabs >= cutoff, nodelabs, NA) + if (is.genind(x)){ + xtree$tip.label <- indNames(x) + } else { + xtree$tip.label <- x@pop.names + } + xtree$node.label <- nodelabs + if (showtree){ + poppr.plot.phylo(xtree, tree) + } + return(xtree) +} \ No newline at end of file diff --git a/R/file_handling.r b/R/file_handling.r index c1751499..6d646af9 100644 --- a/R/file_handling.r +++ b/R/file_handling.r @@ -56,10 +56,10 @@ #' \code{\link{poppr.all}}. #' #' @param pattern a \code{\link{regex}} pattern for use while -#' \code{multFile==TRUE}. +#' \code{multi == TRUE}. This will grab all files matching this pattern. #' -#' @param combine \code{logical}. When this is set to \code{TRUE}, the -#' \code{\$files} vector will have the path appended to them. When it is set to +#' @param combine \code{logical}. When this is set to \code{TRUE} (default), the +#' \code{$files} vector will have the path appended to them. When it is set to #' \code{FALSE}, it will have the basename. #' #' @return \item{path}{a character string of the absolute path to the @@ -73,10 +73,8 @@ #' #' x <- getfile() #' poppr(x$files) -#' -#' #' -#' y <- getfile(multFile=TRUE, pattern="^.+?dat$") +#' y <- getfile(multi=TRUE, pattern="^.+?dat$") #' #useful for reading in multiple FSTAT formatted files. #' #' yfiles <- poppr.all(y$files) @@ -143,47 +141,86 @@ getfile <- function(multi=FALSE, pattern=NULL, combine=TRUE){ } #==============================================================================# #' Importing data from genalex formatted *.csv files. -#' -#' read.genalex will read in a genalex-formatted file that has been exported in -#' a comma separated format and will parse most types of genalex data. The -#' output is a \code{\link{genind}} object. -#' -#' @param genalex a *.csv file exported from genalex -#' -#' @param ploidy indicate the ploidy of the dataset -#' -#' @param geo indicates the presence of geographic data in the file. #' -#' @param region indicates the presence of regional data in the file. +#' read.genalex will read in a genalex-formatted file that has been exported in +#' a comma separated format and will parse most types of genalex data. The +#' output is a \code{\linkS4class{genclone}} or \code{\linkS4class{genind}} +#' object. #' -#' @return A \code{\link{genind}} object. -#' -#' @note This function cannot handle raw allele frequency data. -#' -#' The resulting genind object will have a data frame in the \code{other} slot called -#' population_hierarchy. This will contain a column for your population data and -#' a column for your Regional data if you have set the flag. -#' -#' If there is geographic data, it will be included in a data frame called xy in -#' the \code{other} slot. -#' -#' In the case that there are duplicated names within the file, this function -#' will assume separate individuals and rename each one to a sequence of -#' integers from 1 to the number of individuals. A vector of the original names -#' will be saved in the \code{other} slot under \code{original_names}. -#' -#' @seealso \code{\link{clonecorrect}}, \code{\link{genind}} -#' +#' @param genalex a *.csv file exported from genalex +#' +#' @param ploidy indicate the ploidy of the dataset +#' +#' @param geo indicates the presence of geographic data in the file. This data +#' will be included in a data frame labeled \code{xy} in the +#' \code{\link{other}} slot. +#' +#' @param region indicates the presence of regional data in the file. +#' +#' @param genclone when \code{TRUE} (default), the output will be a +#' \code{\linkS4class{genclone}} object. When \code{FALSE}, the output will be +#' a \code{\linkS4class{genind}} object +#' +#' @param sep A character specifying the column separator of the data. Defaults +#' to ",". +#' +#' @return A \code{\linkS4class{genclone}} or \code{\linkS4class{genind}} +#' object. +#' +#' @note This function cannot handle raw allele frequency data. +#' +#' In the case that there are duplicated names within the file, this function +#' will assume separate individuals and rename each one to a sequence of +#' integers from 1 to the number of individuals. A vector of the original +#' names will be saved in the \code{other} slot under \code{original_names}. +#' +#' +#' @details \subsection{if \code{genclone = FALSE}}{ The resulting genind object +#' will have a data frame in the \code{other} slot called +#' \code{population_hierarchy}. This will contain a column for your population +#' data and a column for your Regional data if you have set the flag.} +#' +#' \subsection{if \code{genclone = TRUE}}{ The resulting genclone object will +#' have a single hierarchical level defined in the hierarchy slot. This will +#' be called "Pop" and will reflect the population factor defined in the +#' genalex input. If \code{region = TRUE}, a second column will be inserted +#' and labeled "Region". If you have more than two hierarchical levels within +#' your data set, you should run the command \code{\link{splithierarchy}} on +#' your data set to define the unique hierarchical levels. } +#' +#' \subsection{FOR POLYPLOID (> 2n) DATA SETS}{ Adegenet's genind object has +#' an all-or-none approach to missing data. If a sample has missing data at a +#' particular locus, then the entire locus is considered missing. This works +#' for diploids and haploids where allelic dosage is unambiguous. For +#' polyploids this poses a problem as much of the data set would be +#' transformed into missing data. With this function, I have created a +#' workaround. +#' +#' When importing polyploid data sets, missing data is scored as "0" and kept +#' within the genind object as an extra allele. This will break most analyses +#' relying on allele frequencies*. All of the functions in poppr will work +#' properly with these data sets as multilocus genotype analysis is agnostic +#' of ploidy and we have written both Bruvo's distance and the index of +#' association in such a way as to be able to handle polyploids presented in +#' this manner. +#' +#' * To restore functionality of analyses relying on allele frequencies, use +#' the \code{\link{recode_polyploids}} function.} +#' +#' +#' @seealso \code{\link{clonecorrect}}, \code{\linkS4class{genclone}}, +#' \code{\linkS4class{genind}}, \code{\link{recode_polyploids}} +#' #' @export #' @author Zhian N. Kamvar #' @examples #' #' \dontrun{ #' Aeut <- read.genalex(system.file("files/rootrot.csv", package="poppr")) -#' +#' #' genalex2 <- read.genalex("genalex2.csv", geo=TRUE) #' # A genalex file with geographic coordinate data. -#' +#' #' genalex3 <- read.genalex("genalex3.csv", region=TRUE) #' # A genalex file with regional information. #' @@ -192,122 +229,92 @@ getfile <- function(multi=FALSE, pattern=NULL, combine=TRUE){ #' } #==============================================================================# -read.genalex <- function(genalex, ploidy=2, geo=FALSE, region=FALSE){ +read.genalex <- function(genalex, ploidy=2, geo=FALSE, region=FALSE, + genclone = TRUE, sep = ","){ # The first two lines from a genalex file contain all of the information about # the structure of the file (except for ploidy and geographic info) - gencall <- match.call() - all.info <- strsplit(readLines(genalex, n=2), ",") - if (any(all.info[[1]]=="")){ - num.info <- as.numeric(all.info[[1]][-which(all.info[[1]]=="")]) - } - else { - num.info <- as.numeric(all.info[[1]]) - } - if (any(all.info[[2]]=="")){ - pop.info <- all.info[[2]][c(-1,-2,-3,-which(all.info[[2]]==""))] - } - else { - pop.info <- all.info[[2]][c(-1,-2,-3)] - } - - # glob.info is for the number of loci, individuals, and populations. - glob.info <- num.info[1:3] - if(any(is.na(glob.info))){ - stop("Something is wrong with your csv file. Perhaps it is not comma delimited?\n") - } - # cat("Global Information:",glob.info,"\n") - # cat("Populations:",pop.info,"\n") - # cat("num.info:",num.info,"\n") - - gena <- read.csv(genalex, header=TRUE, skip=2, check.names = FALSE) + gencall <- match.call() + + all.info <- strsplit(readLines(genalex, n = 2), sep) + gena <- read.table(genalex, sep = sep, header = TRUE, skip = 2, + stringsAsFactors = FALSE, check.names = FALSE) + num.info <- as.numeric(all.info[[1]]) + pop.info <- all.info[[2]][-c(1:3)] + num.info <- num.info[!is.na(num.info)] + pop.info <- pop.info[!pop.info %in% c("", NA)] + nloci <- num.info[1] + ninds <- num.info[2] + npops <- num.info[3] # Removing all null columns - if(!is.na(which(is.na(gena[1, ]))[1])){ - gena <- gena[, -which(is.na(gena[1, ]))] + if (any(is.na(gena[1, ]))){ + gena <- gena[, !is.na(gena[1, ])] } #----------------------------------------------------------------------------# # Checking for extra information such as Regions or XY coordinates #----------------------------------------------------------------------------# - # Creating vectors that correspond to the different information fields. - # If the regions are true, then the length of the pop.info should be equal to - # the number of populations "npop"(glob.info[3]) plus the number of regions - # which is the npop+4th entry in the vector. - # Note that this strategy will only work if the name of the first region does - # not match any of the populations. + # Creating vectors that correspond to the different information fields. If the + # regions are true, then the length of the pop.info should be equal to the + # number of populations "npop"(npops) plus the number of regions which is the + # npop+4th entry in the vector. Note that this strategy will only work if the + # name of the first region does not match any of the populations. - if (region==TRUE & length(pop.info) == glob.info[3] + num.info[glob.info[3]+4]){ + clm <- ncol(gena) + + if (region == TRUE & length(pop.info) == npops + num.info[npops + 4]){ # Info for the number of columns the loci can take on. - loci.adj <- c(glob.info[1], glob.info[1]*ploidy) + loci.adj <- c(nloci, nloci*ploidy) - # First question, do you have two or four extra columns? Two extra would - # indicate no geographic data. Four extra would indicate geographic data. + # First question, do you have two or four extra columns? Two extra would + # indicate no geographic data. Four extra would indicate geographic data. # Both of these indicate that, while a regional specification exists, a # column indicating the regions was not specified, so it needs to be created - if(((ncol(gena) %in% (loci.adj + 4)) & (geo == TRUE)) | (ncol(gena) %in% (loci.adj + 2))){ + if (((clm %in% (loci.adj + 4)) & (geo == TRUE)) | (clm %in% (loci.adj + 2))){ + pop.vec <- gena[, 2] ind.vec <- gena[, 1] - xy <- gena[, c((ncol(gena)-1), ncol(gena))] - - # Get the indices for the regions - region.inds <- ((glob.info[3]+5):length(num.info)) - # Get the number of individuals per region - reg.inds <- num.info[region.inds] - # Get the names of the regions - reg.names <- all.info[[2]][region.inds] - # Paste them all into a single vector. - reg.vec <- rep(reg.names, reg.inds) - if(geo == TRUE){ - geoinds <- c((ncol(gena)-1), ncol(gena)) - xy <- gena[, geoinds] - gena <- gena[, -geoinds] - } - else{ + xy <- gena[, c((clm-1), clm)] + region.inds <- ((npops + 5):length(num.info)) # Indices for the regions + reg.inds <- num.info[region.inds] # Number of individuals per region + reg.names <- all.info[[2]][region.inds] # Names of the regions + reg.vec <- rep(reg.names, reg.inds) # Paste into a single vector + if (geo == TRUE){ + geoinds <- c((clm - 1), clm) + xy <- gena[, geoinds] + gena <- gena[, -geoinds] + } else { xy <- NULL } - gena <- gena[, c(-1,-2)] - } - # - # The Regions are specified in one of the first two columns. - # - else{ - pop.vec <- ifelse(any(gena[, 1] == pop.info[1]), 1, 2) - reg.vec <- ifelse(pop.vec == 2, 1, 2) + gena <- gena[, c(-1, -2)] + + } else { + + pop.vec <- ifelse(any(gena[, 1] == pop.info[1]), 1, 2) + reg.vec <- ifelse(pop.vec == 2, 1, 2) orig.ind.vec <- NULL - # Regional Vector - reg.vec <- gena[, reg.vec] - # Population Vector - pop.vec <- gena[, pop.vec] - if(geo == TRUE){ - geoinds <- c((ncol(gena)-1), ncol(gena)) - xy <- gena[, geoinds] - gena <- gena[, -geoinds] - } - else{ + reg.vec <- gena[, reg.vec] # Regional Vector + pop.vec <- gena[, pop.vec] # Population Vector + if (geo == TRUE){ + geoinds <- c((clm-1), clm) + xy <- gena[, geoinds] + gena <- gena[, -geoinds] + } else { xy <- NULL } - # Individual Vector - ind.vec <- gena[, ncol(gena)] - # removing the non-genotypic columns from the data frame - gena <- gena[, c(-1,-2,-ncol(gena))] + ind.vec <- gena[, clm] # Individual Vector + gena <- gena[, c(-1,-2,-clm)] # removing the non-genotypic columns } - } - - # - # There are no Regions specified, but there are geographic coordinates - # - else if (geo == TRUE & length(pop.info) == glob.info[3]){ + } else if (geo == TRUE & length(pop.info) == npops){ + # There are no Regions specified, but there are geographic coordinates reg.vec <- NULL pop.vec <- gena[, 2] ind.vec <- gena[, 1] - xy <- gena[, c((ncol(gena)-1), ncol(gena))] - gena <- gena[, c(-1,-2,-(ncol(gena)-1),-ncol(gena))] - } - # - # There are no Regions or geographic coordinates - # - else{ + xy <- gena[, c((clm-1), clm)] + gena <- gena[, c(-1,-2,-(clm-1),-clm)] + } else { + # There are no Regions or geographic coordinates reg.vec <- NULL pop.vec <- gena[, 2] ind.vec <- gena[, 1] @@ -320,105 +327,103 @@ read.genalex <- function(genalex, ploidy=2, geo=FALSE, region=FALSE){ # reconstruct the matrix in a way that adegenet likes it. #----------------------------------------------------------------------------# - clm <- ncol(gena) + clm <- ncol(gena) gena.mat <- as.matrix(gena) - # Checking for diploid data. - if (glob.info[1] == clm/2){ + # Checking for greater than haploid data. + if (nloci == clm/ploidy & ploidy > 1){ # Missing data in genalex is coded as "0" for non-presence/absence data. # this converts it to "NA" for adegenet. - if(any(gena.mat =="0")){ - #gena.mat <- as.matrix(gena) + if(any(gena.mat == "0") & ploidy < 3){ gena[gena.mat == "0"] <- NA - #gena <- as.data.frame(gena.mat) } - type <- 'codom' - loci <- which((1:clm)%%2==1) + type <- 'codom' + loci <- which((1:clm) %% ploidy == 1) gena2 <- gena[, loci] - lapply(loci, function(x) gena2[, ((x-1)/2)+1] <<- - paste(gena[, x],"/",gena[, x+1], sep="")) - #res <- list(Gena=gena2, Glob.info=glob.info, Ploid=ploidy) - res.gid <- df2genind(gena2, sep="/", ind.names=ind.vec, pop=pop.vec, - ploidy=ploidy, type=type) - } - # Checking for AFLP data. - else if (glob.info[1] == clm & all(gena.mat %in% as.integer(-1:1))) { + lapply(loci, function(x) gena2[, ((x-1)/ploidy)+1] <<- + pop_combiner(gena, hier = x:(x+ploidy-1), sep = "/")) + res.gid <- df2genind(gena2, sep="/", ind.names = ind.vec, pop = pop.vec, + ploidy = ploidy, type = type) + } else if (nloci == clm & all(gena.mat %in% as.integer(-1:1))) { + # Checking for AFLP data. # Missing data in genalex is coded as "-1" for presence/absence data. # this converts it to "NA" for adegenet. if(any(gena.mat == -1L)){ - #gena.mat <- as.matrix(gena) gena[gena.mat == -1L] <- NA - #gena <- as.data.frame(gena.mat) } type <- 'PA' - #res <- list(Gena=gena, Glob.info=glob.info, Ploid=ploidy) - res.gid <- df2genind(gena, ind.names=ind.vec, pop=pop.vec, - ploidy=ploidy, type=type) - } - # Checking for haploid microsattellite data or SNP data - else if (glob.info[1] == clm & !all(gena.mat %in% as.integer(-1:1))) { + res.gid <- df2genind(gena, ind.names = ind.vec, pop = pop.vec, + ploidy = ploidy, type = type) + } else if (nloci == clm & !all(gena.mat %in% as.integer(-1:1))) { + # Checking for haploid microsattellite data or SNP data if(any(gena.mat == "0")){ - #gena.mat <- as.matrix(gena) gena[gena.mat == "0"] <- NA - #gena <- as.data.frame(gena.mat) } - type <- 'codom' - #res <- list(Gena=gena, Glob.info=glob.info, Ploid=1) - res.gid <- df2genind(gena, ind.names=ind.vec, pop=pop.vec, - ploidy= 1, type=type) - } - else { + type <- 'codom' + res.gid <- df2genind(gena, ind.names = ind.vec, pop = pop.vec, + ploidy = 1, type = type) + } else { stop("Something went wrong. Check your geo and region flags to make sure they are set correctly. Otherwise, the problem may lie within the data structure itself.") } if (any(duplicated(ind.vec))){ # ensuring that all names are unique - res.gid@ind.names <- paste("ind",1:length(ind.vec)) + res.gid@ind.names <- paste("ind", 1:length(ind.vec)) res.gid@other[["original_names"]] <- ind.vec } res.gid@other[["population_hierarchy"]] <- as.data.frame(list(Pop=pop.vec)) res.gid@call <- gencall - res.gid@call[2] <- basename(genalex) - if(region==TRUE){ + + # Keep the name if it's a URL + if (length(grep("://", genalex)) < 1){ + res.gid@call[2] <- basename(genalex) + } + if (region){ res.gid@other[["population_hierarchy"]]$Region <- reg.vec - #return(res.gid) } - if(geo==TRUE){ + if (geo){ res.gid@other[["xy"]] <- xy - #return(res.gid) + } + if (genclone){ + res.gid <- as.genclone(res.gid) } return(res.gid) } #==============================================================================# #' Exporting data from genind objects to genalex formatted *.csv files. -#' -#' genind2genalex will export a genind object to a *.csv file formatted for use -#' in genalex. -#' -#' @param pop a \code{\link{genind}} object. #' -#' @param filename a string indicating the name and/or path of the file you wish -#' to create. -#' -#' @param quiet \code{logical} If \code{FALSE} a message will be printed to the -#' screen. +#' genind2genalex will export a genclone or genind object to a *.csv file +#' formatted for use in genalex. #' +#' @param pop a \code{\linkS4class{genclone}} or \code{\linkS4class{genind}} +#' object. +#' +#' @param filename a string indicating the name and/or path of the file you wish +#' to create. +#' +#' @param quiet \code{logical} If \code{FALSE} a message will be printed to the +#' screen. +#' #' @param geo \code{logical} Default is \code{FALSE}. If it is set to -#' \code{TRUE}, the resulting file will have two columns for geographic data. -#' -#' @param geodf \code{character} Since the \code{other} slot in the adegenet object -#' can contain many different items, you must specify the name of the data frame -#' in the \code{other} slot containing your geographic coordinates. It defaults to -#' "xy". -#' -#' @note If you enter a file name that exists, that file will be overwritten. -#' If your data set lacks a population structure, it will be coded in the new -#' file as a single population lableled "Pop". Likewise, if you don't have any -#' labels for your individuals, they will be labeled as "ind1" through -#' "ind\emph{N}", with \emph{N} being the size of your population. -#' -#' @seealso \code{\link{clonecorrect}}, \code{\link{genind}} -#' +#' \code{TRUE}, the resulting file will have two columns for geographic data. +#' +#' @param geodf \code{character} Since the \code{other} slot in the adegenet +#' object can contain many different items, you must specify the name of the +#' data frame in the \code{other} slot containing your geographic coordinates. +#' It defaults to "xy". +#' +#' @param sep a character specifying what character to use to separate columns. +#' Defaults to ",". +#' +#' @note If you enter a file name that exists, that file will be overwritten. If +#' your data set lacks a population structure, it will be coded in the new +#' file as a single population lableled "Pop". Likewise, if you don't have any +#' labels for your individuals, they will be labeled as "ind1" through +#' "ind\emph{N}", with \emph{N} being the size of your population. +#' +#' @seealso \code{\link{clonecorrect}}, \code{\linkS4class{genclone}} or +#' \code{\linkS4class{genind}} +#' #' @export #' @author Zhian N. Kamvar #' @examples @@ -427,9 +432,11 @@ read.genalex <- function(genalex, ploidy=2, geo=FALSE, region=FALSE){ #' genind2genalex(nancycats, "~/Documents/nancycats.csv", geo=TRUE) #' } #==============================================================================# -genind2genalex <- function(pop, filename="genalex.csv", quiet=FALSE, geo=FALSE, geodf="xy"){ - if(!is.genind(pop)) stop("A genind object is needed.") - if(is.null(pop@pop)){ +genind2genalex <- function(pop, filename = "genalex.csv", quiet = FALSE, + geo = FALSE, geodf = "xy", sep = ","){ + if (!is.genind(pop)) stop("A genind object is needed.") + if (nchar(sep) != 1) stop("sep must be one byte/character (eg. \",\")") + if (is.null(pop@pop)){ pop(pop) <- rep("Pop", nInd(pop)) } popcall <- match.call() @@ -465,6 +472,7 @@ genind2genalex <- function(pop, filename="genalex.csv", quiet=FALSE, geo=FALSE, thirdline <- c(thirdline, rep("", lenfac)) } infolines <- rbind(topline, secondline, thirdline) + # converting to a data frame if(any(!pop@tab %in% c(0, ((1:ploid)/ploid), 1, NA))){ @@ -480,7 +488,7 @@ genind2genalex <- function(pop, filename="genalex.csv", quiet=FALSE, geo=FALSE, df <- cbind(pop@ind.names, df) # setting the NA replacement. This doesn't work too well. replacement <- ifelse(pop@type =="PA","-1","0") - if(!quiet) cat("Writing the table to",filename,"... ") + if(!quiet) cat("Writing the table to", filename, "... ") if(geo == TRUE & !is.null(pop$other[[geodf]])){ replacemat <- matrix("", 3, 3) @@ -495,16 +503,22 @@ genind2genalex <- function(pop, filename="genalex.csv", quiet=FALSE, geo=FALSE, } else if (geo == TRUE){ popcall <- popcall[2] - warning(paste("There is no data frame or matrix in ", - paste(substitute(popcall), collapse=""), - "@other called ",geodf, - ".\nThe xy coordinates will not be represented in the resulting file.", sep="")) + warning(paste0("There is no data frame or matrix in ", + paste0(substitute(popcall)), "@other called ", geodf, + ".\nThe xy coordinates will not be represented in the", + " resulting file.")) } df[df == "NA" | is.na(df)] <- replacement - write.table(infolines, file=filename, quote=FALSE, row.names=FALSE, - col.names=FALSE, sep=",") - write.table(df, file=filename, quote=TRUE, na=replacement, append=TRUE, - row.names=FALSE, col.names=FALSE, sep=",") + + if (ncol(infolines) > ncol(df)){ + lendiff <- ncol(infolines) - ncol(df) + padding <- matrix("", nrow = nInd(pop), ncol = lendiff) + df <- cbind(df, padding) + } + write.table(infolines, file = filename, quote = FALSE, row.names = FALSE, + col.names = FALSE, sep = sep) + write.table(df, file = filename, quote = TRUE, na = replacement, append = TRUE, + row.names = FALSE, col.names = FALSE, sep = sep) if(!quiet) cat("Done.\n") } diff --git a/R/internal.r b/R/internal.r index 29d2a10c..46aaa2c9 100644 --- a/R/internal.r +++ b/R/internal.r @@ -47,16 +47,16 @@ #' @name Aeut #' @docType data #' @usage data(Aeut) -#' @description The Aeut dataset consists of 187 isolates of the Oomycete root -#' rot pathogen, \emph{Aphanomyces euteiches} collected from two different -#' fields in NW Oregon and W Washington, USA. -#' @format a \code{\link{genind}} object with two popualations containing a -#' data frame in the \code{other} slot called \code{population_hierarchy}. -#' This data frame gives indices of the populations and subpopulations for the -#' data set. -#' @references Grunwald, NJ and Hoheisel, G.-A. 2006. Hierarchical Analysis -#' of Diversity, Selfing, and Genetic Differentiation in Populations of the -#' Oomycete \emph{Aphanomyces euteiches}. Phytopathology 96:1134-1141 +#' @description The Aeut dataset consists of 187 isolates of the Oomycete root +#' rot pathogen, \emph{Aphanomyces euteiches} collected from two different +#' fields in NW Oregon and W Washington, USA. +#' @format a \code{\link{genind}} object with two populations containing a data +#' frame in the \code{other} slot called \code{population_hierarchy}. This +#' data frame gives indices of the populations and subpopulations for the data +#' set. +#' @references Grunwald, NJ and Hoheisel, G.A. 2006. Hierarchical Analysis of +#' Diversity, Selfing, and Genetic Differentiation in Populations of the +#' Oomycete \emph{Aphanomyces euteiches}. Phytopathology 96:1134-1141 #==============================================================================# NULL #==============================================================================# @@ -67,14 +67,62 @@ NULL #' @docType data #' @usage data(partial_clone) #' @description These data were simulated using SimuPOP version 1.0.8 with -#' 99.9\% clonal reproduction over 10,000 generations. Populations were assigned -#' post-hoc and are simply present for the purposes of demonstrating a minimum -#' spanning network with Bruvo's distance. +#' 99.9\% clonal reproduction over 10,000 generations. Populations were +#' assigned post-hoc and are simply present for the purposes of demonstrating +#' a minimum spanning network with Bruvo's distance. #' @format a \code{\link{genind}} object with 50 individuals, 10 loci, and four -#' popualations. +#' populations. #' @references Bo Peng and Christopher Amos (2008) Forward-time simulations of -#' nonrandom mating populations using simuPOP. \emph{bioinformatics}, 24 (11): -#' 1408-1409. +#' nonrandom mating populations using simuPOP. \emph{bioinformatics}, 24 (11): +#' 1408-1409. +#==============================================================================# +NULL +#==============================================================================# +#' Phytophthora infestans data from Mexico and South America. +#' +#' @name Pinf +#' @docType data +#' @usage data(Pinf) +#' @description The Pinf data set contains 86 isolates genotyped over 11 +#' microsatellite loci collected from Mexico, Peru, Columbia, and Ecuador. +#' This is a subset of the data used for the reference below. +#' @format a \code{\linkS4class{genclone}} object with 2 hierarchical levels +#' called "Continent" and "Country" that contain 2 and 4 populations, +#' respectively. +#' @references Goss, Erica M., Javier F. Tabima, David EL Cooke, Silvia +#' Restrepo, William E. Fry, Gregory A. Forbes, Valerie J. Fieland, Martha +#' Cardenas, and Niklaus J. Grünwald. "The Irish potato famine pathogen +#' \emph{Phytophthora infestans} originated in central Mexico rather than the Andes." +#' Proceedings of the National Academy of Sciences 111:8791–8796. +#==============================================================================# +NULL +#==============================================================================# +#' Peach brown rot pathogen \emph{Monilinia fructicola} +#' +#' @name monpop +#' @docType data +#' @usage data(monpop) +#' @description This is microsatellite data for a population of the haploid +#' plant pathogen \emph{Monilinia fructicola} that causes disease within peach +#' tree canopies (Everhart & Scherm, 2014). Entire populations within trees +#' were sampled across 3 years (2009, 2010, and 2011) in a total of four +#' trees, where one tree was sampled in all three years, for a total of 6 +#' within-tree populations. Within each year, samples in the spring were taken +#' from affected blossoms (termed “BB” for blossom blight) and in late summer +#' from affected fruits (termed “FR” for fruit rot). There are a total of 694 +#' isolates with 65 to 173 isolates within each canopy population that were +#' characterized using a set of 13 microsatellite markers. +#' @format a \code{\linkS4class{genclone}} object with 3 hierarchical levels +#' coded into one population factor. These are named "Tree", "Year", and +#' "Symptom" +#' @references SE Everhart, H Scherm, (2014) Fine-scale genetic structure of +#' \emph{Monilinia fructicola} during brown rot epidemics within individual peach +#' tree canopies. Phytopathology, submitted +#' @examples +#' data(monpop) +#' splithierarchy(monpop) <- ~Tree/Year/Symptom +#' setpop(monpop) <- ~Symptom/Year +#' monpop #==============================================================================# NULL #==============================================================================# @@ -128,39 +176,39 @@ extract.info <- function(x) { # Internal functions utilizing this function: # # new.poppr (in testing) #==============================================================================# -.file.type <- function(pop, quiet=TRUE, missing="ignore", cutoff=0.05, keep=1, +process_file <- function(input, quiet=TRUE, missing="ignore", cutoff=0.05, keep=1, clonecorrect=FALSE, hier=c(1), dfname="hier"){ - if (!is.genind(pop)){ - x <- pop + if (!is.genind(input)){ + x <- input if (toupper(.readExt(x)) == "CSV"){ - try(pop <- read.genalex(x), silent=quiet) - try(pop <- read.genalex(x, region=TRUE), silent=quiet) - try(pop <- read.genalex(x, geo=TRUE), silent=quiet) - try(pop <- read.genalex(x, geo=TRUE, region=TRUE), silent=quiet) + try(input <- read.genalex(x), silent=quiet) + try(input <- read.genalex(x, region=TRUE), silent=quiet) + try(input <- read.genalex(x, geo=TRUE), silent=quiet) + try(input <- read.genalex(x, geo=TRUE, region=TRUE), silent=quiet) } else { - try(pop <- import2genind(x, quiet=quiet), silent=quiet) + try(input <- import2genind(x, quiet=quiet), silent=quiet) } - stopifnot(is.genind(pop)) - pop@call[2] <- x - popcall <- pop@call - pop <- missingno(pop, type=missing, cutoff=cutoff, quiet=quiet) - pop@call <- popcall + stopifnot(is.genind(input)) + input@call[2] <- x + popcall <- input@call + input <- missingno(input, type=missing, cutoff=cutoff, quiet=quiet) + input@call <- popcall if (clonecorrect == TRUE){ - poplist <- clonecorrect(pop, hier=hier, dfname=dfname, keep=keep) - pop <- poplist - pop@call <- popcall + poplist <- clonecorrect(input, hier=hier, dfname=dfname, keep=keep) + input <- poplist + input@call <- popcall } - } else if (is.genind(pop)) { - x <- as.character(pop@call)[2] - popcall <- pop@call - pop <- missingno(pop, type=missing, cutoff=cutoff, quiet=quiet) + } else if (is.genind(input)) { + x <- as.character(match.call()[2]) + popcall <- input@call + input <- missingno(input, type=missing, cutoff=cutoff, quiet=quiet) if (clonecorrect == TRUE){ - poplist <- clonecorrect(pop, hier=hier, dfname=dfname, keep=keep) - pop <- poplist - pop@call <- popcall + poplist <- clonecorrect(input, hier=hier, dfname=dfname, keep=keep) + input <- poplist + input@call <- popcall } } - return(list(X=x, GENIND=pop)) + return(list(X=x, GENIND=input)) } #==============================================================================# @@ -176,10 +224,15 @@ extract.info <- function(x) { #==============================================================================# .clonecorrector <- function(x){ - res <- -which(duplicated(x@tab[, 1:ncol(x@tab)])) + if (is.genclone(x)){ + is_duplicated <- duplicated(x@mlg) + } else { + is_duplicated <- duplicated(x@tab[, 1:ncol(x@tab)]) + } + res <- -which(is_duplicated) # conditional for the case that all individuals are unique. if(is.na(res[1])){ - res <- which(!duplicated(x@tab[, 1:ncol(x@tab)])) + res <- which(!is_duplicated) } return(res) } @@ -193,6 +246,8 @@ extract.info <- function(x) { # # Internal functions utilizing this function: # # percent_missing +# +# DEPRECATED #==============================================================================# geno.na <- function(pop){ @@ -212,6 +267,8 @@ geno.na <- function(pop){ # # Internal functions utilizing this function: # # percent_missing +# +# DEPRECATED #==============================================================================# loci.na <- function(pop) { @@ -236,28 +293,15 @@ loci.na <- function(pop) { percent_missing <- function(pop, type="loci", cutoff=0.05){ if (toupper(type) == "LOCI"){ - misslist <- loci.na(pop) - if(all(misslist > 0)){ - return(misslist) - } - poplen <- nInd(pop) - filter <- vapply(-misslist, function(x) - length(which(is.na(pop@tab[, x])))/poplen, 1) > cutoff - if (is.na(filter[1])){ - filter <- 1:length(misslist) - } + missing_loci <- 1 - propTyped(pop, "loc") + names(missing_loci) <- levels(pop@loc.fac) + missing_loci <- missing_loci[missing_loci > cutoff] + misslist <- 1:ncol(pop@tab) + filter <- !pop@loc.fac %in% names(missing_loci) } else { - misslist <- geno.na(pop) - if(all(misslist > 0)){ - return(misslist) - } - poplen <- nLoc(pop) - filter <- vapply(-misslist, function(x) - length(unique( pop@loc.fac[which(is.na(pop@tab[x, ]))] )) / poplen, 1) > cutoff - } - if (all(filter %in% FALSE)){ - filter <- 1:length(misslist) - misslist <- 1:length(misslist) + missing_geno <- 1 - propTyped(pop, "ind") + misslist <- 1:nInd(pop) + filter <- missing_geno <= cutoff } return(misslist[filter]) } @@ -289,6 +333,7 @@ round.poppr <- function(x){ # # Internal functions utilizing this function: # # .ia +# DEPRECATED #==============================================================================# ia.pval <- function(index="index", sampled, observed){ @@ -438,45 +483,55 @@ sub_index <- function(pop, sublist="ALL", blacklist=NULL){ # Internal functions utilizing this function: # # none #==============================================================================# - -mlg.matrix <- function(pop){ - - # getting the genotype counts - countvec2 <- mlg.vector(pop) - - if(!is.null(pop@pop)){ +mlg.matrix <- function(x){ + if (is.genclone(x)){ + mlgvec <- x@mlg + } else { + mlgvec <- mlg.vector(x) + } + mlgs <- length(unique(mlgvec)) + if (!is.null(pop(x))){ + mlg.mat <- table(pop(x), mlgvec) + } else { + mlg.mat <- matrix(table(mlgvec), nrow = 1) + rownames(mlg.mat) <- "Total" + } + names(attr(mlg.mat, "dimnames")) <- NULL + if (is.null(colnames(mlg.mat))){ + colnames(mlg.mat) <- 1:mlgs + } + colnames(mlg.mat) <- paste("MLG", colnames(mlg.mat), sep=".") + return(mlg.mat) +} +#==============================================================================# +# DEPRECATED +#==============================================================================# +old.mlg.matrix <- function(x){ + mlgvec <- mlg.vector(x) + mlgs <- length(unique(mlgvec)) + if (!is.null(x@pop)){ # creating a new population matrix. Rows are the population indicator and # columns are the genotype indicator. - mlg.mat <- matrix(ncol=length(unique(countvec2)),nrow=length(levels(pop@pop)), - data=0) + mlg.mat <- matrix(ncol=mlgs, nrow=length(levels(x@pop)), data=0L) # populating (no, pun intended.) the matrix with genotype counts. - - lapply(levels(pop@pop),function(z){ + lapply(levels(x@pop),function(z){ # This first part gets the index for the row names. - count <- as.numeric(paste(unlist - (strsplit(z,""))[2:nchar(z)], - collapse="")) - sapply(countvec2[which(pop@pop==z)], + count <- as.numeric(substr(z, 2, nchar(z))) + sapply(mlgvec[which(x@pop==z)], function(a) mlg.mat[count, a] <<- - mlg.mat[count, a] + 1) + mlg.mat[count, a] + 1L) }) - rownames(mlg.mat) <- pop@pop.names - } - else{ - + rownames(mlg.mat) <-x@pop.names + } else { # if there are no populations to speak of. - - mlg.mat <- t(as.matrix( - vector(length=length(unique(countvec2)), mode="numeric"))) - sapply(countvec2, function(a) mlg.mat[a] <<- mlg.mat[a] + 1) + mlg.mat <- t(as.matrix(vector(length=mlgs, mode="numeric"))) + sapply(mlgvec, function(a) mlg.mat[a] <<- mlg.mat[a] + 1) rownames(mlg.mat) <- "Total" } - - colnames(mlg.mat) <- paste("MLG",seq(ncol(mlg.mat)), sep=".") + colnames(mlg.mat) <- paste("MLG", 1:mlgs, sep=".") return(mlg.mat) } - #==============================================================================# # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! # # @@ -507,13 +562,13 @@ mlg.matrix <- function(pop){ #''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''# # Starting the actual calculations. #,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,# - V <- .PA.pairwise.differences(pop,numLoci,np, missing=missing) + V <- .PA.pairwise.differences(pop,numLoci, np, missing=missing) # First, set the variance of D varD <- ((sum(V$D.vector^2)-((sum(V$D.vector))^2)/np))/np # Next is to create a vector containing all of the variances of d (there # will be one for each locus) vard.vector <- ((V$d2.vector-((V$d.vector^2)/np))/np) - vardpair.vector <- .Call("pairwise_covar", vard.vector) + vardpair.vector <- .Call("pairwise_covar", vard.vector, PACKAGE = "poppr") # The sum of the variances necessary for the calculation of Ia is calculated sigVarj <- sum(vard.vector) rm(vard.vector) @@ -656,9 +711,8 @@ final <- function(Iout, result){ Iout <- NULL idx <- as.data.frame(list(Index=names(IarD))) samp <- .sampling(popx, sample, missing, quiet=quiet, type=type, method=method) - samp2 <- rbind(samp, IarD) - p.val <- ia.pval(index="Ia", samp2, IarD[1]) - p.val[2] <- ia.pval(index="rbarD", samp2, IarD[2]) + p.val <- sum(IarD[1] <= c(samp$Ia, IarD[1]))/(sample + 1)#ia.pval(index="Ia", samp2, IarD[1]) + p.val[2] <- sum(IarD[2] <= c(samp$rbarD, IarD[2]))/(sample + 1)#ia.pval(index="rbarD", samp2, IarD[2]) if(hist == TRUE){ poppr.plot(samp, observed=IarD, pop=namelist$population, file=namelist$File, pval=p.val, N=nrow(pop@tab)) @@ -689,7 +743,7 @@ final <- function(Iout, result){ # isolates. # # -# DEPRECIATED +# DEPRECATED #==============================================================================# .pairwise.differences <- function(pop,numLoci,np, missing){ temp.d.vector <- matrix(nrow=np, ncol=numLoci, data=as.numeric(NA)) @@ -712,7 +766,7 @@ final <- function(Iout, result){ # one allele. In that case, the resulting pairwise differences will all be zero. # # -# DEPRECIATED +# DEPRECATED #==============================================================================# pairwisematrix <- function(pop, np){ temp.d.vector <- vector(mode="numeric", length=np) @@ -725,7 +779,7 @@ pairwisematrix <- function(pop, np){ # https://stat.ethz.ch/pipermail/r-help/2004-August/055324.html # # -# DEPRECIATED +# DEPRECATED #==============================================================================# .pairwise.diffs <- function(x){ stopifnot(is.matrix(x)) @@ -744,7 +798,7 @@ pairwisematrix <- function(pop, np){ # caluclated. # # -# DEPRECIATED +# DEPRECATED #==============================================================================# .pairwise.variances <- function(vard.vector, pair.alleles){ # Here the roots of the products of the variances are being produced and @@ -776,7 +830,7 @@ pairwisematrix <- function(pop, np){ V <- pair_diffs(pop, numLoci, np) varD <- ((sum(V$D.vector^2) - ((sum(V$D.vector))^2)/np))/np vard.vector <- ((V$d2.vector - ((V$d.vector^2)/np))/np) - vardpair.vector <- .Call("pairwise_covar", vard.vector) + vardpair.vector <- .Call("pairwise_covar", vard.vector, PACKAGE = "poppr") sigVarj <- sum(vard.vector) rm(vard.vector) Ia <- (varD/sigVarj) - 1 @@ -799,7 +853,7 @@ pair_diffs <- function(pop, numLoci, np) { ploid <- ploidy(pop[[1]]) temp.d.vector <- matrix(nrow = np, ncol = numLoci, data = as.numeric(NA)) - temp.d.vector <- vapply(pop, function(x) .Call("pairdiffs", x@tab)*(ploid/2), + temp.d.vector <- vapply(pop, function(x) .Call("pairdiffs", x@tab, PACKAGE = "poppr")*(ploid/2), temp.d.vector[, 1]) d.vector <- colSums(temp.d.vector) d2.vector <- colSums(temp.d.vector^2) @@ -808,7 +862,7 @@ pair_diffs <- function(pop, numLoci, np) } #==============================================================================# -# Internal counter...probably depreciated. +# Internal counter...probably DEPRECATED. #==============================================================================# .new_counter <- function() { i <- 0 @@ -828,9 +882,9 @@ pair_diffs <- function(pop, numLoci, np) # Internal functions utilizing this function: # # none # -# DEPRECIATED +# DEPRECATED #==============================================================================# -phylo.bruvo.dist <- function(ssr.matrix, replen=c(2), ploid=2){ +phylo.bruvo.dist <- function(ssr.matrix, replen=c(2), ploid=2, add = TRUE, loss = TRUE){ # Preceeding functions should take care of this: # ssr.matrix <- genind2df(pop, sep="/", usepop=FALSE) # ssr.matrix[is.na(ssr.matrix)] <- paste(rep(0, ploid), collapse="/") @@ -842,8 +896,9 @@ phylo.bruvo.dist <- function(ssr.matrix, replen=c(2), ploid=2){ ssr.matrix <- apply(as.matrix(t(sapply(ssr.matrix, unlist))), 2, as.numeric) # Dividing each column by the repeat length and changing the values to integers. ssr.matrix <- apply(ssr.matrix / rep(replen, each=ploid*nrow(ssr.matrix)), 2, round) - perms <- .Call("permuto", ploid) - distmat <- .Call("bruvo_distance", ssr.matrix, perms, ploid) + ssr.matrix <- apply(ssr.matrix, 2, as.integer) + perms <- .Call("permuto", ploid, PACKAGE = "poppr") + distmat <- .Call("bruvo_distance", ssr.matrix, perms, ploid, add, loss, PACKAGE = "poppr") distmat[distmat == 100] <- NA avg.dist.vec <- apply(distmat, 1, mean, na.rm=TRUE) # presenting the information in a lower triangle distance matrix. @@ -854,6 +909,68 @@ phylo.bruvo.dist <- function(ssr.matrix, replen=c(2), ploid=2){ return(dist.mat) } +#==============================================================================# +# This will transform the data to be in the range of [0, 1] +# +# Public functions utilizing this function: +# poppr.msn +# +# Internal functions utilizing this function: +# # adjustcurve +#==============================================================================# + +rerange <- function(x){ + minx <- min(x, na.rm = TRUE) + maxx <- max(x, na.rm = TRUE) + if (minx < 0) + x <- x + abs(minx) + maxx <- maxx + abs(minx) + if (maxx > 1) + x <- x/maxx + return(x) +} + +#==============================================================================# +# This will scale the edge widths +# +# Public functions utilizing this function: +# none +# +# Internal functions utilizing this function: +# # update_edge_scales +#==============================================================================# + +make_edge_width <- function(mst){ + edgewidth <- rerange(E(mst)$weight) + if (any(edgewidth < 0.08)){ + edgewidth <- edgewidth + 0.08 + } + return(1/edgewidth) +} + +#==============================================================================# +# This will scale the edge widths and edge color for a graph +# +# Public functions utilizing this function: +# poppr.msn bruvo.msn plot_poppr_msn +# +# Internal functions utilizing this function: +# # singlepop_msn +#==============================================================================# +update_edge_scales <- function(mst, wscale = TRUE, gscale = TRUE, glim, gadj){ + if(gscale == TRUE){ + E(mst)$color <- gray(adjustcurve(E(mst)$weight, glim=glim, correction=gadj, + show=FALSE)) + } else { + E(mst)$color <- rep("black", length(E(mst)$weight)) + } + E(mst)$width <- 2 + if (wscale==TRUE){ + E(mst)$width <- make_edge_width(mst) + } + return(mst) +} + #==============================================================================# # This will adjust the grey scale with respect to the edge weights for igraph. # This is needed because the length of the edges do not correspond to weights. @@ -870,8 +987,10 @@ phylo.bruvo.dist <- function(ssr.matrix, replen=c(2), ploid=2){ # #==============================================================================# -adjustcurve <- function(weights, glim = c(0,0.8), correction = 3, show=FALSE){ +adjustcurve <- function(weights, glim = c(0,0.8), correction = 3, show=FALSE, + scalebar = FALSE, smooth = TRUE){ w <- weights + w <- rerange(w) maxg <- max(glim) ming <- 1-(min(glim)/maxg) if (correction < 0){ @@ -881,28 +1000,54 @@ adjustcurve <- function(weights, glim = c(0,0.8), correction = 3, show=FALSE){ adj <- (1 - (((1-w)^abs(correction))/(1/ming)) ) adj <- adj / (1/maxg) } - if (show == FALSE){ + if (!show){ return(adj) - } else { - cols <- grey(sort(adj)) - hist(w, col=cols, border=NA, breaks=w, ylim=0:1, xlab="Observed Value", - ylab="Grey Adjusted", - main=paste("Grey adjustment\n min:", min(glim), "max:", max(glim), - "adjust:",abs(correction))) - points(x=w, y=adj, col=grey(rev(adj)), pch=20) + } else if (!scalebar){ + with_quantiles <- sort(weights) + wq_raster <- t(as.raster(as.matrix(gray(sort(adj)), nrow = 1))) + xlims <- c(min(weights), max(weights)) + plot(xlims, 0:1, type = "n", ylim = 0:1, xlim = xlims, xlab = "", ylab = "") + rasterImage(wq_raster, xlims[1], 0, xlims[2], 1) + points(x = sort(weights), y = sort(adj), col=grey(rev(sort(adj))), pch=20) + title(xlab="Observed Value", ylab="Grey Adjusted", + main=paste("Grey adjustment\n min:", + min(glim), + "max:", max(glim), + "adjust:",abs(correction))) if (correction < 0){ text(bquote(frac(bgroup("(",frac(scriptstyle(x)^.(abs(correction)), .(ming)^-1),")") + .(1-ming), .(maxg)^-1)) , - x=0.25,y=0.75, col="red") + x = min(weights) + (0.25*max(weights)), y=0.75, col="red") } else { text(bquote(frac(1-bgroup("(",frac((1-scriptstyle(x))^.(abs(correction)), .(ming)^-1),")"), .(maxg)^-1)) , - x=0.15,y=0.75, col="red") + x= min(weights) + (0.15*max(weights)), y=0.75, col="red") } - lines(x=0:1, y=c(min(glim),min(glim)), col="yellow") - lines(x=0:1, y=c(max(glim),max(glim)), col="yellow") + lines(x=xlims, y=c(min(glim),min(glim)), col="yellow") + lines(x=xlims, y=c(max(glim),max(glim)), col="yellow") + } else { + with_quantiles <- sort(weights) + wq_raster <- t(as.raster(as.matrix(gray(sort(adj)), nrow = 1))) + no_quantiles <- seq(min(weights), max(weights), length = 1000) + nq_raster <- adjustcurve(no_quantiles, glim, correction, show = FALSE) + nq_raster <- t(as.raster(as.matrix(gray(nq_raster), nrow = 1))) + layout(matrix(1:2, nrow = 2)) + plot.new() + rasterImage(wq_raster, 0, 0.5, 1, 1) + polygon(c(0, 1, 1), c(0.5, 0.5, 0.8), col = "white", border = "white", lwd = 2) + axis(3, at = c(0, 0.25, 0.5, 0.75, 1), labels = round(quantile(with_quantiles), 3)) + text(0.5, 0, labels = "Quantiles From Data", font = 2, cex = 1.5, adj = c(0.5, 0)) + plot.new() + rasterImage(nq_raster, 0, 0.5, 1, 1) + polygon(c(0, 1, 1), c(0.5, 0.5, 0.8), col = "white", border = "white", lwd = 2) + axis(3, at = c(0, 0.25, 0.5, 0.75, 1), labels = round(quantile(no_quantiles), 3)) + text(0.5, 0, labels = "Quantiles From Smoothing", font = 2, cex = 1.5, adj = c(0.5, 0)) + # Return top level plot to defau lts. + layout(matrix(c(1), ncol=1, byrow=T)) + par(mar=c(5,4,4,2) + 0.1) # number of lines of margin specified. + par(oma=c(0,0,0,0)) # Figure margins } } @@ -987,10 +1132,18 @@ fix_negative_branch <- function(tre){ singlepop_msn <- function(pop, vertex.label, replen = NULL, distmat = NULL, gscale = TRUE, - glim = c(0, 0.8), gadj = 3, wscale = TRUE, palette = topo.colors, ...){ + glim = c(0, 0.8), gadj = 3, wscale = TRUE, palette = topo.colors, showplot = TRUE, ...){ # First, clone correct and get the number of individuals per MLG in order. cpop <- pop[.clonecorrector(pop), ] - mlg.number <- table(pop$other$mlg.vec)[rank(cpop$other$mlg.vec)] + if (is.genclone(pop)){ + mlgs <- pop$mlg + cmlg <- cpop$mlg + mlg.number <- table(mlgs)[rank(cmlg)] + } else { + mlgs <- pop$other$mlg.vec + cmlg <- cpop$other$mlg.vec + mlg.number <- table(mlgs)[rank(cmlg)] + } # Calculate distance matrix if not supplied (Bruvo's distance) if (is.null(distmat) & !is.null(replen)){ @@ -1004,41 +1157,26 @@ singlepop_msn <- function(pop, vertex.label, replen = NULL, distmat = NULL, gsca # Create the vertex labels if (!is.na(vertex.label[1]) & length(vertex.label) == 1){ if (toupper(vertex.label) == "MLG"){ - vertex.label <- paste0("MLG.", cpop$other$mlg.vec) + vertex.label <- paste0("MLG.", cmlg) } else if(toupper(vertex.label) == "INDS") { vertex.label <- cpop$ind.names } } - - # Adjust the color of the edges. - if (gscale == TRUE){ - E(mst)$color <- gray(adjustcurve(E(mst)$weight, glim=glim, correction=gadj, - show=FALSE)) - } else { - E(mst)$color <- rep("black", length(E(mst)$weight)) - } - - # Adjust the widths of the edges - edgewidth <- 2 - if (wscale == TRUE){ - edgewidth <- 1/(E(mst)$weight) - if (any(E(mst)$weight < 0.08)){ - edgewidth <- 1/(E(mst)$weight + 0.08) - } - } - + mst <- update_edge_scales(mst, wscale, gscale, glim, gadj) populations <- ifelse(is.null(pop(pop)), NA, pop$pop.names) # Plot everything - plot.igraph(mst, edge.width = edgewidth, edge.color = E(mst)$color, - vertex.label = vertex.label, vertex.size = mlg.number*3, - vertex.color = palette(1), ...) - legend(-1.55,1,bty = "n", cex = 0.75, - legend = populations, title = "Populations", fill = palette(1), - border = NULL) + if (showplot){ + plot.igraph(mst, edge.width = E(mst)$width, edge.color = E(mst)$color, + vertex.label = vertex.label, vertex.size = mlg.number*3, + vertex.color = palette(1), ...) + legend(-1.55,1,bty = "n", cex = 0.75, + legend = populations, title = "Populations", fill = palette(1), + border = NULL) + } # Save variables and return plot. - E(mst)$width <- edgewidth + # E(mst)$width <- E(mst)$width V(mst)$size <- mlg.number V(mst)$color <- palette(1) V(mst)$label <- vertex.label @@ -1055,7 +1193,7 @@ singlepop_msn <- function(pop, vertex.label, replen = NULL, distmat = NULL, gsca # # singlepop_msn #==============================================================================# -bruvos_distance <- function(bruvomat, funk_call = match.call()){ +bruvos_distance <- function(bruvomat, funk_call = match.call(), add = TRUE, loss = TRUE){ x <- bruvomat@mat ploid <- bruvomat@ploidy replen <- bruvomat@replen @@ -1064,9 +1202,9 @@ bruvos_distance <- function(bruvomat, funk_call = match.call()){ x <- x / rep(replen, each=ploid*nrow(x)) x <- matrix(as.integer(round(x)), ncol=ncol(x)) # Getting the permutation vector. - perms <- .Call("permuto", ploid) + perms <- .Call("permuto", ploid, PACKAGE = "poppr") # Calculating bruvo's distance over each locus. - distmat <- .Call("bruvo_distance", x, perms, ploid) + distmat <- .Call("bruvo_distance", x, perms, ploid, add, loss, PACKAGE = "poppr") # If there are missing values, the distance returns 100, which means that the # comparison is not made. These are changed to NA. distmat[distmat == 100] <- NA @@ -1076,9 +1214,605 @@ bruvos_distance <- function(bruvomat, funk_call = match.call()){ dist.mat <- matrix(ncol=nrow(x), nrow=nrow(x)) dist.mat[which(lower.tri(dist.mat)==TRUE)] <- avg.dist.vec dist.mat <- as.dist(dist.mat) - attr(dist.mat, "labels") <- bruvomat@ind.names + attr(dist.mat, "Labels") <- bruvomat@ind.names attr(dist.mat, "method") <- "Bruvo" attr(dist.mat, "call") <- funk_call return(dist.mat) } +#==============================================================================# +# A function for creating a population hierarchy using a formula and data frame +# +# hier = a nested formula such as ~ A/B/C where C is nested within B, which is +# nested within A. +# +# df = a data frame containing columns corresponding to the variables in hier. +# +# example: +# df <- data.frame(list(a = letters, b = LETTERS, c = 1:26)) +# newdf <- make_hierarchy(~ a/b/c, df) +# df[names(newdf)] <- newdf # Add new columns. +# +# Public functions utilizing this function: +# +# # poppr.amova, setpop, gethierarchy +# +# Internal functions utilizing this function: +# # none +#==============================================================================# +make_hierarchy <- function(hier, df, expand_label = FALSE){ + newlevs <- attr(terms(hier), "term.labels") + levs <- all.vars(hier) + if (length(levs) > 1){ + newlevs <- gsub(":", "_", newlevs) + } + if (!all(levs %in% names(df))){ + stop(hier_incompatible_warning(levs, df)) + } + newdf <- df[levs[1]] + if (!expand_label){ + newlevs <- levs + } + lapply(1:length(levs), function(x) newdf[[newlevs[x]]] <<- as.factor(pop_combiner(df, levs[1:x]))) + return(newdf) +} + +#==============================================================================# +# Function for creating the structure data frame needed for ade4's AMOVA +# implementation. +# +# Public functions utilizing this function: +# +# # poppr.amova +# +# Internal functions utilizing this function: +# # none +#==============================================================================# + +make_ade_df <- function(hier, df, expanded = FALSE){ + if (expanded){ + levs <- attr(terms(hier), "term.labels") + } else { + levs <- all.vars(hier) + } + if(length(levs) <= 1){ + # stop("Only one level present") + return(NULL) + } + levs <- gsub(":", "_", levs) + if(!all(levs %in% names(df))){ + stop(hier_incompatible_warning(levs, df)) + } + smallest <- df[[levs[length(levs)]]] + smallinds <- !duplicated(smallest) + newdf <- df[smallinds, ] + newdf <- newdf[-length(levs)] + if (length(newdf) > 1){ + factlist <- lapply(newdf, function(x) factor(x, unique(x))) + } else { + factlist <- list(factor(newdf[[1]], unique(newdf[[1]]))) + names(factlist) <- names(newdf) + } + return(rev(data.frame(factlist))) +} + +# Function for determining if a genind object has any heterozygous sites. +# Public functions utilizing this function: +# +# # none...yet +# +# Internal functions utilizing this function: +# # none + +check_Hs <- function(x){ + res <- any(x@tab > 0 & x@tab < 1, na.rm = TRUE) + return(res) +} + +## Obtains specific haplotypes from a genind object. +## +## Arguments: +## inds - indexes of the first and last characters defining the haplotype +## x - the loci object +## loci_cols - the columns defining the loci +## ind_names - the sample names +## hap_fac - a vector defining the haplotype. +## +## Since the inds and hap_fac args are not intuitive, I should demonstrate. +## Let's say we have a two individuals with two loci: +## 1 2 +## A: 10/20 30/40 +## B: 15/15 25/10 +## +## They have the haplotypes of +## A1: 10 30 +## A2: 20 40 +## B1: 15 25 +## B2: 15 10 +## +## Each genotype at a locus is 5 characters long. To get the haplotypes, we need +## to avoid the separator. +## inds in this case will be c(1, 2) for the first haplotype and c(4, 5) for the +## second haplotype and hap_fac will be c(1,1,1,2,2,2) for both. +## +# Public functions utilizing this function: +# # none +# +# Internal functions utilizing this function: +# # separate_haplotypes + +hap2genind <- function(inds, x, loci_cols, ind_names, hap_fac){ + new_ind_names <- paste(ind_names, hap_fac[inds[2]], sep = ".") + new_pop <- rep(hap_fac[inds[2]], nrow(x)) + x2 <- lapply(x[loci_cols], substr, inds[1], inds[2]) + x2 <- data.frame(x2, stringsAsFactors = F) + x2 <- df2genind(x2, ploidy = 1, pop=new_pop, ind.names=new_ind_names) + return(x2) +} + +## Secondary function. Transforms the genind object into a loci object and +## collects information necessary to collect individual haplotypes. +# Public functions utilizing this function: +# # none +# +# Internal functions utilizing this function: +# # pool_haplotypes +separate_haplotypes <- function(x){ + ploidy <- ploidy(x) + allele_list <- unlist(lapply(x@all.names, nchar)) + allele_length <- sum(allele_list)/length(allele_list) + if (!all(allele_list == allele_length)){ + stop("not all alleles are of equal length.") + } + inds <- indNames(x) + x.loc <- as.loci(x) + loci_cols <- attr(x.loc, "locicol") + pop_col <- which(names(x.loc) == "population") + geno_length <- allele_length*ploidy + ploidy - 1 + sep <- which(1:geno_length %% (allele_length + 1) == 0) + sep <- c(0, sep, geno_length + 1) + hap_fac <- rep(1:ploidy, each = allele_length + 1) + allele_inds <- lapply(1:ploidy, function(i) c(sep[i] + 1, sep[i + 1] - 1)) + haplist <- lapply(allele_inds, hap2genind, x.loc, loci_cols, inds, hap_fac) + return(haplist) +} + +#==============================================================================# +# Haplotype pooling. +# The following functions are necessary to account for within sample variation. +# They will separate the haplotypes of a genind object and repool them so that +# there are n*k individuals in the new data set where n is the number of +# individuals and k is the ploidy. +# Public functions utilizing this function: +# # poppr.amova +# +# Internal functions utilizing this function: +# # none +#==============================================================================# +## Main Function. Lengthens the population hierarchy as well. +pool_haplotypes <- function(x, dfname = "population_hierarchy"){ + ploidy <- ploidy(x) + df <- other(x)[[dfname]] + df$Individual <- indNames(x) + df <- df[rep(1:nrow(df), ploidy), ] + newx <- repool(separate_haplotypes(x)) + pop(newx) <- df$Individual + other(newx)[[dfname]] <- df + return(newx) +} + +#==============================================================================# +# The function locus_table_pegas is the internal workhorse. It will process a +# summary.loci object into a nice table utilizing the various diversity indices +# provided by vegan. Note that it has a catch which will remove all allele names +# that are any amount of zeroes and nothing else. The reason for this being that +# these alleles actually represent missing data. +# The wrapper for this is locus_table, which will take in a genind object and +# send it into locus_table_pegas. +# Note: lev argument has only the options of "allele" or "genotype" +# Public functions utilizing this function: +# # locus_table +# +# Internal functions utilizing this function: +# # none +#==============================================================================# +locus_table_pegas <- function(x, index = "simpson", lev = "allele", type = "codom"){ + unique_types <- x[[lev]] + # Removing any zero-typed alleles that would be present with polyploids. + zero_names <- grep("^0+?$", names(unique_types)) + if (length(zero_names) > 0 & type == "codom"){ + unique_types <- unique_types[-zero_names] + } + + N <- length(unique_types) + H <- vegan::diversity(unique_types) + G <- vegan::diversity(unique_types, "inv") + Simp <- vegan::diversity(unique_types, "simp") + nei <- (N/(N-1)) * Simp + + if (index == "simpson"){ + idx <- Simp + names(idx) <- "1-D" + } else if (index == "shannon"){ + idx <- H + names(idx) <- "H" + } else { + idx <- G + names(idx) <- "G" + } + + E.5 <- (G - 1)/(exp(H) - 1) + names(N) <- lev + return(c(N, idx, Hexp = nei, Evenness = E.5)) +} + +#==============================================================================# +# Function to plot phylo objects the way I want to. +# +# Public functions utilizing this function: +# bruvo.boot +# +# Private functions utilizing this function: +# # nei.boot any.boot +#==============================================================================# +poppr.plot.phylo <- function(tree, type = "nj"){ + ARGS <- c("nj", "upgma") + type <- match.arg(type, ARGS) + barlen <- min(median(tree$edge.length), 0.1) + if (barlen < 0.1) barlen <- 0.01 + if (type == "nj"){ + tree <- ladderize(tree) + } + plot.phylo(tree, cex = 0.8, font = 2, adj = 0, xpd = TRUE, + label.offset = 0.0125) + nodelabels(tree$node.label, adj = c(1.3, -0.5), frame = "n", cex = 0.8, + font = 3, xpd = TRUE) + if (type == "nj"){ + add.scale.bar(lwd = 5, length = barlen) + } else { + axisPhylo(3) + } +} + + +#==============================================================================# +# Function to do something with Rodger's distance +# +# Public functions utilizing this function: +# rodger.dist +# +# Private functions utilizing this function: +# # none +#==============================================================================# +# From adegenet dist.genpop +dcano <- function(mat) { + daux <- mat%*%t(mat) + vec <- diag(daux) + daux <- -2*daux + vec[col(daux)] + vec[row(daux)] + diag(daux) <- 0 + # if (any(daux == Inf)){ + # daux <- infinite_vals_replacement(daux, warning) + # } + daux <- sqrt(daux*0.5) + return(daux) +} + +#==============================================================================# +# tabulate the amount of missing data per locus. +# +# Public functions utilizing this function: +# none +# +# Private functions utilizing this function: +# # percent_missing +#==============================================================================# + +number_missing_locus <- function(x, divisor){ + missing_result <- colSums(1 - propTyped(x, by = "both")) + return(missing_result/divisor) +} + +#==============================================================================# +# tabulate the amount of missing data per genotype. +# +# Public functions utilizing this function: +# none +# +# Private functions utilizing this function: +# # percent_missing +#==============================================================================# + +number_missing_geno <- function(x, divisor){ + missing_result <- rowSums(1 - propTyped(x, by = "both")) + return(missing_result/divisor) +} + +#==============================================================================# +# Replace infinite values with the maximum finite value of a distance matrix. +# +# Public functions utilizing this function: +# nei.dist +# +# Private functions utilizing this function: +# # none +#==============================================================================# + +infinite_vals_replacement <- function(D, warning){ + if (warning){ + warning("Infinite values detected.") + } + maxval <- max(D[!D == Inf]) + D[D == Inf] <- maxval*10 + return(D) +} + +#==============================================================================# +# Given a tree function and a distance function, this will generate an +# automatic tree generating function. This is useful for functions such as +# boot.phylo. +# +# Public functions utilizing this function: +# anyboot +# +# Private functions utilizing this function: +# # none +#==============================================================================# +tree_generator <- function(tree, distance, quiet = TRUE, ...){ + TREEFUNK <- match.fun(tree) + DISTFUNK <- match.fun(distance) + distargs <- formals(distance) + otherargs <- list(...) + #print(otherargs) + matchargs <- names(distargs)[names(distargs) %in% names(otherargs)] + distargs[matchargs] <- unlist(otherargs[matchargs]) + #print(distargs) + if (!quiet) cat("\nTREE....... ", tree,"\nDISTANCE... ", distance) + treedist <- function(x){ + distargs[[1]] <- x + #print(distargs) + TREEFUNK(do.call(DISTFUNK, distargs)) + } + return(treedist) +} + +#==============================================================================# +# This will retrieve a genetic matrix based on genpop status or not. +# +# Public functions utilizing this function: +# *.dist +# +# Private functions utilizing this function: +# # none +#==============================================================================# +get_gen_mat <- function(x){ + if (is.genpop(x) && x@type == "codom"){ + MAT <- makefreq(x, missing = "mean", quiet = TRUE)$tab + } else { + MAT <- x@tab + } + return(MAT) +} + +#==============================================================================# +# This will retrieve the labels for the distance matrix from "gen" objects +# +# Public functions utilizing this function: +# *.dist +# +# Private functions utilizing this function: +# # none +#==============================================================================# +get_gen_dist_labs <- function(x){ + if (is.genind(x)){ + labs <- indNames(x) + } else if (is.genpop(x)){ + labs <- x@pop.names + } else if (is(x, "bootgen")){ + labs <- names(x) + } else { + labs <- rownames(x) + } + return(labs) +} + +#==============================================================================# +# This will give attributes to genetic distance matrices. +# +# Public functions utilizing this function: +# *.dist +# +# Private functions utilizing this function: +# # none +#==============================================================================# +make_attributes <- function(d, nlig, labs, method, matched_call){ + attr(d, "Size") <- nlig + attr(d, "Labels") <- labs + attr(d, "Diag") <- FALSE + attr(d, "Upper") <- FALSE + attr(d, "method") <- method + attr(d, "call") <- matched_call + class(d) <- "dist" + return(d) +} + +#==============================================================================# +# Function used to update colors in poppr msn +# +# Public functions utilizing this function: +# plot_poppr_msn +# +# Private functions utilizing this function: +# # none +#==============================================================================# +update_poppr_graph <- function(graphlist, PALETTE){ + PALETTE <- match.fun(PALETTE) + lookup <- data.frame(old = graphlist$colors, + update = PALETTE(length(graphlist$colors)), + stringsAsFactors = FALSE) + if (nrow(lookup) > 1){ + colorlist <- V(graphlist$graph)$pie.color + V(graphlist$graph)$pie.color <- lapply(colorlist, update_colors, lookup) + } else { + colorlist <- V(graphlist$graph)$color + V(graphlist$graph)$color <- rep(PALETTE(1), length(colorlist)) + } + graphlist$colors <- lookup[[2]] + return(graphlist) +} + +#==============================================================================# +# Function used to update colors in poppr msn +# +# Public functions utilizing this function: +# none +# +# Private functions utilizing this function: +# # update_poppr_graph +#==============================================================================# +update_colors <- function(colorvec, lookup){ + x <- vapply(1:length(colorvec), update_single_color, "a", lookup, colorvec) + return(x) +} +#==============================================================================# +# Function used to update colors in poppr msn +# +# Public functions utilizing this function: +# none +# +# Private functions utilizing this function: +# # update_colors +#==============================================================================# +update_single_color <- function(x, lookup, colorvec){ + update <- lookup[[2]] + original <- lookup[[1]] + return(update[original %in% colorvec[x]]) +} + +#==============================================================================# +# Function used to obtain information about local ploidy in a genind data set +# for polyploids. When polyploids are imported, the entire data set is coded in +# such a way that the whole data set is considered to be the ploidy of the higest +# observed datum. The missing data are coded as "0". This function will simply +# take the difference between the maximum ploidy and the number of zeroes present. +# +# Public functions utilizing this function: +# info_table +# +# Private functions utilizing this function: +# # none +#==============================================================================# +get_local_ploidy <- function(x){ + ploidy <- x@ploidy + stopifnot(ploidy > 2) + stopifnot(test_zeroes(x)) + zerocol <- which(as.numeric(x@all.names[[1]]) == 0) + locs <- names(x@loc.names) + locmat <- vapply(1:nLoc(x), function(z) as.integer(round(ploidy - x[loc = locs[z]]@tab[, zerocol]*ploidy)), + integer(nInd(x))) + return(locmat) +} +#==============================================================================# +# Test if a data set is comprised of microsatellites. +# +# Public functions utilizing this function: +# none +# +# Private functions utilizing this function: +# # test_zeroes +#==============================================================================# +test_microsat <- function(x){ + allnames <- unlist(lapply(x@all.names, as.numeric)) + if (any(is.na(allnames))){ + return(FALSE) + } else { + return(TRUE) + } +} +#==============================================================================# +# Test if a polyploid microsatellite data set represent missing data as "0" +# +# Public functions utilizing this function: +# none +# +# Private functions utilizing this function: +# # get_local_ploidy +#==============================================================================# +test_zeroes <- function(x){ + if (test_microsat(x)){ + allnames <- unlist(lapply(x@all.names, as.numeric)) + if (any(allnames == 0) & x@ploidy > 2){ + return(TRUE) + } + } + return(FALSE) +} + +#==============================================================================# +# Internal plotting function for mlg.table +# +# Public functions utilizing this function: +# none +# +# Private functions utilizing this function: +# # print_mlg_barplot +#==============================================================================# +mlg_barplot <- function(mlgt){ + + # create a data frame that ggplot2 can read. + mlgt.df <- as.data.frame(list(MLG = colnames(mlgt), + count = as.vector(mlgt)), + stringsAsFactors = FALSE) + + # Organize the data frame by count in descending order. + rearranged <- order(mlgt.df$count, decreasing = TRUE) + mlgt.df <- mlgt.df[rearranged, ] + mlgt.df[["MLG"]] <- factor(mlgt.df[["MLG"]], unique(mlgt.df[["MLG"]])) + + # plot it + return(ggplot(mlgt.df, aes_string(x = "MLG", y = "count")) + + geom_bar(aes_string(fill = "count"), position="identity", stat = "identity")) +} + +#==============================================================================# +# Internal plotting function for mlg.table +# +# Public functions utilizing this function: +# mlg.table +# +# Private functions utilizing this function: +# # none +#==============================================================================# + +print_mlg_barplot <- function(n, mlgtab, quiet=quiet) { + if(!quiet) cat("|",n,"\n") + + # Gather all nonzero values + mlgt <- mlgtab[n, mlgtab[n, ] > 0, drop=FALSE] + + # controlling for the situation where the population size is 1. + if (sum(mlgtab[n, ]) > 1){ + print(mlg_barplot(mlgt) + + theme_classic() %+replace% + theme(axis.text.x=element_text(size=10, angle=-45, hjust=0, vjust=1)) + + labs(title=paste("Population:", n, "\nN =", sum(mlgtab[n, ]), + "MLG =", length(mlgt)))) + } +} + +#==============================================================================# +# Internal function for resampling loci for genotype accumulation curve. +# +# Public functions utilizing this function: +# genotype_curve +# +# Private functions utilizing this function: +# # none +#==============================================================================# +get_sample_mlg <- function(size, samp, nloci, gen, progbar){ + if (!is.null(progbar)){ + setTxtProgressBar(progbar, size/(nloci-1)) + } + out <- vapply(1:samp, function(x) nrow(unique(sample(gen, size))), integer(1)) + return(out) +} + diff --git a/R/messages.r b/R/messages.r new file mode 100644 index 00000000..16f2107b --- /dev/null +++ b/R/messages.r @@ -0,0 +1,177 @@ +#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# +#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# +#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# +#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# +#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# +# +# This software was authored by Zhian N. Kamvar and Javier F. Tabima, graduate +# students at Oregon State University; and Dr. Nik Grünwald, an employee of +# USDA-ARS. +# +# Permission to use, copy, modify, and distribute this software and its +# documentation for educational, research and non-profit purposes, without fee, +# and without a written agreement is hereby granted, provided that the statement +# above is incorporated into the material, giving appropriate attribution to the +# authors. +# +# Permission to incorporate this software into commercial products may be +# obtained by contacting USDA ARS and OREGON STATE UNIVERSITY Office for +# Commercialization and Corporate Development. +# +# The software program and documentation are supplied "as is", without any +# accompanying services from the USDA or the University. USDA ARS or the +# University do not warrant that the operation of the program will be +# uninterrupted or error-free. The end-user understands that the program was +# developed for research purposes and is advised not to rely exclusively on the +# program for any reason. +# +# IN NO EVENT SHALL USDA ARS OR OREGON STATE UNIVERSITY BE LIABLE TO ANY PARTY +# FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING +# LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, +# EVEN IF THE OREGON STATE UNIVERSITY HAS BEEN ADVISED OF THE POSSIBILITY OF +# SUCH DAMAGE. USDA ARS OR OREGON STATE UNIVERSITY SPECIFICALLY DISCLAIMS ANY +# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE AND ANY STATUTORY +# WARRANTY OF NON-INFRINGEMENT. THE SOFTWARE PROVIDED HEREUNDER IS ON AN "AS IS" +# BASIS, AND USDA ARS AND OREGON STATE UNIVERSITY HAVE NO OBLIGATIONS TO PROVIDE +# MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. +# +#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# +#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# +#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# +#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# +#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# +#==============================================================================# + +#==============================================================================# +# Message to print after running the poppr function +# +# Public functions utilizing this function: +# # poppr poppr.all +# +# Internal functions utilizing this function: +# # none +#==============================================================================# + +poppr_message <- function(){ + cat("-----------------------------------------------------------------------|\n") + cat("Pop = Population name (Total == Pooled)\n") + cat("N = Census population size\n") + cat("MLG = Number of unique multilocus genotypes (MLG) observed\n") + cat("eMLG = Number of expected MLG based on rarefaction at smallest N >= 10\n") + cat("SE = Standard error of rarefaction analysis\n") + cat("H = Shannon-Wiener Index of MLG diversity\n") + cat("G = Stoddart and Taylor's Index of MLG diversity\n") + cat("Hexp = Nei's 1978 genotypic diversity (Expected Heterozygosity)\n") + cat("E.5 = Evenness\n") + cat("Ia = Index of association\n") + cat("rbarD = Standardized index of association\n") + cat("-----------------------------------------------------------------------|\n") +} + +#==============================================================================# +# A function that will quit the function if a level in the hierarchy is not +# present in the given data frame. +# +# Public functions utilizing this function: +# # setpop gethierarchy poppr.amova +# +# Internal functions utilizing this function: +# # make_hierarchy make_ade_df +#==============================================================================# +hier_incompatible_warning <- function(levs, df){ + msg <- paste("One or more levels in the given hierarchy is not present", + "in the data frame.", + "\nHierarchy:\t", paste(levs, collapse = ", "), "\nData:\t\t", + paste(names(df), collapse = ", ")) + return(msg) +} + +#==============================================================================# +# Warning message for when a distance matrix is non-euclidean and the user +# did not specify an appropriate correction. +# +# Public functions utilizing this function: +# # poppr.amova +# +# Internal functions utilizing this function: +# # none +#==============================================================================# +not_euclid_msg <- function(correction){ + msg <- paste0("\nThe distance matrix generated is non-euclidean and a ", + "correction is needed.\n", + "You supplied: correction = '", correction, "'\nPlease change", + " it to one of the following:\n", + "\t'cailliez'\t'quasieuclid'\t'lingoes'") + return(msg) +} + +#==============================================================================# +# Warning message for the function popsub. +# Public functions utilizing this function: +# +# # popsub +# +# Internal functions utilizing this function: +# # none +#==============================================================================# +unmatched_pops_warning <- function(pops, sublist){ + msg <- paste("The sublist provided does not match any of the populations:\n", + "\tsublist.......", sublist, "\n", + "\tPopulations...", paste(pops, collapse = " ")) + return(msg) +} + +#==============================================================================# +# Warning messages for Bruvo's distance calculation. +# Public functions utilizing this function: +# +# # bruvo.dist bruvo.boot bruvo.msn +# +# Internal functions utilizing this function: +# # none +#==============================================================================# +repeat_length_warning <- function(replen){ + msg <- paste("\n\nRepeat length vector for loci is not equal to the number", + "of loci represented.\nEstimating repeat lengths from data:\n", + paste0("c(", paste(replen, collapse = ", "),")"), "\n\n") + return(msg) +} + +non_ssr_data_warning <- function(){ + msg <- paste("\nThis dataset does not appear to be microsatellite data.", + "Bruvo's Distance can only be applied for true microsatellites.") + return(msg) +} + +#==============================================================================# +# Warning message for Neighbor-Joining trees. +# Public functions utilizing this function: +# +# # aboot bruvo.boot +# +# Internal functions utilizing this function: +# # none +#==============================================================================# +negative_branch_warning <- function(){ + msg <- paste("Some branch lengths of the tree are negative.", + "Normalizing branches according to Kuhner and Felsenstein", + "(1994)") + return(msg) +} + +#==============================================================================# +# Warning message for mlg.crosspop with the flag mlgsub. +# Public functions utilizing this function: +# +# # mlg.crosspop +# +# Internal functions utilizing this function: +# # none +#==============================================================================# + +mlg_sub_warning <- function(mlgs){ + msg <- paste0("The following multilocus genotypes are not defined in this ", + "dataset: ", paste(mlgs, collapse = ", ")) + return(msg) +} diff --git a/R/methods.r b/R/methods.r new file mode 100644 index 00000000..85f57eb8 --- /dev/null +++ b/R/methods.r @@ -0,0 +1,1010 @@ +#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# +#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# +#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# +#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# +#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# +# +# This software was authored by Zhian N. Kamvar and Javier F. Tabima, graduate +# students at Oregon State University; and Dr. Nik Grünwald, an employee of +# USDA-ARS. +# +# Permission to use, copy, modify, and distribute this software and its +# documentation for educational, research and non-profit purposes, without fee, +# and without a written agreement is hereby granted, provided that the statement +# above is incorporated into the material, giving appropriate attribution to the +# authors. +# +# Permission to incorporate this software into commercial products may be +# obtained by contacting USDA ARS and OREGON STATE UNIVERSITY Office for +# Commercialization and Corporate Development. +# +# The software program and documentation are supplied "as is", without any +# accompanying services from the USDA or the University. USDA ARS or the +# University do not warrant that the operation of the program will be +# uninterrupted or error-free. The end-user understands that the program was +# developed for research purposes and is advised not to rely exclusively on the +# program for any reason. +# +# IN NO EVENT SHALL USDA ARS OR OREGON STATE UNIVERSITY BE LIABLE TO ANY PARTY +# FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING +# LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, +# EVEN IF THE OREGON STATE UNIVERSITY HAS BEEN ADVISED OF THE POSSIBILITY OF +# SUCH DAMAGE. USDA ARS OR OREGON STATE UNIVERSITY SPECIFICALLY DISCLAIMS ANY +# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE AND ANY STATUTORY +# WARRANTY OF NON-INFRINGEMENT. THE SOFTWARE PROVIDED HEREUNDER IS ON AN "AS IS" +# BASIS, AND USDA ARS AND OREGON STATE UNIVERSITY HAVE NO OBLIGATIONS TO PROVIDE +# MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. +# +#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# +#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# +#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# +#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# +#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# +#==============================================================================# + +################################################################################ +#------------------------------------------------------------------------------# +# BOOTGEN METHODS +#------------------------------------------------------------------------------# +################################################################################ + +#==============================================================================# +#' Methods used for the bootgen object. +#' +#' This is not designed for user interaction. +#' +#' @rdname bootgen-methods +#' @param x a \code{"\linkS4class{bootgen}"} object +#' @param i vector of numerics indicating number of individuals desired +#' @param j a vector of numerics corresponding to the loci desired. +#' @param ... unused. +#' @param drop set to \code{FALSE} +#' @keywords internal +#' @author Zhian N. Kamvar +#==============================================================================# +setMethod( + f = "[", + signature(x = "bootgen"), + definition = function(x, i, j, ..., drop = FALSE){ + if (missing(i)) i <- TRUE + if (missing(j)) j <- TRUE + loc <- dim(x)[2] + if (length(j) > loc | any(j > loc)){ + stop('subscript out of bounds') + } + + # Taking Names + locnall <- x@loc.nall[j] + allnames <- x@all.names[j] + names(allnames) <- names(x@all.names)[1:length(j)] + names(locnall) <- names(allnames) + alllist <- slot(x, "alllist")[j] + # Shuffling + # matcols <- apply(getinds(x@loc.nall), 1, function(ind) ind[1]:ind[2])[j] + # matcols <- .Call("expand_indices", cumsum(x@loc.nall), nLoc(x))[j] + indices <- unlist(alllist) + locnames <- rep(names(allnames), locnall) + tabnames <- paste(locnames, unlist(allnames), sep = ".") + res <- slot(x, "tab")[i, indices, drop = drop] + colnames(res) <- tabnames + + ## Resetting all factors that need to be set. + slot(x, "tab") <- res + slot(x, "loc.fac") <- factor(locnames, names(allnames)) + slot(x, "loc.names") <- names(allnames) + slot(x, "loc.nall") <- locnall + slot(x, "all.names") <- allnames + slot(x, "alllist") <- alllist + slot(x, "names") <- slot(x, "names")[i] + return(x) + } +) + +#==============================================================================# +#' @rdname bootgen-methods +#==============================================================================# +setMethod( + f = "dim", + signature(x = "bootgen"), + definition = function(x){ + return(c(length(slot(x, "names")), length(slot(x, "loc.names")))) + } +) + +#==============================================================================# +#' @rdname bootgen-methods +#==============================================================================# +setMethod( + f = "$", + signature(x = "bootgen"), + definition = function(x, name){ + return(slot(x, name)) + } +) + +#==============================================================================# +#' @rdname bootgen-methods +#' @param .Object a character, "bootgen" +#' @param gen a genind, genclone, or genpop object +#==============================================================================# +setMethod( + f = "initialize", + signature = "bootgen", + definition = function(.Object, gen){ + if (missing(gen)){ + stop("gen must be specified.") + } + if (is.genind(gen)){ + objnames <- slot(gen, "ind.names") + tab <- slot(gen, "tab") + } else if (is.genpop(gen)){ + objnames <- slot(gen, "pop.names") + tab <- makefreq(gen, missing = "mean", quiet = TRUE)$tab + } else { + stop("gen must be a valid gen object.") + } + num_alleles <- slot(gen, "loc.nall") + num_loci <- length(num_alleles) + slot(.Object, "tab") <- tab + slot(.Object, "loc.fac") <- slot(gen, "loc.fac") + slot(.Object, "loc.names") <- slot(gen, "loc.names") + slot(.Object, "loc.nall") <- num_alleles + slot(.Object, "all.names") <- slot(gen, "all.names") + slot(.Object, "alllist") <- .Call("expand_indices", cumsum(num_alleles), num_loci, PACKAGE = "poppr") + slot(.Object, "names") <- objnames + slot(.Object, "type") <- slot(gen, "type") + slot(.Object, "ploidy") <- as.integer(slot(gen, "ploidy")) + return(.Object) + }) + +################################################################################ +#------------------------------------------------------------------------------# +# BRUVOMAT METHODS +#------------------------------------------------------------------------------# +################################################################################ + +#==============================================================================# +#' @rdname bruvomat-methods +#' @param .Object a character, "bruvomat" +#' @param gen \code{"\linkS4class{genind}"} object +#' @param replen a vector of numbers indicating the repeat length for each +#' microsatellite locus. +#' @keywords internal +#' @author Zhian N. Kamvar +#==============================================================================# +setMethod( + f = "initialize", + signature = "bruvomat", + definition = function(.Object, gen, replen){ + if (missing(gen)) gen <- new("genind") + if (missing(replen)){ + replen <- vapply(gen@all.names, function(y) guesslengths(as.numeric(y)), 1) + } + ploid <- ploidy(gen) + # This controlls for the user correcting missing data using "mean". + if (any(!gen@tab %in% c((0:ploid)/ploid, NA))){ + gen@tab[!gen@tab %in% c((0:ploid)/ploid, NA)] <- NA + } + # This will check for data that has missing scored as "zero". + popcols <- ploid*nLoc(gen) + if (!any(is.na(gen@tab)) & any(rowSums(gen@tab, na.rm=TRUE) < nLoc(gen))){ + mat1 <- as.matrix.data.frame(genind2df(gen, sep="/", usepop=FALSE)) + mat1[mat1 %in% c("", NA)] <- paste(rep(0, ploid), collapse="/") + mat2 <- apply(mat1, 1, strsplit, "/") + mat3 <- apply(as.matrix(t(sapply(mat2, unlist))), 2, as.numeric) + vec1 <- suppressWarnings(as.numeric(unlist(mat3))) + pop <- matrix(vec1, nrow=nInd(gen), ncol=popcols) + } else { + popdf <- genind2df(gen, oneColPerAll=TRUE, usepop=FALSE) + mat1 <- as.matrix.data.frame(popdf) + pop <- suppressWarnings(matrix(as.numeric(mat1), ncol=popcols)) + } + slot(.Object, "mat") <- pop + slot(.Object, "replen") <- replen + slot(.Object, "ploidy") <- ploid + slot(.Object, "ind.names") <- indNames(gen) + return(.Object) + } +) + +#==============================================================================# +#' @rdname bruvomat-methods +#' @keywords internal +#==============================================================================# +setMethod( + f = "dim", + signature(x = "bruvomat"), + definition = function(x){ + return(c(nrow(x@mat), ncol(x@mat)/x@ploidy)) + } +) + +#==============================================================================# +#' Methods used for the bruvomat object. +#' +#' This is not designed for user interaction. +#' +#' @rdname bruvomat-methods +#' @param x a \code{"\linkS4class{bruvomat}"} object +#' @param i vector of numerics indicating number of individuals desired +#' @param j a vector of numerics corresponding to the loci desired. +#' @param ... unused. +#' @keywords internal +#' @param drop set to \code{FALSE} +#==============================================================================# +setMethod( + f = "[", + signature(x = "bruvomat"), + definition = function(x, i, j, ..., drop = FALSE){ + if (missing(i)) i <- TRUE + if (missing(j)) j <- TRUE + x@replen <- x@replen[j] + x@ind.names <- x@ind.names[i] + cols <- rep(1:ncol(x), each = x@ploidy) + replacement <- vapply(j, function(ind) which(cols == ind), 1:x@ploidy) + x@mat <- x@mat[i, as.vector(replacement), drop = FALSE] + return(x) + } +) + +################################################################################ +#------------------------------------------------------------------------------# +# GENCLONE METHODS +#------------------------------------------------------------------------------# +################################################################################ +#==============================================================================# +#' Check for validity of a genclone object +#' +#' @note a \linkS4class{genclone} object will always be a valid +#' \linkS4class{genind} object. +#' +#' @export +#' @rdname is.genclone +#' @param x a genclone object +#' @author Zhian N. Kamvar +#' @examples +#' data(nancycats) +#' nanclone <- as.genclone(nancycats) +#' is.genclone(nanclone) +#==============================================================================# +is.genclone <- function(x){ + res <- (is(x, "genclone")) + return(res) +} + +#==============================================================================# +#' Methods used for the genclone object +#' +#' Default methods for subsetting genclone objects. +#' +#' @rdname genclone-method +#' @param x a genclone object +#' @param i vector of numerics indicating number of individuals desired +#' @param j a vector of numerics corresponding to the loci desired. +#' @param ... passed on to the \code{\linkS4class{genind}} object. +#' @param drop set to \code{FALSE} +#' @param loc passed on to \code{\linkS4class{genind}} object. +#' @param treatOther passed on to \code{\linkS4class{genind}} object. +#' @param quiet passed on to \code{\linkS4class{genind}} object. +#' @author Zhian N. Kamvar +#==============================================================================# +setMethod( + f = "[", + signature(x = "genclone", i = "ANY", j = "ANY", drop = "ANY"), + definition = function(x, i, j, ..., loc=NULL, treatOther=TRUE, quiet=TRUE, drop = FALSE){ + if (missing(i)) i <- TRUE + if (missing(j)) j <- TRUE + mlg <- slot(x, "mlg")[i] + hierarchy <- slot(x, "hierarchy")[i, , drop = FALSE] + ## The following is lifted directly from the adegenet source code as + ## callNextMethod() was throwing the error: + ## + ## Error in callNextMethod() : bad object found as method (class "function") + ## + ## The reason for this is the fact that the `genind` function calls + ## `new("genind")` within the function. Because of this, using + ## `callNextMethod()` returns a genind object instead of modifying the + ## genclone object as it should. + pop <- NULL + if (is.null(x@pop)) { + tab <- truenames(x) + } else { + temp <- truenames(x) + tab <- temp$tab + pop <- temp$pop + pop <- factor(pop[i]) + } + nrowx <- nrow(x@tab) + old.other <- other(x) + + ## handle loc argument + if(!is.null(loc)){ + loc <- as.character(loc) + temp <- !loc %in% x@loc.fac + if (any(temp)) { # si mauvais loci + noloc <- paste(loc[temp], collapse = " ") + warning(paste("the following specified loci do not exist:", noloc)) + } + j <- x$loc.fac %in% loc + } # end loc argument + + prevcall <- x@call + tab <- tab[i, j, ..., drop=FALSE] + + if(drop){ + allNb <- apply(tab, 2, sum, na.rm=TRUE) # allele absolute frequencies + toKeep <- (allNb > 1e-10) + tab <- tab[ , toKeep, drop=FALSE] + } + + res <- genind(tab, pop=pop, prevcall=prevcall, ploidy=x@ploidy, type=x@type) + res <- new("genclone", res, hierarchy, mlg) + + ## handle 'other' slot + nOther <- length(x@other) + namesOther <- names(x@other) + counter <- 0 + if (treatOther){ + f1 <- function(obj, n = nrowx){ + counter <<- counter + 1 + if (!is.null(dim(obj)) && nrow(obj) == n){ # if the element is a matrix-like obj + obj <- obj[i , , drop=FALSE] + } else if (length(obj) == n){ # if the element is not a matrix but has a length == n + obj <- obj[i] + if (is.factor(obj)){ + obj <- factor(obj) + } + } else { + if (!quiet){ + warning(paste("cannot treat the object", namesOther[counter])) + } + } + return(obj) + } # end f1 + + res@other <- lapply(x@other, f1) # treat all elements + } else { + res@other <- old.other + } # end treatOther + + return(res) + } +) + +#==============================================================================# +#' @rdname genclone-method +#' @param .Object a character, "genclone" +#' @param gen \code{"\linkS4class{genind}"} object +#' @param hierarchy a data frame where each row i represents the different +#' population assignments of individual i in the data set. If this is empty, the +#' hierarchy will be created from the population factor. +#' @param mlg a vector where each element assigns the multilocus genotype of +#' that individual in the data set. +#' @keywords internal +#==============================================================================# +setMethod( + f = "initialize", + signature("genclone"), + definition = function(.Object, gen, hierarchy, mlg){ + if (missing(gen)){ + gen <- new("genind") + if (missing(mlg)) mlg <- 0 + } else { + if (missing(mlg)) mlg <- mlg.vector(gen) + } + if (missing(hierarchy)){ + if (is.null(pop(gen))){ + hierarchy <- data.frame() + } else { + hierarchy <- data.frame(Pop = pop(gen)) + } + } else { + hierarchy <- data.frame(lapply(hierarchy, function(f) factor(f, unique(f)))) + } + + # No 'initialize' method for genind objects... + lapply(names(gen), function(y) slot(.Object, y) <<- slot(gen, y)) + slot(.Object, "mlg") <- mlg + slot(.Object, "hierarchy") <- hierarchy + return(.Object) + } +) + +#==============================================================================# +#' @rdname genclone-method +#' @param object a genclone object +#==============================================================================# +setMethod( + f = "show", + signature("genclone"), + definition = function(object){ + ploid <- c("ha", "di", "tri", "tetra", "penta", "hexa", "hepta", "octa", + "nona", "deca", "hendeca", "dodeca") + ploid <- paste0(ploid[object@ploidy], "ploid") + nind <- nInd(object) + type <- ifelse(object@type == "PA", "dominant", "codominant") + nmlg <- length(unique(object@mlg)) + nloc <- nLoc(object) + npop <- ifelse(is.null(object@pop), 0, length(object@pop.names)) + hier <- length(object@hierarchy) + chars <- nchar(c(nmlg, nind, nloc, hier, npop)) + ltab <- max(chars) - chars + ltab <- vapply(ltab, function(x) substr(" ", 1, x+1), character(1)) + pops <- object@pop.names + poplen <- length(pops) + if (poplen > 7) + pops <- c(pops[1:3], "...", pops[(poplen-2):poplen]) + hiernames <- names(object@hierarchy) + hierlen <- length(hiernames) + if (hierlen > 7) + hiernames <- c(hiernames[1:3], "...", hiernames[(hierlen-2):hierlen]) + cat("\nThis is a genclone object\n") + cat("-------------------------\n") + cat("Genotype information:\n\n", + ltab[1], nmlg, "multilocus genotypes\n", + ltab[2], nind, ploid, "individuals\n", + ltab[3], nloc, type, "loci\n\n" + ) + pophier <- ifelse(hier > 1, "levels -", "level -") + if (hier == 0) pophier <- "levels." + popdef <- ifelse(npop > 0, "defined -", "defined.") + cat("Population information:\n\n") + cat("", ltab[4], hier, "hierarchical", pophier, hiernames, fill = TRUE) + cat("", ltab[5], npop, "populations", popdef, pops, fill = TRUE) + + }) + +#==============================================================================# +#' @rdname genclone-method +#' @export +#' @param x a genclone object +#' @param fullnames \code{logical}. If \code{TRUE}, then the full names of the +#' populations will be printed. If \code{FALSE}, then only the first and last +#' three population names are displayed. +#==============================================================================# +setMethod( + f = "print", + signature("genclone"), + definition = function(x, ...){ + ploid <- c("ha", "di", "tri", "tetra", "penta", "hexa", "hepta", "octa", + "nona", "deca", "hendeca", "dodeca") + ploid <- paste0(ploid[x@ploidy], "ploid") + nind <- nInd(x) + type <- ifelse(x@type == "PA", "dominant", "codominant") + nmlg <- length(unique(x@mlg)) + nloc <- nLoc(x) + npop <- ifelse(is.null(x@pop), 0, length(x@pop.names)) + hier <- length(x@hierarchy) + chars <- nchar(c(nmlg, nind, nloc, hier, npop)) + ltab <- max(chars) - chars + ltab <- vapply(ltab, function(x) substr(" ", 1, x + 1), character(1)) + pops <- x@pop.names + hiernames <- names(x@hierarchy) + cat("\nThis is a genclone object\n") + cat("-------------------------\n") + cat("Genotype information:\n\n", + ltab[1], nmlg, "multilocus genotypes\n", + ltab[2], nind, ploid, "individuals\n", + ltab[3], nloc, type, "loci\n\n" + ) + pophier <- ifelse(hier > 1, "levels -", "level -") + if (hier == 0) pophier <- "levels." + popdef <- ifelse(npop > 0, "defined -", "defined.") + cat("Population information:\n\n") + cat("", ltab[4], hier, "hierarchical", pophier, hiernames, fill = TRUE) + cat("", ltab[5], npop, "populations", popdef, pops, fill = TRUE) + }) + +#==============================================================================# +#' Create a genclone object from a genind object. +#' +#' Wrapper for genclone initializer. +#' +#' @export +#' @rdname coercion-methods +#' @aliases as.genclone,genind-method +#' @param x a \code{\linkS4class{genind}} or \code{\linkS4class{genclone}} +#' object +#' @param hierarchy a data frame representing the population hierarchy. +#' @docType methods +#' +#' @note The hierarchy must have the same number of rows as the number of +#' observations in the genind object. If no hierarchy is defined, the function +#' will search for a data frame in the \code{\link{other}} slot called +#' "population_hierarchy" and set that as the hieararchy. If none is defined, +#' the population will be set as the hierarchy under the label "Pop". Use the +#' function \code{\link{splithierarchy}} to split up any population +#' hierarchies that might be combined in the population factor. +#' +#' @seealso \code{\link{splithierarchy}}, \code{\linkS4class{genclone}}, +#' \code{\link{read.genalex}} +#' @author Zhian N. Kamvar +#' @examples +#' data(Aeut) +#' Aeut +#' Aeut.gc <- as.genclone(Aeut) +#' Aeut.gc +#' Aeut.gc <- as.genclone(Aeut, other(Aeut)$population_hierarchy[-1]) +#' Aeut.gc +#==============================================================================# +as.genclone <- function(x, hierarchy = NULL){ + standardGeneric("as.genclone") +} + +#' @export +setGeneric("as.genclone") + + +setMethod( + f = "as.genclone", + signature(x = "genind"), + definition = function(x, hierarchy){ + if (missing(hierarchy)){ + if ("population_hierarchy" %in% names(other(x))){ + hierarchy <- other(x)[["population_hierarchy"]] + newgenclone <- new("genclone", x, hierarchy) + } else { + newgenclone <- new("genclone", x) + } + } else { + newgenclone <- new("genclone", x, hierarchy) + } + return(newgenclone) + }) + +#==============================================================================# +# Seploc method for genclone objects. +#==============================================================================# +setMethod( + f = "seploc", + signature(x = "genclone"), + definition = function(x, ...){ + mlg <- x@mlg + hierarchy <- x@hierarchy + listx <- callNextMethod() + if (is.genind(listx[[1]])){ + listx <- lapply(listx, function(gid) new("genclone", gid, hierarchy, mlg)) + } + return(listx) + }) +#==============================================================================# +#' Access and manipulate the population hierarchy for genclone objects. +#' +#' The following methods allow the user to quickly change the hierarchy or +#' population of a genclone object. +#' +#' @export +#' @rdname hierarchy-methods +#' @aliases gethierarchy,genclone-method +#' @param x a genclone object +#' @param formula a nested formula indicating the order of the population +#' hierarchy. +#' @param combine if \code{TRUE}, the levels will be combined according to the +#' formula argument. If it is \code{FALSE}, the levels will not be combined. +#' @docType methods +#==============================================================================# +gethierarchy <- function(x, formula = NULL, combine = TRUE){ + standardGeneric("gethierarchy") +} + +#' @export +setGeneric("gethierarchy") + +setMethod( + f = "gethierarchy", + signature(x = "genclone"), + definition = function(x, formula = NULL, combine = TRUE){ + if (is.null(formula)) return(x@hierarchy) + vars <- all.vars(formula) + if (any(!vars %in% names(x@hierarchy))){ + stop(hier_incompatible_warning(vars, x@hierarchy)) + } + if (combine){ + hier <- make_hierarchy(formula, x@hierarchy) + } else { + hier <- x@hierarchy[all.vars(formula)] + } + invisible(return(hier)) + }) + +#==============================================================================# +#' @export +#' @rdname hierarchy-methods +#' @aliases sethierarchy<-,genclone-method +#' @param value a data frame OR vector OR formula (see details). +#' @docType methods +#' +#' @details \subsection{Function Specifics}{ \itemize{ \item +#' \strong{gethierarchy()} - This will retrieve the data from the +#' \emph{hierarchy} slot in the \linkS4class{genclone} object. You have the +#' option to choose specific heirarchical levels using a formula (see below) and +#' you can choose to combine the hierarchical levels (default) \item +#' \strong{sethierarchy()} - Set or reset the hierarchical levels in your +#' \linkS4class{genclone} object. \item \strong{namehierarchy()} - Rename the +#' hierarchical levels. \item \strong{splithierarchy()} - This is conceptually +#' similar to the default method of \code{\link{splitcombine}}. It is often +#' difficult to import files with several levels of hierarchy as most data +#' formats do not allow unlimited population levels. This is circumvented by +#' collapsing all hierarchical levels into a single population factor with a +#' common separator for each observation. This function will then split those +#' hierarchies for you, but it works best on a hierarchy that only has a single +#' column in it. See the rootrot example below. \item \strong{addhierarchy()} - +#' Add levels to your population hierarchy. If you have extra hierarchical +#' levels you want to add to your population hierarchy, you can use this method +#' to do so. You can input a data frame or a vector, but if you put in a vector, +#' you have the option to name it (if you are using the functional version and +#' not the assignment version). }} +#' +#' \subsection{Argument Specifics}{ +#' +#' These functions allow the user to seamlessly assign the hierarchical levels +#' of their \code{\linkS4class{genclone}} object. Note that there are two ways +#' of performing all methods (except for \code{gethierarchy()}). They +#' essentially do the same thing except that the assignment method (the one with +#' the "\code{<-}") will modify the object in place whereas the non-assignment +#' method will not modify the original object. Due to convention, everything +#' right of the assignment is termed \code{value}. To avoid confusion, here is a +#' guide to the inputs: \itemize{ \item \strong{sethierarchy()} This will be a +#' \code{\link{data.frame}} that defines the hierarchy for each individual in +#' the rows. \item \strong{namehierarchy()} This will be either a +#' \code{\link{vector}} or a \code{\link{formula}} that will define the names. +#' \item \strong{splithierarchy()} This will be a \code{\link{formula}} argument +#' with the same number of levels as the hierarchy you wish to split. \item +#' \strong{addhierarchy()} This will be a \code{\link{vector}} or +#' \code{\link{data.frame}} with the same length as the number of individuals in +#' your data. }} +#' +#' \subsection{Details on Formulas}{ +#' +#' The preferred use of these functions is with a \code{\link{formula}} object. +#' Specifically, a hierarchical formula argument is used to assign the levels of +#' the hierarchy. An example of a hierarchical formula would be:\cr +#' \code{~Country/City/Neighborhood}\cr or \cr \code{~Country + Country:City + +#' Country:City:Neighborhood}\cr of course, the first method is slightly easier +#' to read. It is important to use hiearchical formulas when specifying +#' hierarchies as other types of formulas (eg. +#' \code{~Country*City*Neighborhood}) might give spurious results.} +#' +#' @seealso \code{\link{setpop}} \code{\link{genclone}} +#' \code{\link{as.genclone}} +#' +#' @author Zhian N. Kamvar +#' @examples +#' # let's look at the microbov data set: +#' data(microbov) +#' microgc <- as.genclone(microbov) +#' microgc +#' +#' # We see that we have three vectors of different names here. +#' ?microbov +#' # These are Country, Breed, and Species +#' names(other(microgc)) +#' +#' # Let's set the hierarchy +#' sethierarchy(microgc) <- data.frame(other(microgc)) +#' microgc +#' +#' # And change the names so we know what they are +#' namehierarchy(microgc) <- ~Country/Breed/Species +#' +#' # let's see what the hierarchy looks like by Species and Breed: +#' head(gethierarchy(microgc, ~Breed/Species)) +#' +#' \dontrun{ +#' # Load our data set and convert it to a genclone object. +#' Aeut.gc <- read.genalex(system.file("files/rootrot.csv", package = "poppr")) +#' +#' # we can see the hierarchy is set to Population_Subpopulation. +#' head(gethierarchy(Aeut.gc)) +#' +#' # We can use splithierarchy() to split them. +#' splithierarchy(Aeut.gc) <- ~Pop/Subpop +#' Aeut.gc +#' head(gethierarchy(Aeut.gc)) +#' +#' # We can also use gethierarchy to combine the hierarchy. +#' head(gethierarchy(Aeut.gc, ~Pop/Subpop)) +#' +#' # We can also give it a more descriptive name. +#' namehierarchy(Aeut.gc) <- ~Population/Subpopulation +#' Aeut.gc +#' Aeut.gc <- namehierarchy(Aeut.gc, ~Pop/Subpop) +#' Aeut.gc +#' } +#==============================================================================# +sethierarchy <- function(x, value){ + standardGeneric("sethierarchy") +} + +#' @export +setGeneric("sethierarchy") + +setMethod( + f = "sethierarchy", + signature(x = "genclone"), + definition = function(x, value){ + if (!inherits(value, "data.frame")){ + stop(paste(substitute(value), "is not a data frame")) + } + if (nrow(value) != nInd(x)){ + stop("Number of rows in data frame not equal to number of individuals in object.") + } + value <- data.frame(lapply(value, function(f) factor(f, unique(f)))) + x@hierarchy <- value + return(x) + }) + +#==============================================================================# +#' @export +#' @rdname hierarchy-methods +#' @aliases sethierarchy,genclone-method +#' @docType methods +#==============================================================================# +"sethierarchy<-" <- function(x, value){ + standardGeneric("sethierarchy<-") +} + +#' @export +setGeneric("sethierarchy<-") + +setMethod( + f = "sethierarchy<-", + signature(x = "genclone"), + definition = function(x, value){ + return(sethierarchy(x, value)) + }) + +#==============================================================================# +#' @export +#' @rdname hierarchy-methods +#' @aliases namehierarchy,genclone-method +#' @docType methods +#==============================================================================# +namehierarchy <- function(x, value){ + standardGeneric("namehierarchy") +} + +#' @export +setGeneric("namehierarchy") + +setMethod( + f = "namehierarchy", + signature(x = "genclone"), + definition = function(x, value){ + if (is.language(value)){ + value <- all.vars(value) + } + if (!is.vector(value) | length(value) != length(x@hierarchy)){ + stop(paste("Hierarchy, needs a vector argument of length", length(x@hierarchy))) + } + names(x@hierarchy) <- value + return(x) + }) + +#==============================================================================# +#' @export +#' @rdname hierarchy-methods +#' @aliases namehierarchy<-,genclone-method +#' @docType methods +#==============================================================================# +"namehierarchy<-" <- function(x, value){ + standardGeneric("namehierarchy<-") +} + +#' @export +setGeneric("namehierarchy<-") + +setMethod( + f = "namehierarchy<-", + signature(x = "genclone"), + definition = function(x, value){ + return(namehierarchy(x, value)) + }) + +#==============================================================================# +#' @export +#' @rdname hierarchy-methods +#' @aliases splithierarchy,genclone-method +#' @docType methods +#' @param sep a \code{character} indicating the character used to separate +#' hierarchical levels. This defaults to "_". +#' @importFrom reshape2 colsplit +#==============================================================================# +splithierarchy <- function(x, value, sep = "_"){ + standardGeneric("splithierarchy") +} + +#' @export +setGeneric("splithierarchy") + +setMethod( + f = "splithierarchy", + signature(x = "genclone"), + definition = function(x, value, sep = "_"){ + if (is.language(value)){ + # valterms <- attr(terms(value), "term.labels") + # valterms <- valterms[length(valterms)] + # valterms <- gsub(":", sep, valterms) + value <- all.vars(value) + } else { + stop("value must be a formula.") + } + if (length(value) < 1){ + stop("value must have more than one hierarchical level.") + } + hierarchy <- x@hierarchy + if (length(hierarchy) > 1){ + warning("Hierarchy must be length 1. Taking the first column.") + hierarchy <- hierarchy[1] + } + seps <- gregexpr(sep, hierarchy[[1]]) + sepmatch <- vapply(seps, function(val) all(as.integer(val) > 0), logical(1)) + seps <- vapply(seps, length, numeric(1)) + all_seps_match <- all(sepmatch) + given_seps <- length(value) - 1 + if (!all_seps_match | all(seps != given_seps)){ + seps <- ifelse(all_seps_match, seps[1], 0) + 1 + msg1 <- paste("\n Data has", seps, ifelse(seps == 1, "level", "levels"), + "of hierarchy with the separator", sep, ".") + msg2 <- paste("Here is the fist column of the data:", hierarchy[1, ]) + stop(paste(msg1, "\n ", msg2)) + } + x@hierarchy <- colsplit(as.character(hierarchy[[1]]), pattern = sep, value) + x@hierarchy <- data.frame(lapply(x@hierarchy, function(f) factor(f, levels = unique(f)))) + # names(hierarchy) <- value + # x@hierarchy <- hierarchy + return(x) + }) + +#==============================================================================# +#' @export +#' @rdname hierarchy-methods +#' @aliases splithierarchy<-,genclone-method +#' @docType methods +#==============================================================================# +"splithierarchy<-" <- function(x, value){ + standardGeneric("splithierarchy<-") +} + +#' @export +setGeneric("splithierarchy<-") + +setMethod( + f = "splithierarchy<-", + signature(x = "genclone"), + definition = function(x, value){ + return(splithierarchy(x, value)) + }) + +#==============================================================================# +#' @export +#' @rdname hierarchy-methods +#' @aliases addhierarchy,genclone-method +#' @param name an optional name argument for use with addhierarchy if supplying +#' a vector. Defaults to "NEW". +#' @docType methods +#==============================================================================# +addhierarchy <- function(x, value, name = "NEW"){ + standardGeneric("addhierarchy") +} + +#' @export +setGeneric("addhierarchy") + +setMethod( + f = "addhierarchy", + signature(x = "genclone"), + definition = function(x, value, name = "NEW"){ + + hierarchy <- x@hierarchy + if ((is.vector(value) | is.factor(value)) & length(value) == nrow(hierarchy)){ + value <- factor(value, levels = unique(value)) + NEW <- data.frame(value) + names(NEW) <- name + hierarchy <- cbind(hierarchy, NEW) + } else if (is.data.frame(value) && nrow(value) == nrow(hierarchy)){ + value <- data.frame(lapply(value, function(f) factor(f, unique(f)))) + hierarchy <- cbind(hierarchy, value) + } else { + stop("value must be a vector or data frame.") + } + x@hierarchy <- hierarchy + return(x) + }) + +#==============================================================================# +#' @export +#' @rdname hierarchy-methods +#' @aliases addhierarchy<-,genclone-method +#' @docType methods +#==============================================================================# +"addhierarchy<-" <- function(x, value){ + standardGeneric("addhierarchy<-") +} + +#' @export +setGeneric("addhierarchy<-") + +setMethod( + f = "addhierarchy<-", + signature(x = "genclone"), + definition = function(x, value){ + return(addhierarchy(x, value)) + }) + + +#==============================================================================# +#' Manipulate the population factor of genclone objects. +#' +#' The following methods allow the user to quickly change the population of a +#' genclone object. +#' +#' @export +#' @rdname population-methods +#' @param x a genclone object +#' @param formula a nested formula indicating the order of the population +#' hierarchy. +#' @param value same as formula +#' @aliases setpop,genclone-method +#' @docType methods +#' @author Zhian N. Kamvar +#' @examples +#' +#' data(Aeut) +#' Aeut.gc <- as.genclone(Aeut) +#' +#' # Notice that there are two hierarchical levels, Pop and Subpop +#' Aeut.gc +#' +#' # Currently set on just Pop +#' head(pop(Aeut.gc)) +#' +#' # setting the hierarchy to both Pop and Subpop +#' setpop(Aeut.gc) <- ~Pop/Subpop +#' head(pop(Aeut.gc)) +#' +#' \dontrun{ +#' +#' # Can be used to create objects as well. +#' Aeut.old <- setpop(Aeut.gc, ~Pop) +#' head(pop(Aeut.old)) +#' } +#==============================================================================# +setpop <- function(x, formula = NULL) standardGeneric("setpop") + +#' @export +setGeneric("setpop") + +setMethod( + f = "setpop", + signature(x = "genclone"), + definition = function(x, formula = NULL){ + if (is.null(formula) | !is.language(formula)){ + stop(paste(substitute(formula), "must be a valid formula object.")) + } + vars <- all.vars(formula) + if (!all(vars %in% names(x@hierarchy))){ + stop(hier_incompatible_warning(vars, x@hierarchy)) + } + pop(x) <- make_hierarchy(formula, x@hierarchy)[[length(vars)]] + return(x) + }) + +#==============================================================================# +#' @export +#' @rdname population-methods +#' @aliases setpop<-,genclone-method +#' @docType methods +#==============================================================================# +"setpop<-" <- function(x, value) standardGeneric("setpop<-") + +#' @export +setGeneric("setpop<-") + +setMethod( + f = "setpop<-", + signature(x = "genclone"), + definition = function(x, value){ + return(setpop(x, value)) + }) diff --git a/R/mlg.r b/R/mlg.r index 642c844e..e86ea0db 100755 --- a/R/mlg.r +++ b/R/mlg.r @@ -45,7 +45,7 @@ #' #' @name mlg #' -#' @param pop a \code{\link{genind}} object. +#' @param pop a \code{\linkS4class{genind}} or \code{\linkS4class{genclone}} object. #' #' @param sublist a \code{vector} of population names or indices that the user #' wishes to keep. Default to "ALL". @@ -73,11 +73,69 @@ #' #' @param total \code{logical} If \code{TRUE}, a row containing the sum of all #' represented MLGs is appended to the matrix produced by mlg.table. +#' +#' @return +#' \subsection{mlg}{ +#' an integer describing the number of multilocus genotypes observed. +#' } +#' \subsection{mlg.table}{ +#' a matrix with columns indicating unique multilocus genotypes and rows +#' indicating populations. +#' } +#' \subsection{mlg.vector}{ +#' a numeric vector naming the multilocus genotype of each individual in +#' the dataset. +#' } +#' \subsection{mlg.crosspop}{ +#' \itemize{ +#' \item{default}{ a \code{list} where each element contains a named integer vector representing the number of individuals represented from each population in that MLG} +#' \item{\code{indexreturn = TRUE}}{ a \code{vector} of integers defining the multilocus genotypes that have individuals crossing populations} +#' \item{\code{df = TRUE}}{ A long form data frame with the columns: MLG, Population, Count. Useful for graphing with ggplot2} +#' } +#' } +#' \subsection{mlg.id}{ +#' a list of multilocus genotypes with the associated individual names per MLG. +#' } #' -#' @seealso \code{\link{diversity}} \code{\link{popsub}} +#' @seealso \code{\link[vegan]{diversity}} \code{\link{popsub}} #' @author Zhian N. Kamvar #' @examples #' +#' # Load the data set +#' data(Aeut) +#' +#' # Investigate the number of multilocus genotypes. +#' amlg <- mlg(Aeut) +#' amlg # 119 +#' +#' # show the multilocus genotype vector +#' avec <- mlg.vector(Aeut) +#' avec +#' +#' # Get a table +#' atab <- mlg.table(Aeut, bar = FALSE) +#' atab +#' +#' # See where multilocus genotypes cross populations +#' acrs <- mlg.crosspop(Aeut) # MLG.59: (2 inds) Athena Mt. Vernon +#' +#' # See which individuals belong to each MLG +#' aid <- mlg.id(Aeut) +#' aid["59"] # individuals 159 and 57 +#' +#' \dontrun{ +#' +#' # A simple example. 10 individuals, 5 genotypes. +#' mat1 <- matrix(ncol=5, 25:1) +#' mat1 <- rbind(mat1, mat1) +#' mat <- matrix(nrow=10, ncol=5, paste(mat1,mat1,sep="/")) +#' mat.gid <- df2genind(mat, sep="/") +#' mlg(mat.gid) +#' mlg.vector(mat.gid) +#' mlg.table(mat.gid) +#' +#' # Now for a more complicated example. +#' # Data set of 1903 samples of the H3N2 flu virus genotyped at 125 SNP loci. #' data(H3N2) #' mlg(H3N2, quiet=FALSE) #' @@ -90,7 +148,6 @@ #' # Show which genotypes exist accross populations in the entire dataset. #' res <- mlg.crosspop(H3N2, quiet=FALSE) #' -#' \dontrun{ #' # Let's say we want to visualize the multilocus genotype distribution for the #' # USA and Russia #' mlg.table(H3N2, sublist=c("USA", "Russia"), bar=TRUE) @@ -110,178 +167,98 @@ #' # analyze only the MLGs that are duplicated across populations. #' new.H <- H3N2[H.vec %in% inds, ] #' -#' # A simple example. 10 individuals, 5 genotypes. -#' mat1 <- matrix(ncol=5, 25:1) -#' mat1 <- rbind(mat1, mat1) -#' mat <- matrix(nrow=10, ncol=5, paste(mat1,mat1,sep="/")) -#' mat.gid <- df2genind(mat, sep="/") -#' mlg(mat.gid) -#' mlg.vector(mat.gid) -#' mlg.table(mat.gid) #' } NULL #==============================================================================# #' @rdname mlg -# Multi Locus Genotype -# -# Count the number of unique multilocus genotypes found within a genind object. -# -# @param x a \code{\link{genind}} object. -# -# @param quiet default \code{TRUE}. If set to \code{FALSE}, it will display the -# number of individuals and MLG on the output device. #' -#' @return an integer of the number of multilocus genotypes within the sample. #' #' @export #==============================================================================# mlg <- function(pop, quiet=FALSE){ - if(!is.genind(pop)){ - stop("x is not a genind object") - } - if(nrow(pop@tab)==1){ - derp <- 1 + if (!is.genind(pop)){ + stop(paste(substitute(pop), "is not a genind object")) } - else { - derp <- nrow(unique(pop@tab[,1:ncol(pop@tab)])) + if (is.genclone(pop)){ + out <- length(unique(pop@mlg)) + } else { + if(nrow(pop@tab) == 1){ + out <- 1 + } + else { + out <- nrow(unique(pop@tab[, 1:ncol(pop@tab)])) + } } if(quiet!=TRUE){ cat("#############################\n") - cat("# Number of Individuals: "); cat(length(pop@ind.names),"\n") - cat("# Number of MLG: "); cat(derp,"\n") + cat("# Number of Individuals: ", nInd(pop), "\n") + cat("# Number of MLG: ", out, "\n") cat("#############################\n") } - return(derp) + return(out) } #==============================================================================# -#' @rdname mlg -# -#' @return a matrix with columns indicating unique multilocus genotypes and rows -#' indicating populations. +#' @rdname mlg #' #' @note The resulting matrix of \code{mlg.table} can be used for analysis with #' the \code{\link{vegan}} package. -#' The names of the multilocus genotypes represented will be those from -#' the entire dataset. If you wish to view those relative to a subsetted -#' dataset, you can use \code{mlg.bar(popsub(pop, ...))}. #' #' @export -# -# #==============================================================================# mlg.table <- function(pop, sublist="ALL", blacklist=NULL, mlgsub=NULL, bar=TRUE, total=FALSE, quiet=FALSE){ - if(!is.genind(pop)){ + if (!is.genind(pop)){ stop("This function requires a genind object.") } mlgtab <- mlg.matrix(pop) - if(!is.null(mlgsub)){ + if (!is.null(mlgsub)){ mlgtab <- mlgtab[, mlgsub] - mlgtab <- mlgtab[which(rowSums(mlgtab) > 0), ] + mlgtab <- mlgtab[which(rowSums(mlgtab) > 0L), ] pop <- popsub(pop, sublist=rownames(mlgtab)) } - if(sublist[1] != "ALL" | !is.null(blacklist)){ + if (sublist[1] != "ALL" | !is.null(blacklist)){ pop <- popsub(pop, sublist, blacklist) mlgtab <- mlgtab[unlist(vapply(pop@pop.names, - function(x) which(rownames(mlgtab)==x), 1)), , drop=FALSE] + function(x) which(rownames(mlgtab) == x), 1)), , drop=FALSE] + rows <- rownames(mlgtab) } - if(total==TRUE & (nrow(mlgtab) > 1 | !is.null(nrow(mlgtab)) )){ + if (total==TRUE & (nrow(mlgtab) > 1 | !is.null(nrow(mlgtab)) )){ mlgtab <- rbind(mlgtab, colSums(mlgtab)) rownames(mlgtab)[nrow(mlgtab)] <- "Total" } - #````````````````````````````````````````````````````````````````````````````# - # Dealing with the visualizations. - if(bar){ - # if(!require(ggplot2)){ - # warning("ggplot2 must be installed to visualize the MLG distributions.") - # mlgtab <- mlgtab[, which(colSums(mlgtab) > 0)] - # return(mlgtab) - # } - - # Function for setting up and organizing data frame to produce ggplot2 graphs - plot1 <- function(mlgt){ - - # create a data frame that ggplot2 can read. - mlgt.df <- as.data.frame(list(MLG = rep(colnames(mlgt), mlgt), - count = rep(mlgt, mlgt))) - - # Organize the data frame by count in descending order. - mlgt.df[["MLG"]] <- reorder(mlgt.df[["MLG"]], -mlgt.df[["count"]]) - - # plot it - return(ggplot(mlgt.df, aes_string(x = "MLG")) + geom_bar(aes_string(fill = "count"), position="identity")) - #theme(axis.text.x=element_text(size = 10, angle=-45, hjust=0))) - } + # Dealing with the visualizations. + if (bar){ # If there is a population structure if(!is.null(pop@pop.names)){ popnames <- pop@pop.names - if(total & nrow(mlgtab) > 1) - popnames[length(popnames)+1] <- "Total" - - # Function for printing plots with population structures one by one. - printplot <- function(n, quiet=quiet) { - if(!quiet) cat("|",n,"\n") - - # Gather all nonzero values - mlgt <- mlgtab[n, mlgtab[n, ] > 0, drop=FALSE] - - # controlling for the situation where the population size is 1. - if (sum(mlgtab[n, ]) > 1){ - print(plot1(mlgt) + - theme_classic() %+replace% - theme(axis.text.x=element_text(size = 10, angle=-45, hjust=0, vjust=1)) + - labs(title=paste("Population:",n,"\nN =",sum(mlgtab[n, ]), - "MLG =",length(mlgt)))) - } + if(total & nrow(mlgtab) > 1){ + popnames[length(popnames) + 1] <- "Total" } - # Apply this over all populations. - invisible(lapply(popnames, printplot, quiet=quiet)) - } - - # If there is no population structure detected. - else { - print(plot1(mlgtab) + - theme_classic() %+replace% - theme(axis.text.x=element_text(size = 10, angle=-45, hjust=0, - vjust=1)) + - labs(title= paste("File:",as.character(pop@call[2]), - "\nN =",sum(mlgtab),"MLG =",length(mlgtab)) - )) + invisible(lapply(popnames, print_mlg_barplot, mlgtab, quiet=quiet)) + } else { + print(mlg_barplot(mlgtab) + + theme_classic() %+replace% + theme(axis.text.x=element_text(size=10, angle=-45, hjust=0, vjust=1)) + + labs(title = paste("File:", as.character(pop@call[2]), "\nN =", + sum(mlgtab), "MLG =", length(mlgtab)))) } } - #,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,# - mlgtab <- mlgtab[, which(colSums(mlgtab) > 0)] return(mlgtab) } #==============================================================================# #' @rdname mlg -# Multilocus Genotype Vector -# -# Create a vector of multilocus genotype indecies. -# -# @param x a \code{\link{genind}} object. -# -#' @return a numeric vector naming the multilocus genotype of each individual in -#' the dataset. -#' -#' @note The numbers of \code{mlg.vector} will not match up with the sequence of -#' new genotypes found because sorting takes place within the algorithm before -#' the genotypes are called so that the number of comparisons is \eqn{n-1} -#' instead of \eqn{\frac{n(n-1)}{2}}. -#' +#' +#' @note mlg.vector will recalculate the mlg vector for +#' \code{\linkS4class{genind}} objects and will return the contents of the mlg +#' slot in \code{\linkS4class{genclone}} objects. This means that MLGs will be +#' different for subsetted \code{\linkS4class{genind}} objects. +#' #' @export -# @examples -# mat1 <- matrix(ncol=5, 25:1) -# mat1 <- rbind(mat1, mat1) -# mat <- matrix(nrow=10, ncol=5, paste(mat1,mat1,sep="/")) -# mat.gid <- df2genind(mat, sep="/") -# mlg.vector(mat.gid) -# mlg.table(mat.gid) #==============================================================================# mlg.vector <- function(pop){ @@ -297,13 +274,15 @@ mlg.vector <- function(pop){ # Step 4: evaluate strings in sorted vector and increment to the respective # # index vector each time a unique string occurs. # Step 4: Rearrange index vector with the indices from the original vector. - + if (is.genclone(pop)){ + return(pop@mlg) + } xtab <- pop@tab # concatenating each genotype into one long string. xsort <- vapply(seq(nrow(xtab)),function(x) paste(xtab[x, ]*pop@ploidy, - collapse=""), "string") + collapse = ""), "string") # creating a new vector to store the counts of unique genotypes. - countvec <- vector(length=length(xsort), mode="numeric") + countvec <- vector(length = length(xsort), mode = "integer") # sorting the genotypes ($x) and preserving the index ($xi). xsorted <- sort(xsort, index.return=TRUE) @@ -314,17 +293,16 @@ mlg.vector <- function(pop){ # object to modify. In this case it is countvec, which was declared above. f1 <- function(num, comp){ - if(num-1 == 0){ - countvec[num] <<- 1 + if (num - 1 == 0){ + countvec[num] <<- 1L } # These have the exact same strings, thus they are the same MLG. Perpetuate # The MLG index. - else if(comp[num] == comp[num-1]){ - countvec[num] <<- countvec[num-1] - } + else if (comp[num] == comp[num - 1]){ + countvec[num] <<- countvec[num - 1] + } else { # These have differnt strings, increment the MLG index by one. - else{ - countvec[num] <<- countvec[num-1] + 1 + countvec[num] <<- countvec[num - 1] + 1L } } @@ -340,67 +318,73 @@ mlg.vector <- function(pop){ } #==============================================================================# -#' @rdname mlg -# Multilocus Genotypes Across Populations -# -# Show which multilocus genotypes exist accross populations. -# -# @param pop a \code{\link{genind}} object. -# -#' @return a \code{list} containing vectors of population names for each MLG. +#' @rdname mlg #' #' @export #==============================================================================# mlg.crosspop <- function(pop, sublist="ALL", blacklist=NULL, mlgsub=NULL, indexreturn=FALSE, df=FALSE, quiet=FALSE){ - if(length(sublist) == 1 & sublist[1] != "ALL" | is.null(pop(pop))){ + if (length(sublist) == 1 & sublist[1] != "ALL" | is.null(pop(pop))){ cat("Multiple populations are needed for this analysis.\n") return(0) } - vec <- mlg.vector(pop) + if (is.genclone(pop)){ + vec <- pop@mlg + } else { + vec <- mlg.vector(pop) + } subind <- sub_index(pop, sublist, blacklist) - vec <- vec[subind] + vec <- vec[subind] mlgtab <- mlg.matrix(pop) - if(!is.null(mlgsub)){ - mlgtab <- mlgtab[, mlgsub] - mlgs <- 1:ncol(mlgtab) + if (!is.null(mlgsub)){ + mlgsubnames <- paste("MLG", mlgsub, sep = ".") + matches <- mlgsubnames %in% colnames(mlgtab) + if (!all(matches)){ + rejects <- mlgsub[!matches] + mlgsubnames <- mlgsubnames[matches] + warning(mlg_sub_warning(rejects)) + } + mlgtab <- mlgtab[, mlgsubnames, drop = FALSE] + mlgs <- 1:ncol(mlgtab) names(mlgs) <- colnames(mlgtab) - } - else{ - if(sublist[1] != "ALL" | !is.null(blacklist)){ - pop <- popsub(pop, sublist, blacklist) + } else { + if (sublist[1] != "ALL" | !is.null(blacklist)){ + pop <- popsub(pop, sublist, blacklist) mlgtab <- mlgtab[unlist(vapply(pop@pop.names, - function(x) which(rownames(mlgtab)==x), 1)), , drop=FALSE] + function(x) which(rownames(mlgtab) == x), 1)), , drop=FALSE] } #mlgtab <- mlgtab[, which(colSums(mlgtab) > 0)] - mlgs <- unlist(strsplit(names(which(colSums(ifelse(mlgtab==0, 0, 1)) > 1)), - "\\.")) - mlgs <- as.numeric(mlgs[!mlgs %in% "MLG"]) - if(length(mlgs) == 0){ + # mlgs <- unlist(strsplit(names(which(colSums(ifelse(mlgtab == 0L, 0L, 1L)) > 1)), + # "\\.")) + # mlgs <- as.numeric(mlgs[!mlgs %in% "MLG"]) + mlgs <- colSums(ifelse(mlgtab == 0L, 0L, 1L)) > 1 + if (sum(mlgs) == 0){ cat("No multilocus genotypes were detected across populations\n") return(0) } - names(mlgs) <- paste("MLG", mlgs, sep=".") - if(indexreturn){ - return(mlgs) + #names(mlgs) <- paste("MLG", mlgs, sep=".") + if (indexreturn){ + mlgout <- unlist(strsplit(names(mlgs[mlgs]), "\\.")) + mlgout <- as.numeric(mlgout[!mlgout %in% "MLG"]) + return(mlgout) } } popop <- function(x, quiet=TRUE){ - popnames <- mlgtab[mlgtab[, x] > 0, x] - if(length(popnames) == 1){ - names(popnames) <- rownames(mlgtab[mlgtab[, x] > 0, x, drop=FALSE]) + popnames <- mlgtab[mlgtab[, x] > 0L, x] + if (length(popnames) == 1){ + names(popnames) <- rownames(mlgtab[mlgtab[, x] > 0L, x, drop=FALSE]) } - if(!quiet) + if (!quiet) cat(paste(x, ":", sep=""),paste("(",sum(popnames)," inds)", sep=""), names(popnames), fill=80) return(popnames) } # Removing any populations that are not represented by the MLGs. - mlgtab <- mlgtab[rowSums(mlgtab[, mlgs, drop=FALSE]) > 0, mlgs, drop=FALSE] + mlgtab <- mlgtab[rowSums(mlgtab[, mlgs, drop=FALSE]) > 0L, mlgs, drop=FALSE] # Compiling the list. mlg.dup <- lapply(colnames(mlgtab), popop, quiet=quiet) names(mlg.dup) <- colnames(mlgtab) - if(df == TRUE){ + if (df == TRUE){ mlg.dup <- as.data.frame(list(MLG = rep(names(mlg.dup), sapply(mlg.dup, length)), Population = unlist(lapply(mlg.dup, names)), Count = unlist(mlg.dup))) @@ -408,3 +392,20 @@ mlg.crosspop <- function(pop, sublist="ALL", blacklist=NULL, mlgsub=NULL, indexr } return(mlg.dup) } + + + +#==============================================================================# +#' @rdname mlg +#' @export +#==============================================================================# + + +mlg.id <- function (pop){ + if (!is.genind(pop)){ + stop(paste(substitute(pop), "is not a genind or genclone object")) + } + ctab <- table(pop$ind.names, mlg.vector(pop)) + m.g <- apply(ctab, MARGIN = 2, FUN = function (y) names(y[y > 0])) + return(m.g) +} diff --git a/R/poppr.R b/R/poppr.R new file mode 100644 index 00000000..ec60bdbe --- /dev/null +++ b/R/poppr.R @@ -0,0 +1,112 @@ +#==============================================================================# +#' The \pkg{poppr} R package. +#' +#' @description \pkg{Poppr} provides tools for population genetic analysis that +#' include genotypic diveristy measures, genetic distances with bootstrap +#' support, native organization and handling of population hierarchies, and +#' clone correction. +#' +#' To cite \pkg{poppr}, please use \code{citation("poppr")}. When referring to +#' \pkg{poppr} in your manuscript, please use lower case unless it occurs at the +#' beginning of a sentence. +#' +#' @details This package relies on the \pkg{\link[adegenet]{adegenet}} package. +#' It was built around the \code{\linkS4class{genind}} object, which stores +#' genetic information in a table of allele frequencies. \pkg{Poppr} has +#' extended this object into a new object called +#' \code{\linkS4class{genclone}}. This object tracks clones and organizes +#' different population hierarchical levels. +#' +#' \subsection{Documentation}{ Documentation is available for any function by +#' typing \code{?function_name} in the R console. Essential functions for +#' manipulating data are detailed within the \emph{Data import and +#' manipulation} vignette (\code{vignette("poppr_manual", package = "poppr")}) +#' and details on algorithms used in \pkg{poppr} are within the +#' \emph{Algorithms and equations} vignette (\code{vignette("algo", package = +#' "poppr")}). Examples of analyses are available in a primer written by +#' Niklaus J. Grünwald, Zhian N. Kamvar, and Sydney E. Everhart at +#' \url{http://grunwaldlab.github.io/Population_Genetics_in_R}.} +#' +#' \subsection{Getting help}{ If you have a specific question or issue with +#' \pkg{poppr}, feel free to contribute to the google group at +#' \url{https://groups.google.com/forum/#!forum/poppr}. If you find a bug and +#' are a github user, you can submit bug reports at +#' \url{https://github.com/grunwaldlab/poppr/issues}. Otherwise, leave a +#' message on the groups.} +#' +#' Below are descriptions and links to functions found in \pkg{poppr}. Be +#' aware that all functions in \pkg{\link[adegenet]{adegenet}} are also +#' available. +#' +#' @section Data import/export: +#' \itemize{ +#' \item \code{\link{getfile}} - Provides a quick GUI to grab files for import +#' \item \code{\link{read.genalex}} - Read GenAlEx formatted csv files to a genind object +#' \item \code{\link{genind2genalex}} - Converts genind objects to GenAlEx formatted csv files +#' } +#' @section Data manipulation: +#' \itemize{ +#' \item \code{\link{as.genclone}} - Converts genind objects to genclone objects +#' \item \code{\link{setpop}} - Set the population using defined hierarchies +#' \item \code{\link{splithierarchy}} - Split a concatenated hierarchy imported as a population +#' \item \code{\link{sethierarchy}} - Define a population hierarchy of a genclone object +#' \item \code{\link{gethierarchy}} - Extract the hierarchy data frame +#' \item \code{\link{addhierarchy}} - Add a vector or data frame to an existing hierarchy +#' \item \code{\link{namehierarchy}} - Rename a population hierarchy +#' \item \code{\link{missingno}} - Handles missing data +#' \item \code{\link{clonecorrect}} - Clone censors at a specified population hierarchy +#' \item \code{\link{informloci}} - Detects and removes phylogenetically uninformative loci +#' \item \code{\link{popsub}} - Subsets genind objects by population +#' \item \code{\link{shufflepop}} - Shuffles genotypes at each locus using four different shuffling algorithms +#' \item \code{\link{recode_polyploids}} - recode polyploid data sets with missing alleles imported as "0" +#' } +#' @section Genetic distances: +#' \itemize{ +#' \item \code{\link{bruvo.dist}} - Bruvo’s distance +#' \item \code{\link{diss.dist}} - Absolute genetic distance (see provesti.dist) +#' \item \code{\link{nei.dist}} - Nei’s 1978 genetic distance +#' \item \code{\link{rogers.dist}} - Rogers’ euclidean distance +#' \item \code{\link{reynolds.dist}} - Reynolds’ coancestry distance +#' \item \code{\link{edwards.dist}} - Edwards’ angular distance +#' \item \code{\link{provesti.dist}} - Provesti’s absolute genetic distance +#' } +#' @section Bootstrapping: +#' \itemize{ +#' \item \code{\link{aboot}} - Creates a bootstrapped dendrogram for any distance measure +#' \item \code{\link{bruvo.boot}} - Produces dendrograms with bootstrap support based on Bruvo’s distance +#' } +#' @section Analysis: +#' \itemize{ +#' \item \code{\link{poppr.amova}} - Analysis of Molecular Variance (as implemented in ade4) +#' \item \code{\link{ia}} - Calculates the index of association +#' \item \code{\link{mlg}} - Calculates the number of multilocus genotypes +#' \item \code{\link{mlg.crosspop}} - Finds all multilocus genotypes that cross populations +#' \item \code{\link{mlg.table}} - Returns a table of populations by multilocus genotypes +#' \item \code{\link{mlg.vector}} - Returns a vector of a numeric multilocus genotype assignment for each individual +#' \item \code{\link{mlg.id}} - Finds all individuals associated with a single multilocus genotype +#' \item \code{\link{poppr}} - Returns a diversity table by population +#' \item \code{\link{poppr.all}} - Returns a diversity table by population for all compatible files specified +#' \item \code{\link{private_alleles}} - Tabulates the occurences of alleles that only occur in one population. +#' \item \code{\link{locus_table}} - Creates a table of summary statistics per locus. +#' } +#' @section Visulalization: +#' \itemize{ +#' \item \code{\link{plot_poppr_msn}} - Plots minimum spanning networks produced in poppr with scale bar and legend +#' \item \code{\link{greycurve}} - Helper to determine the appropriate parameters for adjusting the grey level for msn functions +#' \item \code{\link{bruvo.msn}} - Produces minimum spanning networks based off Bruvo’s distance colored by population +#' \item \code{\link{poppr.msn}} - Produces a minimum spanning network for any pairwise distance matrix related to the data +#' \item \code{\link{info_table}} - Creates a heatmap representing missing data or observed ploidy +#' \item \code{\link{genotype_curve}} - Creates a series of boxplots to demonstrate how many markers are needed to represent the diversity of your data. +#' } +#' +#' @author Zhian N. Kamvar, Javier F. Tabima, Niklaus J. Grünwald +#' +#' Maintainer: Zhian N. Kamvar +#' +#' @references Kamvar ZN, Tabima JF, Grünwald NJ. (2014) Poppr: an R package for +#' genetic analysis of populations with clonal, partially clonal, and/or sexual +#' reproduction. PeerJ 2:e281 \url{http://dx.doi.org/10.7717/peerj.281} +#' @name poppr-package +#' @docType package +#==============================================================================# +NULL \ No newline at end of file diff --git a/R/print_methods.r b/R/print_methods.r new file mode 100644 index 00000000..d91607a9 --- /dev/null +++ b/R/print_methods.r @@ -0,0 +1,90 @@ +#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# +#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# +#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# +#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# +#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# +# +# This software was authored by Zhian N. Kamvar and Javier F. Tabima, graduate +# students at Oregon State University; and Dr. Nik Grünwald, an employee of +# USDA-ARS. +# +# Permission to use, copy, modify, and distribute this software and its +# documentation for educational, research and non-profit purposes, without fee, +# and without a written agreement is hereby granted, provided that the statement +# above is incorporated into the material, giving appropriate attribution to the +# authors. +# +# Permission to incorporate this software into commercial products may be +# obtained by contacting USDA ARS and OREGON STATE UNIVERSITY Office for +# Commercialization and Corporate Development. +# +# The software program and documentation are supplied "as is", without any +# accompanying services from the USDA or the University. USDA ARS or the +# University do not warrant that the operation of the program will be +# uninterrupted or error-free. The end-user understands that the program was +# developed for research purposes and is advised not to rely exclusively on the +# program for any reason. +# +# IN NO EVENT SHALL USDA ARS OR OREGON STATE UNIVERSITY BE LIABLE TO ANY PARTY +# FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING +# LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, +# EVEN IF THE OREGON STATE UNIVERSITY HAS BEEN ADVISED OF THE POSSIBILITY OF +# SUCH DAMAGE. USDA ARS OR OREGON STATE UNIVERSITY SPECIFICALLY DISCLAIMS ANY +# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE AND ANY STATUTORY +# WARRANTY OF NON-INFRINGEMENT. THE SOFTWARE PROVIDED HEREUNDER IS ON AN "AS IS" +# BASIS, AND USDA ARS AND OREGON STATE UNIVERSITY HAVE NO OBLIGATIONS TO PROVIDE +# MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. +# +#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# +#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# +#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# +#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# +#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# +#==============================================================================# + +#' @method print ialist +#' @export +print.ialist <- function(x, ...){ + cat("Index\n") + print(x$index) + cat("Samples\n") + if (nrow(x$samples) > 13){ + print(head(x$samples)) + cat("...\n") + print(tail(x$samples)) + } else { + print(x$samples) + } +} + +#' @method print amova +#' @export +print.amova <- function(x, full = FALSE, ...) +{ + if (full == TRUE) + print(x) + else print(x[-((length(x) - 2):length(x))]) +} + +#' @method print popprtable +#' @export +print.popprtable <- function(x, ...){ + call <- list(...) + if (length(call > 0) && names(call) %in% "digits"){ + print.data.frame(x, ...) + } else { + print.data.frame(x, digits = 3, ...) + } +} + +#' @method print locustable +#' @export +print.locustable <- function(x, ...){ + call <- list(...) + if (length(call > 0) && names(call) %in% "digits"){ + print.table(x, ...) + } else { + print.table(x, digits = 2, zero.print = ".", ...) + } +} \ No newline at end of file diff --git a/R/sample_schemes.r b/R/sample_schemes.r index 14484bb7..a33e86c0 100755 --- a/R/sample_schemes.r +++ b/R/sample_schemes.r @@ -42,46 +42,45 @@ #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# #==============================================================================# -#' Shuffle individuals in a \code{\link{genind}} object independently over each -#' locus. -#' -#' @param pop a \code{\link{genind}} object -#' +#' Shuffle individuals in a \code{\linkS4class{genclone}} or +#' \code{\linkS4class{genind}} object independently over each locus. +#' +#' @param pop a \code{\linkS4class{genclone}} or \code{\linkS4class{genind}} +#' object +#' #' @param method an integer between 1 and 4. See details below. -#' -#' @return a \code{\link{genind}} object shuffled by a specified method -#' -#' @section Details: This function will shuffle each locus in the data set independently -#' of one another, rendering them essentially unlinked. The following methods -#' are available to shuffle your data: -#' \enumerate{ -#' \item \strong{Permute Alleles} This will redistribute all alleles in the -#' sample throughout the locus. Missing data is fixed in place. This maintains -#' allelic structure, but heterozygosity is variable. -#' \item \strong{Parametric Bootstrap} This will redistribute available alleles -#' within the locus based on their allelic frequencies. This means that both the -#' allelic state and heterozygosity will vary. The resulting data set will not -#' have missing data. -#' \item \strong{Non-Parametric Bootstrap} This will shuffle the -#' allelic state for each individual. Missing data is fixed in place. -#' \item \strong{Multilocus Style Permutation} This will shuffle the genotypes -#' at each locus, maintaining the heterozygosity and allelic structure. -#' } -#' -#' +#' +#' @return a \code{\linkS4class{genclone}} or \code{\linkS4class{genind}} object +#' shuffled by a specified method +#' +#' @details This function will shuffle each locus in the data set independently +#' of one another, rendering them essentially unlinked. The following methods +#' are available to shuffle your data: \enumerate{ \item \strong{Permute +#' Alleles} This will redistribute all alleles in the sample throughout the +#' locus. Missing data is fixed in place. This maintains allelic structure, +#' but heterozygosity is variable. \item \strong{Parametric Bootstrap} This +#' will redistribute available alleles within the locus based on their allelic +#' frequencies. This means that both the allelic state and heterozygosity will +#' vary. The resulting data set will not have missing data. \item +#' \strong{Non-Parametric Bootstrap} This will shuffle the allelic state for +#' each individual. Missing data is fixed in place. \item \strong{Multilocus +#' Style Permutation} This will shuffle the genotypes at each locus, +#' maintaining the heterozygosity and allelic structure. } +#' +#' #' @export #' @author Zhian N. Kamvar -#' -#' @references Paul-Michael Agapow and Austin Burt. 2001. Indices of multilocus -#' linkage disequilibrium. \emph{Molecular Ecology Notes}, 1(1-2):101-102 -#' +#' +#' @references Paul-Michael Agapow and Austin Burt. 2001. Indices of multilocus +#' linkage disequilibrium. \emph{Molecular Ecology Notes}, 1(1-2):101-102 +#' #' @examples #' # load the microbov dataset #' data(microbov) #' # Let's look at a single population for now. Howsabout Zebu #' Zebu <- popsub(microbov, "Zebu") #' summary(Zebu) -#' +#' #' # Take note of the Number of alleles per population and the Observed #' # heterozygosity as we go through each method. #' @@ -90,7 +89,7 @@ #' \dontrun{ #' # Parametric Bootstrap: do not maintain allelic state or heterozygosity #' summary(shufflepop(Zebu, method=2)) -#' +#' #' # Non-Parametric Bootstrap: do not maintain allelic state or heterozygosity. #' summary(shufflepop(Zebu, method=3)) #' @@ -163,14 +162,29 @@ shufflefunk <- function(pop, FUN, sample=1, method=1, ...){ ) ) if(!quiet) progbar <- txtProgressBar(style = 3) - for (c in 1:iterations){ - IarD <- .Ia.Rd(.all.shuffler(pop, type, method=method), missing=missing) + # ploid <- ploidy(pop[[1]]) + # tz <- test_zeroes(pop[[1]]) + # if (ploid > 2 & tz){ + # zcol <- which(as.numeric(pop[[1]]@all.names[[1]]) == 0) + # ploidvec <- vapply(pop, get_local_ploidy, integer(nInd(pop[[1]]))) + # ploidind <- TRUE + # } else { + # ploidind <- FALSE + # } + for (c in 1:iterations){ + # if (ploidind){ + # IarD <- .Ia.Rd(new.all.shuff(pop, type, method, ploidvec = ploidvec, + # zerocol = zcol), missing = missing) + # } else { + IarD <- .Ia.Rd(.all.shuffler(pop, type, method=method), missing=missing) + # } sample.data$Ia[c] <- IarD[1] sample.data$rbarD[c] <- IarD[2] if (!quiet){ setTxtProgressBar(progbar, c/iterations) } } + if(!quiet) close(progbar) return(sample.data) } @@ -260,7 +274,7 @@ shufflefunk <- function(pop, FUN, sample=1, method=1, ...){ # # weights is a corresponding vector giving the allelic frequency for each allele # at that locus in that population. The sum of the frequencies should be 1. -# DEPRECIATED (replaced with multinomial distribution) +# DEPRECATED (replaced with multinomial distribution) #==============================================================================# .diploid.shuff <- function(vec, weights){ # Dealing with missing values is probably not necessary for a parametric @@ -296,9 +310,9 @@ shufflefunk <- function(pop, FUN, sample=1, method=1, ...){ #==============================================================================# .permut.shuff <- function(mat, ploidy = 2){ - bucket <- colSums(mat, na.rm = TRUE)*ploidy + bucket <- round(colSums(mat, na.rm = TRUE)*ploidy) bucketlist <- as.integer(sample(rep(1:length(bucket), bucket))) - mat <- .Call("permute_shuff", mat, bucketlist - 1, 1/ploidy, ploidy) + mat <- .Call("permute_shuff", mat, bucketlist - 1, 1/ploidy, ploidy, PACKAGE = "poppr") return(mat) } diff --git a/R/sandbox.r b/R/sandbox.r index 79890aa7..ee297855 100755 --- a/R/sandbox.r +++ b/R/sandbox.r @@ -42,7 +42,176 @@ #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# +# Keeping this for compatability +new.read.genalex <- function(genalex, ploidy=2, geo=FALSE, region=FALSE){ + return(read.genalex(genalex, ploidy, geo, region)) +} + + +read_allele_columns <- function(x, ploidy = 2){ + clm <- ncol(x) + if (length(ploidy) == 1){ + if (clm %% ploidy == 0){ + ploidy <- rep(ploidy, clm/ploidy) + } else { + stop("ploidy specified does not match number of columns.") + } + } + loci <- first_allele_col(ploidy) + x2 <- x[, loci] + lapply(1:length(loci), function(a) x2[, a] <<- + pop_combiner(x, hier = loci[a]:(loci[a]+ploidy[a]-1), sep = "/")) + + x2 <- data.frame(lapply(x2, add_zeroes, ploidy = max(ploidy))) + x2[apply(x2, 2, function(na) grepl("NA", na))] <- NA + return(x2) +} + +first_allele_col <- function(ploidies) cumsum(ploidies) - ploidies + 1 + +# Function that will add extra zeroes onto genotypes that are deficient in the +# right number of alleles. +add_zeroes <- function(x, ploidy = 3){ + extras <- ploidy - vapply(strsplit(x, "/"), length, 69) + vapply(1:length(extras), zero_adder, character(1), extras, x) +} + +zero_adder <- function(index, extras, df){ + if(extras[index] > 0){ + return(paste(c(rep(0, extras[index]), df[index]), collapse = "/")) + } else { + return(df[index]) + } +} +#==============================================================================# +# Create a population hierarchy in a genind object if one is not there. +#==============================================================================# +genind_hierarchy <- function(x, df = NULL, dfname = "population_hierarchy"){ + if (is.null(df)){ + df <- data.frame(Pop = pop(x)) + } + other(x)[[dfname]] <- df + return(x) +} + +gen2polysat <- function(gen, newploidy = gen@ploidy){ + if (!require(polysat)){ + stop("User needs polysat installed") + } + gen <- recode_polyploids(gen, newploidy) + gendf <- genind2df(gen, sep = "/", usepop = FALSE) + gendf <- lapply(gendf, strsplit, "/") + gendf <- lapply(gendf, lapply, as.numeric) + ambig <- new("genambig", samples = indNames(gen), loci = locNames(gen)) + lapply(names(gendf), function(x) Genotypes(ambig, loci = x) <<- gendf[[x]]) + return(ambig) +} + + +new_graph_pops <- function(graph, dat, color){ + cmlg <- unique(mlg.vector(dat)) + mlg.cp <- mlg.crosspop(dat, mlgsub = 1:length(cmlg), quiet = TRUE) + mlg.cp <- mlg.cp[rank(cmlg)] + mlg.color <- lapply(mlg.cp, function(x) color[dat@pop.names %in% names(x)]) + V(graph)$pie <- mlg.cp + V(graph)$pie.color <- mlg.color + return(graph) +} + + + +#==============================================================================# +# An attempt at making the shuffling schemes handle polyploid data better. +#==============================================================================# + +new.permut.shuff <- function(mat, ploidy = 2L, ploidvec = NULL, zerocol = 1L){ + bucket <- round(colSums(mat, na.rm = TRUE)*ploidy)[-zerocol] + bucketlen <- (1:ncol(mat))[-zerocol] + bucketlist <- as.integer(sample(rep(bucketlen, bucket))) + nas <- !is.na(ploidvec) + ploidvec[nas] <- sample(ploidvec[nas]) + mat <- .Call("new_permute_shuff", mat, bucketlist - 1L, 1/ploidy, ploidy, + ploidvec, zerocol - 1L, PACKAGE = "poppr") + # mat <- list(mat = mat, bucket = bucket, bucketlist = bucketlist - 1L, + # allele = 1/ploidy, ploidy = ploidy, ploidvec = ploidvec, + # zerocol = zerocol - 1L) + return(mat) +} + + +new.all.shuff <- function(pop, type=type, method=1, ploidvec= NULL, zerocol = 1L){ + METHODS = c("permute alleles", "parametric bootstrap", + "non-parametric bootstrap", "multilocus") + if(type=="PA"){ + if(method == 1 | method == 4){ + pop@tab <- vapply(1:ncol(pop@tab), + function(x) sample(pop@tab[, x]), pop@tab[, 1]) + } + else if(method == 2){ + paramboot <- function(x){ + one <- mean(pop@tab[, x], na.rm=TRUE) + zero <- 1-one + return(sample(c(1,0), length(pop@tab[, x]), prob=c(one, zero), replace=TRUE)) + } + pop@tab <- vapply(1:ncol(pop@tab), paramboot, pop@tab[, 1]) + } + else if(method == 3){ + pop@tab <- vapply(1:ncol(pop@tab), + function(x) sample(pop@tab[, x], replace=TRUE), pop@tab[, 1]) + } + } + else { + pop <- lapply(1:length(pop), function(x){ + new.locus.shuffler(pop[[x]], method = method, ploidvec[, x], zerocol) + }) + } + return(pop) +} + +new.locus.shuffler <- function(pop, method=1, ploidvec = NULL, zerocol = 1L){ + ploid <- ploidy(pop) + if (method == 1){ + pop@tab <- new.permut.shuff(pop@tab, ploidy = ploid, ploidvec, zerocol) + } else if (method == 2){ + + uploids <- unique(ploidvec) + weights <- colMeans(pop@tab[, -zerocol, drop = FALSE], na.rm = TRUE) + + if (length(uploids) > 1){ + ploidprobs <- table(ploidvec) + newploids <- sample(unique(ploidvec), length(ploidvec), + prob = ploidprobs/sum(ploidprobs), replace = TRUE) + pop@tab[, zerocol] <- newploids/ploid + lapply(unique(newploids), function(x){ + pop@tab[newploids == x, -zerocol] <<- t(rmultinom(sum(newploids == x), + size = x, prob = weights))/ploid + + }) + + } else { + pop@tab[, -zerocol] <- t(rmultinom(length(ploidvec), size = uploids, + prob = weights))/ploid + } + } else if (method == 3){ + uploids <- unique(ploidvec) + if (length(uploids) > 1){ + newploids <- sample(unique(ploidvec), length(ploidvec), replace = TRUE) + pop@tab[, zerocol] <- newploids/ploid + lapply(unique(newploids), function(x){ + pop@tab[newploids == x, -zerocol] <<- t(rmultinom(sum(newploids == x), + size = x, prob = rep(1, ploid)))/ploid + + }) + } else { + pop@tab[, -zerocol] <- t(rmultinom(length(ploidvec), size = uploids, + prob = rep(1, ploid)))/ploid + } + } else if (method == 4){ + pop@tab <- pop@tab[sample(nInd(pop)), ] + } + return(pop) +} pair_ia <- function(pop){ @@ -51,8 +220,7 @@ pair_ia <- function(pop){ loci_pairs <- combn(pop@loc.names, 2) pair_ia_vector <- apply(loci_pairs, 2, function(x) .Ia.Rd(pop_loci[x])) colnames(pair_ia_vector) <- apply(loci_pairs, 2, paste, collapse = ":") - } - else{ + } else { loci_pairs <- combn(1:nLoc(pop), 2) pair_ia_vector <- apply(loci_pairs, 2, function(x) .PA.Ia.Rd(pop[, x], missing = "ignore")) colnames(pair_ia_vector) <- apply(combn(pop@loc.names, 2), 2, paste, collapse = ":") @@ -61,13 +229,14 @@ pair_ia <- function(pop){ return(pair_ia_vector) } + poppr_pair_ia <- function(pop){ if(is.null(pop(pop))){ return(pair_ia(pop)) } - pops <- seppop(pop, drop = FALSE) + pops <- seppop(pop, drop = FALSE) loci_pairs <- choose(nLoc(pop), 2) - res_mat <- matrix(0.5, 2, loci_pairs) + res_mat <- matrix(0.5, 2, loci_pairs) pops_array <- vapply(pops, pair_ia, res_mat) return(pops_array) } @@ -75,59 +244,22 @@ poppr_pair_ia <- function(pop){ -testing_funk <- function(){ - cat("This test worked...maybe.\n") -} -.new.sampling <- function(pop,iterations,quiet="noisy",missing="ignore",type=type, method=1){ - METHODS = c("multilocus", "permute alleles", "parametric bootstrap", - "non-parametric bootstrap") - if(!is.list(pop)){ - if(type=="PA"){ - .Ia.Rd <- .PA.Ia.Rd - } - } - sample.data <- NULL - for (c in 1:iterations){ - IarD <- .Ia.Rd(total.shuffler(pop, method=method), missing=missing) - sample.data <- rbind(sample.data, as.data.frame(list( - Ia=IarD[1], - rbarD=IarD[2] - ))) - if (quiet != TRUE){ - if(quiet == "noisy"){ - cat("Sample: ",c,"\n") - cat("Index of Association: ", IarD[1],"\n") - cat("Standardized Index of Association (rbarD): ", IarD[2],"\n") - } - else{ - if(c%%50 != 0){ - cat(".") - if (c == iterations){ - cat("\n") - } -# else if( c == 666 ){ -# cat("\\m/") -# } - } - else{ - cat(".") - cat("\n") - } - } - } - } - return(sample.data) + + +testing_funk <- function(){ + cat("This test worked...maybe.\n") } + new.poppr <- function(pop,total=TRUE,sublist=c("ALL"),blacklist=c(NULL), sample=0, method=1,missing="ignore", quiet=FALSE,clonecorrect=FALSE,hier=c(1),dfname="hier", hist=TRUE, minsamp=10){ METHODS = c("multilocus", "permute alleles", "parametric bootstrap", "non-parametric bootstrap") - x <- .file.type(pop, missing=missing, clonecorrect=clonecorrect, hier=hier, + x <- process_file(pop, missing=missing, clonecorrect=clonecorrect, hier=hier, dfname=dfname, quiet=TRUE) # The namelist will contain information such as the filename and population # names so that they can easily be ported around. @@ -305,8 +437,7 @@ new.poppr.all <- function(filelist, ...) { return(IarD) } } - - IarD <- .Ia.Rd(popx, missing) + IarD <- .Ia.Rd(popx, missing) # data vomit options. if (!quiet){ cat("|", namelist$population ,"\n") @@ -344,73 +475,272 @@ new.poppr.all <- function(filelist, ...) { return(final(Iout, result)) } +.new.Ia.Rd <- function (pop, missing = NULL) +{ + vard.vector <- NULL + numLoci <- length(pop) + numIsolates <- length(pop[[1]]@ind.names) + np <- choose(numIsolates, 2) + if (np < 2) { + return(as.numeric(c(NaN, NaN))) + } + V <- new.pair_diffs(pop, numLoci, np) + varD <- ((sum(V$D.vector^2) - ((sum(V$D.vector))^2)/np))/np + vard.vector <- ((V$d2.vector - ((V$d.vector^2)/np))/np) + vardpair.vector <- .Call("pairwise_covar", vard.vector, PACKAGE = "poppr") + sigVarj <- sum(vard.vector) + rm(vard.vector) + Ia <- (varD/sigVarj) - 1 + rbarD <- (varD - sigVarj)/(2 * sum(vardpair.vector)) + return(c(Ia, rbarD)) +} +#==============================================================================# +# This creates a pairwise difference matrix via the C function pairdiffs in +# src/poppr_distance.c +# +# Public functions utilizing this function: +# # none +# +# Internal functions utilizing this function: +# # .Ia.Rd +# +#==============================================================================# +new.pair_diffs <- function(pop, numLoci, np) +{ + ploid <- ploidy(pop[[1]]) + temp.d.vector <- matrix(nrow = np, ncol = numLoci, data = as.numeric(NA)) + temp.d.vector <- vapply(pop, function(x) .Call("pairdiffs", x@tab, PACKAGE = "poppr")*(ploid/2), numeric(np)) + return(temp.d.vector) + d.vector <- colSums(temp.d.vector) + d2.vector <- colSums(temp.d.vector^2) + D.vector <- rowSums(temp.d.vector) + return(list(d.vector = d.vector, d2.vector = d2.vector, D.vector = D.vector)) +} getinds <- function(x){ # x is a vector indicating the size of loci in a data set (must be continuous). # The sum of x is equal to the number of observations. - to <- cumsum(x) + indices <- matrix(ncol = 2, nrow = length(x)) + to <- cumsum(x) from <- c(1, to[-length(to)]+1) - indices <- rbind(from, to) - rownames(indices) <- NULL + indices[] <- as.integer(c(from, to)) return(indices) } -total.shuffler <- function(pop, method){ - return(lapply(pop, pop.sampler, method)) +#==============================================================================# +# bootjack is a function that will calculate values of the index of association +# after sampling with or without replacement. The purpose of this function is +# to help determine distributions of I_A under non-random mating by creating +# exact copies of samples by sampling with replacement and also to derive a +# distribution of the data by sampling a subset of the data (63% by default) +# without replacement. +# +# Since the data itself is not being changed, we can use the distances observed. +# These distances are calculated per-locus and presented in a matrix (V) with the +# number of columns equal to the number of loci and the number of rows equal to +# choose(N, 2) where N is the number of samples in the data set. Sampling has +# been optimized with this strategy by first creating a lower triangle distance +# matrix containing indices from 1 to choose(N, 2). This is then converted to a +# square matrix (mat) and subset after sampling with or without replacement. The +# remaining indices in the distance matrix is used to subset the distance-per- +# locus matrix (V). Calculations are then performed on this matrix. It must be +# noted that for sampling with replacement, duplicated indices must be +# supplemented with rows of zeroes to indicate no distance. +#==============================================================================# +bootjack <- function(gid, iterations = 999, half = NULL, progbar = NULL){ + if (!is.list(gid)){ + N <- nInd(gid) + numLoci <- nLoc(gid) + } else { + N <- nInd(gid[[1]]) + numLoci <- length(gid) + } + # Step 1: make a distance matrix defining the indices for the pairwise + # distance matrix of loci. + np <- choose(N, 2) + dis <- 1:np + dis <- make_attributes(dis, N, 1:N, "dist", call("dist")) + mat <- as.matrix(dis) + # Step 2: calculate the pairwise distances for each locus. + V <- jack.pair_diffs(gid, numLoci, np) + if (!is.null(half)){ + np <- choose(half, 2) + sample.data <- vapply(1:iterations, jackrun, numeric(2), V, mat, N, half, + np, progbar, iterations) + } else { + sample.data <- vapply(1:iterations, bootrun, numeric(2), V, mat, N, np, + progbar, iterations) + } + rownames(sample.data) <- c("Ia", "rbarD") + return(data.frame(t(sample.data))) } + +#------------------------------------------------------------------------------# +# Variables for bootrun and jackrun +# +# count = the current iteration +# V = the matrix of pairwise distances per locus +# mat = square matrix containing indices for V +# N = number of observations in data set +# half = the number of observations to be sampled without replacement +# np = choose(N, 2) +# progbar = a progress bar object +# iterations = the number of total samples +#------------------------------------------------------------------------------# +bootrun <- function(count, V, mat, N, np, progbar, iterations){ + if (!is.null(progbar)){ + setTxtProgressBar(progbar, count/iterations) + } + # Step 3: sample individuals with replacement and subset distance matrix + # from step 1. + inds <- sample(N, replace = TRUE) + newInds <- as.vector(as.dist(mat[inds, inds])) + # Step 4: Find the number of zeroes in from the distance matrix. This will be + # the number of duplicated genotypes. + zeroes <- length(inds == 0) + + # Step 5: subset the pairwise locus matrix. + newV <- V[newInds, ] + V <- list(d.vector = colSums(newV), + d2.vector = colSums(newV * newV), + D.vector = c(rowSums(newV), rep(0, zeroes)) + ) + # Step 6: Calculate the index of association. + return(jack.calc(V, np)) +} +jackrun <- function(count, V, mat, N, half, np, progbar, iterations){ + if (!is.null(progbar)){ + setTxtProgressBar(progbar, count/iterations) + } -pop.sampler <- function(pop, method=1){ - pop_names <- names(table(pop@pop)) - total_inds <- 1:length(pop@pop) - repop <- function(x, pop, pop_names, total_inds, method){ - pop_vec <- total_inds[pop@pop %in% pop_names[x]] - if(all(is.na(pop@tab[pop_vec, ]))){ - pop@tab[pop_vec, ] <<- pop@tab[pop_vec, ] - return(1) - } - cols <- which(colSums(pop[pop_vec, ]@tab, na.rm=TRUE) > 0) - newpop <- pop - if(method != 2){ - newpop@tab[pop_vec, -cols] <- 0 - } - newpop@tab[pop_vec, cols] <- .locus.shuffler(popsub(pop, x), method)@tab - if(method == 1){ - replacements <- is.na(rowSums(newpop@tab[pop_vec, cols])) - if(any(replacements == TRUE)){ - #cat(pop_vec,"\t",replacements,"\n") - newpop@tab[pop_vec[replacements], -cols] <- NA - } + inds <- sample(N, half) + newInds <- as.vector(as.dist(mat[inds, inds])) + + newV <- V[newInds, ] + V <- list(d.vector = colSums(newV), + d2.vector = colSums(newV * newV), + D.vector = rowSums(newV) + ) + return(jack.calc(V, np)) +} + +jack.calc <- function(V, np){ + varD <- ((sum(V$D.vector^2) - ((sum(V$D.vector))^2)/np))/np + vard.vector <- ((V$d2.vector - ((V$d.vector^2)/np))/np) + vardpair.vector <- .Call("pairwise_covar", vard.vector, PACKAGE = "poppr") + sigVarj <- sum(vard.vector) + rm(vard.vector) + Ia <- (varD/sigVarj) - 1 + rbarD <- (varD - sigVarj)/(2 * sum(vardpair.vector)) + return(c(Ia, rbarD)) +} + +jack.pair_diffs <- function(pop, numLoci, np){ + temp.d.vector <- matrix(nrow = np, ncol = numLoci, data = as.numeric(NA)) + if(!is.list(pop)){ + ploid <- 2 + temp.d.vector <- vapply(seq(numLoci), + function(x) as.vector(dist(pop@tab[,x])), + temp.d.vector[,1]) + # checking for missing data and imputing the comparison to zero. + if(any(is.na(temp.d.vector))){ + temp.d.vector[which(is.na(temp.d.vector))] <- 0 } - pop@tab[pop_vec, ] <<- newpop@tab[pop_vec, ] - return(0) + } + else { + ploid <- ploidy(pop[[1]]) + temp.d.vector <- vapply(pop, function(x) .Call("pairdiffs", x@tab, PACKAGE = "poppr")*(ploid/2), + temp.d.vector[, 1]) } - invisible(lapply(1:length(pop_names), repop, pop, pop_names, total_inds, method)) -# nas <- function(x, pop, pop_names, total_inds, method){ -# pop_vec <- total_inds[pop@pop %in% pop_names[x]] -# if(any(is.na(pop@tab[pop_vec, ]))){ -# newpop <- pop -# replacena <- function(locus, newpop, pop_vec){ -# locus <- newpop@loc.fac %in% locus -# truth <- is.na(rowSums(newpop@tab[pop_vec, locus])) -# if(any(truth == TRUE)){ -# #cat(pop_vec,"\n",as.numeric(locus),"\n", truth, "\n") -# newpop@tab[pop_vec[truth], locus] <<- NA -# } -# } -# invisible(lapply(names(pop@loc.names), replacena, newpop, pop_vec)) -# pop@tab[pop_vec, ] <<- newpop@tab[pop_vec, ] -# } -# } -# if(method == 1){ -# invisible(lapply(1:length(pop_names), nas, pop, pop_names, total_inds, method)) -# } - return(pop) + return(temp.d.vector) } +jackbootplot <- function(df, obs.df, index = c("rd", "ia", "both"), obsline = TRUE){ + ARGS <- c("rd", "ia", "both") + index <- match.arg(index, ARGS) + Indexfac <- c("I[A]","bar(r)[d]") + levels(df$variable) <- Indexfac + obs.df$variable <- Indexfac + + if (index == "rd"){ + df <- df[df$variable == "bar(r)[d]", ] + obs.df <- obs.df[obs.df$variable == "bar(r)[d]", ] + } else if (index == "ia"){ + df <- df[df$variable == "I[A]", ] + obs.df <- obs.df[obs.df$variable == "I[A]", ] + } + + distplot <- ggplot(df, aes_string(y = "value", x = "Distribution")) + + geom_violin(aes_string(fill = "Distribution")) + + geom_boxplot(alpha = 0.25, width = 0.125) + if (index == "both"){ + distplot <- distplot + facet_grid("variable ~ .", scales = "free", + labeller = label_parsed) + } + if (obsline){ + aesth <- aes_string(yintercept = "value", group = "variable") + distplot <- distplot + geom_hline(aesth, data = obs.df, linetype = 2) + } + return(distplot) +} + +#' @importFrom reshape2 melt + +jackbootcomp <- function(pop, sample = 999, quiet = FALSE, method = 1, + jack = 0.63, plotindex = "rd"){ + ARGS <- c("rd", "ia", "both") + plotindex <- match.arg(plotindex, ARGS) + + inds <- nInd(pop) + message("Null Distribution...") + + null <- ia(pop, sample = sample, valuereturn = TRUE, quiet = quiet, + hist = FALSE, method = method) + if(pop@type == "codom"){ + popx <- seploc(pop) + } else { + popx <- pop + } + + message("Alternative Distribution (Bootstrap)...") + if(!quiet){ + bootbar <- txtProgressBar(style = 3) + } else { + bootbar <- NULL + } + jackbar <- bootbar + + altboot <- bootjack(popx, sample, half = NULL, bootbar) + + message("Alternative Distribution (Jack Knife)...") + half <- round(inds*jack) + altjack <- bootjack(popx, sample, half, jackbar) + datlist <- list(null = null$sample, bootstrap = altboot, `jack knife` = altjack) + dat <- melt(datlist, measure.vars = c("Ia", "rbarD")) + names(dat)[3] <- "Distribution" + message("Creating Plots") + obs.df <- data.frame(list(variable = names(null$samples), + value = null$index[c(1,3)])) + distplot <- jackbootplot(dat, obs.df, plotindex) + + ggtitle(paste("Data:", deparse(substitute(pop)), "\n", inds, + "Individuals,", half, "Sampled for Jack knife") + ) + if (plotindex == "both"){ + distplot <- distplot + theme(strip.text = element_text(size = rel(2))) + } else if (plotindex == "rd"){ + distplot <- distplot + ylab(expression(bar(r)[d])) + } else { + distplot <- distplot + ylab(expression(I[A])) + } + return(list(observed = null$index, null_samples = null$samples, + alt_samples_boot = altboot, alt_samples_jack = altjack, + plot = distplot) + ) +} @@ -426,65 +756,7 @@ pop.sampler <- function(pop, method=1){ #################### Javier's Functions below ################################## ################################################################################ -genoid.bruvo.boot <- function(pop, replen=c(2), sample = 100, tree = "upgma", - showtree=TRUE, cutoff=NULL, quiet=FALSE, ...) { - # This attempts to make sure the data is true microsatellite data. It will - # reject snp and aflp data. - if(pop@type != "codom" | all(is.na(unlist(lapply(pop@all.names, as.numeric))))){ - stop("\nThis dataset does not appear to be microsatellite data. Bruvo's Distance can only be applied for true microsatellites.") - } - ploid <- ploidy(pop) - # Bruvo's distance depends on the knowledge of the repeat length. If the user - # does not provide the repeat length, it can be estimated by the smallest - # repeat difference greater than 1. This is not a preferred method. - if (length(replen) != length(pop@loc.names)){ - replen <- vapply(pop@all.names, function(x) guesslengths(as.numeric(x)), 1) - # replen <- rep(replen[1], numLoci) - warning("\n\nRepeat length vector for loci is not equal to the number of loci represented.\nEstimating repeat lengths from data:\n", immediate.=TRUE) - cat(replen,"\n\n") - } - # This controlls for the user correcting missing data using "mean". - if(any(!round(pop@tab,10) %in% c(0,((1:ploid)/ploid),1, NA))){ - pop@tab[!round(pop@tab,10) %in% c(0,((1:ploid)/ploid),1, NA)] <- NA - } - # Converting the genind object into a matrix with each allele separated by "/" - bar <- as.matrix(genind2df(pop, sep="/", usepop=FALSE)) - # The bruvo algorithm will ignore missing data, coded as 0. - bar[bar %in% c("", NA)] <- paste(rep(0, ploid), collapse="/") - # stopifnot(require(phangorn)) - # Steps: Create initial tree and then use boot.phylo to perform bootstrap - # analysis, and then place the support labels on the tree. - if(tree == "upgma"){ - newfunk <- match.fun(upgma) - } - else if(tree == "nj"){ - newfunk <- match.fun(nj) - } - tre <- newfunk(phylo.bruvo.dist(bar, replen=replen, ploid=ploid)) - if (any (tre$edge.length < 0)){ - tre$edge.length[tre$edge.length < 0] <- 0 - } - if(quiet == FALSE){ - cat("\nBootstrapping... (note: calculation of node labels can take a while even after the progress bar is full)\n\n") - } - bp <- boot.phylo(tre, bar, FUN = function (x) newfunk(phylo.bruvo.dist(x, replen = replen, ploid = ploid)), B = sample, quiet=quiet, ...) - tre$node.labels <- round(((bp / sample)*100)) - if (!is.null(cutoff)){ - if (cutoff < 1 | cutoff > 100){ - cat("Cutoff value must be between 0 and 100.\n") - cutoff<- as.numeric(readline(prompt = "Choose a new cutoff value between 0 and 100:\n")) - } - tre$node.labels[tre$node.labels < cutoff]<-NA - } - tre$tip.label <- pop@ind.names - if(showtree == TRUE){ - plot(tre, show.node.label=TRUE) - } - if(tree=="upgma"){ - axisPhylo(3) - } - return(tre) -} + javier<-function(x){ cat ("http://www.youtube.com/watch?v=1-ctsxVXvO0") diff --git a/R/visualizations.r b/R/visualizations.r index 927080ef..3fd448fb 100755 --- a/R/visualizations.r +++ b/R/visualizations.r @@ -168,7 +168,7 @@ poppr.plot <- function(sample, pval = c("0.05", "0.05"), pop="pop", data = infodata[infodata[["Index"]] == Indexfac[2], ], position="identity", binwidth=diff(range(infodata[infodata[["Index"]] == Indexfac[2], "Value"]))/30) + - geom_rug() + + geom_rug(alpha = 0.5) + #theme(legend.position = "none") + # The label for the observed line is a bit more difficult to code as # it has the ability to appear anywhere on the chart. Here, I'm # forcing it to flip to one side or the other based on which side of @@ -201,7 +201,7 @@ poppr.plot <- function(sample, pval = c("0.05", "0.05"), pop="pop", data = infodata[infodata[["Index"]] == Indexfac[2], ], position="identity", binwidth=diff(range(infodata[infodata[["Index"]] == Indexfac[2], "Value"]))/30) + - geom_rug() + + geom_rug(alpha = 0.5) + #theme(legend.position = "none") + # Positioning the observed line and labeling it. geom_vline(aes_string(xintercept = "Observed"), data = obsdata, color="blue", show_guide=TRUE, linetype="dashed") + @@ -238,65 +238,68 @@ poppr.plot <- function(sample, pval = c("0.05", "0.05"), pop="pop", #' matrix. #' #' @param pop a \code{\link{genind}} object -#' +#' #' @param distmat a distance matrix that has been derived from your data set. -#' -#' @param palette a \code{function} defining the color palette to be used to -#' color the populations on the graph. It defaults to \code{\link{topo.colors}}, -#' but you can easily create new schemes by using \code{\link{colorRampPalette}} -#' (see examples for details) -#' -#' @param sublist a \code{vector} of population names or indexes that the user -#' wishes to keep. Default to "ALL". -#' +#' +#' @param palette a \code{function} defining the color palette to be used to +#' color the populations on the graph. It defaults to +#' \code{\link{topo.colors}}, but you can easily create new schemes by using +#' \code{\link{colorRampPalette}} (see examples for details) +#' +#' @param sublist a \code{vector} of population names or indexes that the user +#' wishes to keep. Default to "ALL". +#' #' @param blacklist a \code{vector} of population names or indexes that the user -#' wishes to discard. Default to \code{NULL} -#' +#' wishes to discard. Default to \code{NULL} +#' #' @param vertex.label a \code{vector} of characters to label each vertex. There -#' are two defaults: \code{"MLG"} will label the nodes with the multilocus genotype -#' from the original data set and \code{"inds"} will label the nodes with the -#' representative individual names. -#' +#' are two defaults: \code{"MLG"} will label the nodes with the multilocus +#' genotype from the original data set and \code{"inds"} will label the nodes +#' with the representative individual names. +#' #' @param gscale "grey scale". If this is \code{TRUE}, this will scale the color -#' of the edges proportional to the observed distance, with the lines becoming -#' darker for more related nodes. See \code{\link{greycurve}} for details. -#' +#' of the edges proportional to the observed distance, with the lines becoming +#' darker for more related nodes. See \code{\link{greycurve}} for details. +#' #' @param glim "grey limit". Two numbers between zero and one. They determine -#' the upper and lower limits for the \code{\link{gray}} function. Default is 0 -#' (black) and 0.8 (20\% black). See \code{\link{greycurve}} for details. -#' -#' @param gadj "grey adjust". a positive \code{integer} greater than zero that -#' will serve as the exponent to the edge weight to scale the grey value to -#' represent that weight. See \code{\link{greycurve}} for details. -#' -#' @param gweight "grey weight". an \code{integer}. If it's 1, the grey scale -#' will be weighted to emphasize the differences between closely related nodes. -#' If it is 2, the grey scale will be weighted to emphasize the differences -#' between more distantly related nodes. See \code{\link{greycurve}} for details. -#' -#' @param wscale "width scale". If this is \code{TRUE}, the edge widths will be -#' scaled proportional to the inverse of the observed distance , with the lines -#' becoming thicker for more related nodes. -#' +#' the upper and lower limits for the \code{\link{gray}} function. Default is +#' 0 (black) and 0.8 (20\% black). See \code{\link{greycurve}} for details. +#' +#' @param gadj "grey adjust". a positive \code{integer} greater than zero that +#' will serve as the exponent to the edge weight to scale the grey value to +#' represent that weight. See \code{\link{greycurve}} for details. +#' +#' @param gweight "grey weight". an \code{integer}. If it's 1, the grey scale +#' will be weighted to emphasize the differences between closely related +#' nodes. If it is 2, the grey scale will be weighted to emphasize the +#' differences between more distantly related nodes. See +#' \code{\link{greycurve}} for details. +#' +#' @param wscale "width scale". If this is \code{TRUE}, the edge widths will be +#' scaled proportional to the inverse of the observed distance , with the +#' lines becoming thicker for more related nodes. +#' +#' @param showplot logical. If \code{TRUE}, the graph will be plotted. If +#' \code{FALSE}, it will simply be returned. +#' #' @param ... any other arguments that could go into plot.igraph -#' -#' @return -#' \item{graph}{a minimum spanning network with nodes corresponding to MLGs within -#' the data set. Colors of the nodes represent population membership. Width and -#' color of the edges represent distance.} -#' \item{populations}{a vector of the population names corresponding to the -#' vertex colors} -#' \item{colors}{a vector of the hexadecimal representations of the colors used -#' in the vertex colors} -#' +#' +#' @return \item{graph}{a minimum spanning network with nodes corresponding to +#' MLGs within the data set. Colors of the nodes represent population +#' membership. Width and color of the edges represent distance.} +#' \item{populations}{a vector of the population names corresponding to the +#' vertex colors} \item{colors}{a vector of the hexadecimal representations of +#' the colors used in the vertex colors} +#' #' @note The edges of these graphs may cross each other if the graph becomes too -#' large. -#' -#' @seealso \code{\link{nancycats}}, \code{\link{upgma}}, \code{\link{nj}}, -#' \code{\link{nodelabels}}, \code{\link{na.replace}}, \code{\link{missingno}}, -#' \code{\link{bruvo.msn}}, \code{\link{greycurve}}. +#' large. +#' +#' @seealso \code{\link{nancycats}}, \code{\link{upgma}}, \code{\link{nj}}, +#' \code{\link{nodelabels}}, \code{\link{na.replace}}, +#' \code{\link{missingno}}, \code{\link{bruvo.msn}}, \code{\link{greycurve}}. #' #' @export +#' @aliases msn.poppr #' @author Javier F. Tabima, Zhian N. Kamvar #' @examples #' @@ -334,11 +337,7 @@ poppr.plot <- function(sample, pval = c("0.05", "0.05"), pop="pop", poppr.msn <- function (pop, distmat, palette = topo.colors, sublist = "All", blacklist = NULL, vertex.label = "MLG", gscale=TRUE, glim = c(0,0.8), gadj = 3, gweight = 1, - wscale=TRUE, ...){ - # require(igraph) - # if(!require(igraph)){ - # stop("You must have the igraph library installed to use this function.\n") - # } + wscale=TRUE, showplot = TRUE, ...){ if (class(distmat) != "dist"){ if (is.matrix(distmat)){ if (any(nInd(pop) != dim(distmat))){ @@ -354,12 +353,21 @@ poppr.msn <- function (pop, distmat, palette = topo.colors, } gadj <- ifelse(gweight == 1, gadj, -gadj) # Storing the MLG vector into the genind object - pop$other$mlg.vec <- mlg.vector(pop) + if (!is.genclone(pop)){ + # Storing the MLG vector into the genind object + pop$other$mlg.vec <- mlg.vector(pop) + } bclone <- as.matrix(distmat) + # The clone correction of the matrix needs to be done at this step if there # is only one or no populations. if (is.null(pop(pop)) | length(pop@pop.names) == 1){ - bclone <- bclone[!duplicated(pop$other$mlg.vec), !duplicated(pop$other$mlg.vec)] + if (is.genclone(pop)){ + mlgs <- pop@mlg + } else { + mlgs <- pop$other$mlg.vec + } + bclone <- bclone[!duplicated(mlgs), !duplicated(mlgs)] return(singlepop_msn(pop, vertex.label, distmat = bclone, gscale = gscale, glim = glim, gadj = gadj, wscale = wscale, palette = palette)) @@ -368,28 +376,39 @@ poppr.msn <- function (pop, distmat, palette = topo.colors, if(sublist[1] != "ALL" | !is.null(blacklist)){ sublist_blacklist <- sub_index(pop, sublist, blacklist) bclone <- bclone[sublist_blacklist, sublist_blacklist] - pop <- popsub(pop, sublist, blacklist) + pop <- popsub(pop, sublist, blacklist) + } + cpop <- pop[.clonecorrector(pop), ] + if (is.genclone(pop)){ + mlgs <- pop@mlg + cmlg <- cpop@mlg + } else { + mlgs <- pop$other$mlg.vec + cmlg <- cpop$other$mlg.vec } - # This will clone correct the incoming matrix. - bclone <- bclone[!duplicated(pop$other$mlg.vec), !duplicated(pop$other$mlg.vec)] + bclone <- bclone[!duplicated(mlgs), !duplicated(mlgs)] if (is.null(pop(pop)) | length(pop@pop.names) == 1){ return(singlepop_msn(pop, vertex.label, distmat = bclone, gscale = gscale, glim = glim, gadj = gadj, wscale = wscale, - palette = palette)) + palette = palette, showplot = showplot, ...)) } # Obtaining population information for all MLGs - mlg.cp <- mlg.crosspop(pop, mlgsub=1:mlg(pop, quiet=TRUE), quiet=TRUE) - names(mlg.cp) <- paste0("MLG.", sort(unique(pop$other$mlg.vec))) - cpop <- pop[.clonecorrector(pop), ] - + if (is.genclone(pop)){ + subs <- sort(unique(mlgs)) + } else { + subs <- 1:mlg(pop, quiet = TRUE) + } + mlg.cp <- mlg.crosspop(pop, mlgsub = subs, quiet=TRUE) + + names(mlg.cp) <- paste0("MLG.", sort(unique(mlgs))) # This will determine the size of the nodes based on the number of individuals - # in the MLG. Subsetting by the MLG vector of the clone corrected set will + # in the MLG. Sub-setting by the MLG vector of the clone corrected set will # give us the numbers and the population information in the correct order. # Note: rank is used to correctly subset the data - mlg.number <- table(pop$other$mlg.vec)[rank(cpop$other$mlg.vec)] - mlg.cp <- mlg.cp[rank(cpop$other$mlg.vec)] + mlg.number <- table(mlgs)[rank(cmlg)] + mlg.cp <- mlg.cp[rank(cmlg)] rownames(bclone) <- cpop$pop colnames(bclone) <- cpop$pop @@ -398,7 +417,7 @@ poppr.msn <- function (pop, distmat, palette = topo.colors, if (!is.na(vertex.label[1]) & length(vertex.label) == 1){ if(toupper(vertex.label) == "MLG"){ - vertex.label <- paste("MLG.", cpop$other$mlg.vec, sep="") + vertex.label <- paste("MLG.", cmlg, sep="") } else if(toupper(vertex.label) == "INDS"){ vertex.label <- cpop$ind.names @@ -411,33 +430,17 @@ poppr.msn <- function (pop, distmat, palette = topo.colors, color <- palette(length(pop@pop.names)) ###### Edge adjustments ###### - # Grey Scale Adjustment weighting towards more diverse or similar populations. - if (gscale == TRUE){ - E(mst)$color <- gray(adjustcurve(E(mst)$weight, glim=glim, correction=gadj, - show=FALSE)) - } else { - E(mst)$color <- rep("black", length(E(mst)$weight)) - } - - # Width scale adjustment to avoid extremely large widths. - # by adding 0.08 to entries, the max width is 12.5 and the min is 0.9259259 - edgewidth <- 2 - if (wscale==TRUE){ - edgewidth <- 1/(E(mst)$weight) - if (any(E(mst)$weight < 0.08)){ - edgewidth <- 1/(E(mst)$weight + 0.08) - } - } + mst <- update_edge_scales(mst, wscale, gscale, glim, gadj) # This creates a list of colors corresponding to populations. mlg.color <- lapply(mlg.cp, function(x) color[pop@pop.names %in% names(x)]) - - plot.igraph(mst, edge.width = edgewidth, edge.color = E(mst)$color, - vertex.size = mlg.number*3, vertex.shape = "pie", vertex.pie = mlg.cp, - vertex.pie.color = mlg.color, vertex.label = vertex.label, ...) - legend(-1.55 ,1 ,bty = "n", cex = 0.75, legend = pop$pop.names, - title = "Populations", fill=color, border=NULL) - E(mst)$width <- edgewidth + if (showplot){ + plot.igraph(mst, edge.width = E(mst)$width, edge.color = E(mst)$color, + vertex.size = mlg.number*3, vertex.shape = "pie", vertex.pie = mlg.cp, + vertex.pie.color = mlg.color, vertex.label = vertex.label, ...) + legend(-1.55 ,1 ,bty = "n", cex = 0.75, legend = pop$pop.names, + title = "Populations", fill=color, border=NULL) + } V(mst)$size <- mlg.number V(mst)$shape <- "pie" V(mst)$pie <- mlg.cp @@ -445,3 +448,586 @@ poppr.msn <- function (pop, distmat, palette = topo.colors, V(mst)$label <- vertex.label return(list(graph = mst, populations = pop$pop.names, colors = color)) } + + +#==============================================================================# +#' Create a table summarizing missing data or ploidy information of a genind or +#' genclone object +#' +#' @param gen a \linkS4class{genind} or \linkS4class{genclone} object. +#' +#' @param type \code{character}. What information should be returned. Choices +#' are "missing" (Default) and "ploidy". See Description. +#' +#' @param percent \code{logical}. (ONLY FOR \code{type = 'missing'}) If +#' \code{TRUE} (default), table and plot will represent missing data as a +#' percentage of each cell. If \code{FALSE}, the table and plot will represent +#' missing data as raw counts. (See details) +#' +#' @param plot \code{logical}. If \code{TRUE}, a simple heatmap will be +#' produced. If \code{FALSE} (default), no heatmap will be produced. +#' +#' @param df \code{logical}. If \code{TRUE}, the data will be returned as a long +#' form data frame. If \code{FALSE} (default), a matrix with samples in rows +#' and loci in columns will be returned. +#' +#' @param returnplot \code{logical}. If \code{TRUE}, a list is returned with two +#' elements: \code{table} - the normal output and \code{plot} - the ggplot +#' object. If \code{FALSE}, the table is returned. +#' +#' @param low \code{character}. What color should represent no missing data or +#' lowest observed ploidy? (default: "blue") +#' +#' @param high \code{character}. What color should represent the highest amount +#' of missing data or observed ploidy? (default: "red") +#' +#' @param plotlab \code{logical}. (ONLY FOR \code{type = 'missing'}) If +#' \code{TRUE} (default), values of missing data greater than 0\% will be +#' plotted. If \code{FALSE}, the plot will appear unappended. +#' +#' @param scaled \code{logical}. (ONLY FOR \code{type = 'missing'}) This is for +#' when \code{percent = TRUE}. If \code{TRUE} (default), the color specified +#' in \code{high} will represent the highest observed value of missing data. +#' If \code{FALSE}, the color specified in \code{high} will represent 100\%. +#' +#' @return a matrix, data frame (\code{df = TRUE}), or a list (\code{returnplot +#' = TRUE}) representing missing data per population (\code{type = 'missing'}) +#' or ploidy per individual (\code{type = 'ploidy'}) in a \linkS4class{genind} +#' or \linkS4class{genclone} object. +#' +#' @details +#' Missing data is accounted for on a per-population level.\cr +#' Ploidy is accounted for on a per-individual level. +#' +#' \subsection{For type = 'missing'}{ +#' This data is potentially useful for identifying areas of systematic missing +#' data. There are a few caveats to be aware of. \itemize{ \item +#' \strong{Regarding counts of missing data}: Each count represents the number +#' of individuals with missing data at each locus. The last column, "mean" can +#' be thought of as the average number of individuals with missing data per +#' locus. \item \strong{Regarding percentage missing data}: This percentage is +#' \strong{relative to the population and locus}, not ot the enitre data set. +#' The last colum, "mean" represents the average percent of the population +#' with missing data per locus. }} +#' \subsection{For type = 'ploidy'}{ +#' This option is useful for data that has been imported with mixed ploidies. +#' It will summarize the relative levels of ploidy per individual per locus. +#' This is simply based off of observed alleles and does not provide any +#' further estimates.} +#' +#' @export +#' @keywords missing ploidy +#' @author Zhian N. Kamvar +#' @examples +#' data(nancycats) +#' nancy.miss <- info_table(nancycats, plot = TRUE, type = "missing") +#' data(Pinf) +#' Pinf.ploid <- info_table(Pinf, plot = TRUE, type = "ploidy") +#' +#==============================================================================# +info_table <- function(gen, type = c("missing", "ploidy"), percent = TRUE, plot = FALSE, + df = FALSE, returnplot = FALSE, low = "blue", + high = "red", plotlab = TRUE, scaled = TRUE){ + datalabel <- as.character(match.call()[2]) + ARGS <- c("missing", "ploidy") + type <- match.arg(type, ARGS) + + if (type == "missing"){ + + valname <- "Missing" + pops <- seppop(gen, drop = FALSE) + pops$Total <- gen + inds <- 1 + if (percent){ + inds <- c(table(pop(gen)), nInd(gen)) + } + data_table <- matrix(0, nrow = nLoc(gen) + 1, ncol = length(pops)) + data_table[1:nLoc(gen), ] <- vapply(pops, number_missing_locus, numeric(nLoc(gen)), 1) + data_table[-nrow(data_table), ] <- t(apply(data_table[-nrow(data_table), ], 1, "/", inds)) + data_table[nrow(data_table), ] <- colMeans(data_table[-nrow(data_table), ]) + rownames(data_table) <- c(gen@loc.names, "Mean") + colnames(data_table) <- names(pops) + dimnames(data_table) <- list(Locus = c(gen@loc.names, "Mean"), Population = names(pops)) + if (all(data_table == 0)){ + cat("No Missing Data Found!") + return(NULL) + } + if (plot){ + data_df <- melt(data_table, value.name = valname) + leg_title <- valname + data_df[1:2] <- data.frame(lapply(data_df[1:2], + function(x) factor(x, levels = unique(x)))) + plotdf <- textdf <- data_df + if (percent) { + plotdf$Missing <- round(plotdf$Missing*100, 2) + textdf$Missing <- paste(plotdf$Missing, "%") + miss <- "0 %" + title <- paste("Percent missing data per locus and population of", + datalabel) + leg_title <- paste("Percent", leg_title) + if(!scaled){ + lims <- c(0, 100) + } + } else { + textdf$Missing <- round(textdf$Missing, 2) + miss <- 0 + title <- paste("Missing data per locus and population of", + datalabel) + } + if (scaled | !percent){ + lims <- c(0, max(plotdf$Missing)) + } + linedata <- data.frame(list(yint = 1.5, xint = nrow(data_table) - 0.5)) + textdf$Missing <- ifelse(textdf$Missing == miss, "", textdf$Missing) + plotdf$Missing[plotdf$Locus == "Mean" & plotdf$Population == "Total"] <- NA + + outplot <- ggplot(plotdf, aes_string(x = "Locus", y = "Population")) + + geom_tile(aes_string(fill = valname)) + + labs(list(title = title, x = "Locus", y = "Population")) + + labs(fill = leg_title) + + scale_fill_gradient(low = low, high = high, na.value = "white", + limits = lims) + + geom_hline(aes_string(yintercept = "yint"), data = linedata) + + geom_vline(aes_string(xintercept = "xint"), data = linedata) + if (plotlab){ + outplot <- outplot + geom_text(aes_string(label = valname), + data = textdf) + } + outplot <- outplot + + theme_classic() + + scale_x_discrete(expand = c(0, -1)) + + scale_y_discrete(expand = c(0, -1), + limits = rev(unique(plotdf$Population))) + + theme(axis.text.x = element_text(size = 10, angle = -45, + hjust = 0, vjust = 1)) + print(outplot) + } + + } else if (type == "ploidy"){ + + valname <- "Observed_Ploidy" + if (gen@ploidy <= 2){ + warning("This function is meant for polyploid data.") + data_table <- matrix(gen@ploidy, nrow = nInd(gen), ncol = nLoc(gen)) + missing <- propTyped(gen, "both") == 0 + data_table[missing] <- NA + } else { + data_table <- get_local_ploidy(gen) + } + dimnames(data_table) <- list(Samples = indNames(gen), Loci = locNames(gen)) + if (plot){ + data_df <- melt(data_table, value.name = valname) + data_df[1:2] <- data.frame(lapply(data_df[1:2], + function(x) factor(x, levels = unique(x)))) + vars <- aes_string(x = "Loci", y = "Samples", fill = valname) + + mytheme <- theme_classic() + + theme(axis.text.x = element_text(size = 10, angle = -45, + hjust = 0, vjust = 1)) + + title <- paste("Observed ploidy of", datalabel) + outplot <- ggplot(data_df) + geom_tile(vars) + + scale_fill_gradient(low = low, high = high) + + scale_x_discrete(expand = c(0, -1)) + + scale_y_discrete(expand = c(0, -1), + limits = rev(unique(data_df$Samples))) + + labs(list(title = title, x = "Locus", y = "Sample", + fill = "Observed\nPloidy")) + + mytheme + + print(outplot) + } + } + if (df){ + if(!exists("data_df")){ + data_df <- melt(data_table, value.name = valname) + } + data_table <- data_df + } else { + if (type == "missing"){ + data_table <- t(data_table) + } + class(data_table) <- c("locustable", "matrix") + } + if (returnplot & exists("outplot")){ + data_table <- list(table = data_table, plot = outplot) + } + return(data_table) +} + +#==============================================================================# +#' Display a greyscale gradient adjusted to specific parameters +#' +#' This function has one purpose. It is for deciding the appropriate scaling for +#' a grey palette to be used for edge weights of a minimum spanning network. +#' +#' +#' @param data a sequence of numbers to be converted to greyscale. +#' +#' @param glim "grey limit". Two numbers between zero and one. They determine +#' the upper and lower limits for the \code{\link{gray}} function. Default is 0 +#' (black) and 0.8 (20\% black). +#' +#' @param gadj "grey adjust". a positive \code{integer} greater than zero that +#' will serve as the exponent to the edge weight to scale the grey value to +#' represent that weight. +#' +#' @param gweight "grey weight". an \code{integer}. If it's 1, the grey scale +#' will be weighted to emphasize the differences between closely related nodes. +#' If it is 2, the grey scale will be weighted to emphasize the differences +#' between more distantly related nodes. +#' +#' @param scalebar When this is set to \code{TRUE}, two scalebars will be +#' plotted. The purpose of this is for adding a scale bar to minimum spanning +#' networks produced in earlier versions of poppr. +#' +#' @return A plot displaying a grey gradient from 0.001 to 1 with minimum and +#' maximum values displayed as yellow lines, and an equation for the correction +#' displayed in red. +#' +#' @author Zhian N. Kamvar +#' +#' @examples +#' # Normal grey curve with an adjustment of 3, an upper limit of 0.8, and +#' # weighted towards smaller values. +#' greycurve() +#' \dontrun{ +#' # 1:1 relationship grey curve. +#' greycurve(gadj=1, glim=1:0) +#' +#' # Grey curve weighted towards larger values. +#' greycurve(gweight=2) +#' +#' # Same as the first, but the limit is 1. +#' greycurve(glim=1:0) +#' +#' # Setting the lower limit to 0.1 and weighting towards larger values. +#' greycurve(glim=c(0.1,0.8), gweight=2) +#' } +#' @export +#==============================================================================# +greycurve <- function(data = seq(0, 1, length = 1000), glim = c(0,0.8), + gadj = 3, gweight = 1, scalebar = FALSE){ + gadj <- ifelse(gweight == 1, gadj, -gadj) + adjustcurve(data, glim, correction=gadj, show = TRUE, scalebar = scalebar) +} + + +#==============================================================================# +#' Plot minimum spanning networks produced in poppr. +#' +#' This function allows you to take the output of poppr.msn and bruvo.msn and +#' customize the plot by labeling groups of individuals, size of nodes, and +#' adjusting the palette and scale bar. +#' +#' @param x a \code{\linkS4class{genind}} or \code{\linkS4class{genclone}} +#' object from which \code{poppr_msn} was derived. +#' +#' @param poppr_msn a \code{list} produced from either \code{\link{poppr.msn}} +#' or \code{\link{bruvo.msn}}. This list should contain a graph, a vector of +#' population names and a vector of hexadecimal color definitions for each +#' popualtion. +#' +#' @inheritParams greycurve +#' +#' @inheritParams poppr.msn +#' +#' @param inds a character vector indicating which individual names to label +#' nodes with. See details. +#' +#' @param quantiles \code{logical}. When set to \code{TRUE} (default), the scale +#' bar will be composed of the quantiles from the observed edge weights. When +#' set to \code{FALSE}, the scale bar will be composed of a smooth gradient +#' from the minimum edge weight to the maximum edge weight. +#' +#' @param nodelab an \code{integer} specifying the smallest size of node to +#' label. See details. +#' +#' @param cutoff a number indicating the longest distance to display in your +#' graph. This is performed by removing edges with weights greater than this +#' number. +#' +#' @param palette a function or character corresponding to a specific palette +#' you want to use to delimit your populations. The default is whatever +#' palette was used to produce the original graph. +#' +#' @param layfun a function specifying the layout of nodes in your graph. It +#' defaults to \code{\link[igraph]{layout.auto}}. +#' +#' @param beforecut if \code{TRUE}, the layout of the graph will be computed +#' before any edges are removed with \code{cutoff}. If \code{FALSE} (Default), +#' the layout will be computed after any edges are removed. +#' +#' @param ... any other parameters to be passed on to +#' \code{\link[igraph]{plot.igraph}}. +#' +#' @details The previous incarnation of msn plotting in poppr simply plotted the +#' minimum spanning network with the legend of populations, but did not +#' provide a scale bar and it did not provide the user a simple way of +#' manipulating the layout or labels. This function allows the user to +#' manipulate many facets of graph creation, making the creation of minimum +#' spanning networks ever so slightly more user friendly. Note that this +#' function will only plot individual names, not MLG names since the naming +#' convention for those are arbitrary. +#' +#' This function must have both the source data and the output msn to work. +#' The source data must contain the same population structure as the graph. +#' Every other parameter has a default setting. +#' +#' \subsection{Parameter details}{ \itemize{ \item \code{inds} This will take +#' in the name of a query individual in your data set and will use that to +#' query any other individuals that share multilocus genotypes and label their +#' node on the graph. The default is to label all the nodes, but you can set +#' it to a name that doesn't exist to label none of the nodes. \item +#' \code{nodelab} If a node is not labeled by individual, this will label the +#' size of the nodes greater than or equal to this value. If you don't want to +#' label the size of the nodes, simply set this to a very high number. \item +#' \code{cutoff} This is useful for when you want to investigate groups of +#' multilocus genotypes separated by a specific distance or if you have two +#' distinct populations and you want to physically separate them in your +#' network. \item \code{beforecut} This is an indicator useful if you want to +#' maintain the same position of the nodes before and after removing edges +#' with the \code{cutoff} argument. This works best if you set a seed before +#' you run the function.}} +#' +#' @seealso \code{\link[igraph]{layout.auto}} \code{\link[igraph]{plot.igraph}} +#' \code{\link{poppr.msn}} \code{\link{bruvo.msn}} \code{\link{greycurve}} +#' \code{\link[igraph]{delete.edges}} \code{\link{palette}} +#' +#' @author Zhian N. Kamvar +#' @export +#' +#' @examples +#' # Using a data set of the Aphanomyces eutieches root rot pathogen. +#' data(Aeut) +#' adist <- diss.dist(Aeut, percent = TRUE) +#' amsn <- poppr.msn(Aeut, adist, showplot = FALSE) +#' +#' # Default +#' library(igraph) # To get all the layouts. +#' set.seed(500) +#' plot_poppr_msn(Aeut, amsn, gadj = 15, beforecut = TRUE) +#' +#' # Removing link between populations and labelling no individuals +#' set.seed(500) +#' plot_poppr_msn(Aeut, amsn, inds = "none", gadj = 15, beforecut = TRUE, cutoff = 0.2) +#' +#' # Labelling individual #57 because it is an MLG that crosses popualtions +#' # Showing clusters of MLGS with at most 5% variation +#' # Notice that the Mt. Vernon population appears to be more clonal +#' set.seed(50) +#' plot_poppr_msn(Aeut, amsn, gadj = 15, cutoff = 0.05, inds = "57") +#' +#' +#' \dontrun{ +#' data(partial_clone) +#' pcmsn <- bruvo.msn(partial_clone, replen = rep(1, 10)) +#' plot_poppr_msn(partial_clone, pcmsn, palette = rainbow, inds = "sim 20") +#' +#' # Something pretty +#' data(microbov) +#' mdist <- diss.dist(microbov, percent = TRUE) +#' micmsn <- poppr.msn(microbov, mdist, showplot = FALSE) +#' +#' plot_poppr_msn(microbov, micmsn, palette = "terrain.colors", inds = "n", +#' quantiles = FALSE) +#' plot_poppr_msn(microbov, micmsn, palette = "terrain.colors", inds = "n", +#' cutoff = 0.3, quantiles = FALSE) +#' } +#==============================================================================# +#' @importFrom igraph layout.auto delete.edges +plot_poppr_msn <- function(x, poppr_msn, gscale = TRUE, gadj = 3, glim = c(0, 0.8), + gweight = 1, wscale = TRUE, inds = "ALL", quantiles = TRUE, + nodelab = 2, cutoff = NULL, palette = NULL, + layfun = layout.auto, beforecut = FALSE, ...){ + if (!is.genind(x)){ + stop(paste(substitute(x), "is not a genind or genclone object.")) + } + if (!identical(names(poppr_msn), c("graph", "populations", "colors"))){ + stop("graph not compatible") + } + if (!is.null(palette)){ + poppr_msn <- update_poppr_graph(poppr_msn, palette) + } + # Making sure incoming data matches so that the individual names match. + x <- popsub(x, sublist = poppr_msn$populations) + + if (beforecut){ + LAYFUN <- match.fun(layfun) + lay <- LAYFUN(poppr_msn$graph) + } else { + lay <- match.fun(layfun) + } + # delete.edges <- match.fun(igraph::delete.edges) + if (!is.null(cutoff) && !is.na(cutoff)){ + if (all(cutoff < E(poppr_msn$graph)$weight)){ + msg <- paste0("Cutoff value (", cutoff, ") is below the minimum observed", + " distance. Edges will not be removed.") + warning(msg) + } else { + E_above_cutoff <- E(poppr_msn$graph)[E(poppr_msn$graph)$weight >= cutoff] + poppr_msn$graph <- delete.edges(poppr_msn$graph, E_above_cutoff) + } + } + # Adjusting color scales. This will replace any previous scaling contained in + # poppr_msn. + weights <- E(poppr_msn$graph)$weight + wmin <- min(weights) + wmax <- max(weights) + gadj <- ifelse(gweight == 1, gadj, -gadj) + poppr_msn$graph <- update_edge_scales(poppr_msn$graph, wscale, gscale, glim, gadj) + + # Highlighting only the names of the submitted genotypes and the matching + # isolates. + x.mlg <- mlg.vector(x) + labs <- unique(x.mlg) + # The labels in the graph are organized by MLG, so we will use that to extract + # the names we need. + if (length(inds) == 1 & toupper(inds[1]) == "ALL"){ + x.input <- unique(x.mlg) + } else { + x.input <- unique(x.mlg[x@ind.names %in% inds]) + } + # Combine all the names that match with each particular MLG in x.input. + combined_names <- vapply(x.input, function(mlgname) + paste(rev(x@ind.names[x.mlg == mlgname]), + collapse = "\n"), + character(1)) + # Remove labels that are not specified. + labs[which(!labs %in% x.input)] <- NA + labs[!is.na(labs)] <- combined_names + if (any(is.na(labs))){ + sizelabs <- V(poppr_msn$graph)$size + sizelabs <- ifelse(sizelabs >= nodelab, sizelabs, NA) + labs <- ifelse(is.na(labs), sizelabs, labs) + } + # Change the size of the vertices to a log scale. + vsize <- log(V(poppr_msn$graph)$size, base = 1.15) + 3 + + # Plotting parameters. + def.par <- par(no.readonly = TRUE) + # Setting up the matrix for plotting. One Vertical panel of width 1 and height + # 5 for the legend, one rectangular panel of width 4 and height 4.5 for the + # graph, and one horizontal panel of width 4 and height 0.5 for the greyscale. + layout(matrix(c(1,2,1,3), ncol = 2, byrow = TRUE), + widths = c(1, 4), heights= c(4.5, 0.5)) + # mar = bottom left top right + + ## LEGEND + par(mar = c(0, 0, 1, 0) + 0.5) + too_many_pops <- as.integer(ceiling(length(x$pop.names)/30)) + pops_correction <- ifelse(too_many_pops > 1, -1, 1) + yintersperse <- ifelse(too_many_pops > 1, 0.51, 0.62) + plot(c(0, 2), c(0, 1), type = 'n', axes = F, xlab = '', ylab = '', + main = 'POPULATION') + legend("topleft", bty = "n", cex = 1.2^pops_correction, + legend = poppr_msn$populations, fill = poppr_msn$color, border = NULL, + ncol = too_many_pops, x.intersp = 0.45, y.intersp = yintersperse) + + ## PLOT + par(mar = c(0,0,0,0)) + plot.igraph(poppr_msn$graph, vertex.label = labs, vertex.size = vsize, + layout = lay, ...) + + ## SCALE BAR + if (quantiles){ + scales <- sort(weights) + } else { + scales <- seq(wmin, wmax, l = 1000) + } + greyscales <- gray(adjustcurve(scales, show=FALSE, glim=glim, correction=gadj)) + legend_image <- as.raster(matrix(greyscales, nrow=1)) + par(mar = c(0, 1, 0, 1) + 0.5) + plot.new() + rasterImage(legend_image, 0, 0.5, 1, 1) + polygon(c(0, 1, 1), c(0.5, 0.5, 0.8), col = "white", border = "white", lwd = 2) + axis(3, at = c(0, 0.25, 0.5, 0.75, 1), labels = round(quantile(scales), 3)) + text(0.5, 0, labels = "DISTANCE", font = 2, cex = 1.5, adj = c(0.5, 0)) + + # Return top level plot to defaults. + layout(matrix(c(1), ncol=1, byrow=T)) + par(mar=c(5,4,4,2) + 0.1) # number of lines of margin specified. + par(oma=c(0,0,0,0)) # Figure margins +} + + +#==============================================================================# +#' Produce a genotype accumulation curve +#' +#' GA curves are useful for determinining the minimum number of loci necessary +#' to discriminate between individuals in a population. This function will +#' randomly sample loci without replacement and count the number of multilocus +#' genotypes observed. +#' +#' @param gen a \code{\linkS4class{genclone}} or \code{\linkS4class{genind}} +#' object. +#' +#' @param sample an \code{integer} defining the number of times loci will be +#' resampled. +#' +#' @param quiet if \code{FALSE}, a progress bar will be displayed. If +#' \code{TRUE}, nothing is printed to screen as the function runs. +#' +#' @param thresh a number from 0 to 1. This will draw a line at this fraction of +#' multilocus genotypes. +#' +#' @return a matrix of integers showing the results of each randomization. +#' Columns represent the number of loci sampled and rows represent an +#' independent sample. +#' +#' @author Zhian N. Kamvar +#' @export +#' @examples +#' data(nancycats) +#' nan_geno <- genotype_curve(nancycats) +#' \dontrun{ +#' # With AFLP data, it is often necessary to include more markers for resolution +#' data(Aeut) +#' Ageno <- genotype_curve(Aeut) +#' +#' # Many microsatellite data sets have hypervariable markers +#' data(microbov) +#' mgeno <- geotype_curve(microbov) +#' +#' # This data set has been pre filtered +#' data(monpop) +#' mongeno <- genotype_curve(monpop)} +#==============================================================================# + +genotype_curve <- function(gen, sample = 100, quiet = FALSE, thresh = 0.9){ + datacall <- match.call() + if (!is.genind(gen)){ + stop(paste(datacall[2], "must be a genind object")) + } + genloc <- as.loci(gen) + if (!is.null(pop(gen))){ + genloc <- genloc[-1] + } + nloci <- nLoc(gen) + if (!quiet){ + cat("Calculating genotype accumulation for", nloci - 1, "loci...\n") + progbar <- txtProgressBar(style = 3) + } else { + progbar <- NULL + } + out <- vapply(1:(nloci-1), get_sample_mlg, integer(sample), sample, nloci, genloc, progbar) + colnames(out) <- 1:(nloci-1) + threshdf <- data.frame(x = mlg(gen, quiet = TRUE)*thresh) + outmelt <- melt(out, value.name = "MLG", varnames = c("sample", "NumLoci")) + aesthetics <- aes_string(x = "factor(NumLoci)", y = "MLG", group = "NumLoci") + outplot <- ggplot(outmelt) + geom_boxplot(aesthetics) + + labs(list(title = paste("Genotype accumulation curve for", datacall[2]), + y = "Number of multilocus genotypes", + x = "Number of loci sampled")) + if (!is.null(thresh)){ + outbreaks <- sort(c(seq(min(out), max(out), length.out = 5), threshdf$x)) + outplot <- outplot + geom_hline(aes_string(yintercept = "x"), + data = threshdf, color = "red", linetype = 2) + + annotate("text", x = 1, y = threshdf$x, vjust = -1, + label = paste0(thresh*100, "%"), + color = "red", hjust = 0) + + scale_y_continuous(breaks = outbreaks) + } + print(outplot) + return(out) +} diff --git a/R/zzz.r b/R/zzz.r new file mode 100644 index 00000000..4d0d2e49 --- /dev/null +++ b/R/zzz.r @@ -0,0 +1,59 @@ +#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# +#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# +#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# +#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# +#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# +# +# This software was authored by Zhian N. Kamvar and Javier F. Tabima, graduate +# students at Oregon State University; and Dr. Nik Grünwald, an employee of +# USDA-ARS. +# +# Permission to use, copy, modify, and distribute this software and its +# documentation for educational, research and non-profit purposes, without fee, +# and without a written agreement is hereby granted, provided that the statement +# above is incorporated into the material, giving appropriate attribution to the +# authors. +# +# Permission to incorporate this software into commercial products may be +# obtained by contacting USDA ARS and OREGON STATE UNIVERSITY Office for +# Commercialization and Corporate Development. +# +# The software program and documentation are supplied "as is", without any +# accompanying services from the USDA or the University. USDA ARS or the +# University do not warrant that the operation of the program will be +# uninterrupted or error-free. The end-user understands that the program was +# developed for research purposes and is advised not to rely exclusively on the +# program for any reason. +# +# IN NO EVENT SHALL USDA ARS OR OREGON STATE UNIVERSITY BE LIABLE TO ANY PARTY +# FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING +# LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, +# EVEN IF THE OREGON STATE UNIVERSITY HAS BEEN ADVISED OF THE POSSIBILITY OF +# SUCH DAMAGE. USDA ARS OR OREGON STATE UNIVERSITY SPECIFICALLY DISCLAIMS ANY +# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE AND ANY STATUTORY +# WARRANTY OF NON-INFRINGEMENT. THE SOFTWARE PROVIDED HEREUNDER IS ON AN "AS IS" +# BASIS, AND USDA ARS AND OREGON STATE UNIVERSITY HAVE NO OBLIGATIONS TO PROVIDE +# MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. +# +#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# +#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# +#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# +#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# +#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# + +.onAttach <- function(...) { + startupmsg <- paste("This is poppr version", utils::packageVersion("poppr")) + startupmsg <- paste0(startupmsg, ". To get started, type package?poppr") + packageStartupMessage(startupmsg) + if (!interactive() || stats::runif(1) > 0.1) return() + + tips <- c( + "\nNeed help? Try the poppr mailing list: http://groups.google.com/group/poppr.\n", + "\nUse suppressPackageStartupMessages(library(poppr)) to eliminate package startup messages.\n" + ) + + tip <- sample(tips, 1) + packageStartupMessage(tip) +} + diff --git a/README.md b/README.md index ddf6b2ad..8fb1a156 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ genind object and offers the following implementations: - convenient counting of multilocus genotypes and sub-setting of populations with multiple levels of hierarchy - define multilocus genotypes - calculation of indices of genotypic diversity, evenness, richness, and rarefaction -- drawing of dendrograms with bootstrap support for Bruvo's distance +- drawing of dendrograms with bootstrap support for genetic distances - drawing of minimum spanning networks for genetic distances - calculation of the index of association (![equation](http://latex.codecogs.com/gif.latex?I_A)) @@ -29,21 +29,13 @@ You can obtain citation information in R by typing: citation(package = "poppr") ``` -## Help - -Users who have any questions/comments/suggestions regarding any version of poppr (stable or development) should direct their comments to the [Poppr google group](http://groups.google.com/group/poppr) - ## Installation -[![Build Status](https://travis-ci.org/grunwaldlab/poppr.png?branch=master)](https://travis-ci.org/grunwaldlab/poppr?branch=master) - -If the image above says "Passing", then that means it should be safe to install with the latest version of R. If it does not say "Passing", I am probably trying to fix whatever problem is causing it as fast as I can. - ### From CRAN Binary versions for mac and windows are available for R ≥ 2.15.1 [**here**](http://cran.r-project.org/web/packages/poppr/index.html). -To install, make sure R is at least version 2.15.1 (the authors recommend ≥ 3.0), and in your console, type: +To install, make sure R is at least version 2.15.1 (the authors recommend ≥ 3.0), and in your console, type: ```s install.packages("poppr") @@ -55,13 +47,17 @@ If you want the absolute latest version of *poppr*, see about installing from gi ### Stable and Development versions +[![Build Status](https://travis-ci.org/grunwaldlab/poppr.png?branch=devel)](https://travis-ci.org/grunwaldlab/poppr?branch=devel) + +If the image above says "Passing", then that means it should be safe to install with the latest version of R. If it does not say "Passing", I am probably trying to fix whatever problem is causing it as fast as I can. + To install this package from github, make sure you have the following: - [Xcode](https://developer.apple.com/xcode/) (OSX) OR [Rtools](http://cran.r-project.org/bin/windows/Rtools/) (Windows) - [devtools](https://github.com/hadley/devtools) (to install, use: `install.packages("devtools")`) -For Linux users, make sure that the function `getOption("unzip")` returns `"unzip"` or `"internal"`. If it doesn't, then run options(unzip = "internal"). +For Linux users, make sure that the function `getOption("unzip")` returns `"unzip"` or `"internal"`. If it doesn't, then run `options(unzip = "internal")`. Now you can use the `install_github()` function: @@ -81,8 +77,20 @@ install_github(repo = "grunwaldlab/poppr", ref = "devel") library(poppr) ``` -You can view the manual by typing: `vignette("poppr_manual")` after installation or by clicking [**here**](http://grunwaldlab.cgrb.oregonstate.edu/sites/default/files/u5/poppr_manual.pdf) +## Help / Documentation + +### User Group + +Users who have any questions/comments/suggestions regarding any version of poppr (stable or development) should direct their comments to the [Poppr google group](http://groups.google.com/group/poppr) + +### Vignettes + +Two vignettes have been written for poppr: + +1. Data Import and Manipulation (`vignette("poppr_manual", package = "poppr")`) +2. Algorithms and Equation Utilized (`vignette("algo", package = "poppr")`) + +### Book/Primer + +In Spring of 2014, Dr. Niklaus J. Grünwald, Dr. Sydney E. Everhart and Zhian N. Kamvar wrote a primer for population genetic analysis in R located at http://grunwaldlab.github.io/Population_Genetics_in_R. -Be sure to visit our [Webpage](http://grunwaldlab.cgrb.oregonstate.edu/poppr-r-package-population-genetics) with FAQs and tutorials coming soon. - -Enjoy! diff --git a/data/Pinf.rda b/data/Pinf.rda new file mode 100644 index 00000000..cf86a078 Binary files /dev/null and b/data/Pinf.rda differ diff --git a/data/monpop.rda b/data/monpop.rda new file mode 100644 index 00000000..9a2828e0 Binary files /dev/null and b/data/monpop.rda differ diff --git a/inst/CITATION b/inst/CITATION index e4aba9e3..93d1071b 100644 --- a/inst/CITATION +++ b/inst/CITATION @@ -14,7 +14,7 @@ citEntry(entry = 'article', volume = "2", pages = "e281", keywords = "population genetics, clonality, genotypic diversity, index of association, Bruvo's distance, clone correction, minimum spanning networks, hierarchy, bootstrap, permutation", - key = "kamvar2013poppr", + key = "kamvar2014poppr", issn = "2167-8359", url = "http://dx.doi.org/10.7717/peerj.281", doi = "10.7717/peerj.281", diff --git a/inst/README b/inst/README new file mode 100644 index 00000000..9407f574 --- /dev/null +++ b/inst/README @@ -0,0 +1,6 @@ +# Poppr development version README + +**Use this development version at your own risk** + +This version contains experimental functions that are not intended for release +until further testing can be completed. diff --git a/man/Aeut.Rd b/man/Aeut.Rd index 9b21c909..adb65be7 100644 --- a/man/Aeut.Rd +++ b/man/Aeut.Rd @@ -1,24 +1,23 @@ +% Generated by roxygen2 (4.0.1): do not edit by hand \docType{data} \name{Aeut} \alias{Aeut} \title{Oomycete root rot pathogen \emph{Aphanomyces euteiches} AFLP data} -\format{a \code{\link{genind}} object with two popualations containing a -data frame in the \code{other} slot called \code{population_hierarchy}. -This data frame gives indices of the populations and subpopulations for the -data set.} +\format{a \code{\link{genind}} object with two populations containing a data + frame in the \code{other} slot called \code{population_hierarchy}. This + data frame gives indices of the populations and subpopulations for the data + set.} \usage{ data(Aeut) } \description{ -The Aeut dataset consists of 187 isolates of the Oomycete -root rot pathogen, \emph{Aphanomyces euteiches} collected -from two different fields in NW Oregon and W Washington, -USA. +The Aeut dataset consists of 187 isolates of the Oomycete root + rot pathogen, \emph{Aphanomyces euteiches} collected from two different + fields in NW Oregon and W Washington, USA. } \references{ -Grunwald, NJ and Hoheisel, G.-A. 2006. Hierarchical -Analysis of Diversity, Selfing, and Genetic Differentiation -in Populations of the Oomycete \emph{Aphanomyces -euteiches}. Phytopathology 96:1134-1141 +Grunwald, NJ and Hoheisel, G.A. 2006. Hierarchical Analysis of + Diversity, Selfing, and Genetic Differentiation in Populations of the + Oomycete \emph{Aphanomyces euteiches}. Phytopathology 96:1134-1141 } diff --git a/man/Pinf.Rd b/man/Pinf.Rd new file mode 100644 index 00000000..a9d955f2 --- /dev/null +++ b/man/Pinf.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2 (4.0.1): do not edit by hand +\docType{data} +\name{Pinf} +\alias{Pinf} +\title{Phytophthora infestans data from Mexico and South America.} +\format{a \code{\linkS4class{genclone}} object with 2 hierarchical levels + called "Continent" and "Country" that contain 2 and 4 populations, + respectively.} +\usage{ +data(Pinf) +} +\description{ +The Pinf data set contains 86 isolates genotyped over 11 + microsatellite loci collected from Mexico, Peru, Columbia, and Ecuador. + This is a subset of the data used for the reference below. +} +\references{ +Goss, Erica M., Javier F. Tabima, David EL Cooke, Silvia + Restrepo, William E. Fry, Gregory A. Forbes, Valerie J. Fieland, Martha + Cardenas, and Niklaus J. Grünwald. "The Irish potato famine pathogen + \emph{Phytophthora infestans} originated in central Mexico rather than the Andes." + Proceedings of the National Academy of Sciences 111:8791–8796. +} + diff --git a/man/aboot.Rd b/man/aboot.Rd new file mode 100644 index 00000000..817e67f3 --- /dev/null +++ b/man/aboot.Rd @@ -0,0 +1,107 @@ +% Generated by roxygen2 (4.0.1): do not edit by hand +\name{aboot} +\alias{aboot} +\alias{bootstrap} +\title{Calculate a dendrogram with bootstrap support using any distance applicable +to genind or genclone objects.} +\usage{ +aboot(x, tree = "upgma", distance = "nei.dist", sample = 100, + cutoff = 0, showtree = TRUE, missing = "mean", quiet = FALSE, ...) +} +\arguments{ +\item{x}{a \linkS4class{genind}, \linkS4class{genclone}, or matrix object.} + +\item{tree}{one of "upgma" (Default) or "nj" defining the type of dendrogram + to be produced, UPGMA or Neighbor-Joining.} + +\item{distance}{a character or function defining the distance to be applied + to x. Defaults to \code{\link{nei.dist}}.} + +\item{sample}{An integer representing the number of bootstrap replicates + Default is 100.} + +\item{cutoff}{An integer from 0 to 100 setting the cutoff value to return the + bootstrap values on the nodes. Default is 0.} + +\item{showtree}{If \code{TRUE} (Default), a dendrogram will be plotted. If + \code{FALSE}, nothing will be plotted.} + +\item{missing}{any method to be used by \code{\link{missingno}}: "mean" + (default), "zero", "loci", "genotype", or "ignore".} + +\item{quiet}{if \code{FALSE} (Default), a progress bar will be printed to + screen.} + +\item{...}{any parameters to be passed off to the distance method.} +} +\value{ +an object of class \code{\link[ape]{phylo}}. +} +\description{ +Calculate a dendrogram with bootstrap support using any distance applicable +to genind or genclone objects. +} +\details{ +This function utilizes an internal class called + \code{\linkS4class{bootgen}} that allows bootstrapping of objects that + inherit the genind class. This is necessary due to the fact that columns in + the genind matrix are defined as alleles and are thus interrelated. This + function will specifically bootstrap loci so that results are biologically + relevant. With this function, the user can also define a custom distance to + be performed on the genind or genclone object. +} +\note{ +\code{\link{provesti.dist}} and \code{\link{diss.dist}} are exactly the + same, but \code{\link{diss.dist}} scales better for large numbers of + individuals (n > 125) at the cost of required memory. \subsection{missing + data}{Missing data is not allowed by many of the distances. Thus, one of + the first steps of this function is to treat missing data by setting it to + the average allele frequency in the data set. If you are using a distance + that can handle missing data (Provesti's distance), you can set + \code{missing = "ignore"} to allow the distance function to handle any + missing data. See \code{\link{missingno}} for details on missing + data.}\subsection{Bruvo's Distance}{While calculation of Bruvo's distance + is possible with this function, it is optimized in the function + \code{\link{bruvo.boot}}.} +} +\examples{ +data(nancycats) +nan9 <- popsub(nancycats, 9) + +set.seed(9999) +# Generate a tree using nei's distance +neinan <- aboot(nan9, dist = nei.dist) + +set.seed(9999) +# Generate a tree using custom distance +bindist <- function(x) dist(x$tab, method = "binary") +binnan <- aboot(nan9, dist = bindist) + +\dontrun{ +# AFLP data +data(Aeut) + +# Nei's distance +anei <- aboot(Aeut, dist = nei.dist, sample = 1000, cutoff = 50) + +# Rogers' distance +arog <- aboot(Aeut, dist = rogers.dist, sample = 1000, cutoff = 50) + +# This can also be run on genpop objects +Aeut.gc <- as.genclone(Aeut, hierarchy=other(Aeut)$population_hierarchy[-1]) +setpop(Aeut.gc) <- ~Pop/Subpop +Aeut.pop <- genind2genpop(Aeut.gc) +set.seed(5000) +aboot(Aeut.pop) # compare to Grunwald et al. 2006 + +} +} +\seealso{ +\code{\link{nei.dist}} \code{\link{edwards.dist}} + \code{\link{rogers.dist}} \code{\link{reynolds.dist}} + \code{\link{provesti.dist}} \code{\link{diss.dist}} + \code{\link{bruvo.boot}} \code{\link[ape]{boot.phylo}} + \code{\link[adegenet]{dist.genpop}} \code{\link{dist}} +} +\keyword{bootstrap} + diff --git a/man/bootgen-class.Rd b/man/bootgen-class.Rd new file mode 100644 index 00000000..1c34ff94 --- /dev/null +++ b/man/bootgen-class.Rd @@ -0,0 +1,30 @@ +% Generated by roxygen2 (4.0.1): do not edit by hand +\docType{class} +\name{bootgen-class} +\alias{bootgen-class} +\title{Bootgen object} +\description{ +An internal object used for bootstrapping. Not intended for user interaction. +} +\section{Slots}{ + +\describe{ +\item{\code{type}}{a character denoting Codominant ("codom") or Dominant data ("P/A")} + +\item{\code{ploidy}}{an integer denoting the ploidy of the data set. (>=1)} + +\item{\code{alllist}}{a list with numeric vectors, each representing a different +locus where each element in the vector represents the index for a specific +allele.} + +\item{\code{names}}{a vector containing names of the observed samples.} +}} +\section{Extends}{ + +Virtual Class \code{"\linkS4class{gen}"}. +} +\author{ +Zhian N. Kamvar +} +\keyword{internal} + diff --git a/man/bootgen-methods.Rd b/man/bootgen-methods.Rd new file mode 100644 index 00000000..ad9edd3d --- /dev/null +++ b/man/bootgen-methods.Rd @@ -0,0 +1,47 @@ +% Generated by roxygen2 (4.0.1): do not edit by hand +\docType{methods} +\name{[,bootgen,ANY,ANY,ANY-method} +\alias{$,bootgen-method} +\alias{[,bootgen,ANY,ANY,ANY-method} +\alias{dim,bootgen-method} +\alias{initialize,bootgen-method} +\title{Methods used for the bootgen object.} +\usage{ +\S4method{[}{bootgen,ANY,ANY,ANY}(x, i, j, ..., drop = FALSE) + +\S4method{dim}{bootgen}(x) + +\S4method{$}{bootgen}(x, name) + +\S4method{initialize}{bootgen}(.Object, gen) +} +\arguments{ +\item{x}{a \code{"\linkS4class{bootgen}"} object} + +\item{i}{vector of numerics indicating number of individuals desired} + +\item{j}{a vector of numerics corresponding to the loci desired.} + +\item{...}{unused.} + +\item{drop}{set to \code{FALSE}} + +\item{.Object}{a character, "bootgen"} + +\item{gen}{a genind, genclone, or genpop object} + +\item{name}{ + A literal character string or a \link{name} (possibly \link{backtick} + quoted). For extraction, this is normally (see under + \sQuote{Environments}) partially matched to the \code{\link{names}} + of the object. + } +} +\description{ +This is not designed for user interaction. +} +\author{ +Zhian N. Kamvar +} +\keyword{internal} + diff --git a/man/bruvo.boot.Rd b/man/bruvo.boot.Rd index b4458adb..05be4df4 100755 --- a/man/bruvo.boot.Rd +++ b/man/bruvo.boot.Rd @@ -1,58 +1,53 @@ +% Generated by roxygen2 (4.0.1): do not edit by hand \name{bruvo.boot} \alias{bruvo.boot} \title{Create a tree using Bruvo's Distance with non-parametric bootstrapping.} \usage{ -bruvo.boot(pop, replen = c(1), sample = 100, tree = "upgma", - showtree = TRUE, cutoff = NULL, quiet = FALSE, ...) +bruvo.boot(pop, replen = 1, add = TRUE, loss = TRUE, sample = 100, + tree = "upgma", showtree = TRUE, cutoff = NULL, quiet = FALSE, ...) } \arguments{ - \item{pop}{a \code{\link{genind}} object} +\item{pop}{a \code{\link{genind}} object} - \item{replen}{a \code{vector} of \code{integers} - indicating the length of the nucleotide repeats for each - microsatellite locus.} +\item{replen}{a \code{vector} of \code{integers} indicating the length of the + nucleotide repeats for each microsatellite locus.} - \item{sample}{an \code{integer} indicated the number of - bootstrap replicates desired.} +\item{add}{if \code{TRUE}, genotypes with zero values will be treated under + the genome addition model presented in Bruvo et al. 2004.} - \item{tree}{choose between "nj" for neighbor-joining and - "upgma" for a upgma tree to be produced.} +\item{loss}{if \code{TRUE}, genotypes with zero values will be treated under + the genome loss model presented in Bruvo et al. 2004.} - \item{showtree}{\code{logical} if \code{TRUE}, a tree - will be plotted with nodelabels.} +\item{sample}{an \code{integer} indicated the number of bootstrap replicates + desired.} - \item{cutoff}{\code{integer} the cutoff value for - bootstrap node label values (between 0 and 100).} +\item{tree}{choose between "nj" for neighbor-joining and "upgma" for a upgma + tree to be produced.} - \item{quiet}{\code{logical} defaults to \code{FALSE}. If - \code{TRUE}, a progress bar and messages will be - supressed.} +\item{showtree}{\code{logical} if \code{TRUE}, a tree will be plotted with + nodelabels.} - \item{...}{any argument to be passed on to - \code{\link{boot.phylo}}. eg. \code{quiet = TRUE}.} +\item{cutoff}{\code{integer} the cutoff value for bootstrap node label values + (between 0 and 100).} + +\item{quiet}{\code{logical} defaults to \code{FALSE}. If \code{TRUE}, a + progress bar and messages will be suppressed.} + +\item{...}{any argument to be passed on to \code{\link{boot.phylo}}. eg. + \code{quiet = TRUE}.} } \value{ -a tree with nodelables +a tree of class phylo with nodelables } \description{ -Create a tree using Bruvo's Distance with non-parametric -bootstrapping. +Create a tree using Bruvo's Distance with non-parametric bootstrapping. } \note{ -This function calculates bruvo's distance for non-special -cases (ie. the ploidy and all alleles are known). Currently -there is no way to import polyploid partial heterozygote -data into adegenet. For Bruvo's Distance concerning special -cases, see the package \code{polysat}. Missing data is -ignored, but be sure that missing data is NOT set to 0 in -the genind object. This is not easy to detect and will -result in an error. Please use any other method in -\code{\link{na.replace}} or \code{\link{missingno}}. - -If the user does not provide a vector of appropriate length -for \code{replen} , it will be estimated by taking the -minimum difference among represented alleles at each locus. -IT IS NOT RECOMMENDED TO RELY ON THIS ESTIMATION. +\strong{Please refer to the documentation for bruvo.dist for details on + the algorithm.} If the user does not provide a vector of appropriate length + for \code{replen} , it will be estimated by taking the minimum difference + among represented alleles at each locus. IT IS NOT RECOMMENDED TO RELY ON + THIS ESTIMATION. } \examples{ # Please note that the data presented is assuming that the nancycat dataset @@ -68,18 +63,18 @@ ssr <- rep(2, 9) bruvo.boot(popsub(nancycats, 1), replen = ssr) } \author{ -Javier F. Tabima, Zhian N. Kamvar +Zhian N. Kamvar, Javier F. Tabima } \references{ Ruzica Bruvo, Nicolaas K. Michiels, Thomas G. D'Souza, and -Hinrich Schulenburg. A simple method for the calculation of -microsatellite genotype distances irrespective of ploidy -level. Molecular Ecology, 13(7):2101-2106, 2004. +Hinrich Schulenburg. A simple method for the calculation of microsatellite +genotype distances irrespective of ploidy level. Molecular Ecology, +13(7):2101-2106, 2004. } \seealso{ -\code{\link{nancycats}}, \code{\link{upgma}}, -\code{\link{nj}}, \code{\link{boot.phylo}}, -\code{\link{nodelabels}}, \code{\link{na.replace}}, -\code{\link{missingno}}. +\code{\link{bruvo.dist}}, \code{\link{nancycats}}, + \code{\link{upgma}}, \code{\link{nj}}, \code{\link{boot.phylo}}, + \code{\link{nodelabels}}, \code{\link{na.replace}}, + \code{\link{missingno}}. } diff --git a/man/bruvo.dist.Rd b/man/bruvo.dist.Rd index 8d75065c..543e62b2 100755 --- a/man/bruvo.dist.Rd +++ b/man/bruvo.dist.Rd @@ -1,34 +1,82 @@ +% Generated by roxygen2 (4.0.1): do not edit by hand \name{bruvo.dist} \alias{bruvo.dist} -\title{Calculate the average Bruvo's Distance over all loci in a population.} +\title{Bruvo's distance for microsatellites} \usage{ -bruvo.dist(pop, replen = c(1)) +bruvo.dist(pop, replen = 1, add = TRUE, loss = TRUE) } \arguments{ - \item{pop}{a \code{\link{genind}} object} +\item{pop}{a \code{\link{genind}} object} - \item{replen}{a \code{vector} of \code{integers} - indicating the length of the nucleotide repeats for each - microsatellite locus.} +\item{replen}{a \code{vector} of \code{integers} indicating the length of the + nucleotide repeats for each microsatellite locus.} + +\item{add}{if \code{TRUE}, genotypes with zero values will be treated under + the genome addition model presented in Bruvo et al. 2004.} + +\item{loss}{if \code{TRUE}, genotypes with zero values will be treated under + the genome loss model presented in Bruvo et al. 2004.} } \value{ a \code{distance matrix} } \description{ -Calculate the average Bruvo's Distance over all loci in a -population. +Calculate the average Bruvo's distance over all loci in a population. +} +\details{ +Ploidy is irrelevant with respect to calculation of Bruvo's + distance. However, since it makes a comparison between all alleles at a + locus, it only makes sense that the two loci need to have the same ploidy + level. Unfortunately for polyploids, it's often difficult to fully separate + distinct alleles at each locus, so you end up with genotypes that appear to + have a lower ploidy level than the organism. + + To help deal with these situations, Bruvo has suggested three methods for + dealing with these differences in ploidy levels: \itemize{ \item Infinite + Model - The simplest way to deal with it is to count all missing alleles as + infinitely large so that the distance between it and anything else is 1. + Aside from this being computationally simple, it will tend to + \strong{inflate distances between individuals}. \item Genome Addition Model + - If it is suspected that the organism has gone through a recent genome + expansion, \strong{the missing alleles will be replace with all possible + combinations of the observed alleles in the shorter genotype}. For example, + if there is a genotype of [69, 70, 0, 0] where 0 is a missing allele, the + possible combinations are: [69, 70, 69, 69], [69, 70, 69, 70], and [69, 70, + 70, 70]. The resulting distances are then averaged over the number of + comparisons. \item Genome Loss Model - This is similar to the genome + addition model, except that it assumes that there was a recent genome + reduction event and uses \strong{the observed values in the full genotype + to fill the missing values in the short genotype}. As with the Genome + Addition Model, the resulting distances are averaged over the number of + comparisons. \item Combination Model - Combine and average the genome + addition and loss models. } As mentioned above, the infinite model is + biased, but it is not nearly as computationally intensive as either of the + other models. The reason for this is that both of the addition and loss + models requires replacement of alleles and recalculation of Bruvo's + distance. The number of replacements required is equal to the multiset + coefficient: \eqn{\left({n \choose k}\right) == {(n+k-1) \choose + k}}{choose(n+k-1, k)} where \emph{n} is the number of potential + replacements and \emph{k} is the number of alleles to be replaced. So, for + the example given above, The genome addition model would require + \eqn{\left({2 \choose 2}\right) = 3}{choose(2+2-1, 2) == 3} calculations of + Bruvo's distance, whereas the genome loss model would require \eqn{\left({4 + \choose 2}\right) = 10}{choose(4+2-1, 2) == 10} calculations. + + To reduce the number of calculations and assumptions otherwise, Bruvo's + distance will be calculated using the largest observed ploidy in pairwise + comparisons. This means that when comparing [69,70,71,0] and [59,60,0,0], + they will be treated as triploids. } \note{ -This function calculates bruvo's distance for non-special -cases (ie. the ploidy and all alleles are known). Currently -there is no way to import polyploid partial heterozygote -data into adegenet. For Bruvo's Distance concerning special -cases, see the package \code{polysat}. +The result of both \code{add = TRUE} and \code{loss = TRUE} is that the + distance is averaged over both values. If both are set to \code{FALSE}, + then the infinite alleles model is used. For genotypes with all missing + values, the result will be NA. -If the user does not provide a vector of appropriate length -for \code{replen} , it will be estimated by taking the -minimum difference among represented alleles at each locus. -IT IS NOT RECOMMENDED TO RELY ON THIS ESTIMATION. + If the user does not provide a vector of appropriate length for + \code{replen} , it will be estimated by taking the minimum difference among + represented alleles at each locus. IT IS NOT RECOMMENDED TO RELY ON THIS + ESTIMATION. } \examples{ # Please note that the data presented is assuming that the nancycat dataset @@ -54,11 +102,11 @@ Zhian N. Kamvar } \references{ Ruzica Bruvo, Nicolaas K. Michiels, Thomas G. D'Souza, and -Hinrich Schulenburg. A simple method for the calculation of -microsatellite genotype distances irrespective of ploidy -level. Molecular Ecology, 13(7):2101-2106, 2004. + Hinrich Schulenburg. A simple method for the calculation of microsatellite + genotype distances irrespective of ploidy level. Molecular Ecology, + 13(7):2101-2106, 2004. } \seealso{ -\code{\link{nancycats}} +\code{\link{bruvo.boot}}, \code{\link{bruvo.msn}} } diff --git a/man/bruvo.msn.Rd b/man/bruvo.msn.Rd index b6129684..aad121a3 100644 --- a/man/bruvo.msn.Rd +++ b/man/bruvo.msn.Rd @@ -1,85 +1,96 @@ +% Generated by roxygen2 (4.0.1): do not edit by hand \name{bruvo.msn} \alias{bruvo.msn} -\title{Create minimum spanning network of selected populations using Brvuo's +\alias{msn.bruvo} +\title{Create minimum spanning network of selected populations using Bruvo's distance.} \usage{ -bruvo.msn(pop, replen = c(1), palette = topo.colors, sublist = "All", - blacklist = NULL, vertex.label = "MLG", gscale = TRUE, glim = c(0, - 0.8), gadj = 3, gweight = 1, wscale = TRUE, ...) +bruvo.msn(pop, replen = 1, add = TRUE, loss = TRUE, + palette = topo.colors, sublist = "All", blacklist = NULL, + vertex.label = "MLG", gscale = TRUE, glim = c(0, 0.8), gadj = 3, + gweight = 1, wscale = TRUE, showplot = TRUE, ...) } \arguments{ - \item{pop}{a \code{\link{genind}} object} - - \item{replen}{a \code{vector} of \code{integers} - indicating the length of the nucleotide repeats for each - microsatellite locus.} - - \item{palette}{a \code{function} defining the color - palette to be used to color the populations on the graph. - It defaults to \code{\link{topo.colors}}, but you can - easily create new schemes by using - \code{\link{colorRampPalette}} (see examples for - details)} - - \item{sublist}{a \code{vector} of population names or - indexes that the user wishes to keep. Default to "ALL".} - - \item{blacklist}{a \code{vector} of population names or - indexes that the user wishes to discard. Default to - \code{NULL}} - - \item{vertex.label}{a \code{vector} of characters to - label each vertex. There are two defaults: \code{"MLG"} - will label the nodes with the multilocus genotype from - the original data set and \code{"inds"} will label the - nodes with the representative individual names.} - - \item{gscale}{"grey scale". If this is \code{TRUE}, this - will scale the color of the edges proportional to Bruvo's - distance, with the lines becoming darker for more related - nodes. See \code{\link{greycurve}} for details.} - - \item{glim}{"grey limit". Two numbers between zero and - one. They determine the upper and lower limits for the - \code{\link{gray}} function. Default is 0 (black) and 0.8 - (20\% black). See \code{\link{greycurve}} for details.} - - \item{gadj}{"grey adjust". a positive \code{integer} - greater than zero that will serve as the exponent to the - edge weight to scale the grey value to represent that - weight. See \code{\link{greycurve}} for details.} - - \item{gweight}{"grey weight". an \code{integer}. If it's - 1, the grey scale will be weighted to emphasize the - differences between closely related nodes. If it is 2, - the grey scale will be weighted to emphasize the +\item{pop}{a \code{\link{genind}} object} + +\item{replen}{a \code{vector} of \code{integers} indicating the length of the + nucleotide repeats for each microsatellite locus.} + +\item{add}{if \code{TRUE}, genotypes with zero values will be treated under + the genome addition model presented in Bruvo et al. 2004.} + +\item{loss}{if \code{TRUE}, genotypes with zero values will be treated under + the genome loss model presented in Bruvo et al. 2004.} + +\item{palette}{a \code{function} defining the color palette to be used to + color the populations on the graph. It defaults to + \code{\link{topo.colors}}, but you can easily create new schemes by using + \code{\link{colorRampPalette}} (see examples for details)} + +\item{sublist}{a \code{vector} of population names or indexes that the user + wishes to keep. Default to "ALL".} + +\item{blacklist}{a \code{vector} of population names or indexes that the user + wishes to discard. Default to \code{NULL}} + +\item{vertex.label}{a \code{vector} of characters to label each vertex. There + are two defaults: \code{"MLG"} will label the nodes with the multilocus + genotype from the original data set and \code{"inds"} will label the nodes + with the representative individual names.} + +\item{gscale}{"grey scale". If this is \code{TRUE}, this will scale the color + of the edges proportional to Bruvo's distance, with the lines becoming + darker for more related nodes. See \code{\link{greycurve}} for details.} + +\item{glim}{"grey limit". Two numbers between zero and one. They determine + the upper and lower limits for the \code{\link{gray}} function. Default is + 0 (black) and 0.8 (20\% black). See \code{\link{greycurve}} for details.} + +\item{gadj}{"grey adjust". a positive \code{integer} greater than zero that + will serve as the exponent to the edge weight to scale the grey value to + represent that weight. See \code{\link{greycurve}} for details.} + +\item{gweight}{"grey weight". an \code{integer}. If it's 1, the grey scale + will be weighted to emphasize the differences between closely related + nodes. If it is 2, the grey scale will be weighted to emphasize the differences between more distantly related nodes. See \code{\link{greycurve}} for details.} - \item{wscale}{"width scale". If this is \code{TRUE}, the - edge widths will be scaled proportional to the inverse of - Bruvo's distance , with the lines becoming thicker for - more related nodes.} +\item{wscale}{"width scale". If this is \code{TRUE}, the edge widths will be + scaled proportional to Bruvo's distance, with the lines becoming thicker + for more related nodes.} + +\item{showplot}{logical. If \code{TRUE}, the graph will be plotted. If + \code{FALSE}, it will simply be returned.} - \item{...}{any other arguments that could go into - plot.igraph} +\item{...}{any other arguments that could go into plot.igraph} } \value{ -\item{graph}{a minimum spanning network with nodes -corresponding to MLGs within the data set. Colors of the -nodes represent population membership. Width and color of -the edges represent distance.} \item{populations}{a vector -of the population names corresponding to the vertex colors} -\item{colors}{a vector of the hexadecimal representations -of the colors used in the vertex colors} +\item{graph}{a minimum spanning network with nodes corresponding to + MLGs within the data set. Colors of the nodes represent population + membership. Width and color of the edges represent distance.} + \item{populations}{a vector of the population names corresponding to the + vertex colors} \item{colors}{a vector of the hexadecimal representations of + the colors used in the vertex colors} } \description{ -Create minimum spanning network of selected populations -using Brvuo's distance. +Create minimum spanning network of selected populations using Bruvo's +distance. +} +\details{ +The minimum spanning network generated by this function is generated + via igraph's \code{\link[igraph]{minimum.spanning.tree}}. The resultant + graph produced can be plotted using igraph functions, or the entire object + can be plotted using the function \code{\link{plot_poppr_msn}}, which will + give the user a scale bar and the option to layout your data. } \note{ -The edges of these graphs may cross each other if the graph -becomes too large. +\itemize{ \item \strong{Please see the documentation for + \code{\link{bruvo.dist}} for details on the algorithm}. \item The edges of + these graphs may cross each other if the graph becomes too large. \item The + nodes in the graph represent multilocus genotypes. The colors of the nodes + are representative of population membership. It is not uncommon to see + different populations containing the same multilocus genotype.} } \examples{ # Load the data set. @@ -95,27 +106,27 @@ palette=heat.colors, vertex.label.cex=0.7, vertex.label.dist=0.4) # View custom colors. Here, we use black and orange. bruvo.msn(nancycats, replen=rep(2, 9), sublist=8:9, vertex.label="inds", -palette = colorRampPalette(c("orange", "black"), vertex.label.cex=0.7, +palette = colorRampPalette(c("orange", "black")), vertex.label.cex=0.7, vertex.label.dist=0.4) # View with darker shades of grey (setting the upper limit to 1/2 black 1/2 white). bruvo.msn(nancycats, replen=rep(2, 9), sublist=8:9, vertex.label="inds", -palette = colorRampPalette(c("orange", "black"), vertex.label.cex=0.7, +palette = colorRampPalette(c("orange", "black")), vertex.label.cex=0.7, vertex.label.dist=0.4, glim=c(0, 0.5)) # View with no grey scaling. bruvo.msn(nancycats, replen=rep(2, 9), sublist=8:9, vertex.label="inds", -palette = colorRampPalette(c("orange", "black"), vertex.label.cex=0.7, +palette = colorRampPalette(c("orange", "black")), vertex.label.cex=0.7, vertex.label.dist=0.4, gscale=FALSE) # View with no line widths. bruvo.msn(nancycats, replen=rep(2, 9), sublist=8:9, vertex.label="inds", -palette = colorRampPalette(c("orange", "black"), vertex.label.cex=0.7, +palette = colorRampPalette(c("orange", "black")), vertex.label.cex=0.7, vertex.label.dist=0.4, wscale=FALSE) # View with no scaling at all. bruvo.msn(nancycats, replen=rep(2, 9), sublist=8:9, vertex.label="inds", -palette = colorRampPalette(c("orange", "black"), vertex.label.cex=0.7, +palette = colorRampPalette(c("orange", "black")), vertex.label.cex=0.7, vertex.label.dist=0.4, vscale=FALSE, gscale=FALSE) # View the whole population, but without labels. @@ -123,19 +134,17 @@ bruvo.msn(nancycats, replen=rep(2, 9), vertex.label=NA) } } \author{ -Javier F. Tabima, Zhian N. Kamvar +Zhian N. Kamvar, Javier F. Tabima } \references{ Ruzica Bruvo, Nicolaas K. Michiels, Thomas G. D'Souza, and -Hinrich Schulenburg. A simple method for the calculation of -microsatellite genotype distances irrespective of ploidy -level. Molecular Ecology, 13(7):2101-2106, 2004. + Hinrich Schulenburg. A simple method for the calculation of microsatellite + genotype distances irrespective of ploidy level. Molecular Ecology, + 13(7):2101-2106, 2004. } \seealso{ -\code{\link{nancycats}}, \code{\link{upgma}}, -\code{\link{nj}}, \code{\link{boot.phylo}}, -\code{\link{nodelabels}}, \code{\link{na.replace}}, -\code{\link{missingno}}, \code{\link{bruvo.boot}}, -\code{\link{greycurve}}. +\code{\link{bruvo.dist}}, \code{\link{nancycats}}, + \code{\link{plot_poppr_msn}}, \code{\link[igraph]{minimum.spanning.tree}} + \code{\link{bruvo.boot}}, \code{\link{greycurve}} } diff --git a/man/bruvomat-class.Rd b/man/bruvomat-class.Rd index 4ef891a1..7e8e43f9 100644 --- a/man/bruvomat-class.Rd +++ b/man/bruvomat-class.Rd @@ -1,10 +1,11 @@ +% Generated by roxygen2 (4.0.1): do not edit by hand \docType{class} \name{bruvomat-class} \alias{bruvomat-class} \title{bruvomat object} \description{ -An internal object used for bruvo's distance. Not intended -for user interaction. +An internal object used for bruvo's distance. +Not intended for user interaction. } \section{Slots}{ @@ -19,6 +20,7 @@ be equal to (ploidy)*(number of loci)} \item{\code{ind.names}}{names of individuals in matrix rows.} }} \author{ -Zhian Kamvar +Zhian N. Kamvar } +\keyword{internal} diff --git a/man/bruvomat-methods.Rd b/man/bruvomat-methods.Rd index 14578e73..d2512f5f 100644 --- a/man/bruvomat-methods.Rd +++ b/man/bruvomat-methods.Rd @@ -1,3 +1,4 @@ +% Generated by roxygen2 (4.0.1): do not edit by hand \docType{methods} \name{initialize,bruvomat-method} \alias{[,bruvomat,ANY,ANY,ANY-method} @@ -12,26 +13,28 @@ \S4method{[}{bruvomat,ANY,ANY,ANY}(x, i, j, ..., drop = FALSE) } \arguments{ - \item{.Object}{a character, "bruvomat"} +\item{.Object}{a character, "bruvomat"} - \item{gen}{\code{"\linkS4class{genind}"} object} +\item{gen}{\code{"\linkS4class{genind}"} object} - \item{replen}{a vector of numbers indicating the repeat - length for each microsatellite locus.} +\item{replen}{a vector of numbers indicating the repeat length for each +microsatellite locus.} - \item{x}{a \code{"\linkS4class{bruvomat}"} object} +\item{x}{a \code{"\linkS4class{bruvomat}"} object} - \item{i}{vector of numerics indicating number of - individuals desired} +\item{i}{vector of numerics indicating number of individuals desired} - \item{j}{a vector of numerics corresponding to the loci - desired.} +\item{j}{a vector of numerics corresponding to the loci desired.} - \item{...}{unused.} +\item{...}{unused.} - \item{drop}{set to \code{FALSE}} +\item{drop}{set to \code{FALSE}} } \description{ This is not designed for user interaction. } +\author{ +Zhian N. Kamvar +} +\keyword{internal} diff --git a/man/clonecorrect.Rd b/man/clonecorrect.Rd index c52b9c40..5480ba69 100755 --- a/man/clonecorrect.Rd +++ b/man/clonecorrect.Rd @@ -1,72 +1,86 @@ +% Generated by roxygen2 (4.0.1): do not edit by hand \name{clonecorrect} \alias{clonecorrect} -\title{Remove potential bias caused by cloned genotypes in genind object.} +\title{Remove potential bias caused by cloned genotypes in genind or genclone +object.} \usage{ -clonecorrect(pop, hier = c(1), dfname = "population_hierarchy", +clonecorrect(pop, hier = 1, dfname = "population_hierarchy", combine = FALSE, keep = 1) } \arguments{ - \item{pop}{a \code{\link{genind}} object} +\item{pop}{a \code{\link{genind}} object} - \item{hier}{a \code{numeric or character list}. This is - the list of vectors within a data frame (specified in - \code{dfname}) in the 'other' slot of the - \code{\link{genind}} object. The list should indicate the - population hierarchy to be used for clone correction.} +\item{hier}{a hierarchical formula or numeric vector. In a + \code{\linkS4class{genclone}} object, this will define the columns of the + data frame in the hierarchy slot to use. In a \code{\linkS4class{genind}} + object, the data frame must exist within the \code{\link[adegenet]{other}} + slot and the user must define the name of the data frame with the parameter + \code{dfname}} - \item{dfname}{a \code{character string}. This is the name - of the data frame or list containing the vectors of the +\item{dfname}{a \code{character string}. \strong{Only for genind objects} + This is the name of the data frame or list containing the vectors of the population hierarchy within the \code{other} slot of the \code{\link{genind}} object.} - \item{combine}{\code{logical}. When set to TRUE, the - heirarchy will be combined to create a new population for - the genind object.} +\item{combine}{\code{logical}. When set to TRUE, the heirarchy will be + combined to create a new population for the clone corrected genind or + genclone object.} - \item{keep}{\code{integer}. When \code{combine} is set to - \code{FALSE}, you can use this flag to choose the levels - of your population hierarchy. For example: if your clone - correction hierarchy is set to "Pop", "Subpop", and - "Year", and you want to analyze your populations with - respect to year, you can set \code{keep = c(1,3)}.} +\item{keep}{\code{integer}. When \code{combine} is set to \code{FALSE}, you + can use this flag to choose the levels of your population hierarchy. For + example: if your clone correction hierarchy is set to "Pop", "Subpop", and + "Year", and you want to analyze your populations with respect to year, you + can set \code{keep = c(1,3)}.} } \value{ -a clone corrected \code{\link{genind}} object. +a clone corrected \code{\linkS4class{genclone}} or + \code{\linkS4class{genind}} object. } \description{ -This function removes any duplicated multi locus genotypes -from any specified population hierarchy. +This function removes any duplicated multilocus genotypes from any specified +population hierarchy. +} +\details{ +This function will clone correct based on the hierarchical level + provided. To clone correct indiscriminantly of hierarchical structure, set + \code{hier = NA}. It is recommended to use this function with + \code{\linkS4class{genclone}} objects as they have a specific slot for + hierarchies. If you wish to use this function on a + \code{\linkS4class{genind}} object, see below. } \note{ -This function will clone correct to the population level -indicated in the \code{pop} slot of the -\code{\link{genind}} object if there is no data frame -specified in dfname. If there is no population structure -and there is no specified data frame, it will clone correct -the entire \code{\link{genind}} object. +\subsection{For genind objects}{ \code{\linkS4class{genind}} objects do + not have a specific slot for hierarchies and thus require the user to + specfy the hierarchical levels in a data frame within the + \code{\link[adegenet]{other}} slot. If there is no data frame indicating + population hierarchy, then clone correction will occur on the population + factor that is set in the \code{\link[adegenet]{pop}} slot.} } \examples{ # LOAD A. euteiches data set data(Aeut) +# Redefine it as a genclone object +Aeut <- as.genclone(Aeut, hier = other(Aeut)$population_hierarchy[-1]) + # Check the number of multilocus genotypes mlg(Aeut) Aeut$pop.names # Clone correct at the population level. -Aeut.pop <- clonecorrect(Aeut, hier="Pop") +Aeut.pop <- clonecorrect(Aeut, hier= ~Pop) mlg(Aeut.pop) Aeut.pop$pop.names \dontrun{ # Clone correct at the subpopulation level with respect to population and # combine. -Aeut.subpop <- clonecorrect(Aeut, hier=c("Pop", "Subpop"), combine=TRUE) +Aeut.subpop <- clonecorrect(Aeut, hier=~Pop/Subpop, combine=TRUE) mlg(Aeut.subpop) Aeut.subpop$pop.names # Do the same, but set to the population level. -Aeut.subpop2 <- clonecorrect(Aeut, hier=c("Pop", "Subpop"), keep=1) +Aeut.subpop2 <- clonecorrect(Aeut, hier=~Pop/Subpop, keep=1) mlg(Aeut.subpop2) Aeut.subpop2$pop.names @@ -74,7 +88,7 @@ Aeut.subpop2$pop.names data(H3N2) # Extract only the individuals located in China -country <- clonecorrect(H3N2, hier=c("country"), dfname="x") +country <- clonecorrect(H3N2, hier= ~country, dfname="x") # How many isolates did we have from China before clone correction? length(which(other(H3N2)$x$country=="China")) # 155 @@ -86,7 +100,7 @@ length(which(other(country)$x$country=="China")) # 79 # slower computers) # setting the hierarchy to be Country > Year > Month -c.y.m <- clonecorrect(H3N2, hier=c("year","month","country"), dfname="x") +c.y.m <- clonecorrect(H3N2, hier= ~year/month/country, dfname="x") # How many isolates in the original data set? length(other(H3N2)$x$country) # 1903 diff --git a/man/coercion-methods.Rd b/man/coercion-methods.Rd new file mode 100644 index 00000000..a68a2521 --- /dev/null +++ b/man/coercion-methods.Rd @@ -0,0 +1,43 @@ +% Generated by roxygen2 (4.0.1): do not edit by hand +\docType{methods} +\name{as.genclone} +\alias{as.genclone} +\alias{as.genclone,genind-method} +\title{Create a genclone object from a genind object.} +\usage{ +as.genclone(x, hierarchy = NULL) +} +\arguments{ +\item{x}{a \code{\linkS4class{genind}} or \code{\linkS4class{genclone}} +object} + +\item{hierarchy}{a data frame representing the population hierarchy.} +} +\description{ +Wrapper for genclone initializer. +} +\note{ +The hierarchy must have the same number of rows as the number of + observations in the genind object. If no hierarchy is defined, the function + will search for a data frame in the \code{\link{other}} slot called + "population_hierarchy" and set that as the hieararchy. If none is defined, + the population will be set as the hierarchy under the label "Pop". Use the + function \code{\link{splithierarchy}} to split up any population + hierarchies that might be combined in the population factor. +} +\examples{ +data(Aeut) +Aeut +Aeut.gc <- as.genclone(Aeut) +Aeut.gc +Aeut.gc <- as.genclone(Aeut, other(Aeut)$population_hierarchy[-1]) +Aeut.gc +} +\author{ +Zhian N. Kamvar +} +\seealso{ +\code{\link{splithierarchy}}, \code{\linkS4class{genclone}}, + \code{\link{read.genalex}} +} + diff --git a/man/diss.dist.Rd b/man/diss.dist.Rd index 8738ae09..6295efa5 100644 --- a/man/diss.dist.Rd +++ b/man/diss.dist.Rd @@ -1,23 +1,38 @@ +% Generated by roxygen2 (4.0.1): do not edit by hand \name{diss.dist} \alias{diss.dist} \title{Calculate a distance matrix based on relative dissimilarity} \usage{ -diss.dist(pop) +diss.dist(x, percent = FALSE, mat = FALSE) } \arguments{ - \item{pop}{a \code{\link{genind}} object.} +\item{x}{a \code{\link{genind}} object.} + +\item{percent}{\code{logical}. Should the distance be represented as a + percent? If set to \code{FALSE} (default), the distance will be reflected + as the number of alleles differing between to individuals. When set to + \code{TRUE}, These will be divided by the ploidy multiplied by the number + of loci.} + +\item{mat}{\code{logical}. Return a matrix object. Default set to + \code{FALSE}, returning a dist object. \code{TRUE} returns a matrix object.} } \value{ -A distance object. +Pairwise distances between individuals present in the genind object. } \description{ -diss.dist uses the same discreet dissimilarity matrix -utilized by the index of association (see \code{\link{ia}} -for details). It returns a distance reflecting a ratio of -the number of observed differences by the number of -possible differences. Eg. two individuals who share half of -the same alleles will have a distance of 0.5. This function -can analyze distances for any marker system. +diss.dist uses the same discreet dissimilarity matrix utilized by the index +of association (see \code{\link{ia}} for details). By default, it returns a +distance reflecting the number of allelic differences between two +individuals. When \code{percent = TRUE}, it returns a ratio of the number of +observed differences by the number of possible differences. Eg. two +individuals who share half of the same alleles will have a distance of 0.5. +This function can analyze distances for any marker system. +} +\note{ +When \code{percent = TRUE}, this is exactly the same as + \code{\link{provesti.dist}}, except that it performs better for large + numbers of individuals (n > 125) at the cost of avaliable memory. } \examples{ # A simple example. Let's analyze the mean distance among populations of A. diff --git a/man/genclone-class.Rd b/man/genclone-class.Rd new file mode 100644 index 00000000..31252681 --- /dev/null +++ b/man/genclone-class.Rd @@ -0,0 +1,59 @@ +% Generated by roxygen2 (4.0.1): do not edit by hand +\docType{class} +\name{genclone-class} +\alias{genclone} +\alias{genclone-class} +\title{Genclone class} +\description{ +Genclone is an S4 class that extends the \code{\linkS4class{genind}} +from the \pkg{\link{adegenet}} package. It will have all of the same +attributes as the \code{\linkS4class{genind}}, but it will contain two +extra slots that will help retain information about population hierarchies +and multilocus genotypes. +} +\details{ +The genclone class will allow for more optimized methods of clone +correcting and analyzing data over multiple levels of population hierarchy. + +Previously, for hierarchical analysis to work in a \code{\link{genind}} +object, the user had to place a data frame in the \code{\link{other}} slot of +the object. The suggested name of the data frame was +\code{population_hierarchy}, and this was used to be able to store the +hierarchical information inside the object so that the user did not have to +keep track of that information. This method worked, but it became apparent +that it was a bit confusing to the user as the method for changing the +population of an object became: + +\code{pop(object) <- other(object)$population_hierarchy$population_name} + +That is a lot to keep track of. The new \strong{\code{hierarchy}} slot will +allow the user to change the population factor with one function and a formula: + +\code{setpop(object) <- ~Population/Subpopulation} + +making this become slightly more intuitive and tractable. + +Previously for \linkS4class{genind} objects, multilocus genotypes were not +retained after a data set was subset by population. The new +\strong{\code{mlg}} slot allows us to assign the multilocus genotypes and +retain that information no matter how we subset the data set. +} +\section{Slots}{ + +\describe{ +\item{\code{mlg}}{a vector representing multilocus genotypes for the data set.} + +\item{\code{hierarchy}}{a data frame containing hierarchical levels.} +}} +\section{Extends}{ + +Class \code{"\linkS4class{genind}"}, directly. +} +\author{ +Zhian N. Kamvar +} +\seealso{ +\code{\link{as.genclone}} \code{\link{sethierarchy}} \code{\link{setpop}} +\code{\linkS4class{genind}} +} + diff --git a/man/genclone-method.Rd b/man/genclone-method.Rd new file mode 100644 index 00000000..fb03bcdd --- /dev/null +++ b/man/genclone-method.Rd @@ -0,0 +1,62 @@ +% Generated by roxygen2 (4.0.1): do not edit by hand +\docType{methods} +\name{[,genclone,ANY,ANY,ANY-method} +\alias{[,genclone,ANY,ANY,ANY-method} +\alias{initialize,genclone-method} +\alias{print,genclone-method} +\alias{show,genclone-method} +\title{Methods used for the genclone object} +\usage{ +\S4method{[}{genclone,ANY,ANY,ANY}(x, i, j, ..., loc = NULL, + treatOther = TRUE, quiet = TRUE, drop = FALSE) + +\S4method{initialize}{genclone}(.Object, gen, hierarchy, mlg) + +\S4method{show}{genclone}(object) + +\S4method{print}{genclone}(x, ...) +} +\arguments{ +\item{x}{a genclone object} + +\item{i}{vector of numerics indicating number of individuals desired} + +\item{j}{a vector of numerics corresponding to the loci desired.} + +\item{...}{passed on to the \code{\linkS4class{genind}} object.} + +\item{drop}{set to \code{FALSE}} + +\item{loc}{passed on to \code{\linkS4class{genind}} object.} + +\item{treatOther}{passed on to \code{\linkS4class{genind}} object.} + +\item{quiet}{passed on to \code{\linkS4class{genind}} object.} + +\item{.Object}{a character, "genclone"} + +\item{gen}{\code{"\linkS4class{genind}"} object} + +\item{hierarchy}{a data frame where each row i represents the different +population assignments of individual i in the data set. If this is empty, the +hierarchy will be created from the population factor.} + +\item{mlg}{a vector where each element assigns the multilocus genotype of +that individual in the data set.} + +\item{object}{a genclone object} + +\item{x}{a genclone object} + +\item{fullnames}{\code{logical}. If \code{TRUE}, then the full names of the +populations will be printed. If \code{FALSE}, then only the first and last +three population names are displayed.} +} +\description{ +Default methods for subsetting genclone objects. +} +\author{ +Zhian N. Kamvar +} +\keyword{internal} + diff --git a/man/genetic_distance.Rd b/man/genetic_distance.Rd new file mode 100644 index 00000000..1ffe849b --- /dev/null +++ b/man/genetic_distance.Rd @@ -0,0 +1,122 @@ +% Generated by roxygen2 (4.0.1): do not edit by hand +\name{nei.dist} +\alias{edwards.dist} +\alias{nei.dist} +\alias{provesti.dist} +\alias{reynolds.dist} +\alias{rogers.dist} +\title{Calculate Genetic Distance for a genind or genclone object.} +\usage{ +nei.dist(x, warning = TRUE) + +edwards.dist(x) + +rogers.dist(x) + +reynolds.dist(x) + +provesti.dist(x) +} +\arguments{ +\item{x}{a \linkS4class{genind}, \linkS4class{genclone}, or matrix object.} + +\item{warning}{If \code{TRUE}, a warning will be printed if any infinite + values are detected and replaced. If \code{FALSE}, these values will be + replaced without warning. See Details below.} +} +\value{ +an object of class dist with the same number of observations as the + number of individuals in your data. +} +\description{ +These functions are modified from the function \link[adegenet]{dist.genpop} to +be applicable for distances between individuals. +} +\details{ +It is important to be careful with the interpretation of these + distances as they were originally intended for calculation of + between-population distance. As Nei's distance is the negative log of 0:1, + this means that it is very possible to obtain distances of infinity. When + this happens, infinite values are corrected to be 10 * max(D) where D is + the distance matrix without infinite values. +} +\note{ +Provesti's distance is identical to \code{\link{diss.dist}}, except + that \code{\link{diss.dist}} is optimized for a larger number of + individuals (n > 125) at the cost of required memory. + + These distances were adapted from the \pkg{adegenet} function + \code{\link{dist.genpop}} to work with \code{\linkS4class{genind}} objects. +} +\examples{ +data(nancycats) +nan9 <- popsub(nancycats, 9) +neinan <- nei.dist(nan9) +ednan <- edwards.dist(nan9) +rodnan <- rogers.dist(nan9) +reynan <- reynolds.dist(nan9) +pronan <- provesti.dist(nan9) +} +\author{ +Zhian N. Kamvar (poppr adaptation) +Thibaut Jombart (adegenet adaptation) +Daniel Chessel (ade4) +} +\references{ +Nei, M. (1972) Genetic distances between populations. American Naturalist, +106, 283–292. + +Nei M. (1978) Estimation of average heterozygosity and genetic +distance from a small number of individuals. Genetics, 23, 341–369. + +Avise, J. C. (1994) Molecular markers, natural history and evolution. Chapman & Hall, +London. + +Edwards, A.W.F. (1971) Distance between populations on the basis of gene +frequencies. Biometrics, 27, 873–881. + +Cavalli-Sforza L.L. and Edwards A.W.F. +(1967) Phylogenetic analysis: models and estimation procedures. Evolution, +32, 550–570. + +Hartl, D.L. and Clark, A.G. (1989) Principles of population +genetics. Sinauer Associates, Sunderland, Massachussetts (p. 303). + +Reynolds, J. B., B. S. Weir, and C. C. Cockerham. (1983) Estimation of the +coancestry coefficient: basis for a short-term genetic distance. Genetics, +105, 767–779. + +Rogers, J.S. (1972) Measures of genetic similarity and genetic distances. +Studies in Genetics, Univ. Texas Publ., 7213, 145–153. + +Avise, J. C. (1994) +Molecular markers, natural history and evolution. Chapman & Hall, London. + +Prevosti A. (1974) La distancia genetica entre poblaciones. Miscellanea +Alcobe, 68, 109–118. + +Prevosti A., Oca\~na J. and Alonso G. (1975) Distances +between populations of Drosophila subobscura, based on chromosome +arrangements frequencies. Theoretical and Applied Genetics, 45, 231–241. + +For more information on dissimilarity indexes: + +Gower J. and Legendre P. (1986) +Metric and Euclidean properties of dissimilarity coefficients. Journal of +Classification, 3, 5–48 + +Legendre P. and Legendre L. (1998) Numerical Ecology, +Elsevier Science B.V. 20, pp274–288. +} +\seealso{ +\code{\link{aboot}} \code{\link{diss.dist}} \code{\link{poppr.amova}} +} +\keyword{angular} +\keyword{coancestry} +\keyword{edwards} +\keyword{nei} +\keyword{provesti} +\keyword{reynolds} +\keyword{rodgers} +\keyword{rogers} + diff --git a/man/genind2genalex.Rd b/man/genind2genalex.Rd index e74ab532..738af6a8 100755 --- a/man/genind2genalex.Rd +++ b/man/genind2genalex.Rd @@ -1,41 +1,42 @@ +% Generated by roxygen2 (4.0.1): do not edit by hand \name{genind2genalex} \alias{genind2genalex} \title{Exporting data from genind objects to genalex formatted *.csv files.} \usage{ genind2genalex(pop, filename = "genalex.csv", quiet = FALSE, geo = FALSE, - geodf = "xy") + geodf = "xy", sep = ",") } \arguments{ - \item{pop}{a \code{\link{genind}} object.} +\item{pop}{a \code{\linkS4class{genclone}} or \code{\linkS4class{genind}} + object.} - \item{filename}{a string indicating the name and/or path - of the file you wish to create.} +\item{filename}{a string indicating the name and/or path of the file you wish + to create.} - \item{quiet}{\code{logical} If \code{FALSE} a message - will be printed to the screen.} +\item{quiet}{\code{logical} If \code{FALSE} a message will be printed to the + screen.} - \item{geo}{\code{logical} Default is \code{FALSE}. If it - is set to \code{TRUE}, the resulting file will have two - columns for geographic data.} +\item{geo}{\code{logical} Default is \code{FALSE}. If it is set to + \code{TRUE}, the resulting file will have two columns for geographic data.} - \item{geodf}{\code{character} Since the \code{other} slot - in the adegenet object can contain many different items, - you must specify the name of the data frame in the - \code{other} slot containing your geographic coordinates. +\item{geodf}{\code{character} Since the \code{other} slot in the adegenet + object can contain many different items, you must specify the name of the + data frame in the \code{other} slot containing your geographic coordinates. It defaults to "xy".} + +\item{sep}{a character specifying what character to use to separate columns. + Defaults to ",".} } \description{ -genind2genalex will export a genind object to a *.csv file +genind2genalex will export a genclone or genind object to a *.csv file formatted for use in genalex. } \note{ -If you enter a file name that exists, that file will be -overwritten. If your data set lacks a population structure, -it will be coded in the new file as a single population -lableled "Pop". Likewise, if you don't have any labels for -your individuals, they will be labeled as "ind1" through -"ind\emph{N}", with \emph{N} being the size of your -population. +If you enter a file name that exists, that file will be overwritten. If + your data set lacks a population structure, it will be coded in the new + file as a single population lableled "Pop". Likewise, if you don't have any + labels for your individuals, they will be labeled as "ind1" through + "ind\emph{N}", with \emph{N} being the size of your population. } \examples{ \dontrun{ @@ -47,6 +48,7 @@ genind2genalex(nancycats, "~/Documents/nancycats.csv", geo=TRUE) Zhian N. Kamvar } \seealso{ -\code{\link{clonecorrect}}, \code{\link{genind}} +\code{\link{clonecorrect}}, \code{\linkS4class{genclone}} or + \code{\linkS4class{genind}} } diff --git a/man/genotype_curve.Rd b/man/genotype_curve.Rd new file mode 100644 index 00000000..0b6850db --- /dev/null +++ b/man/genotype_curve.Rd @@ -0,0 +1,51 @@ +% Generated by roxygen2 (4.0.1): do not edit by hand +\name{genotype_curve} +\alias{genotype_curve} +\title{Produce a genotype accumulation curve} +\usage{ +genotype_curve(gen, sample = 100, quiet = FALSE, thresh = 0.9) +} +\arguments{ +\item{gen}{a \code{\linkS4class{genclone}} or \code{\linkS4class{genind}} + object.} + +\item{sample}{an \code{integer} defining the number of times loci will be + resampled.} + +\item{quiet}{if \code{FALSE}, a progress bar will be displayed. If + \code{TRUE}, nothing is printed to screen as the function runs.} + +\item{thresh}{a number from 0 to 1. This will draw a line at this fraction of + multilocus genotypes.} +} +\value{ +a matrix of integers showing the results of each randomization. + Columns represent the number of loci sampled and rows represent an + independent sample. +} +\description{ +GA curves are useful for determinining the minimum number of loci necessary +to discriminate between individuals in a population. This function will +randomly sample loci without replacement and count the number of multilocus +genotypes observed. +} +\examples{ +data(nancycats) +nan_geno <- genotype_curve(nancycats) +\dontrun{ +# With AFLP data, it is often necessary to include more markers for resolution +data(Aeut) +Ageno <- genotype_curve(Aeut) + +# Many microsatellite data sets have hypervariable markers +data(microbov) +mgeno <- geotype_curve(microbov) + +# This data set has been pre filtered +data(monpop) +mongeno <- genotype_curve(monpop)} +} +\author{ +Zhian N. Kamvar +} + diff --git a/man/getfile.Rd b/man/getfile.Rd index c90288ce..d2388378 100755 --- a/man/getfile.Rd +++ b/man/getfile.Rd @@ -1,3 +1,4 @@ +% Generated by roxygen2 (4.0.1): do not edit by hand \name{getfile} \alias{getfile} \title{Get a file name and path and store them in a list.} @@ -5,31 +6,28 @@ getfile(multi = FALSE, pattern = NULL, combine = TRUE) } \arguments{ - \item{multi}{this is an indicator to allow the user to - store the names of multiple files found in the directory. - This is useful in conjunction with - \code{\link{poppr.all}}.} +\item{multi}{this is an indicator to allow the user to store the names of +multiple files found in the directory. This is useful in conjunction with +\code{\link{poppr.all}}.} - \item{pattern}{a \code{\link{regex}} pattern for use - while \code{multFile==TRUE}.} +\item{pattern}{a \code{\link{regex}} pattern for use while +\code{multi == TRUE}. This will grab all files matching this pattern.} - \item{combine}{\code{logical}. When this is set to - \code{TRUE}, the \code{\$files} vector will have the path - appended to them. When it is set to \code{FALSE}, it will - have the basename.} +\item{combine}{\code{logical}. When this is set to \code{TRUE} (default), the +\code{$files} vector will have the path appended to them. When it is set to +\code{FALSE}, it will have the basename.} } \value{ \item{path}{a character string of the absolute path to the -chosen file or files} \item{files}{a character vector -containing the chosen file name or names.} +chosen file or files} +\item{files}{a character vector containing the chosen file +name or names.} } \description{ -getfile is a convenience function that serves as a wrapper -for the functions \code{\link{file.choose}, -\link{file.path},} and \code{\link{list.files}}. If the -user is working in a GUI environment, a window will pop up, -allowing the user to choose a specified file regardless of -path. +getfile is a convenience function that serves as a wrapper for the functions +\code{\link{file.choose}, \link{file.path},} and \code{\link{list.files}}. +If the user is working in a GUI environment, a window will pop up, allowing +the user to choose a specified file regardless of path. } \examples{ \dontrun{ @@ -37,9 +35,7 @@ path. x <- getfile() poppr(x$files) - - -y <- getfile(multFile=TRUE, pattern="^.+?dat$") +y <- getfile(multi=TRUE, pattern="^.+?dat$") #useful for reading in multiple FSTAT formatted files. yfiles <- poppr.all(y$files) diff --git a/man/greycurve.Rd b/man/greycurve.Rd index 6c10a50e..bba01f76 100644 --- a/man/greycurve.Rd +++ b/man/greycurve.Rd @@ -1,35 +1,39 @@ +% Generated by roxygen2 (4.0.1): do not edit by hand \name{greycurve} \alias{greycurve} \title{Display a greyscale gradient adjusted to specific parameters} \usage{ -greycurve(glim = c(0, 0.8), gadj = 3, gweight = 1) +greycurve(data = seq(0, 1, length = 1000), glim = c(0, 0.8), gadj = 3, + gweight = 1, scalebar = FALSE) } \arguments{ - \item{glim}{"grey limit". Two numbers between zero and - one. They determine the upper and lower limits for the - \code{\link{gray}} function. Default is 0 (black) and 0.8 - (20\% black).} - - \item{gadj}{"grey adjust". a positive \code{integer} - greater than zero that will serve as the exponent to the - edge weight to scale the grey value to represent that - weight.} - - \item{gweight}{"grey weight". an \code{integer}. If it's - 1, the grey scale will be weighted to emphasize the - differences between closely related nodes. If it is 2, - the grey scale will be weighted to emphasize the - differences between more distantly related nodes.} +\item{data}{a sequence of numbers to be converted to greyscale.} + +\item{glim}{"grey limit". Two numbers between zero and one. They determine +the upper and lower limits for the \code{\link{gray}} function. Default is 0 +(black) and 0.8 (20\% black).} + +\item{gadj}{"grey adjust". a positive \code{integer} greater than zero that +will serve as the exponent to the edge weight to scale the grey value to +represent that weight.} + +\item{gweight}{"grey weight". an \code{integer}. If it's 1, the grey scale +will be weighted to emphasize the differences between closely related nodes. +If it is 2, the grey scale will be weighted to emphasize the differences +between more distantly related nodes.} + +\item{scalebar}{When this is set to \code{TRUE}, two scalebars will be +plotted. The purpose of this is for adding a scale bar to minimum spanning +networks produced in earlier versions of poppr.} } \value{ -A plot displaying a grey gradient from 0.001 to 1 with -minimum and maximum values displayed as yellow lines, and -an equation for the correction displayed in red. +A plot displaying a grey gradient from 0.001 to 1 with minimum and +maximum values displayed as yellow lines, and an equation for the correction +displayed in red. } \description{ -This function has one purpose. It is for deciding the -appropriate scaling for a grey palette to be used for edge -weights of a minimum spanning network. +This function has one purpose. It is for deciding the appropriate scaling for +a grey palette to be used for edge weights of a minimum spanning network. } \examples{ # Normal grey curve with an adjustment of 3, an upper limit of 0.8, and diff --git a/man/hierarchy-methods.Rd b/man/hierarchy-methods.Rd new file mode 100644 index 00000000..4aca03dc --- /dev/null +++ b/man/hierarchy-methods.Rd @@ -0,0 +1,165 @@ +% Generated by roxygen2 (4.0.1): do not edit by hand +\docType{methods} +\name{gethierarchy} +\alias{addhierarchy} +\alias{addhierarchy,genclone-method} +\alias{addhierarchy<-} +\alias{addhierarchy<-,genclone-method} +\alias{gethierarchy} +\alias{gethierarchy,genclone-method} +\alias{namehierarchy} +\alias{namehierarchy,genclone-method} +\alias{namehierarchy<-} +\alias{namehierarchy<-,genclone-method} +\alias{sethierarchy} +\alias{sethierarchy,genclone-method} +\alias{sethierarchy<-} +\alias{sethierarchy<-,genclone-method} +\alias{splithierarchy} +\alias{splithierarchy,genclone-method} +\alias{splithierarchy<-} +\alias{splithierarchy<-,genclone-method} +\title{Access and manipulate the population hierarchy for genclone objects.} +\usage{ +gethierarchy(x, formula = NULL, combine = TRUE) + +sethierarchy(x, value) + +sethierarchy(x) <- value + +namehierarchy(x, value) + +namehierarchy(x) <- value + +splithierarchy(x, value, sep = "_") + +splithierarchy(x) <- value + +addhierarchy(x, value, name = "NEW") + +addhierarchy(x) <- value +} +\arguments{ +\item{x}{a genclone object} + +\item{formula}{a nested formula indicating the order of the population +hierarchy.} + +\item{combine}{if \code{TRUE}, the levels will be combined according to the +formula argument. If it is \code{FALSE}, the levels will not be combined.} + +\item{value}{a data frame OR vector OR formula (see details).} + +\item{sep}{a \code{character} indicating the character used to separate +hierarchical levels. This defaults to "_".} + +\item{name}{an optional name argument for use with addhierarchy if supplying +a vector. Defaults to "NEW".} +} +\description{ +The following methods allow the user to quickly change the hierarchy or +population of a genclone object. +} +\details{ +\subsection{Function Specifics}{ \itemize{ \item +\strong{gethierarchy()} - This will retrieve the data from the +\emph{hierarchy} slot in the \linkS4class{genclone} object. You have the +option to choose specific heirarchical levels using a formula (see below) and +you can choose to combine the hierarchical levels (default) \item +\strong{sethierarchy()} - Set or reset the hierarchical levels in your +\linkS4class{genclone} object. \item \strong{namehierarchy()} - Rename the +hierarchical levels. \item \strong{splithierarchy()} - This is conceptually +similar to the default method of \code{\link{splitcombine}}. It is often +difficult to import files with several levels of hierarchy as most data +formats do not allow unlimited population levels. This is circumvented by +collapsing all hierarchical levels into a single population factor with a +common separator for each observation. This function will then split those +hierarchies for you, but it works best on a hierarchy that only has a single +column in it. See the rootrot example below. \item \strong{addhierarchy()} - +Add levels to your population hierarchy. If you have extra hierarchical +levels you want to add to your population hierarchy, you can use this method +to do so. You can input a data frame or a vector, but if you put in a vector, +you have the option to name it (if you are using the functional version and +not the assignment version). }} + +\subsection{Argument Specifics}{ + +These functions allow the user to seamlessly assign the hierarchical levels +of their \code{\linkS4class{genclone}} object. Note that there are two ways +of performing all methods (except for \code{gethierarchy()}). They +essentially do the same thing except that the assignment method (the one with +the "\code{<-}") will modify the object in place whereas the non-assignment +method will not modify the original object. Due to convention, everything +right of the assignment is termed \code{value}. To avoid confusion, here is a +guide to the inputs: \itemize{ \item \strong{sethierarchy()} This will be a +\code{\link{data.frame}} that defines the hierarchy for each individual in +the rows. \item \strong{namehierarchy()} This will be either a +\code{\link{vector}} or a \code{\link{formula}} that will define the names. +\item \strong{splithierarchy()} This will be a \code{\link{formula}} argument +with the same number of levels as the hierarchy you wish to split. \item +\strong{addhierarchy()} This will be a \code{\link{vector}} or +\code{\link{data.frame}} with the same length as the number of individuals in +your data. }} + +\subsection{Details on Formulas}{ + +The preferred use of these functions is with a \code{\link{formula}} object. +Specifically, a hierarchical formula argument is used to assign the levels of +the hierarchy. An example of a hierarchical formula would be:\cr +\code{~Country/City/Neighborhood}\cr or \cr \code{~Country + Country:City + +Country:City:Neighborhood}\cr of course, the first method is slightly easier +to read. It is important to use hiearchical formulas when specifying +hierarchies as other types of formulas (eg. +\code{~Country*City*Neighborhood}) might give spurious results.} +} +\examples{ +# let's look at the microbov data set: +data(microbov) +microgc <- as.genclone(microbov) +microgc + +# We see that we have three vectors of different names here. +?microbov +# These are Country, Breed, and Species +names(other(microgc)) + +# Let's set the hierarchy +sethierarchy(microgc) <- data.frame(other(microgc)) +microgc + +# And change the names so we know what they are +namehierarchy(microgc) <- ~Country/Breed/Species + +# let's see what the hierarchy looks like by Species and Breed: +head(gethierarchy(microgc, ~Breed/Species)) + +\dontrun{ +# Load our data set and convert it to a genclone object. +Aeut.gc <- read.genalex(system.file("files/rootrot.csv", package = "poppr")) + +# we can see the hierarchy is set to Population_Subpopulation. +head(gethierarchy(Aeut.gc)) + +# We can use splithierarchy() to split them. +splithierarchy(Aeut.gc) <- ~Pop/Subpop +Aeut.gc +head(gethierarchy(Aeut.gc)) + +# We can also use gethierarchy to combine the hierarchy. +head(gethierarchy(Aeut.gc, ~Pop/Subpop)) + +# We can also give it a more descriptive name. +namehierarchy(Aeut.gc) <- ~Population/Subpopulation +Aeut.gc +Aeut.gc <- namehierarchy(Aeut.gc, ~Pop/Subpop) +Aeut.gc +} +} +\author{ +Zhian N. Kamvar +} +\seealso{ +\code{\link{setpop}} \code{\link{genclone}} + \code{\link{as.genclone}} +} + diff --git a/man/ia.Rd b/man/ia.Rd index 2867b20b..0f94770c 100755 --- a/man/ia.Rd +++ b/man/ia.Rd @@ -1,65 +1,138 @@ +% Generated by roxygen2 (4.0.1): do not edit by hand \name{ia} \alias{ia} \title{Index of Association} \usage{ ia(pop, sample = 0, method = 1, quiet = FALSE, missing = "ignore", - hist = TRUE) + hist = TRUE, valuereturn = FALSE) } \arguments{ - \item{pop}{a \code{\link{genind}} object OR any fstat, - structure, genetix, genpop, or genalex formatted files.} +\item{pop}{a \code{\link{genind}} object OR any fstat, structure, genetix, + genpop, or genalex formatted files.} - \item{sample}{an integer indicating the number of - permutations desired (eg 999).} +\item{sample}{an integer indicating the number of permutations desired (eg + 999).} - \item{method}{an integer from 1 to 4 indicating the - sampling method desired. see \code{\link{shufflepop}} for - details.} +\item{method}{an integer from 1 to 4 indicating the sampling method desired. + see \code{\link{shufflepop}} for details.} - \item{quiet}{Should the function print anything to the - screen while it is performing calculations? +\item{quiet}{Should the function print anything to the screen while it is + performing calculations? \code{TRUE} prints nothing. - \code{FALSE} (defualt) will print the population name and - progress bar.} + \code{FALSE} (defualt) will print the population name and progress bar.} - \item{missing}{a character string. see - \code{\link{missingno}} for details.} +\item{missing}{a character string. see \code{\link{missingno}} for details.} - \item{hist}{\code{logical} if \code{TRUE}, a histogram - will be printed for each population if there is - sampling.} +\item{hist}{\code{logical} if \code{TRUE}, a histogram will be printed for + each population if there is sampling.} + +\item{valuereturn}{\code{logical} if \code{TRUE}, the index values from the + reshuffled data is returned. If \code{FALSE} (default), the index is + returned with associated p-values in a 4 element numeric vector.} } \value{ -\emph{If no sampling has occured:} - -A named number vector of length 2 giving the Index of -Association, "Ia"; and the Standardized Index of -Association, "rbarD" - -\emph{If there is sampling:} - -A a named number vector of length 4 with the following -values: \item{Ia}{numeric. The index of association.} -\item{p.Ia}{A number indicating the p-value resulting from -a one-sided permutation test based on the number of samples -indicated in the original call.} \item{rbarD}{numeric. The -standardized index of association.} \item{p.rD}{A factor -indicating the p-value resutling from a one-sided -permutation test based on the number of samples indicated -in the original call.} +\subsection{If no sampling has occured:}{ A named number vector of + length 2 giving the Index of Association, "Ia"; and the Standardized Index + of Association, "rbarD" } \subsection{If there is sampling:}{ A a named + number vector of length 4 with the following values: \itemize{\item{Ia - + }{numeric. The index of association.} \item{p.Ia - }{A number indicating + the p-value resulting from a one-sided permutation test based on the number + of samples indicated in the original call.} \item{rbarD - }{numeric. The + standardized index of association.} \item{p.rD - }{A factor indicating the + p-value resulting from a one-sided permutation test based on the number of + samples indicated in the original call.}} } \subsection{If there is + sampling and valureturn = TRUE}{ A list with the following + elements: \itemize{ \item{index}{The above vector} \item{samples}{A data + frame with s by 2 column data frame where s is the number of samples + defined. The columns are for the values of Ia and rbarD, respectively.}}} } \description{ -Calculate the Index of Association and Standardized Index -of Association. Obtain p-values from one-sided permutation -tests. +Calculate the Index of Association and Standardized Index of Association. +Obtain p-values from one-sided permutation tests. +} +\details{ +The index of association was originally developed by A.H.D. Brown + analyzing population structure of wheat (Brown, 1980). It has been widely + used as a tool to detect clonal reproduction within populations . + Populations whose members are undergoing sexual reproduction, whether it be + selfing or out-crossing, will produce gametes via meiosis, and thus have a + chance to shuffle alleles in the next generation. Populations whose members + are undergoing clonal reproduction, however, generally do so via mitosis. + This means that the most likely mechanism for a change in genotype is via + mutation. The rate of mutation varies from species to species, but it is + rarely sufficiently high to approximate a random shuffling of alleles. The + index of association is a calculation based on the ratio of the variance of + the raw number of differences between individuals and the sum of those + variances over each locus . You can also think of it as the observed + variance over the expected variance. If they are the same, then the index + is zero after subtracting one (from Maynard-Smith, 1993): \deqn{I_A = + \frac{V_O}{V_E}-1}{Ia = Vo/Ve} Since the distance is more or less a binary + distance, any sort of marker can be used for this analysis. In the + calculation, phase is not considered, and any difference increases the + distance between two individuals. Remember that each column represents a + different allele and that each entry in the table represents the fraction + of the genotype made up by that allele at that locus. Notice also that the + sum of the rows all equal one. Poppr uses this to calculate distances by + simply taking the sum of the absolute values of the differences between + rows. + + The calculation for the distance between two individuals at a single locus + with \emph{a} allelic states and a ploidy of \emph{k} is as follows (except + for Presence/Absence data): \deqn{ d = \displaystyle + \frac{k}{2}\sum_{i=1}^{a} \mid A_{i} - B_{i}\mid }{d(A,B) = (k/2)*sum(abs(Ai - Bi))} + To find the total number of differences + between two individuals over all loci, you just take \emph{d} over \emph{m} + loci, a value we'll call \emph{D}: + + \deqn{D = \displaystyle \sum_{i=1}^{m} d_i }{D = sum(di)} + + These values are calculated over all possible combinations of individuals + in the data set, \eqn{{n \choose 2}}{choose(n, 2)} after which you end up + with \eqn{{n \choose 2}\cdot{}m}{choose(n, 2) * m}. values of \emph{d} and + \eqn{{n \choose 2}}{choose(n, 2)} values of \emph{D}. Calculating the + observed variances is fairly straightforward (modified from Agapow and + Burt, 2001): + + \deqn{ V_O = \frac{\displaystyle \sum_{i=1}^{n \choose 2} D_{i}^2 - + \frac{(\displaystyle\sum_{i=1}^{n \choose 2} D_{i})^2}{{n \choose 2}}}{{n + \choose 2}}}{Vo = var(D)} + + Calculating the expected variance is the sum of each of the variances of + the individual loci. The calculation at a single locus, \emph{j} is the + same as the previous equation, substituting values of \emph{D} for + \emph{d}: + + \deqn{ var_j = \frac{\displaystyle \sum_{i=1}^{n \choose 2} d_{i}^2 - + \frac{(\displaystyle\sum_{i=1}^{n \choose 2} d_i)^2}{{n \choose 2}}}{{n + \choose 2}} }{Varj = var(dj)} + + The expected variance is then the sum of all the variances over all + \emph{m} loci: + + \deqn{ V_E = \displaystyle \sum_{j=1}^{m} var_j }{Ve = sum(var(dj))} + + Agapow and Burt showed that \eqn{I_A}{Ia} increases steadily with the + number of loci, so they came up with an approximation that is widely used, + \eqn{\bar r_d}{rbarD}. For the derivation, see the manual for + \emph{multilocus}. + + \deqn{ \bar{r_d} = \frac{V_O - V_E} {2\displaystyle + \sum_{j=1}^{m}\displaystyle \sum_{k \neq j}^{m}\sqrt{var_j\cdot{}var_k}} + }{rbarD = (Vo - Ve)/(2*sum(sum(sqrt(var(dj)*var(dk))))} } \examples{ data(nancycats) ia(nancycats) \dontrun{ +# Get the indices back and plot them using base R graphics: +nansamp <- ia(nancycats, sample = 999, valuereturn = TRUE) +layout(matrix(c(1,1,2,2,), 2, 2, byrow = TRUE)) +hist(nansamp$samples$Ia); abline(v = nansamp$index[1]) +hist(nansamp$samples$rbarD); abline(v = nansamp$index[3]) + # Get the index for each population. lapply(seppop(nancycats), ia) # With sampling @@ -71,20 +144,18 @@ Zhian N. Kamvar } \references{ Paul-Michael Agapow and Austin Burt. Indices of multilocus -linkage disequilibrium. \emph{Molecular Ecology Notes}, -1(1-2):101-102, 2001 + linkage disequilibrium. \emph{Molecular Ecology Notes}, 1(1-2):101-102, + 2001 -A.H.D. Brown, M.W. Feldman, and E. Nevo. Multilocus -structure of natural populations of hordeum spontaneum. -\emph{Genetics}, 96(2):523-536, 1980. + A.H.D. Brown, M.W. Feldman, and E. Nevo. Multilocus structure of natural + populations of \emph{Hordeum spontaneum}. \emph{Genetics}, 96(2):523-536, 1980. -J M Smith, N H Smith, M O'Rourke, and B G Spratt. How -clonal are bacteria? Proceedings of the National Academy of -Sciences, 90(10):4384-4388, 1993. + J M Smith, N H Smith, M O'Rourke, and B G Spratt. How clonal are bacteria? + Proceedings of the National Academy of Sciences, 90(10):4384-4388, 1993. } \seealso{ \code{\link{poppr}}, \code{\link{missingno}}, -\code{\link{import2genind}}, \code{\link{read.genalex}}, -\code{\link{clonecorrect}} + \code{\link{import2genind}}, \code{\link{read.genalex}}, + \code{\link{clonecorrect}} } diff --git a/man/info_table.Rd b/man/info_table.Rd new file mode 100644 index 00000000..c9865acc --- /dev/null +++ b/man/info_table.Rd @@ -0,0 +1,89 @@ +% Generated by roxygen2 (4.0.1): do not edit by hand +\name{info_table} +\alias{info_table} +\title{Create a table summarizing missing data or ploidy information of a genind or +genclone object} +\usage{ +info_table(gen, type = c("missing", "ploidy"), percent = TRUE, + plot = FALSE, df = FALSE, returnplot = FALSE, low = "blue", + high = "red", plotlab = TRUE, scaled = TRUE) +} +\arguments{ +\item{gen}{a \linkS4class{genind} or \linkS4class{genclone} object.} + +\item{type}{\code{character}. What information should be returned. Choices + are "missing" (Default) and "ploidy". See Description.} + +\item{percent}{\code{logical}. (ONLY FOR \code{type = 'missing'}) If + \code{TRUE} (default), table and plot will represent missing data as a + percentage of each cell. If \code{FALSE}, the table and plot will represent + missing data as raw counts. (See details)} + +\item{plot}{\code{logical}. If \code{TRUE}, a simple heatmap will be + produced. If \code{FALSE} (default), no heatmap will be produced.} + +\item{df}{\code{logical}. If \code{TRUE}, the data will be returned as a long + form data frame. If \code{FALSE} (default), a matrix with samples in rows + and loci in columns will be returned.} + +\item{returnplot}{\code{logical}. If \code{TRUE}, a list is returned with two + elements: \code{table} - the normal output and \code{plot} - the ggplot + object. If \code{FALSE}, the table is returned.} + +\item{low}{\code{character}. What color should represent no missing data or + lowest observed ploidy? (default: "blue")} + +\item{high}{\code{character}. What color should represent the highest amount + of missing data or observed ploidy? (default: "red")} + +\item{plotlab}{\code{logical}. (ONLY FOR \code{type = 'missing'}) If + \code{TRUE} (default), values of missing data greater than 0\% will be + plotted. If \code{FALSE}, the plot will appear unappended.} + +\item{scaled}{\code{logical}. (ONLY FOR \code{type = 'missing'}) This is for + when \code{percent = TRUE}. If \code{TRUE} (default), the color specified + in \code{high} will represent the highest observed value of missing data. + If \code{FALSE}, the color specified in \code{high} will represent 100\%.} +} +\value{ +a matrix, data frame (\code{df = TRUE}), or a list (\code{returnplot + = TRUE}) representing missing data per population (\code{type = 'missing'}) + or ploidy per individual (\code{type = 'ploidy'}) in a \linkS4class{genind} + or \linkS4class{genclone} object. +} +\description{ +Create a table summarizing missing data or ploidy information of a genind or +genclone object +} +\details{ +Missing data is accounted for on a per-population level.\cr + Ploidy is accounted for on a per-individual level. + + \subsection{For type = 'missing'}{ + This data is potentially useful for identifying areas of systematic missing + data. There are a few caveats to be aware of. \itemize{ \item + \strong{Regarding counts of missing data}: Each count represents the number + of individuals with missing data at each locus. The last column, "mean" can + be thought of as the average number of individuals with missing data per + locus. \item \strong{Regarding percentage missing data}: This percentage is + \strong{relative to the population and locus}, not ot the enitre data set. + The last colum, "mean" represents the average percent of the population + with missing data per locus. }} + \subsection{For type = 'ploidy'}{ + This option is useful for data that has been imported with mixed ploidies. + It will summarize the relative levels of ploidy per individual per locus. + This is simply based off of observed alleles and does not provide any + further estimates.} +} +\examples{ +data(nancycats) +nancy.miss <- info_table(nancycats, plot = TRUE, type = "missing") +data(Pinf) +Pinf.ploid <- info_table(Pinf, plot = TRUE, type = "ploidy") +} +\author{ +Zhian N. Kamvar +} +\keyword{missing} +\keyword{ploidy} + diff --git a/man/informloci.Rd b/man/informloci.Rd index 75ce2e91..f95fec6f 100644 --- a/man/informloci.Rd +++ b/man/informloci.Rd @@ -1,3 +1,4 @@ +% Generated by roxygen2 (4.0.1): do not edit by hand \name{informloci} \alias{informloci} \title{Remove all non-phylogentically informative loci} @@ -5,36 +6,38 @@ informloci(pop, cutoff = 2/nInd(pop), quiet = FALSE) } \arguments{ - \item{pop}{a \code{\link{genind}} object.} +\item{pop}{a \code{\linkS4class{genclone}} or \code{\linkS4class{genind}} + object.} - \item{cutoff}{\code{numeric}. This is a number from 0 to - 1 representing the minimum percentage of differentiating - individuals. Defaults is 2 individuals.} +\item{cutoff}{\code{numeric}. This is a number from 0 to 1 representing the + minimum percentage of differentiating individuals. Defaults is 2 + individuals.} - \item{quiet}{\code{logical}. When \code{quiet = TRUE}, - messages indicating the loci removed will be printed to - screen. When \code{quiet = FALSE}, nothing will be - printed to screen.} +\item{quiet}{\code{logical}. When \code{quiet = TRUE}, messages indicating + the loci removed will be printed to screen. When \code{quiet = FALSE}, + nothing will be printed to screen.} } \value{ A \code{genind} object with user-defined informative loci. } \description{ -This function will facilitate in removing phylogenetically -uninformative loci from a \code{\link{genind}} object. The -user can specify what is meant by phylogenetically -uninformative with a specification of the cutoff -percentage. Any loci under the cutoff will be removed. For -convenience's sake, the default cutoff is set to 2 -individuals. +This function will facilitate in removing phylogenetically uninformative loci +from a \code{\linkS4class{genclone}} or \code{\linkS4class{genind}} object. +The user can specify what is meant by phylogenetically uninformative with a +specification of the cutoff percentage. Any loci under the cutoff will be +removed. For convenience's sake, the default cutoff is set to 2 individuals. +} +\details{ +This function works by analyzing the genotypes at each locus. This + has the effect that if a locus has fixed heterozygotes for two alleles, it + will be removed as all individuals are invariant. } \note{ -This will have a few side effects that affect certain -analyses. First, the number of multilocus genotypes might -be reduced due to the reduced number of markers. Second, if -you plan on using this data for analysis of the index of -association, be sure to use the standardized version -(rbarD) that corrects for the number of observed loci. +This will have a few side effects that affect certain analyses. First, + the number of multilocus genotypes might be reduced due to the reduced + number of markers. Second, if you plan on using this data for analysis of + the index of association, be sure to use the standardized version (rbarD) + that corrects for the number of observed loci. } \examples{ # Load the data set H3N2 @@ -54,4 +57,7 @@ N.informten <- informloci(Nepal, cutoff = 10/nInd(Nepal)) # Decimate (10\%) N.informdecimated <- informloci(Nepal, cutoff = 0.1) } +\author{ +Zhian N. Kamvar +} diff --git a/man/is.genclone.Rd b/man/is.genclone.Rd new file mode 100644 index 00000000..c8dd41df --- /dev/null +++ b/man/is.genclone.Rd @@ -0,0 +1,26 @@ +% Generated by roxygen2 (4.0.1): do not edit by hand +\name{is.genclone} +\alias{is.genclone} +\title{Check for validity of a genclone object} +\usage{ +is.genclone(x) +} +\arguments{ +\item{x}{a genclone object} +} +\description{ +Check for validity of a genclone object +} +\note{ +a \linkS4class{genclone} object will always be a valid +\linkS4class{genind} object. +} +\examples{ +data(nancycats) +nanclone <- as.genclone(nancycats) +is.genclone(nanclone) +} +\author{ +Zhian N. Kamvar +} + diff --git a/man/locus_table.Rd b/man/locus_table.Rd new file mode 100644 index 00000000..9cc5bbfd --- /dev/null +++ b/man/locus_table.Rd @@ -0,0 +1,75 @@ +% Generated by roxygen2 (4.0.1): do not edit by hand +\name{locus_table} +\alias{locus_table} +\title{Create a table of summary statistics per locus.} +\usage{ +locus_table(x, index = "simpson", lev = "allele", population = "ALL", + information = TRUE) +} +\arguments{ +\item{x}{a \code{\linkS4class{genind}} or \code{\linkS4class{genclone}} + object.} + +\item{index}{Which diversity index to use. Choices are \itemize{ \item + \code{"simpson"} (Default) to give Simpson's index \item \code{"shannon"} + to give the Shannon-Wiener index \item \code{"invsimpson"} to give the + Inverse Simpson's index aka the Stoddard and Tayor index.}} + +\item{lev}{At what level do you want to analyze diversity? Choices are + \code{"allele"} (Default) or \code{"genotype"}.} + +\item{population}{Select the populations to be analyzed. This is the + parameter \code{sublist} passed on to the function \code{\link{popsub}}. + Defaults to \code{"ALL"}.} + +\item{information}{When \code{TRUE} (Default), this will print out a header + of information to the R console.} +} +\value{ +a table with 4 columns indicating the Number of alleles/genotypes + observed, Diversity index chosen, Nei's 1978 expected heterozygosity, and + Evenness. +} +\description{ +Create a table of summary statistics per locus. +} +\note{ +This will calculate statistics for polyploids as well by only counting + observed allelic states. +} +\examples{ +# Analyze locus statistics for the North American population of P. infestans. +data(Pinf) +locus_table(Pinf, population = "North America") +} +\author{ +Zhian N. Kamvar +} +\references{ +Jari Oksanen, F. Guillaume Blanchet, Roeland Kindt, Pierre Legendre, Peter + R. Minchin, R. B. O'Hara, Gavin L. Simpson, Peter Solymos, M. Henry H. + Stevens, and Helene Wagner. vegan: Community Ecology Package, 2012. R + package version 2.0-5. + + Niklaus J. Gr\"unwald, Stephen B. Goodwin, Michael G. Milgroom, and William + E. Fry. Analysis of genotypic diversity data for populations of + microorganisms. Phytopathology, 93(6):738-46, 2003 + + J.A. Ludwig and J.F. Reynolds. Statistical Ecology. A Primer on Methods and + Computing. New York USA: John Wiley and Sons, 1988. + + E.C. Pielou. Ecological Diversity. Wiley, 1975. + + J.A. Stoddart and J.F. Taylor. Genotypic diversity: estimation and + prediction in samples. Genetics, 118(4):705-11, 1988. + + Masatoshi Nei. Estimation of average heterozygosity and genetic distance + from a small number of individuals. Genetics, 89(3):583-590, 1978. + + Claude Elwood Shannon. A mathematical theory of communication. Bell Systems + Technical Journal, 27:379-423,623-656, 1948 +} +\seealso{ +\code{\link[vegan]{diversity}}, \code{\link{poppr}} +} + diff --git a/man/missingno.Rd b/man/missingno.Rd index effac2b8..54e18fe9 100755 --- a/man/missingno.Rd +++ b/man/missingno.Rd @@ -1,3 +1,4 @@ +% Generated by roxygen2 (4.0.1): do not edit by hand \name{missingno} \alias{missingno} \title{How to deal with missing data in a genind object.} @@ -5,50 +6,40 @@ missingno(pop, type = "loci", cutoff = 0.05, quiet = FALSE) } \arguments{ - \item{pop}{a \code{\link{genind}} object.} +\item{pop}{a \code{\linkS4class{genclone}} or \code{\linkS4class{genind}} + object.} - \item{type}{a \code{character} string: can be "zero", - "mean", "loci", or "geno" (see \code{Details} for - definitions).]} +\item{type}{a \code{character} string: can be "ignore", "zero", "mean", + "loci", or "geno" (see \code{Details} for definitions).} - \item{cutoff}{\code{numeric}. A number from 0 to 1 - indicating the allowable rate of missing data in either - genotypes or loci. This will be ignored for \code{type} - values of \code{"mean"} or \code{"zero"}.} +\item{cutoff}{\code{numeric}. A number from 0 to 1 indicating the allowable + rate of missing data in either genotypes or loci. This will be ignored for + \code{type} values of \code{"mean"} or \code{"zero"}.} - \item{quiet}{if \code{TRUE}, it will print to the screen - the action performed.} +\item{quiet}{if \code{TRUE}, it will print to the screen the action performed.} } \value{ -a \code{\link{genind}} object. +a \code{\linkS4class{genclone}} or \code{\linkS4class{genind}} object. } \description{ -missingno gives the user four options to deal with missing -data. +missingno gives the user four options to deal with missing data. +} +\details{ +These methods provide a way to deal with systematic missing data and + to give a wrapper for \code{adegenet}'s \code{ \link{na.replace}} function. + ALL OF THESE ARE TO BE USED WITH CAUTION. + + \subsection{Treatment types}{ \itemize{ \item{\code{"ignore"} - does not + remove or replace missing data.} \item{\code{"loci"} - removes all loci + containing missing data in the entire data set. } \item{\code{"genotype"} - + removes any genotypes/isolates/individuals with missing data.} + \item{\code{"mean"} - replaces all NA's with the mean of the alleles for the + entire data set.} \item{\code{"zero"} or \code{"0"} - replaces all NA's with + "0". Introduces more diversity.}}} } \note{ \emph{"wild missingno appeared!"} } -\section{Details}{ - The default way that functions in \code{poppr} deal with - missing data is to simply ignore it. These methods - provide a way to deal with systematic missing data and to - give a wrapper for \code{adegenet}'s \code{ - \link{na.replace}} function. ALL OF THESE ARE TO BE USED - WITH CAUTION. - - \strong{\code{"loci"}} - removes all loci containing - missing data in the entire data set. - - \strong{\code{"geno"}} - removes any - genotypes/isolates/individuals with missing data. - - \strong{\code{"mean"}} - replaces all NA's with the mean - of the alleles for the entire data set. - - \strong{\code{"zero"}} or \strong{\code{"0"}} - replaces - all NA's with "0". Introduces more diversity. -} \examples{ data(nancycats) @@ -81,6 +72,7 @@ nancy.mean <- missingno(nancycats, type = "mean") Zhian N. Kamvar } \seealso{ -\code{\link{na.replace}}, \code{\link{poppr}} +\code{\link{na.replace}}, \code{\link{poppr}}, + \code{\link{poppr.amova}}, \code{\link{nei.dist}}, \code{\link{aboot}} } diff --git a/man/mlg.Rd b/man/mlg.Rd index dda20460..957288b5 100755 --- a/man/mlg.Rd +++ b/man/mlg.Rd @@ -1,6 +1,8 @@ +% Generated by roxygen2 (4.0.1): do not edit by hand \name{mlg} \alias{mlg} \alias{mlg.crosspop} +\alias{mlg.id} \alias{mlg.table} \alias{mlg.vector} \title{Create counts, vectors, and matrices of multilocus genotypes.} @@ -14,75 +16,110 @@ mlg.vector(pop) mlg.crosspop(pop, sublist = "ALL", blacklist = NULL, mlgsub = NULL, indexreturn = FALSE, df = FALSE, quiet = FALSE) + +mlg.id(pop) } \arguments{ - \item{pop}{a \code{\link{genind}} object.} +\item{pop}{a \code{\linkS4class{genind}} or \code{\linkS4class{genclone}} object.} - \item{sublist}{a \code{vector} of population names or - indices that the user wishes to keep. Default to "ALL".} +\item{sublist}{a \code{vector} of population names or indices that the user +wishes to keep. Default to "ALL".} - \item{blacklist}{a \code{vector} of population names or - indices that the user wishes to discard. Default to - \code{NULL}.} +\item{blacklist}{a \code{vector} of population names or indices that the user +wishes to discard. Default to \code{NULL}.} - \item{mlgsub}{a \code{vector} of multilocus genotype - indices with which to subset \code{mlg.table} and - \code{mlg.crosspop}. NOTE: The resulting table from - \code{mlg.table} will only contain countries with those - MLGs} +\item{mlgsub}{a \code{vector} of multilocus genotype indices with which to +subset \code{mlg.table} and \code{mlg.crosspop}. NOTE: The resulting table +from \code{mlg.table} will only contain countries with those MLGs} - \item{quiet}{\code{Logical}. If FALSE, progress of - functions will be printed to the screen.} +\item{quiet}{\code{Logical}. If FALSE, progress of functions will be printed +to the screen.} - \item{bar}{\code{logical} If \code{TRUE}, a bar graph for - each population will be displayed showing the relative - abundance of each MLG within the population.} +\item{bar}{\code{logical} If \code{TRUE}, a bar graph for each population +will be displayed showing the relative abundance of each MLG within the +population.} - \item{indexreturn}{\code{logical} If \code{TRUE}, a - vector will be returned to index the columns of - \code{mlg.table}.} +\item{indexreturn}{\code{logical} If \code{TRUE}, a vector will be returned +to index the columns of \code{mlg.table}.} - \item{df}{\code{logical} If \code{TRUE}, return a data - frame containing the counts of the MLGs and what - countries they are in. Useful for making graphs with - \code{\link{ggplot}}.} +\item{df}{\code{logical} If \code{TRUE}, return a data frame containing the +counts of the MLGs and what countries they are in. Useful for making graphs +with \code{\link{ggplot}}.} - \item{total}{\code{logical} If \code{TRUE}, a row - containing the sum of all represented MLGs is appended to - the matrix produced by mlg.table.} +\item{total}{\code{logical} If \code{TRUE}, a row containing the sum of all +represented MLGs is appended to the matrix produced by mlg.table.} } \value{ -an integer of the number of multilocus genotypes within the -sample. - -a matrix with columns indicating unique multilocus -genotypes and rows indicating populations. - -a numeric vector naming the multilocus genotype of each -individual in the dataset. - -a \code{list} containing vectors of population names for -each MLG. +\subsection{mlg}{ +an integer describing the number of multilocus genotypes observed. +} +\subsection{mlg.table}{ +a matrix with columns indicating unique multilocus genotypes and rows +indicating populations. +} +\subsection{mlg.vector}{ +a numeric vector naming the multilocus genotype of each individual in + the dataset. +} +\subsection{mlg.crosspop}{ +\itemize{ +\item{default}{ a \code{list} where each element contains a named integer vector representing the number of individuals represented from each population in that MLG} +\item{\code{indexreturn = TRUE}}{ a \code{vector} of integers defining the multilocus genotypes that have individuals crossing populations} +\item{\code{df = TRUE}}{ A long form data frame with the columns: MLG, Population, Count. Useful for graphing with ggplot2} +} +} +\subsection{mlg.id}{ +a list of multilocus genotypes with the associated individual names per MLG. +} } \description{ -Create counts, vectors, and matrices of multilocus -genotypes. +Create counts, vectors, and matrices of multilocus genotypes. } \note{ -The resulting matrix of \code{mlg.table} can be used for -analysis with the \code{\link{vegan}} package. The names of -the multilocus genotypes represented will be those from the -entire dataset. If you wish to view those relative to a -subsetted dataset, you can use \code{mlg.bar(popsub(pop, -...))}. - -The numbers of \code{mlg.vector} will not match up with the -sequence of new genotypes found because sorting takes place -within the algorithm before the genotypes are called so -that the number of comparisons is \eqn{n-1} instead of -\eqn{\frac{n(n-1)}{2}}. +The resulting matrix of \code{mlg.table} can be used for analysis with +the \code{\link{vegan}} package. + +mlg.vector will recalculate the mlg vector for + \code{\linkS4class{genind}} objects and will return the contents of the mlg + slot in \code{\linkS4class{genclone}} objects. This means that MLGs will be + different for subsetted \code{\linkS4class{genind}} objects. } \examples{ +# Load the data set +data(Aeut) + +# Investigate the number of multilocus genotypes. +amlg <- mlg(Aeut) +amlg # 119 + +# show the multilocus genotype vector +avec <- mlg.vector(Aeut) +avec + +# Get a table +atab <- mlg.table(Aeut, bar = FALSE) +atab + +# See where multilocus genotypes cross populations +acrs <- mlg.crosspop(Aeut) # MLG.59: (2 inds) Athena Mt. Vernon + +# See which individuals belong to each MLG +aid <- mlg.id(Aeut) +aid["59"] # individuals 159 and 57 + +\dontrun{ + +# A simple example. 10 individuals, 5 genotypes. +mat1 <- matrix(ncol=5, 25:1) +mat1 <- rbind(mat1, mat1) +mat <- matrix(nrow=10, ncol=5, paste(mat1,mat1,sep="/")) +mat.gid <- df2genind(mat, sep="/") +mlg(mat.gid) +mlg.vector(mat.gid) +mlg.table(mat.gid) + +# Now for a more complicated example. +# Data set of 1903 samples of the H3N2 flu virus genotyped at 125 SNP loci. data(H3N2) mlg(H3N2, quiet=FALSE) @@ -95,7 +132,6 @@ H.tab <- mlg.table(H3N2, bar=FALSE, total=TRUE) # Show which genotypes exist accross populations in the entire dataset. res <- mlg.crosspop(H3N2, quiet=FALSE) -\dontrun{ # Let's say we want to visualize the multilocus genotype distribution for the # USA and Russia mlg.table(H3N2, sublist=c("USA", "Russia"), bar=TRUE) @@ -115,20 +151,12 @@ H.sub <- mlg.table(H3N2, mlgsub=inds) # analyze only the MLGs that are duplicated across populations. new.H <- H3N2[H.vec \%in\% inds, ] -# A simple example. 10 individuals, 5 genotypes. -mat1 <- matrix(ncol=5, 25:1) -mat1 <- rbind(mat1, mat1) -mat <- matrix(nrow=10, ncol=5, paste(mat1,mat1,sep="/")) -mat.gid <- df2genind(mat, sep="/") -mlg(mat.gid) -mlg.vector(mat.gid) -mlg.table(mat.gid) } } \author{ Zhian N. Kamvar } \seealso{ -\code{\link{diversity}} \code{\link{popsub}} +\code{\link[vegan]{diversity}} \code{\link{popsub}} } diff --git a/man/monpop.Rd b/man/monpop.Rd new file mode 100644 index 00000000..b7d5f633 --- /dev/null +++ b/man/monpop.Rd @@ -0,0 +1,35 @@ +% Generated by roxygen2 (4.0.1): do not edit by hand +\docType{data} +\name{monpop} +\alias{monpop} +\title{Peach brown rot pathogen \emph{Monilinia fructicola}} +\format{a \code{\linkS4class{genclone}} object with 3 hierarchical levels + coded into one population factor. These are named "Tree", "Year", and + "Symptom"} +\usage{ +data(monpop) +} +\description{ +This is microsatellite data for a population of the haploid + plant pathogen \emph{Monilinia fructicola} that causes disease within peach + tree canopies (Everhart & Scherm, 2014). Entire populations within trees + were sampled across 3 years (2009, 2010, and 2011) in a total of four + trees, where one tree was sampled in all three years, for a total of 6 + within-tree populations. Within each year, samples in the spring were taken + from affected blossoms (termed “BB” for blossom blight) and in late summer + from affected fruits (termed “FR” for fruit rot). There are a total of 694 + isolates with 65 to 173 isolates within each canopy population that were + characterized using a set of 13 microsatellite markers. +} +\examples{ +data(monpop) +splithierarchy(monpop) <- ~Tree/Year/Symptom +setpop(monpop) <- ~Symptom/Year +monpop +} +\references{ +SE Everhart, H Scherm, (2014) Fine-scale genetic structure of + \emph{Monilinia fructicola} during brown rot epidemics within individual peach + tree canopies. Phytopathology, submitted +} + diff --git a/man/partial_clone.Rd b/man/partial_clone.Rd index da91b74b..0c615ff9 100644 --- a/man/partial_clone.Rd +++ b/man/partial_clone.Rd @@ -1,23 +1,23 @@ +% Generated by roxygen2 (4.0.1): do not edit by hand \docType{data} \name{partial_clone} \alias{partial_clone} \title{Simulated data illustrating a Minimum Spanning Network based on Bruvo's Distance} \format{a \code{\link{genind}} object with 50 individuals, 10 loci, and four -popualations.} + populations.} \usage{ data(partial_clone) } \description{ These data were simulated using SimuPOP version 1.0.8 with -99.9\% clonal reproduction over 10,000 generations. -Populations were assigned post-hoc and are simply present -for the purposes of demonstrating a minimum spanning -network with Bruvo's distance. + 99.9\% clonal reproduction over 10,000 generations. Populations were + assigned post-hoc and are simply present for the purposes of demonstrating + a minimum spanning network with Bruvo's distance. } \references{ -Bo Peng and Christopher Amos (2008) Forward-time -simulations of nonrandom mating populations using simuPOP. -\emph{bioinformatics}, 24 (11): 1408-1409. +Bo Peng and Christopher Amos (2008) Forward-time simulations of + nonrandom mating populations using simuPOP. \emph{bioinformatics}, 24 (11): + 1408-1409. } diff --git a/man/plot_poppr_msn.Rd b/man/plot_poppr_msn.Rd new file mode 100644 index 00000000..d051d110 --- /dev/null +++ b/man/plot_poppr_msn.Rd @@ -0,0 +1,151 @@ +% Generated by roxygen2 (4.0.1): do not edit by hand +\name{plot_poppr_msn} +\alias{plot_poppr_msn} +\title{Plot minimum spanning networks produced in poppr.} +\usage{ +plot_poppr_msn(x, poppr_msn, gscale = TRUE, gadj = 3, glim = c(0, 0.8), + gweight = 1, wscale = TRUE, inds = "ALL", quantiles = TRUE, + nodelab = 2, cutoff = NULL, palette = NULL, layfun = layout.auto, + beforecut = FALSE, ...) +} +\arguments{ +\item{x}{a \code{\linkS4class{genind}} or \code{\linkS4class{genclone}} + object from which \code{poppr_msn} was derived.} + +\item{poppr_msn}{a \code{list} produced from either \code{\link{poppr.msn}} + or \code{\link{bruvo.msn}}. This list should contain a graph, a vector of + population names and a vector of hexadecimal color definitions for each + popualtion.} + +\item{inds}{a character vector indicating which individual names to label + nodes with. See details.} + +\item{quantiles}{\code{logical}. When set to \code{TRUE} (default), the scale + bar will be composed of the quantiles from the observed edge weights. When + set to \code{FALSE}, the scale bar will be composed of a smooth gradient + from the minimum edge weight to the maximum edge weight.} + +\item{nodelab}{an \code{integer} specifying the smallest size of node to + label. See details.} + +\item{cutoff}{a number indicating the longest distance to display in your + graph. This is performed by removing edges with weights greater than this + number.} + +\item{palette}{a function or character corresponding to a specific palette + you want to use to delimit your populations. The default is whatever + palette was used to produce the original graph.} + +\item{layfun}{a function specifying the layout of nodes in your graph. It + defaults to \code{\link[igraph]{layout.auto}}.} + +\item{beforecut}{if \code{TRUE}, the layout of the graph will be computed + before any edges are removed with \code{cutoff}. If \code{FALSE} (Default), + the layout will be computed after any edges are removed.} + +\item{...}{any other parameters to be passed on to + \code{\link[igraph]{plot.igraph}}.} + +\item{gadj}{"grey adjust". a positive \code{integer} greater than zero that +will serve as the exponent to the edge weight to scale the grey value to +represent that weight.} + +\item{glim}{"grey limit". Two numbers between zero and one. They determine +the upper and lower limits for the \code{\link{gray}} function. Default is 0 +(black) and 0.8 (20\% black).} + +\item{gweight}{"grey weight". an \code{integer}. If it's 1, the grey scale +will be weighted to emphasize the differences between closely related nodes. +If it is 2, the grey scale will be weighted to emphasize the differences +between more distantly related nodes.} + +\item{gscale}{"grey scale". If this is \code{TRUE}, this will scale the color + of the edges proportional to the observed distance, with the lines becoming + darker for more related nodes. See \code{\link{greycurve}} for details.} + +\item{wscale}{"width scale". If this is \code{TRUE}, the edge widths will be + scaled proportional to the inverse of the observed distance , with the + lines becoming thicker for more related nodes.} +} +\description{ +This function allows you to take the output of poppr.msn and bruvo.msn and +customize the plot by labeling groups of individuals, size of nodes, and +adjusting the palette and scale bar. +} +\details{ +The previous incarnation of msn plotting in poppr simply plotted the + minimum spanning network with the legend of populations, but did not + provide a scale bar and it did not provide the user a simple way of + manipulating the layout or labels. This function allows the user to + manipulate many facets of graph creation, making the creation of minimum + spanning networks ever so slightly more user friendly. Note that this + function will only plot individual names, not MLG names since the naming + convention for those are arbitrary. + + This function must have both the source data and the output msn to work. + The source data must contain the same population structure as the graph. + Every other parameter has a default setting. + + \subsection{Parameter details}{ \itemize{ \item \code{inds} This will take + in the name of a query individual in your data set and will use that to + query any other individuals that share multilocus genotypes and label their + node on the graph. The default is to label all the nodes, but you can set + it to a name that doesn't exist to label none of the nodes. \item + \code{nodelab} If a node is not labeled by individual, this will label the + size of the nodes greater than or equal to this value. If you don't want to + label the size of the nodes, simply set this to a very high number. \item + \code{cutoff} This is useful for when you want to investigate groups of + multilocus genotypes separated by a specific distance or if you have two + distinct populations and you want to physically separate them in your + network. \item \code{beforecut} This is an indicator useful if you want to + maintain the same position of the nodes before and after removing edges + with the \code{cutoff} argument. This works best if you set a seed before + you run the function.}} +} +\examples{ +# Using a data set of the Aphanomyces eutieches root rot pathogen. +data(Aeut) +adist <- diss.dist(Aeut, percent = TRUE) +amsn <- poppr.msn(Aeut, adist, showplot = FALSE) + +# Default +library(igraph) # To get all the layouts. +set.seed(500) +plot_poppr_msn(Aeut, amsn, gadj = 15, beforecut = TRUE) + +# Removing link between populations and labelling no individuals +set.seed(500) +plot_poppr_msn(Aeut, amsn, inds = "none", gadj = 15, beforecut = TRUE, cutoff = 0.2) + +# Labelling individual #57 because it is an MLG that crosses popualtions +# Showing clusters of MLGS with at most 5\% variation +# Notice that the Mt. Vernon population appears to be more clonal +set.seed(50) +plot_poppr_msn(Aeut, amsn, gadj = 15, cutoff = 0.05, inds = "57") + + +\dontrun{ +data(partial_clone) +pcmsn <- bruvo.msn(partial_clone, replen = rep(1, 10)) +plot_poppr_msn(partial_clone, pcmsn, palette = rainbow, inds = "sim 20") + +# Something pretty +data(microbov) +mdist <- diss.dist(microbov, percent = TRUE) +micmsn <- poppr.msn(microbov, mdist, showplot = FALSE) + +plot_poppr_msn(microbov, micmsn, palette = "terrain.colors", inds = "n", + quantiles = FALSE) +plot_poppr_msn(microbov, micmsn, palette = "terrain.colors", inds = "n", + cutoff = 0.3, quantiles = FALSE) +} +} +\author{ +Zhian N. Kamvar +} +\seealso{ +\code{\link[igraph]{layout.auto}} \code{\link[igraph]{plot.igraph}} +\code{\link{poppr.msn}} \code{\link{bruvo.msn}} \code{\link{greycurve}} +\code{\link[igraph]{delete.edges}} \code{\link{palette}} +} + diff --git a/man/poppr-package.Rd b/man/poppr-package.Rd new file mode 100644 index 00000000..b395cd41 --- /dev/null +++ b/man/poppr-package.Rd @@ -0,0 +1,132 @@ +% Generated by roxygen2 (4.0.1): do not edit by hand +\docType{package} +\name{poppr-package} +\alias{poppr-package} +\title{The \pkg{poppr} R package.} +\description{ +\pkg{Poppr} provides tools for population genetic analysis that +include genotypic diveristy measures, genetic distances with bootstrap +support, native organization and handling of population hierarchies, and +clone correction. + +To cite \pkg{poppr}, please use \code{citation("poppr")}. When referring to +\pkg{poppr} in your manuscript, please use lower case unless it occurs at the +beginning of a sentence. +} +\details{ +This package relies on the \pkg{\link[adegenet]{adegenet}} package. + It was built around the \code{\linkS4class{genind}} object, which stores + genetic information in a table of allele frequencies. \pkg{Poppr} has + extended this object into a new object called + \code{\linkS4class{genclone}}. This object tracks clones and organizes + different population hierarchical levels. + + \subsection{Documentation}{ Documentation is available for any function by + typing \code{?function_name} in the R console. Essential functions for + manipulating data are detailed within the \emph{Data import and + manipulation} vignette (\code{vignette("poppr_manual", package = "poppr")}) + and details on algorithms used in \pkg{poppr} are within the + \emph{Algorithms and equations} vignette (\code{vignette("algo", package = + "poppr")}). Examples of analyses are available in a primer written by + Niklaus J. Grünwald, Zhian N. Kamvar, and Sydney E. Everhart at + \url{http://grunwaldlab.github.io/Population_Genetics_in_R}.} + + \subsection{Getting help}{ If you have a specific question or issue with + \pkg{poppr}, feel free to contribute to the google group at + \url{https://groups.google.com/forum/#!forum/poppr}. If you find a bug and + are a github user, you can submit bug reports at + \url{https://github.com/grunwaldlab/poppr/issues}. Otherwise, leave a + message on the groups.} + + Below are descriptions and links to functions found in \pkg{poppr}. Be + aware that all functions in \pkg{\link[adegenet]{adegenet}} are also + available. +} +\section{Data import/export}{ + +\itemize{ +\item \code{\link{getfile}} - Provides a quick GUI to grab files for import +\item \code{\link{read.genalex}} - Read GenAlEx formatted csv files to a genind object +\item \code{\link{genind2genalex}} - Converts genind objects to GenAlEx formatted csv files +} +} + +\section{Data manipulation}{ + +\itemize{ +\item \code{\link{as.genclone}} - Converts genind objects to genclone objects +\item \code{\link{setpop}} - Set the population using defined hierarchies +\item \code{\link{splithierarchy}} - Split a concatenated hierarchy imported as a population +\item \code{\link{sethierarchy}} - Define a population hierarchy of a genclone object +\item \code{\link{gethierarchy}} - Extract the hierarchy data frame +\item \code{\link{addhierarchy}} - Add a vector or data frame to an existing hierarchy +\item \code{\link{namehierarchy}} - Rename a population hierarchy +\item \code{\link{missingno}} - Handles missing data +\item \code{\link{clonecorrect}} - Clone censors at a specified population hierarchy +\item \code{\link{informloci}} - Detects and removes phylogenetically uninformative loci +\item \code{\link{popsub}} - Subsets genind objects by population +\item \code{\link{shufflepop}} - Shuffles genotypes at each locus using four different shuffling algorithms +\item \code{\link{recode_polyploids}} - recode polyploid data sets with missing alleles imported as "0" +} +} + +\section{Genetic distances}{ + +\itemize{ +\item \code{\link{bruvo.dist}} - Bruvo’s distance +\item \code{\link{diss.dist}} - Absolute genetic distance (see provesti.dist) +\item \code{\link{nei.dist}} - Nei’s 1978 genetic distance +\item \code{\link{rogers.dist}} - Rogers’ euclidean distance +\item \code{\link{reynolds.dist}} - Reynolds’ coancestry distance +\item \code{\link{edwards.dist}} - Edwards’ angular distance +\item \code{\link{provesti.dist}} - Provesti’s absolute genetic distance +} +} + +\section{Bootstrapping}{ + +\itemize{ +\item \code{\link{aboot}} - Creates a bootstrapped dendrogram for any distance measure +\item \code{\link{bruvo.boot}} - Produces dendrograms with bootstrap support based on Bruvo’s distance +} +} + +\section{Analysis}{ + +\itemize{ +\item \code{\link{poppr.amova}} - Analysis of Molecular Variance (as implemented in ade4) +\item \code{\link{ia}} - Calculates the index of association +\item \code{\link{mlg}} - Calculates the number of multilocus genotypes +\item \code{\link{mlg.crosspop}} - Finds all multilocus genotypes that cross populations +\item \code{\link{mlg.table}} - Returns a table of populations by multilocus genotypes +\item \code{\link{mlg.vector}} - Returns a vector of a numeric multilocus genotype assignment for each individual +\item \code{\link{mlg.id}} - Finds all individuals associated with a single multilocus genotype +\item \code{\link{poppr}} - Returns a diversity table by population +\item \code{\link{poppr.all}} - Returns a diversity table by population for all compatible files specified +\item \code{\link{private_alleles}} - Tabulates the occurences of alleles that only occur in one population. +\item \code{\link{locus_table}} - Creates a table of summary statistics per locus. +} +} + +\section{Visulalization}{ + +\itemize{ +\item \code{\link{plot_poppr_msn}} - Plots minimum spanning networks produced in poppr with scale bar and legend +\item \code{\link{greycurve}} - Helper to determine the appropriate parameters for adjusting the grey level for msn functions +\item \code{\link{bruvo.msn}} - Produces minimum spanning networks based off Bruvo’s distance colored by population +\item \code{\link{poppr.msn}} - Produces a minimum spanning network for any pairwise distance matrix related to the data +\item \code{\link{info_table}} - Creates a heatmap representing missing data or observed ploidy +\item \code{\link{genotype_curve}} - Creates a series of boxplots to demonstrate how many markers are needed to represent the diversity of your data. +} +} +\author{ +Zhian N. Kamvar, Javier F. Tabima, Niklaus J. Grünwald + +Maintainer: Zhian N. Kamvar +} +\references{ +Kamvar ZN, Tabima JF, Grünwald NJ. (2014) Poppr: an R package for +genetic analysis of populations with clonal, partially clonal, and/or sexual +reproduction. PeerJ 2:e281 \url{http://dx.doi.org/10.7717/peerj.281} +} + diff --git a/man/poppr.Rd b/man/poppr.Rd index 207904ba..e00046b4 100755 --- a/man/poppr.Rd +++ b/man/poppr.Rd @@ -1,128 +1,115 @@ +% Generated by roxygen2 (4.0.1): do not edit by hand \name{poppr} \alias{poppr} \title{Produce a basic summary table for population genetic analyses.} \usage{ -poppr(pop, total = TRUE, sublist = c("ALL"), blacklist = c(NULL), - sample = 0, method = 1, missing = "ignore", cutoff = 0.05, - quiet = FALSE, clonecorrect = FALSE, hier = c(1), - dfname = "population_hierarchy", keep = 1, hist = TRUE, minsamp = 10) +poppr(dat, total = TRUE, sublist = "ALL", blacklist = NULL, sample = 0, + method = 1, missing = "ignore", cutoff = 0.05, quiet = FALSE, + clonecorrect = FALSE, hier = 1, dfname = "population_hierarchy", + keep = 1, hist = TRUE, minsamp = 10, legend = FALSE) } \arguments{ - \item{pop}{a \code{\link{genind}} object OR any fstat, - structure, genetix, genpop, or genalex formatted file.} - - \item{total}{default \code{TRUE}. Should indecies be - calculated for the combined populations represented in - the entire file?} - - \item{sublist}{a list of character strings or integers to - indicate specific population names (located in - \code{$pop.names} within the \code{\link{genind}} object) - Defaults to "ALL".} - - \item{blacklist}{a list of character strings or integers - to indicate specific populations to be removed from - analysis. Defaults to NULL.} - - \item{sample}{an integer indicating the number of - permutations desired to obtain p-values. Sampling will - shuffle genotypes at each locus to simulate a panmictic - population using the observed genotypes. Calculating the - p-value includes the observed statistics, so set your - sample number to one off for a round p-value (eg. - \code{sample = 999} will give you p = 0.001 and - \code{sample = 1000} will give you p = 0.000999001).} - - \item{method}{an integer from 1 to 4 indicating the - method of sampling desired. see \code{\link{shufflepop}} - for details.} - - \item{missing}{how should missing data be treated? - \code{"zero"} and \code{"mean"} will set the missing - values to those documented in \code{\link{na.replace}}. - \code{"loci"} and \code{"geno"} will remove any loci or - genotypes with missing data, respectively (see +\item{dat}{a \code{\linkS4class{genind}} object OR a + \code{\linkS4class{genclone}} object OR any fstat, structure, genetix, + genpop, or genalex formatted file.} + +\item{total}{When \code{TRUE} (default), indices will be calculated for the + pooled populations.} + +\item{sublist}{a list of character strings or integers to indicate specific + population names (located in \code{$pop.names} within the + \code{\link{genind}} object) Defaults to "ALL".} + +\item{blacklist}{a list of character strings or integers to indicate specific + populations to be removed from analysis. Defaults to NULL.} + +\item{sample}{an integer indicating the number of permutations desired to + obtain p-values. Sampling will shuffle genotypes at each locus to simulate + a panmictic population using the observed genotypes. Calculating the + p-value includes the observed statistics, so set your sample number to one + off for a round p-value (eg. \code{sample = 999} will give you p = 0.001 + and \code{sample = 1000} will give you p = 0.000999001).} + +\item{method}{an integer from 1 to 4 indicating the method of sampling + desired. see \code{\link{shufflepop}} for details.} + +\item{missing}{how should missing data be treated? \code{"zero"} and + \code{"mean"} will set the missing values to those documented in + \code{\link{na.replace}}. \code{"loci"} and \code{"geno"} will remove any + loci or genotypes with missing data, respectively (see \code{\link{missingno}} for more information.} - \item{cutoff}{\code{numeric} a number from 0 to 1 - indicating the percent missing data allowed for analysis. - This is to be used in conjunction with the flag - \code{missing} (see \code{\link{missingno}} for details)} - - \item{quiet}{Should the function print anything to the - screen while it is performing calculations? \code{TRUE} - prints nothing, \code{FALSE} (defualt) will print the - population name and a progress bar.} - - \item{clonecorrect}{default \code{FALSE}. must be used - with the \code{hier} and \code{dfname} parameters, or the - user will potentially get undesiered results. see - \code{\link{clonecorrect}} for details.} - - \item{hier}{a \code{numeric or character list}. This is - the list of vectors within a data frame (specified in - \code{dfname}) in the 'other' slot of the - \code{\link{genind}} object. The list should indicate the - population hierarchy to be used for clone correction.} - - \item{dfname}{a \code{character string}. This is the name - of the data frame or list containing the vectors of the +\item{cutoff}{\code{numeric} a number from 0 to 1 indicating the percent + missing data allowed for analysis. This is to be used in conjunction with + the flag \code{missing} (see \code{\link{missingno}} for details)} + +\item{quiet}{\code{FALSE} (default) will display a progress bar for each + population analyzed.} + +\item{clonecorrect}{default \code{FALSE}. must be used with the \code{hier} + and \code{dfname} parameters, or the user will potentially get undesired + results. see \code{\link{clonecorrect}} for details.} + +\item{hier}{\itemize{ \item \strong{for genclone objects} - a \code{formula} + indicating the hierarchical levels to be used. The hierarchies should be + present in the \code{hierarchy} slot. See \code{\link{sethierarchy}} for + details. \item \strong{for genind objects} - a \code{numeric or character} + vector OR a hierarchical formula. This is the list of columns within a data + frame (specified in \code{dfname}) in the 'other' slot of the + \code{\link{genind}} object. The list should indicate the population + hierarchy to be used for clone correction. }} + +\item{dfname}{a \code{character string}. (Only for genind objects) This is + the name of the data frame or heirarchy containing the vectors of the population hierarchy within the \code{other} slot of the \code{\link{genind}} object.} - \item{keep}{an \code{integer}. This indicates the levels - of the population hierarchy you wish to keep after clone - correcting your data sets. To combine the hierarchy, just - set keep from 1 to the length of your hierarchy. see - \code{\link{clonecorrect}} for details.} +\item{keep}{an \code{integer}. This indicates the levels of the population + hierarchy you wish to keep after clone correcting your data sets. To + combine the hierarchy, just set keep from 1 to the length of your + hierarchy. see \code{\link{clonecorrect}} for details.} + +\item{hist}{\code{logical} if \code{TRUE} (default) and \code{sampling > 0}, + a histogram will be produced for each population.} - \item{hist}{\code{logical} if \code{TRUE} a histogram - will be produced for each population.} +\item{minsamp}{an \code{integer} indicating the minimum number of individuals + to resample for rarefaction analysis. See \code{\link[vegan]{rarefy}} for + details.} - \item{minsamp}{an \code{integer} indicating the minimum - number of individuals to resample for rarefaction - analysis.} +\item{legend}{\code{logical}. When this is set to \code{TRUE}, a legend + describing the resulting table columns will be printed. Defaults to + \code{FALSE}} } \value{ -\item{Pop}{A vector indicating the pouplation factor} -\item{N}{An integer vector indicating the number of -individuals/isolates in the specified population.} -\item{MLG}{An integer vector indicating the number of -multilocus genotypes found in the specified poupulation, -(see: \code{\link{mlg}})} \item{eMLG}{The expected number -of MLG at the lowest common sample size (set by the -parameter \code{minsamp}.} \item{SE}{The standard error for -the rarefaction analysis} \item{H}{Shannon-Weiner Diversity -index} \item{G}{Stoddard and Taylor's Index} -\item{Hexp}{Expected heterozygosity or Nei's 1987 genotypic -diversity corrected for sample size.} \item{E.5}{Evenness} -\item{Ia}{A numeric vector giving the value of the Index of -Association for each population factor, (see -\code{\link{ia}}).} \item{p.Ia}{A numeric vector indicating -the p-value for Ia from the number of reshufflings -indicated in \code{sample}. Lowest value is 1/n where n is -the number of observed values.} \item{rbarD}{A numeric -vector giving the value of the Standardized Index of -Association for each population factor, (see -\code{\link{ia}}).} \item{p.rD}{A numeric vector indicating -the p-value for rbarD from the number of reshufflings -indicated in \code{sample}. Lowest value is 1/n where n is -the number of observed values.} \item{File}{A vector -indicating the name of the original data file.} +\item{Pop}{A vector indicating the pouplation factor} \item{N}{An + integer vector indicating the number of individuals/isolates in the + specified population.} \item{MLG}{An integer vector indicating the number + of multilocus genotypes found in the specified poupulation, (see: + \code{\link{mlg}})} \item{eMLG}{The expected number of MLG at the lowest + common sample size (set by the parameter \code{minsamp}.} \item{SE}{The + standard error for the rarefaction analysis} \item{H}{Shannon-Weiner + Diversity index} \item{G}{Stoddard and Taylor's Index} \item{Hexp}{Expected + heterozygosity or Nei's 1987 genotypic diversity corrected for sample + size.} \item{E.5}{Evenness} \item{Ia}{A numeric vector giving the value of + the Index of Association for each population factor, (see + \code{\link{ia}}).} \item{p.Ia}{A numeric vector indicating the p-value for + Ia from the number of reshufflings indicated in \code{sample}. Lowest value + is 1/n where n is the number of observed values.} \item{rbarD}{A numeric + vector giving the value of the Standardized Index of Association for each + population factor, (see \code{\link{ia}}).} \item{p.rD}{A numeric vector + indicating the p-value for rbarD from the number of reshuffles indicated + in \code{sample}. Lowest value is 1/n where n is the number of observed + values.} \item{File}{A vector indicating the name of the original data + file.} } \description{ -This function allows the user to quickly view indecies of -distance, heterozygosity, and inbreeding to aid in the -decision of a path to further analyze a specified dataset. -It natively takes \code{\link{genind}} formatted files, but -can convert any raw data formats that adegenet can take -(fstat, structure, genetix, and genpop) as well as genalex -files exported into a csv format (see -\code{\link{read.genalex}} for details). -} -\note{ -All values are rounded to three significant digits for the -final table. +This function allows the user to quickly view indicies of heterozygosity, +evenness, and inbreeding to aid in the decision of a path to further analyze +a specified dataset. It natively takes \code{\linkS4class{genind}} and +\code{\linkS4class{genclone}} objects, but can convert any raw data formats +that adegenet can take (fstat, structure, genetix, and genpop) as well as +genalex files exported into a csv format (see \code{\link{read.genalex}} for +details). } \examples{ data(nancycats) @@ -143,62 +130,50 @@ Zhian N. Kamvar } \references{ Paul-Michael Agapow and Austin Burt. Indices of multilocus -linkage disequilibrium. \emph{Molecular Ecology Notes}, -1(1-2):101-102, 2001 - -A.H.D. Brown, M.W. Feldman, and E. Nevo. Multilocus -structure of natural populations of hordeum spontaneum. -\emph{Genetics}, 96(2):523-536, 1980. - -Niklaus J. Gr\"unwald, Stephen B. Goodwin, Michael G. -Milgroom, and William E. Fry. Analysis of genotypic -diversity data for populations of microorganisms. -Phytopathology, 93(6):738-46, 2003 - -Bernhard Haubold and Richard R. Hudson. Lian 3.0: detecting -linkage disequilibrium in multilocus data. Bioinformatics, -16(9):847-849, 2000. - -Kenneth L.Jr. Heck, Gerald van Belle, and Daniel -Simberloff. Explicit calculation of the rarefaction -diversity measurement and the determination of sufficient -sample size. Ecology, 56(6):pp. 1459-1461, 1975 - -S H Hurlbert. The nonconcept of species diversity: a -critique and alternative parameters. Ecology, -52(4):577-586, 1971. - -J.A. Ludwig and J.F. Reynolds. Statistical Ecology. A -Primer on Methods and Computing. New York USA: John Wiley -and Sons, 1988. - -Masatoshi Nei. Estimation of average heterozygosity and -genetic distance from a small number of individuals. -Genetics, 89(3):583-590, 1978. - -Jari Oksanen, F. Guillaume Blanchet, Roeland Kindt, Pierre -Legendre, Peter R. Minchin, R. B. O'Hara, Gavin L. Simpson, -Peter Solymos, M. Henry H. Stevens, and Helene Wagner. -vegan: Community Ecology Package, 2012. R package version -2.0-5. - -E.C. Pielou. Ecological Diversity. Wiley, 1975. - -Claude Elwood Shannon. A mathematical theory of -communication. Bell Systems Technical Journal, -27:379-423,623-656, 1948 - -J M Smith, N H Smith, M O'Rourke, and B G Spratt. How -clonal are bacteria? Proceedings of the National Academy of -Sciences, 90(10):4384-4388, 1993. - -J.A. Stoddart and J.F. Taylor. Genotypic diversity: -estimation and prediction in samples. Genetics, -118(4):705-11, 1988. + linkage disequilibrium. \emph{Molecular Ecology Notes}, 1(1-2):101-102, + 2001 + + A.H.D. Brown, M.W. Feldman, and E. Nevo. Multilocus structure of natural + populations of \emph{Hordeum spontaneum}. \emph{Genetics}, 96(2):523-536, 1980. + + Niklaus J. Gr\"unwald, Stephen B. Goodwin, Michael G. Milgroom, and William + E. Fry. Analysis of genotypic diversity data for populations of + microorganisms. Phytopathology, 93(6):738-46, 2003 + + Bernhard Haubold and Richard R. Hudson. Lian 3.0: detecting linkage + disequilibrium in multilocus data. Bioinformatics, 16(9):847-849, 2000. + + Kenneth L.Jr. Heck, Gerald van Belle, and Daniel Simberloff. Explicit + calculation of the rarefaction diversity measurement and the determination + of sufficient sample size. Ecology, 56(6):pp. 1459-1461, 1975 + + S H Hurlbert. The nonconcept of species diversity: a critique and + alternative parameters. Ecology, 52(4):577-586, 1971. + + J.A. Ludwig and J.F. Reynolds. Statistical Ecology. A Primer on Methods and + Computing. New York USA: John Wiley and Sons, 1988. + + Masatoshi Nei. Estimation of average heterozygosity and genetic distance + from a small number of individuals. Genetics, 89(3):583-590, 1978. + + Jari Oksanen, F. Guillaume Blanchet, Roeland Kindt, Pierre Legendre, Peter + R. Minchin, R. B. O'Hara, Gavin L. Simpson, Peter Solymos, M. Henry H. + Stevens, and Helene Wagner. vegan: Community Ecology Package, 2012. R + package version 2.0-5. + + E.C. Pielou. Ecological Diversity. Wiley, 1975. + + Claude Elwood Shannon. A mathematical theory of communication. Bell Systems + Technical Journal, 27:379-423,623-656, 1948 + + J M Smith, N H Smith, M O'Rourke, and B G Spratt. How clonal are bacteria? + Proceedings of the National Academy of Sciences, 90(10):4384-4388, 1993. + + J.A. Stoddart and J.F. Taylor. Genotypic diversity: estimation and + prediction in samples. Genetics, 118(4):705-11, 1988. } \seealso{ \code{\link{clonecorrect}}, \code{\link{poppr.all}}, -\code{\link{ia}}, \code{\link{missingno}}, -\code{\link{mlg}} + \code{\link{ia}}, \code{\link{missingno}}, \code{\link{mlg}} } diff --git a/man/poppr.all.Rd b/man/poppr.all.Rd index e06abb3c..9fe1bff8 100755 --- a/man/poppr.all.Rd +++ b/man/poppr.all.Rd @@ -1,3 +1,4 @@ +% Generated by roxygen2 (4.0.1): do not edit by hand \name{poppr.all} \alias{poppr.all} \title{Process a list of files with poppr} @@ -5,19 +6,17 @@ poppr.all(filelist, ...) } \arguments{ - \item{filelist}{a list of files in the current working - directory} +\item{filelist}{a list of files in the current working directory} - \item{...}{arguments passed on to poppr} +\item{...}{arguments passed on to poppr} } \value{ see \code{\link{poppr}} } \description{ -poppr.all is a wrapper function that will loop through a -list of files from the workind directory, execute -\code{\link{poppr}}, and concatenate the output into one -data frame. +poppr.all is a wrapper function that will loop through a list of files from +the working directory, execute \code{\link{poppr}}, and concatenate the +output into one data frame. } \examples{ \dontrun{ diff --git a/man/poppr.amova.Rd b/man/poppr.amova.Rd new file mode 100644 index 00000000..d10e0db4 --- /dev/null +++ b/man/poppr.amova.Rd @@ -0,0 +1,135 @@ +% Generated by roxygen2 (4.0.1): do not edit by hand +\name{poppr.amova} +\alias{amova} +\alias{poppr.amova} +\title{Perform Analysis of Molecular Variance (AMOVA) on genind or genclone objects.} +\usage{ +poppr.amova(x, hier = NULL, clonecorrect = FALSE, within = TRUE, + dist = NULL, squared = TRUE, correction = "quasieuclid", + dfname = "population_hierarchy", sep = "_", missing = "loci", + cutoff = 0.05, quiet = FALSE) +} +\arguments{ +\item{x}{a \code{\linkS4class{genind}} or \code{\linkS4class{genclone}} + object} + +\item{hier}{a hierarchical \code{\link{formula}} that defines your population + hieararchy. (e.g.: ~Population/Subpopulation). \strong{See Details below.}} + +\item{clonecorrect}{\code{logical} if \code{TRUE}, the data set will be clone + corrected with respect to the lowest level of the hierarchy. The default is + set to \code{FALSE}. See \code{\link{clonecorrect}} for details.} + +\item{within}{\code{logical}. When this is set to \code{TRUE} (Default), + variance within individuals are calculated as well. If this is set to + \code{FALSE}, The lowest level of the hierarchy will be the sample level. + See Details below.} + +\item{dist}{an optional distance matrix calculated on your data.} + +\item{squared}{if a distance matrix is supplied, this indicates whether or + not it represents squared distances.} + +\item{correction}{a \code{character} defining the correction method for + non-euclidean distances. Options are \code{\link[ade4]{quasieuclid}} + (Default), \code{\link[ade4]{lingoes}}, and \code{\link[ade4]{cailliez}}. + See Details below.} + +\item{dfname}{if the input data set is a \code{\linkS4class{genind}} object, + specify the name of the data frame in the \code{\link[adegenet]{other}} + slot defining the population hierarchy. Defaults to + \code{"population_hierarchy"}} + +\item{sep}{A single character used to separate the hierarchical levels. This +defaults to "_".} + +\item{missing}{specify method of correcting for missing data utilizing + options given in the function \code{\link{missingno}}. Default is + \code{"loci"}.} + +\item{cutoff}{specify the level at which missing data should be + removed/modified. See \code{\link{missingno}} for details.} + +\item{quiet}{\code{logical} If \code{FALSE} (Default), messages regarding any + corrections will be printed to the screen. If \code{TRUE}, no messages will + be printed.} +} +\value{ +a list of class \code{amova} from the ade4 package. See + \code{\link[ade4]{amova}} for details. +} +\description{ +This function utilizes the ade4 implementation of AMOVA. See +\code{\link[ade4]{amova}} for details on the specific implementation. +} +\details{ +The poppr implementation of AMOVA is a very detailed wrapper for the + ade4 implementation. The output is an \code{\link[ade4]{amova}} class list + that contains the results in the first four elements. The inputs are contained in the + last three elements. The inputs required for the ade4 implementation are: + \enumerate{ + \item a distance matrix on all unique genotypes (haplotypes) + \item a data frame defining the hierarchy of the distance matrix + \item a genotype (haplotype) frequency table.} + All of this data can be constructed from a + \code{\linkS4class{genind}} object, but can be daunting for a novice R + user. \emph{This function automates the entire process}. Since there are many + variables regarding genetic data, some points need to be highlighted: + + \subsection{On Hierarchies:}{The hierarchy is defined by different hierarchical + levels that separate your data. In a \code{\linkS4class{genclone}} object, + these levels are inherently defined in the \code{hierarchy} slot. For + \code{\linkS4class{genind}} objects, these levels must be defined in a data + frame located within the \code{\link[adegenet]{other}} slot. It is best + practice to name this data frame \code{"population_hierarchy"}.} + + \subsection{On Within Individual Variance:}{ Heterozygosities within diploid + genotypes are sources of variation from within individuals and can be + quantified in AMOVA. When \code{within = TRUE}, poppr will split diploid + genotypes into haplotypes and use those to calculate within-individual + variance. No estimation of phase is made. This acts much like the default + settings for AMOVA in the Arlequin software package. Within individual + variance will not be calculated for haploid individuals or dominant + markers.} + + \subsection{On Euclidean Distances:}{ AMOVA, as defined by + Excoffier et al., utilizes an absolute genetic distance measured in the + number of differences between two samples across all loci. With the ade4 + implementation of AMOVA (utilized by poppr), distances must be Euclidean + (due to the nature of the calculations). Unfortunately, many genetic + distance measures are not always euclidean and must be corrected for before + being analyzed. Poppr automates this with three methods implemented in + ade4, \code{\link{quasieuclid}}, \code{\link{lingoes}}, and + \code{\link{cailliez}}. The correction of these distances should not + adversely affect the outcome of the analysis.} +} +\examples{ +data(Aeut) +agc <- as.genclone(Aeut) +agc +amova.result <- poppr.amova(agc, ~Pop/Subpop) +amova.result +amova.test <- randtest(amova.result) # Test for significance +plot(amova.test) +amova.test +\dontrun{ +amova.cc.result <- poppr.amova(agc, ~Pop/Subpop, clonecorrect = TRUE) +amova.cc.result +amova.cc.test <- randtest(amova.cc.result) +plot(amova.cc.test) +amova.cc.test +} +} +\references{ +Excoffier, L., Smouse, P.E. and Quattro, J.M. (1992) Analysis of +molecular variance inferred from metric distances among DNA haplotypes: +application to human mitochondrial DNA restriction data. \emph{Genetics}, +\strong{131}, 479–491. +} +\seealso{ +\code{\link[ade4]{amova}} \code{\link{clonecorrect}} + \code{\link{diss.dist}} \code{\link{missingno}} + \code{\link[ade4]{is.euclid}} \code{\link{sethierarchy}} +} +\keyword{amova} + diff --git a/man/poppr.msn.Rd b/man/poppr.msn.Rd index 0ad4c6d6..1da93246 100644 --- a/man/poppr.msn.Rd +++ b/man/poppr.msn.Rd @@ -1,85 +1,77 @@ +% Generated by roxygen2 (4.0.1): do not edit by hand \name{poppr.msn} +\alias{msn.poppr} \alias{poppr.msn} \title{Create a minimum spanning network of selected populations using a distance matrix.} \usage{ poppr.msn(pop, distmat, palette = topo.colors, sublist = "All", blacklist = NULL, vertex.label = "MLG", gscale = TRUE, glim = c(0, - 0.8), gadj = 3, gweight = 1, wscale = TRUE, ...) + 0.8), gadj = 3, gweight = 1, wscale = TRUE, showplot = TRUE, ...) } \arguments{ - \item{pop}{a \code{\link{genind}} object} - - \item{distmat}{a distance matrix that has been derived - from your data set.} - - \item{palette}{a \code{function} defining the color - palette to be used to color the populations on the graph. - It defaults to \code{\link{topo.colors}}, but you can - easily create new schemes by using - \code{\link{colorRampPalette}} (see examples for - details)} - - \item{sublist}{a \code{vector} of population names or - indexes that the user wishes to keep. Default to "ALL".} - - \item{blacklist}{a \code{vector} of population names or - indexes that the user wishes to discard. Default to - \code{NULL}} - - \item{vertex.label}{a \code{vector} of characters to - label each vertex. There are two defaults: \code{"MLG"} - will label the nodes with the multilocus genotype from - the original data set and \code{"inds"} will label the - nodes with the representative individual names.} - - \item{gscale}{"grey scale". If this is \code{TRUE}, this - will scale the color of the edges proportional to the - observed distance, with the lines becoming darker for - more related nodes. See \code{\link{greycurve}} for - details.} - - \item{glim}{"grey limit". Two numbers between zero and - one. They determine the upper and lower limits for the - \code{\link{gray}} function. Default is 0 (black) and 0.8 - (20\% black). See \code{\link{greycurve}} for details.} - - \item{gadj}{"grey adjust". a positive \code{integer} - greater than zero that will serve as the exponent to the - edge weight to scale the grey value to represent that - weight. See \code{\link{greycurve}} for details.} - - \item{gweight}{"grey weight". an \code{integer}. If it's - 1, the grey scale will be weighted to emphasize the - differences between closely related nodes. If it is 2, - the grey scale will be weighted to emphasize the +\item{pop}{a \code{\link{genind}} object} + +\item{distmat}{a distance matrix that has been derived from your data set.} + +\item{palette}{a \code{function} defining the color palette to be used to + color the populations on the graph. It defaults to + \code{\link{topo.colors}}, but you can easily create new schemes by using + \code{\link{colorRampPalette}} (see examples for details)} + +\item{sublist}{a \code{vector} of population names or indexes that the user + wishes to keep. Default to "ALL".} + +\item{blacklist}{a \code{vector} of population names or indexes that the user + wishes to discard. Default to \code{NULL}} + +\item{vertex.label}{a \code{vector} of characters to label each vertex. There + are two defaults: \code{"MLG"} will label the nodes with the multilocus + genotype from the original data set and \code{"inds"} will label the nodes + with the representative individual names.} + +\item{gscale}{"grey scale". If this is \code{TRUE}, this will scale the color + of the edges proportional to the observed distance, with the lines becoming + darker for more related nodes. See \code{\link{greycurve}} for details.} + +\item{glim}{"grey limit". Two numbers between zero and one. They determine + the upper and lower limits for the \code{\link{gray}} function. Default is + 0 (black) and 0.8 (20\% black). See \code{\link{greycurve}} for details.} + +\item{gadj}{"grey adjust". a positive \code{integer} greater than zero that + will serve as the exponent to the edge weight to scale the grey value to + represent that weight. See \code{\link{greycurve}} for details.} + +\item{gweight}{"grey weight". an \code{integer}. If it's 1, the grey scale + will be weighted to emphasize the differences between closely related + nodes. If it is 2, the grey scale will be weighted to emphasize the differences between more distantly related nodes. See \code{\link{greycurve}} for details.} - \item{wscale}{"width scale". If this is \code{TRUE}, the - edge widths will be scaled proportional to the inverse of - the observed distance , with the lines becoming thicker - for more related nodes.} +\item{wscale}{"width scale". If this is \code{TRUE}, the edge widths will be + scaled proportional to the inverse of the observed distance , with the + lines becoming thicker for more related nodes.} + +\item{showplot}{logical. If \code{TRUE}, the graph will be plotted. If + \code{FALSE}, it will simply be returned.} - \item{...}{any other arguments that could go into - plot.igraph} +\item{...}{any other arguments that could go into plot.igraph} } \value{ -\item{graph}{a minimum spanning network with nodes -corresponding to MLGs within the data set. Colors of the -nodes represent population membership. Width and color of -the edges represent distance.} \item{populations}{a vector -of the population names corresponding to the vertex colors} -\item{colors}{a vector of the hexadecimal representations -of the colors used in the vertex colors} +\item{graph}{a minimum spanning network with nodes corresponding to + MLGs within the data set. Colors of the nodes represent population + membership. Width and color of the edges represent distance.} + \item{populations}{a vector of the population names corresponding to the + vertex colors} \item{colors}{a vector of the hexadecimal representations of + the colors used in the vertex colors} } \description{ -Create a minimum spanning network of selected populations -using a distance matrix. +Create a minimum spanning network of selected populations using a distance +matrix. } \note{ -The edges of these graphs may cross each other if the graph -becomes too large. +The edges of these graphs may cross each other if the graph becomes too + large. } \examples{ # Load the data set and calculate the distance matrix for all individuals. @@ -116,9 +108,8 @@ round(E(micro.msn$graph)$weight, 3), NA), vertex.size=2, edge.label.color="red") Javier F. Tabima, Zhian N. Kamvar } \seealso{ -\code{\link{nancycats}}, \code{\link{upgma}}, -\code{\link{nj}}, \code{\link{nodelabels}}, -\code{\link{na.replace}}, \code{\link{missingno}}, -\code{\link{bruvo.msn}}, \code{\link{greycurve}}. +\code{\link{nancycats}}, \code{\link{upgma}}, \code{\link{nj}}, + \code{\link{nodelabels}}, \code{\link{na.replace}}, + \code{\link{missingno}}, \code{\link{bruvo.msn}}, \code{\link{greycurve}}. } diff --git a/man/popsub.Rd b/man/popsub.Rd index 54c1dc41..afdf6b75 100755 --- a/man/popsub.Rd +++ b/man/popsub.Rd @@ -1,33 +1,32 @@ +% Generated by roxygen2 (4.0.1): do not edit by hand \name{popsub} \alias{popsub} -\title{Subset a \code{\link{genind}} object by population} +\title{Subset a \code{\linkS4class{genclone}} or \code{\linkS4class{genind}} object by population} \usage{ -popsub(pop, sublist = "ALL", blacklist = NULL, mat = NULL, drop = TRUE) +popsub(gid, sublist = "ALL", blacklist = NULL, mat = NULL, drop = TRUE) } \arguments{ - \item{pop}{a \code{\link{genind}} object.} +\item{gid}{a \code{\linkS4class{genclone}} or \code{\linkS4class{genind}} object.} - \item{sublist}{a \code{vector} of population names or - indexes that the user wishes to keep. Default to "ALL".} +\item{sublist}{a \code{vector} of population names or indexes that the user +wishes to keep. Default to "ALL".} - \item{blacklist}{a \code{vector} of population names or - indexes that the user wishes to discard. Default to - \code{NULL}} +\item{blacklist}{a \code{vector} of population names or indexes that the user +wishes to discard. Default to \code{NULL}} - \item{mat}{a \code{matrix} object produced by - \code{\link{mlg.table}} to be subsetted. If this is - present, the subsetted matrix will be returned instead of - the genind object} +\item{mat}{a \code{matrix} object produced by \code{\link{mlg.table}} to be +subsetted. If this is present, the subsetted matrix will be returned instead +of the genind object} - \item{drop}{\code{logical}. If \code{TRUE}, unvariate - alleles will be dropped from the population.} +\item{drop}{\code{logical}. If \code{TRUE}, unvariate alleles will be dropped +from the population.} } \value{ A \code{genind} object or a matrix. } \description{ -Create a new dataset with specified populations or exclude -specified populations from the dataset. +Create a new dataset with specified populations or exclude specified +populations from the dataset. } \examples{ # Load the dataset microbov. @@ -47,4 +46,7 @@ micbig <- popsub(microbov, blacklist=c("NDama", "Montbeliard")) miclrg <- popsub(microbov, sublist=c("BlondeAquitaine", "Charolais")) } } +\author{ +Zhian N. Kamvar +} diff --git a/man/population-methods.Rd b/man/population-methods.Rd new file mode 100644 index 00000000..bc47fcb2 --- /dev/null +++ b/man/population-methods.Rd @@ -0,0 +1,50 @@ +% Generated by roxygen2 (4.0.1): do not edit by hand +\docType{methods} +\name{setpop} +\alias{setpop} +\alias{setpop,genclone-method} +\alias{setpop<-} +\alias{setpop<-,genclone-method} +\title{Manipulate the population factor of genclone objects.} +\usage{ +setpop(x, formula = NULL) + +setpop(x) <- value +} +\arguments{ +\item{x}{a genclone object} + +\item{formula}{a nested formula indicating the order of the population +hierarchy.} + +\item{value}{same as formula} +} +\description{ +The following methods allow the user to quickly change the population of a +genclone object. +} +\examples{ +data(Aeut) +Aeut.gc <- as.genclone(Aeut) + +# Notice that there are two hierarchical levels, Pop and Subpop +Aeut.gc + +# Currently set on just Pop +head(pop(Aeut.gc)) + +# setting the hierarchy to both Pop and Subpop +setpop(Aeut.gc) <- ~Pop/Subpop +head(pop(Aeut.gc)) + +\dontrun{ + +# Can be used to create objects as well. +Aeut.old <- setpop(Aeut.gc, ~Pop) +head(pop(Aeut.old)) +} +} +\author{ +Zhian N. Kamvar +} + diff --git a/man/private_alleles.Rd b/man/private_alleles.Rd new file mode 100644 index 00000000..287d5d66 --- /dev/null +++ b/man/private_alleles.Rd @@ -0,0 +1,39 @@ +% Generated by roxygen2 (4.0.1): do not edit by hand +\name{private_alleles} +\alias{private_alleles} +\title{Tabulate alleles the occur in only one population.} +\usage{ +private_alleles(gid, report = "table", level = "population") +} +\arguments{ +\item{gid}{a \code{\linkS4class{genind}} or \code{\linkS4class{genclone}} + object.} + +\item{report}{one of \code{"table", "vector",} or \code{"data.frame"}. Tables + (Default) and data frame will report counts along with populations or + individuals. Vectors will simply report which populations or individuals + contain private alleles. Tables are matrices with populations or + individuals in rows and alleles in columns. Data frames are long form.} + +\item{level}{one of \code{"population"} (Default) or \code{"individual"}.} +} +\value{ +a matrix, data.frame, or vector defining the populations or + individuals containing private alleles. If vector is chosen, alleles are + not defined. +} +\description{ +Tabulate alleles the occur in only one population. +} +\examples{ +data(Pinf) # Load P. infestans data. +setpop(Pinf) <- ~Country # Set the population to be at the country level +private_alleles(Pinf) +\dontrun{ +# An example of how this data can be displayed. +library(ggplot2) +Pinfpriv <- private_alleles(Pinf, report = "data.frame") +ggplot(Pinfpriv) + geom_tile(aes(x = population, y = allele, fill = count)) +} +} + diff --git a/man/read.genalex.Rd b/man/read.genalex.Rd index d14a2eac..ed5cbf42 100755 --- a/man/read.genalex.Rd +++ b/man/read.genalex.Rd @@ -1,46 +1,79 @@ +% Generated by roxygen2 (4.0.1): do not edit by hand \name{read.genalex} \alias{read.genalex} \title{Importing data from genalex formatted *.csv files.} \usage{ -read.genalex(genalex, ploidy = 2, geo = FALSE, region = FALSE) +read.genalex(genalex, ploidy = 2, geo = FALSE, region = FALSE, + genclone = TRUE, sep = ",") } \arguments{ - \item{genalex}{a *.csv file exported from genalex} +\item{genalex}{a *.csv file exported from genalex} - \item{ploidy}{indicate the ploidy of the dataset} +\item{ploidy}{indicate the ploidy of the dataset} - \item{geo}{indicates the presence of geographic data in - the file.} +\item{geo}{indicates the presence of geographic data in the file. This data + will be included in a data frame labeled \code{xy} in the + \code{\link{other}} slot.} - \item{region}{indicates the presence of regional data in - the file.} +\item{region}{indicates the presence of regional data in the file.} + +\item{genclone}{when \code{TRUE} (default), the output will be a + \code{\linkS4class{genclone}} object. When \code{FALSE}, the output will be + a \code{\linkS4class{genind}} object} + +\item{sep}{A character specifying the column separator of the data. Defaults + to ",".} } \value{ -A \code{\link{genind}} object. +A \code{\linkS4class{genclone}} or \code{\linkS4class{genind}} + object. } \description{ -read.genalex will read in a genalex-formatted file that has -been exported in a comma separated format and will parse -most types of genalex data. The output is a -\code{\link{genind}} object. +read.genalex will read in a genalex-formatted file that has been exported in +a comma separated format and will parse most types of genalex data. The +output is a \code{\linkS4class{genclone}} or \code{\linkS4class{genind}} +object. } -\note{ -This function cannot handle raw allele frequency data. +\details{ +\subsection{if \code{genclone = FALSE}}{ The resulting genind object + will have a data frame in the \code{other} slot called + \code{population_hierarchy}. This will contain a column for your population + data and a column for your Regional data if you have set the flag.} + + \subsection{if \code{genclone = TRUE}}{ The resulting genclone object will + have a single hierarchical level defined in the hierarchy slot. This will + be called "Pop" and will reflect the population factor defined in the + genalex input. If \code{region = TRUE}, a second column will be inserted + and labeled "Region". If you have more than two hierarchical levels within + your data set, you should run the command \code{\link{splithierarchy}} on + your data set to define the unique hierarchical levels. } -The resulting genind object will have a data frame in the -\code{other} slot called population_hierarchy. This will -contain a column for your population data and a column for -your Regional data if you have set the flag. + \subsection{FOR POLYPLOID (> 2n) DATA SETS}{ Adegenet's genind object has + an all-or-none approach to missing data. If a sample has missing data at a + particular locus, then the entire locus is considered missing. This works + for diploids and haploids where allelic dosage is unambiguous. For + polyploids this poses a problem as much of the data set would be + transformed into missing data. With this function, I have created a + workaround. -If there is geographic data, it will be included in a data -frame called xy in the \code{other} slot. + When importing polyploid data sets, missing data is scored as "0" and kept + within the genind object as an extra allele. This will break most analyses + relying on allele frequencies*. All of the functions in poppr will work + properly with these data sets as multilocus genotype analysis is agnostic + of ploidy and we have written both Bruvo's distance and the index of + association in such a way as to be able to handle polyploids presented in + this manner. + + * To restore functionality of analyses relying on allele frequencies, use + the \code{\link{recode_polyploids}} function.} +} +\note{ +This function cannot handle raw allele frequency data. -In the case that there are duplicated names within the -file, this function will assume separate individuals and -rename each one to a sequence of integers from 1 to the -number of individuals. A vector of the original names will -be saved in the \code{other} slot under -\code{original_names}. + In the case that there are duplicated names within the file, this function + will assume separate individuals and rename each one to a sequence of + integers from 1 to the number of individuals. A vector of the original + names will be saved in the \code{other} slot under \code{original_names}. } \examples{ \dontrun{ @@ -60,6 +93,7 @@ genalex4 <- read.genalex("genalex4.csv", region=TRUE, geo=TRUE) Zhian N. Kamvar } \seealso{ -\code{\link{clonecorrect}}, \code{\link{genind}} +\code{\link{clonecorrect}}, \code{\linkS4class{genclone}}, + \code{\linkS4class{genind}}, \code{\link{recode_polyploids}} } diff --git a/man/recode_polyploids.Rd b/man/recode_polyploids.Rd new file mode 100644 index 00000000..c46f6759 --- /dev/null +++ b/man/recode_polyploids.Rd @@ -0,0 +1,109 @@ +% Generated by roxygen2 (4.0.1): do not edit by hand +\name{recode_polyploids} +\alias{recode_polyploids} +\title{Recode polyploid microsatellite data for use in frequency based statistics.} +\usage{ +recode_polyploids(poly, newploidy = poly@ploidy) +} +\arguments{ +\item{poly}{a \code{\linkS4class{genclone}} or \code{\linkS4class{genind}} +object that has a ploidy of >2} + +\item{newploidy}{an \code{integer}. This gives the user the option to reset + the ploidy of the data set. It's default is set to the ploidy of the + incoming data set.} +} +\value{ +a \code{\linkS4class{genclone}} or \code{\linkS4class{genind}} + object. +} +\description{ +As the genind object requires ploidy to be consistent across loci, a +workaround to importing polyploid data was to code missing alleles as "0" +(for microsatellite data sets). The advantage of this is that users would be +able to calculate Bruvo's distance, the index of association, and genotypic +diversity statistics. The tradeoff was the fact that this broke all other +analyses as they relied on allele frequencies and the missing alleles are +treated as extra alleles. This function removes those alleles and returns a +\code{\linkS4class{genclone}} or \code{\linkS4class{genind}} object where +allele frequencies are coded based on the number of alleles observed at a +single locus per individual. See the examples for more details. +} +\details{ +The genind object has two caveats that make it difficult to work + with polyploid data sets: \enumerate{\item ploidy must be constant + throughout the data set \item missing data is treated as "all-or-none"} In + an ideal world, polyploid genotypes would be just as unambigouous as + diploid or haploid genotypes. Unfortunately, the world we live in is far + from ideal and a genotype of AB in a tetraploid organism could be AAAB, + AABB, or ABBB. In order to get polyploid data in to \pkg{adegenet} or + \pkg{poppr}, we must code all loci to have the same number of allelic + states as the ploidy or largest observed heterozygote (if ploidy is + unknown). The way to do this is to insert zeroes to pad the alleles. So, to + import two genotypes of: +\tabular{rrrr}{ +NA \tab 20 \tab 23 \tab 24\cr +20 \tab 24 \tab 26 \tab 43 +} +they should be coded as: +\tabular{rrrr}{ + 0 \tab 20 \tab 23 \tab 24\cr +20 \tab 24 \tab 26 \tab 43 +} +This zero is treated as an extra allele and is represented in the genind object as so: +\tabular{rrrrrr}{ +\strong{0} \tab \strong{20} \tab \strong{23} \tab \strong{24} \tab \strong{26} \tab \strong{43}\cr +0.25 \tab 0.25 \tab 0.25 \tab 0.25 \tab 0.00 \tab 0.00\cr +0.00 \tab 0.25 \tab 0.00 \tab 0.25 \tab 0.25 \tab 0.25 +} + + A homozygote would have the \strong{0} column at a value of 0.75. This + function remidies this problem by removing the zero column and rescaling the allele + frequencies to those observed. The above table would become: +\tabular{rrrrr}{ +\strong{20} \tab \strong{23} \tab \strong{24} \tab \strong{26} \tab \strong{43}\cr +0.333 \tab 0.333 \tab 0.333 \tab 0.00 \tab 0.00\cr +0.25 \tab 0.00 \tab 0.25 \tab 0.25 \tab 0.25 +} + +With this, the user is able to calculate frequency based statistics on the +data set. +} +\note{ +This is an approximation, and a bad one at that. \pkg{Poppr} was not +originally intended for polyploids, but with the inclusion of Bruvo's +distance, it only made sense to attempt something beyond single use. + +\strong{Do not use recoded data with Bruvo's distance or the index of association.} +} +\examples{ +data(Pinf) +iPinf <- recode_polyploids(Pinf) + +# Obtaining basic summaries. Note the heterozygosity measures. +summary(Pinf) +summary(iPinf) + +\dontrun{ +library(ape) + +# Removing missing data. +Pinf <- missingno(Pinf, "geno", cutoff = 0) +iPinf <- recode_polyploids(Pinf) + +# Calculating Rogers' distance. +rog <- rogers.dist(Pinf) +irog <- rogers.dist(iPinf) + +# We will now plot neighbor joining trees. Note the decreased distance in the +# original data. +plot(nj(rog), type = "unrooted", show.tip.label = FALSE) +add.scale.bar(lcol = "red") +plot(nj(irog), type = "unrooted", show.tip.label = FALSE) +add.scale.bar(lcol = "red") +} +} +\author{ +Zhian N. Kamvar +} + diff --git a/man/shufflepop.Rd b/man/shufflepop.Rd index d0d043c6..18cd38d2 100755 --- a/man/shufflepop.Rd +++ b/man/shufflepop.Rd @@ -1,41 +1,38 @@ +% Generated by roxygen2 (4.0.1): do not edit by hand \name{shufflepop} \alias{shufflepop} -\title{Shuffle individuals in a \code{\link{genind}} object independently over each -locus.} +\title{Shuffle individuals in a \code{\linkS4class{genclone}} or +\code{\linkS4class{genind}} object independently over each locus.} \usage{ shufflepop(pop, method = 1) } \arguments{ - \item{pop}{a \code{\link{genind}} object} +\item{pop}{a \code{\linkS4class{genclone}} or \code{\linkS4class{genind}} + object} - \item{method}{an integer between 1 and 4. See details - below.} +\item{method}{an integer between 1 and 4. See details below.} } \value{ -a \code{\link{genind}} object shuffled by a specified -method +a \code{\linkS4class{genclone}} or \code{\linkS4class{genind}} object + shuffled by a specified method } \description{ -Shuffle individuals in a \code{\link{genind}} object -independently over each locus. +Shuffle individuals in a \code{\linkS4class{genclone}} or +\code{\linkS4class{genind}} object independently over each locus. } -\section{Details}{ - This function will shuffle each locus in the data set - independently of one another, rendering them essentially - unlinked. The following methods are available to shuffle - your data: \enumerate{ \item \strong{Permute Alleles} - This will redistribute all alleles in the sample - throughout the locus. Missing data is fixed in place. - This maintains allelic structure, but heterozygosity is - variable. \item \strong{Parametric Bootstrap} This will - redistribute available alleles within the locus based on - their allelic frequencies. This means that both the - allelic state and heterozygosity will vary. The resulting - data set will not have missing data. \item - \strong{Non-Parametric Bootstrap} This will shuffle the - allelic state for each individual. Missing data is fixed - in place. \item \strong{Multilocus Style Permutation} - This will shuffle the genotypes at each locus, +\details{ +This function will shuffle each locus in the data set independently + of one another, rendering them essentially unlinked. The following methods + are available to shuffle your data: \enumerate{ \item \strong{Permute + Alleles} This will redistribute all alleles in the sample throughout the + locus. Missing data is fixed in place. This maintains allelic structure, + but heterozygosity is variable. \item \strong{Parametric Bootstrap} This + will redistribute available alleles within the locus based on their allelic + frequencies. This means that both the allelic state and heterozygosity will + vary. The resulting data set will not have missing data. \item + \strong{Non-Parametric Bootstrap} This will shuffle the allelic state for + each individual. Missing data is fixed in place. \item \strong{Multilocus + Style Permutation} This will shuffle the genotypes at each locus, maintaining the heterozygosity and allelic structure. } } \examples{ @@ -65,8 +62,7 @@ summary(shufflepop(Zebu, method=4)) Zhian N. Kamvar } \references{ -Paul-Michael Agapow and Austin Burt. 2001. Indices of -multilocus linkage disequilibrium. \emph{Molecular Ecology -Notes}, 1(1-2):101-102 +Paul-Michael Agapow and Austin Burt. 2001. Indices of multilocus + linkage disequilibrium. \emph{Molecular Ecology Notes}, 1(1-2):101-102 } diff --git a/man/splitcombine.Rd b/man/splitcombine.Rd index 416dc6f2..899c296e 100755 --- a/man/splitcombine.Rd +++ b/man/splitcombine.Rd @@ -1,59 +1,57 @@ +% Generated by roxygen2 (4.0.1): do not edit by hand \name{splitcombine} \alias{splitcombine} -\title{Split a or combine items within a data frame in \code{\link{genind}} objects.} +\title{Split a or combine items within a data frame in \code{\link{genind}} objects (DEPRECATED).} \usage{ splitcombine(pop, method = 1, dfname = "population_hierarchy", sep = "_", - hier = c(1), setpopulation = TRUE, fixed = TRUE) + hier = 1, setpopulation = TRUE, fixed = TRUE) } \arguments{ - \item{pop}{a \code{\link{genind}} object.} +\item{pop}{a \code{\link{genind}} object.} - \item{method}{an \code{integer}, 1 for splitting, 2 for - combining.} +\item{method}{an \code{integer}, 1 for splitting, 2 for combining.} - \item{dfname}{the name of the data frame containing the - population structure. for the splitting method, the - combined population structure must be in the first - column.} +\item{dfname}{the name of the data frame containing the population structure. +for the splitting method, the combined population structure must be in the +first column.} - \item{sep}{The separator used for separating or combining - the data. See note.} +\item{sep}{The separator used for separating or combining the data. See note.} - \item{hier}{a \code{vector} containing the population - hierarchy you wish to split or combine.} +\item{hier}{a \code{vector} containing the population hierarchy you wish to +split or combine.} - \item{setpopulation}{\code{logical}. if \code{TRUE}, the - population of the resulting genind object will be that of - the highest population structure (split method) or the - combined populations (combine method).} +\item{setpopulation}{\code{logical}. if \code{TRUE}, the population of the +resulting genind object will be that of the highest population structure +(split method) or the combined populations (combine method).} - \item{fixed}{\code{logical}. An argument to be passed - onto \code{\link{strsplit}}. If \code{TRUE}, \code{sep} - must match exactly to the populations for the split - method.} +\item{fixed}{\code{logical}. An argument to be passed onto +\code{\link{strsplit}}. If \code{TRUE}, \code{sep} must match exactly to the +populations for the split method.} } \value{ -a \code{\link{genind}} object with a modified data frame in -the \code{\link{other}} slot. +a \code{\link{genind}} object with a modified data frame in the +\code{\link{other}} slot. } \description{ -Often, one way a lot of file formats fail is that they do -not allow multiple population hierarchies. This can be -circumvented, however, by coding all of the hierarchies in -one string in the input file with a common separator (eg. -"_"). \code{splitcombine} will be able to recognise those -separators and create a data frame of all the population -structures for whatever subsetting you might need. +Often, one way a lot of file formats fail is that they do not allow multiple +population hierarchies. This can be circumvented, however, by coding all of +the hierarchies in one string in the input file with a common separator (eg. +"_"). \code{splitcombine} will be able to recognise those separators and +create a data frame of all the population structures for whatever subsetting +you might need. } \note{ -The separator field is sensitive to regular expressions. If -you do not know what those are, please use the default -underscore to separate your populations. Use \code{fixed = -TRUE} to ignore regular expressions. If you do not set the -\code{hier} flag for the split method, your new data frame -will have the names "comb", "h1", "h2" and so on; for the -combine method, your data frame will return the first -column of your data frame. +This function has been deprecated and replaced by functions like +\code{\link{splithierarchy}}. Please consider using the +\code{\linkS4class{genclone}} object for storing hierarchies. + +The separator +field is sensitive to regular expressions. If you do not know what those are, +please use the default underscore to separate your populations. Use \code{fixed += TRUE} to ignore regular expressions. If you do not set the \code{hier} flag +for the split method, your new data frame will have the names "comb", "h1", "h2" +and so on; for the combine method, your data frame will return the first column +of your data frame. } \examples{ \dontrun{ diff --git a/poppr.Rproj b/poppr.Rproj index f1341ea7..1934cb2f 100644 --- a/poppr.Rproj +++ b/poppr.Rproj @@ -9,12 +9,12 @@ UseSpacesForTab: Yes NumSpacesForTab: 2 Encoding: UTF-8 -RnwWeave: Sweave +RnwWeave: knitr LaTeX: pdfLaTeX BuildType: Package -PackageInstallArgs: --no-multiarch --with-keep.source +PackageInstallArgs: --no-multiarch --with-keep.source --install-tests PackageBuildArgs: --resave-data --compact-vignettes=gs+qpdf -PackageBuildBinaryArgs: --html, --latex, --build +PackageBuildBinaryArgs: --html --latex PackageCheckArgs: --as-cran PackageRoxygenize: rd,collate,namespace diff --git a/src/permut_shuffler.c b/src/permut_shuffler.c index b85809eb..98a4b824 100644 --- a/src/permut_shuffler.c +++ b/src/permut_shuffler.c @@ -94,3 +94,112 @@ SEXP permute_shuff(SEXP locus, SEXP alleles, SEXP allele_freq, SEXP ploidy) return Rout; } + +/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +A slightly faster method of permuting alleles at a locus. + +Inputs: + locus - The matrix to be permuted. Used for reference. + alleles - a vector of integers from 0 to n alleles indicating the matrix cols + allele_freq - 1/ploidy + ploidy - self explanitory + ploidvec - a vector describing the ploidy of each sample. + zero_col - An integer describing the column that contains the padding zero + for the ploidy. + +Outputs; + Rout - the permuted matrix. + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/ + +SEXP new_permute_shuff(SEXP locus, SEXP alleles, SEXP allele_freq, SEXP ploidy, + SEXP ploidvec, SEXP zero_col) +{ + int rows, cols, i, j, count = 0, ploid, p, miss = 0, zc; + SEXP Rout; + SEXP Rdim; + Rdim = getAttrib(locus, R_DimSymbol); + rows = INTEGER(Rdim)[0]; + cols = INTEGER(Rdim)[1]; + PROTECT(Rout = allocMatrix(REALSXP, rows, cols)); + alleles = coerceVector(alleles, INTSXP); + ploidy = coerceVector(ploidy, INTSXP); + ploid = INTEGER(ploidy)[0]; + ploidvec = coerceVector(ploidvec, INTSXP); + zc = INTEGER(zero_col)[0]; + double *loc = REAL(locus), *outmat = REAL(Rout), *afreq = REAL(allele_freq); + int *alle = INTEGER(alleles), *pv = INTEGER(ploidvec); + for(i = 0; i < rows; i++) + { + // loop through all columns first and initialize + for(j = 0; j < cols; j++) + { + // maintain missing values + if (ISNA(loc[i + j*rows])) + { + outmat[i + j*rows] = loc[i + j*rows]; + miss = 1; + } + else + { + if (j == zc) + { + outmat[i + j*rows] = *(afreq) * (ploid - pv[i]); + } + else + { + outmat[i + j*rows] = 0.0; + } + } + } + if (miss == 1) + { + miss = 0; + } + else + { + // permute the alleles by adding the allele frequency + for(p = 0; p < pv[i]; p++) + { + outmat[i + alle[count++]*rows] += *(afreq); + } + } + } + UNPROTECT(1); + return Rout; +} + +/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Method for expanding indices for bootstrapping. Only slightly faster than R +version, but seems to scale better. + +Inputs: + indices - cumulative sum of the number of alleles at each locus. + length - number of loci + +Outputs; + res - a list the same length as the number of loci with continuous numbers. +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/ +SEXP expand_indices(SEXP indices, SEXP length) { + SEXP res; + SEXP tempvec; + int rows, i, j, *ind = INTEGER(indices); + int max, min = 1; + rows = INTEGER(length)[0]; + PROTECT(res = allocVector(VECSXP, rows)); + for (i = 0; i < rows; i++) + { + max = ind[i]; + int veclength = 1 + max - min; + PROTECT(tempvec = allocVector(INTSXP, veclength)); + for (j = 0; j < veclength ; j++) + { + INTEGER(tempvec)[j] = min + j; + } + SET_VECTOR_ELT(res, i, tempvec); + UNPROTECT(1); + min = ind[i] + 1; + } + UNPROTECT(1); + return res; +} diff --git a/src/poppr_distance.c b/src/poppr_distance.c index 3c608760..687435be 100755 --- a/src/poppr_distance.c +++ b/src/poppr_distance.c @@ -41,10 +41,90 @@ #include #include #include -int count; +int perm_count; double bruvo_dist(int *in, int *nall, int *perm, int *woo); +double test_bruvo_dist(int *in, int *nall, int *perm, int *woo, int *loss, int *add); void permute(int *a, int i, int n, int *c); int fact(int x); +double mindist(int perms, int alleles, int *perm, double *dist); +void genome_add_calc(int perms, int alleles, int *perm, double *dist, + int zeroes, int *zero_ind, int curr_zero, int miss_ind, int *replacement, + int inds, int curr_ind, double *genome_add_sum, int *tracker); +void genome_loss_calc(int *genos, int nalleles, int *perm_array, int woo, + int *loss, int *add, int *zero_ind, int curr_zero, int zeroes, + int miss_ind, int curr_allele, double *genome_loss_sum, + int *loss_tracker); +void fill_short_geno(int *genos, int nalleles, int *perm_array, int *woo, + int *loss, int *add, int zeroes, int *zero_ind, int curr_zero, + int miss_ind, int *replacement, int inds, int curr_ind, double *res, + int *tracker); + + + +SEXP raw_pairdiffs(SEXP mat, SEXP ploidy) +{ + char binary_diffs, homozygote; + int count, row, col, bitcount, hz, a1, a2, ht;//, derp, i, j, k; + //SEXP Rout; + SEXP Rdim; + //SEXP dvector; + //SEXP Dvector; + Rdim = getAttrib(mat, R_DimSymbol); + row = INTEGER(Rdim)[0]; + col = INTEGER(Rdim)[1]; + /* + ploidy = coerceVector(ploidy, INTSXP); + PROTECT(dvector = allocVector(dvector, col)); + PROTECT(Dvector = allocVector(Dvector, ((row*(row-1)/2)))); + //mat = coerceVector(mat, RAWSXP); + for(i = 0; i < row - 1; i+ploidy) + { + for(j = i+ploidy; j < row; j+ploidy) + { + count = 0; + for(k = 0; k < col; k++) + { + binary_diffs = RAW(mat)[i+col*ploidy+k] ^ RAW(mat)[j+col*ploidy+k]; + for(bitcount = 7; bitcount >= 0; bitcount--) + { + dvector[count] += (binary_diffs >> bitcount) & 0x01; + } + count++; + } + } + } +*/ + + for(count = 0; count < row*col; count++) + { + if(count < (row*col)-1 && count % 2 == 0) + { + binary_diffs = RAW(mat)[count] ^ RAW(mat)[count+1]; + homozygote = RAW(mat)[count] & RAW(mat)[count+1]; + } + else + { + goto out; + } + for(bitcount = 7; bitcount >= 0; bitcount--) + { + //printf("Genotype:\t\t%d\n", (RAW(mat)[count] >> bitcount) & 0x01); + if(count < (row*col)-1) + { + hz = (homozygote >> bitcount) & 0x01; + a1 = (RAW(mat)[count] >> bitcount) & 0x01; + a2 = (RAW(mat)[count + 1] >> bitcount) & 0x01; + ht = (binary_diffs >> bitcount) & 0x01; + Rprintf("%d AND %d:\t%d\t\t", a1, a2, hz); + Rprintf("%d XOR %d:\t%d\t\t", a1, a2, ht); + Rprintf("RESULT:\t%d\n", hz+ht); + } + } + out: + Rprintf("\n"); + } + return R_NilValue; +} /*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Calculates the root product of pairwise comparisons of each of the variances of each locus. @@ -83,6 +163,7 @@ Output: A vector of length n*(n-1)/2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/ SEXP pairdiffs(SEXP freq_mat) { + int I, J, i, j, z, count; double P; SEXP Rout; @@ -129,7 +210,7 @@ SEXP permuto(SEXP perm) IMPORTANT: INITIALIZE THE COUNTER. THE POINTER IS NOT RELEASED FROM MEMORY OTHERWISE. */ - count = 0; + perm_count = 0; perm = coerceVector(perm, INTSXP); per = INTEGER(perm)[0]; int allele_array[per]; @@ -151,6 +232,8 @@ calcluated regardless of ploidy. For more information, see Bruvo et al. 2006 bruvo_mat - a matrix of individuals by loci, one column per allele. permutations - a vector of indeces for permuting the number of alleles. alleles - the ploidy of the population. +m_loss - an indicator for the genome loss model +m_add - an indicator for the genome addition model Returns: @@ -164,25 +247,29 @@ with missing data. In the wrapping R function, 100s will be converted to NAs and then the average over all loci will be taken. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/ -SEXP single_bruvo(SEXP b_mat, SEXP permutations, SEXP alleles) +SEXP single_bruvo(SEXP b_mat, SEXP permutations, SEXP alleles, SEXP loss, SEXP add) { - int A, P, *pA, *pP; - SEXP Rval; - - P = length(permutations); - alleles = coerceVector(alleles, INTSXP); + int A, P, *pA, *pP; + SEXP Rval; + //SEXP Rdim; + P = length(permutations); + alleles = coerceVector(alleles, INTSXP); + loss = coerceVector(loss, INTSXP); + add = coerceVector(add, INTSXP); A = INTEGER(alleles)[0]; pA = &A; pP = &P; - b_mat = coerceVector(b_mat, INTSXP); - permutations = coerceVector(permutations, INTSXP); - PROTECT(Rval = allocVector(REALSXP, 1)); - REAL(Rval)[0] = bruvo_dist(INTEGER(b_mat), pA, INTEGER(permutations), pP); - UNPROTECT(1); - return Rval; + b_mat = coerceVector(b_mat, INTSXP); + permutations = coerceVector(permutations, INTSXP); + PROTECT(Rval = allocVector(REALSXP, 1)); + REAL(Rval)[0] = test_bruvo_dist(INTEGER(b_mat), pA, INTEGER(permutations), + pP, INTEGER(loss), INTEGER(add)); + UNPROTECT(1); + return Rval; } -SEXP bruvo_distance(SEXP bruvo_mat, SEXP permutations, SEXP alleles) + +SEXP bruvo_distance(SEXP bruvo_mat, SEXP permutations, SEXP alleles, SEXP m_loss, SEXP m_add) { /*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ I = number of rows in bruvo_mat @@ -194,7 +281,7 @@ SEXP bruvo_distance(SEXP bruvo_mat, SEXP permutations, SEXP alleles) A matrix in R is built row by row. That's why there is a triple 'for' loop. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/ - int I, J, A, P, i, j, a, count = 0, *pA, *pP; + int I, J, A, P, i, j, a, count = 0, *pA, *pP;//, add, loss, *padd, *ploss; //Initialization of R vectors. SEXP Rdim; SEXP Rval; @@ -207,6 +294,8 @@ SEXP bruvo_distance(SEXP bruvo_mat, SEXP permutations, SEXP alleles) A = INTEGER(alleles)[0]; pA = &A; pP = &P; + m_loss = coerceVector(m_loss, INTSXP); + m_add = coerceVector(m_add, INTSXP); bruvo_mat = coerceVector(bruvo_mat, INTSXP); permutations = coerceVector(permutations, INTSXP); // Protecting the vectors that will be modified. Rval is the output @@ -240,7 +329,8 @@ SEXP bruvo_distance(SEXP bruvo_mat, SEXP permutations, SEXP alleles) INTEGER(pair_matrix)[z] = INTEGER(bruvo_mat)[j+(a+z-A)*I]; } // Calculating Bruvo's distance over these two. - REAL(Rval)[count++] = bruvo_dist(INTEGER(pair_matrix), pA, INTEGER(permutations), pP); + REAL(Rval)[count++] = test_bruvo_dist(INTEGER(pair_matrix), pA, + INTEGER(permutations), pP, INTEGER(m_loss), INTEGER(m_add)); } } } @@ -248,13 +338,14 @@ SEXP bruvo_distance(SEXP bruvo_mat, SEXP permutations, SEXP alleles) return Rval; } -/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - Internal C Functions -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/ +/*============================================================================== +================================================================================ +* Internal C Functions +================================================================================ +==============================================================================*/ -/* - The algorithm for the permutation function is modified from: - http://www.geeksforgeeks.org/archives/767 */ +/* The algorithm for the permutation function is modified from: +* http://www.geeksforgeeks.org/archives/767 */ /* Function to swap values at two pointers */ void swap (int *x, int *y) @@ -266,12 +357,12 @@ void swap (int *x, int *y) } /*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - Function to print permutations of string - This function takes four parameters: - 1. String - 2. Starting index of the string - 3. Ending index of the string. - 4. pointer to array of size n*n! +* Function to print permutations of string +* This function takes four parameters: +* 1. String +* 2. Starting index of the string +* 3. Ending index of the string. +* 4. pointer to array of size n*n! ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/ void permute(int *a, int i, int n, int *c) { @@ -279,26 +370,26 @@ void permute(int *a, int i, int n, int *c) if (i == n) { /*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - 'a' will be the array containing the numeric sequence to be - permuted. It will be reshuffled into a new pattern each - time it reaches this control structure. To place the value - into the array 'c', the pointer for a needs to be incremented - over all its elements. + * 'a' will be the array containing the numeric sequence to be + * permuted. It will be reshuffled into a new pattern each + * time it reaches this control structure. To place the value + * into the array 'c', the pointer for a needs to be incremented + * over all its elements. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/ - count += n+1; - int ind = count; + perm_count += n+1; + int ind = perm_count; for(j = n; j >= 0; j--) { - c[--ind] = *(a+j); + c[--ind] = *(a + j); } } else { for (j = i; j <= n; j++) { - swap((a+i), (a+j)); - permute(a, i+1, n, c); - swap((a+i), (a+j)); //backtrack + swap((a + i), (a + j)); + permute(a, i + 1, n, c); + swap((a + i), (a + j)); //backtrack } } } @@ -306,17 +397,19 @@ void permute(int *a, int i, int n, int *c) /* A factorial function for calculating permutations */ int fact(int x) { - int f=1; + int f = 1; int u; - for (u=x; u>1; u--) + for (u = x; u > 1; u--) { - f*=u; + f *= u; } return f; } /*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + DEPRECATED + This will calculate bruvo's distance between two individuals. All that needs to be done from here is to have it do the pairwise calculations. @@ -336,7 +429,7 @@ int fact(int x) double bruvo_dist(int *in, int *nall, int *perm, int *woo) { int i, j, counter=0, n = 2, p = *nall, w = *woo, genos[2][p]; - double dist[p][p], da, res, minn=100; + double dist[p][p], da, res = 0, minn=100; // reconstruct the genotype table. for(i=0; i < n; i++) { @@ -345,8 +438,6 @@ double bruvo_dist(int *in, int *nall, int *perm, int *woo) // Missing data will return with distance of 100 if(in[counter] == 0) { - /* THIS WILL BE THE PLACE TO PUT A NEW FUNCTION FOR SPECIAL - CASES OF BRUVO'S DISTANCE */ return minn; } else @@ -361,7 +452,7 @@ double bruvo_dist(int *in, int *nall, int *perm, int *woo) { for(i=0; i < p; i++) { - da = 1- pow(2 ,-abs(genos[0][i]-genos[1][j])); + da = 1- pow(2 , -abs(genos[0][i] - genos[1][j])); dist[i][j] = da; } } @@ -389,3 +480,511 @@ double bruvo_dist(int *in, int *nall, int *perm, int *woo) } return minn/p; } + +/* Test code comparing current status to polysat's Bruvo2.distance: +================================================================================ +poppr_bruvo <- function(){ + return(c(.Call("single_bruvo", c(20,23,24,0,20,24,26,43), .Call("permuto", 4), 4, 0, 0), +.Call("single_bruvo", c(20,23,24,0,20,24,26,43), .Call("permuto", 4), 4, 1, 0), +.Call("single_bruvo", c(20,23,24,0,20,24,26,43), .Call("permuto", 4), 4, 0, 1), +.Call("single_bruvo", c(20,23,24,0,20,24,26,43), .Call("permuto", 4), 4, 1, 1) +)) +} + +polysat_bruvo <- function(){ + return(c(Bruvo2.distance(c(20,23,24), c(20,24,26,43), usatnt=1, loss=FALSE, add=FALSE), +Bruvo2.distance(c(20,23,24), c(20,24,26,43), usatnt=1, loss=T, add=FALSE), +Bruvo2.distance(c(20,23,24), c(20,24,26,43), usatnt=1, loss=F, add=T), +Bruvo2.distance(c(20,23,24), c(20,24,26,43), usatnt=1, loss=T, add=T) +)) +} + +library(polysat) +polysat_bruvo() +poppr_bruvo() +polysat_bruvo() == poppr_bruvo() +==============================================================================*/ +double test_bruvo_dist(int *in, int *nall, int *perm, int *woo, int *loss, int *add) +{ + int i, j, counter = 0, n = 2, p = *nall, w = *woo, loss_indicator = *loss, + add_indicator = *add, genos[2][p], zerocatch[2], zero_ind[2][p], + zerodiff; + double dist[p][p], da, minn = 100, *distp; + // reconstruct the genotype table. + zerocatch[0] = 0; + zerocatch[1] = 0; + for(i=0; i < n; i++) + { + for(j = 0; j < p; j++) + { + // Catch missing data here. + if(in[counter] == 0) + { + if (zerocatch[i] == p - 1) + { + return minn; + } + zerocatch[i] += 1; + //printf("#"); + zero_ind[i][zerocatch[i] - 1] = j; + } + genos[i][j] = in[counter++]; + //printf("%d\t", genos[i][j]); + } + //printf("\n"); + } + //printf("\n"); + zerodiff = abs(zerocatch[0] - zerocatch[1]); + /*========================================================================== + * Removing superfluous zeroes from the data. This is in the case that both + * of the genotypes contain one or more zeroes. + * + * smaller and larger refer to the size of the genotypes. + ==========================================================================*/ + if (zerocatch[0] > 0 && zerocatch[1] > 0) + { + int smaller = 0, larger = 1, reduction = 0, i, j, + zero_counter, *perm_array, *new_genop; + + if (zerodiff == 0) + { + reduction = p - zerocatch[0]; + } + else + { + if(zerocatch[0] < zerocatch[1]) + { + smaller = 1; + larger = 0; + } + reduction = p - (zerocatch[smaller] - zerodiff); + } + int del = 1; + if (del > 0) + { + int new_alleles[reduction]; + for (i = 0; i < reduction; i++) + { + new_alleles[i] = i; + } + w = fact(reduction) * reduction; + perm_array = (int *) malloc(w * sizeof(int)); + perm_count = 0; + permute(new_alleles, 0, reduction - 1, perm_array); + // rebuild the array and make a pointer. + int new_geno[reduction*n]; + counter = 0; + for (i=0; i < n; i++) + { + zero_counter = zerocatch[larger]; + for (j = 0; j < p; j++) + { + if (genos[i][j] == 0 && zero_counter > 0) + { + zero_counter--; + } + else + { + new_geno[counter++] = genos[i][j]; + } + } + } + new_genop = (int *) &new_geno; + + minn = test_bruvo_dist(new_genop, &reduction, perm_array, &w, + &loss_indicator, &add_indicator); + free(perm_array); + } + + /* Questions of the proper way of permuting this arise: + * 1. If both of the genotypes are of equal length, but the user wants + * a non genome - addition model, how do we undertake that? + * should we simply run all combinations of both genotypes + * separately? + * 2. If one genotype is longer than the other, should we fill the + * larger or smaller genotype, or possibly, should we fill both and + * do a similar procedure as I described above? + + + else + { + int fill_tracker = 0, *pzero_ind, large_inds[p - zerocatch[larger]], + large_counter = 0, *plarge_inds; + double res = 0; + pzero_ind = (int *) &zero_ind[larger]; + plarge_inds = (int *) &large_inds; + for (i = 0; i < p; i++) + { + if (genos[larger][i] > 0) + { + large_inds[large_counter++] = i; + } + } + for (i = 0; i < reduction; i++) + { + fill_short_geno(in, p, perm, woo, loss, add, zerocatch[larger], + pzero_ind, 0, larger, plarge_inds, reduction, i, &res, + &fill_tracker); + } + minn = res/fill_tracker; + } + */ + return minn; + } + + // Construct distance matrix of 1 - 2^{-|x|}. + // This is constructed column by column. Genotype 1 in the rows. Genotype 2 + // in the columns. + for(j = 0; j < p; j++) + { + for(i = 0; i < p; i++) + { + da = 1 - pow(2, -abs(genos[0][i] - genos[1][j])); + dist[i][j] = da; + } + } + // This avoids warning: assignment from incompatible pointer type + distp = (double *) &dist; + if(zerocatch[0] > 0 || zerocatch[1] > 0) + { + int *genop, ind, miss_ind = 1, z, tracker = 0, loss_tracker = 0;// full_ind = 0; + double genome_add_sum = 0, genome_loss_sum = 0; + genop = (int *) &genos; + if (zerocatch[0] > 0) // The rows contain the zero value + { + miss_ind = 0; + //full_ind = 1; + } + ind = zero_ind[miss_ind][0]; + int short_inds[zerocatch[miss_ind]], short_counter = 0; + for (i = 0; i < p; i++) + { + if (genos[miss_ind][i] > 0) + { + short_inds[short_counter++] = i; + } + } + /*====================================================================== + * INFINITE MODEL + * Infinite model will simply replace the distance of the comparisons + * containing the missing allele to 1. + ======================================================================*/ + if(loss_indicator != 1 && add_indicator != 1) + { + for (z = 0; z < zerocatch[miss_ind]; z++) + { + ind = zero_ind[miss_ind][z]; + if (zerocatch[0] > 0) + { + for (j = 0; j < p; j++) + { + dist[ind][j] = 1; + } + } + else + { + for (j = 0; j < p; j++) + { + dist[j][ind] = 1; + } + } + } + return mindist(w, p, perm, distp)/p; + } + /*====================================================================== + * GENOME ADDITION MODEL + * Genome Addition model uses the observed values of the short + * genotype for the replacement allele. This is achieved by simply + * shifting the columns or rows of the distance matrix and + * recalculating the minimum distance. + ======================================================================*/ + if (add_indicator == 1) + { + int *pzero_ind, *pshort_inds; + pzero_ind = (int *) &zero_ind[miss_ind]; + pshort_inds = (int *) &short_inds; + for (i = 0; i < p - zerocatch[miss_ind]; i++) + { + genome_add_calc(w, p, perm, distp, zerocatch[miss_ind], + pzero_ind, 0, miss_ind, pshort_inds, p-zerocatch[miss_ind], + i, &genome_add_sum, &tracker); + } + } + /*====================================================================== + * GENOME LOSS MODEL + * Genome Loss model uses the alleles from the larger genotype to + * reconstruct the allelic state of the smaller. This means that + * they need to be replaced and passed through the function again. + ======================================================================*/ + if (loss_indicator == 1) + { + int *pzero_ind; + pzero_ind = (int *) &zero_ind[miss_ind]; + for (i = 0; i < p; i++) + { + genome_loss_calc(genop, p, perm, w, &loss_indicator, + &add_indicator, pzero_ind, 0, zerocatch[miss_ind], + miss_ind, i, &genome_loss_sum, &loss_tracker); + } + } + if (tracker == 0) + { + tracker = 1; + } + if (loss_tracker == 0) + { + loss_tracker = 1; + } + genome_loss_sum = genome_loss_sum/loss_tracker; + genome_add_sum = genome_add_sum/tracker; + int comparison_factor = loss_indicator + add_indicator; + return (genome_add_sum + genome_loss_sum)/(p*comparison_factor); + } + return mindist(w, p, perm, distp)/p; +} + + +/*============================================================================== +* GENOME ADDITION MODEL +* This will replace the portions of the distance matrix generated via the +* missing values with columns (or rows) of observed values in a combinatorial +* way. This will involve recursion. +* +* Arguments: +* --------- +* perms - number of permutations (passed to mindist) +* alleles - number of maximum alleles (also passed to mindist) +* *perm - permutation array (passed to mindist) +* *dist - distance matrix to be manipulated (also passed to mindist) +* +* zeroes - number of zeroes present in the shorter genotype. +* *zero_ind - array containing indices of the zero values of the short geno +* curr_zero - the index of the current zero index for zero_ind +* miss_ind - the index for the genotype with missing data. Necessary for +* determining rows or columns +* *replacement - array containing indices of replacement genotypes. +* inds - number of replacement genotypes. +* curr_ind - the index of the current replacement genotype. +* +* *genome_add_sum - pointer to the total number of the genome addition model. +* *tracker - pointer to counter for the number of calculations for addition. +==============================================================================*/ +void genome_add_calc(int perms, int alleles, int *perm, double *dist, + int zeroes, int *zero_ind, int curr_zero, int miss_ind, int *replacement, + int inds, int curr_ind, double *genome_add_sum, int *tracker) +{ + int i,j; + //========================================================================== + // Part 1: fill one row/column of the matrix. + // Note that we don't have the format of the 2D array here, so we are + // cheating a little bit. Instead of indexing by dist[i][j] over p columns, + // we use dist[i + p*j]. It works. + //========================================================================== + if(miss_ind > 0) + { + for (j = 0; j < alleles; j++) + { + dist[zero_ind[curr_zero] + alleles*j] = + dist[replacement[curr_ind] + alleles*j]; + } + } + else + { + for (j = 0; j < alleles; j++) + { + dist[j + alleles*zero_ind[curr_zero]] = + dist[j + alleles*replacement[curr_ind]]; + } + } + //========================================================================== + // Part 2: Iterate through the rest of the possible combinations. + // + // The first for loop iterates through all possible individuals. + // The first if loop will check if there are any more slots to be filled. + // if there aren't, then the minimum distance will be calculated on the + // matrix as it stands and then the sum will be returned. + //========================================================================== + for (i = curr_ind; i < inds; i++) + { + if (curr_zero < zeroes - 1) + { + genome_add_calc(perms, alleles, perm, dist, zeroes, zero_ind, + ++curr_zero, miss_ind, replacement, inds, i, genome_add_sum, + tracker); + if (curr_zero == zeroes - 1) + { + return; + } + } + else + { + *genome_add_sum += mindist(perms, alleles, perm, dist); + *tracker += 1; + if (zeroes == 1 || i == inds - 1) + { + return; + } + } + curr_zero--; + + } + return; +} + +/*============================================================================== +* Genome Loss Model +* Replace missing alleles in the shorter genotype with all possible +* combinations of alleles in the larger genotype and recall bruvo_dist. There +* are choose((n+k-1), k) possible combinations where n is the number of +* alleles in the larger genotype and k is the number of missing alleles in the +* shorter genotype. +* +* Arguments: +* --------- +* PASSED TO BRUVO_DIST: +* *genos - genotype array +* nalleles - number of maximum alleles. +* *perm_array - permutation array. +* *woo - nalleles * nalleles! +* *loss - genome loss model indicator. +* *add - genome addition indicator. +* +* UNIQUE TO THIS FUNCTION: +* *zero_ind - array containing indices of the zero values of the short geno +* curr_zero - the index of the current zero index for zero_ind +* zeroes - number of zeroes present in the shorter genotype. +* miss_ind - the index for the genotype with missing data. +* curr_allele - the current index for the replacement alleles of the full geno +* +* *genome_loss_sum - pointer to the total number of the genome loss model. +* *loss_tracker - pointer to counter for the number of calculations for loss. +==============================================================================*/ +void genome_loss_calc(int *genos, int nalleles, int *perm_array, int woo, + int *loss, int *add, int *zero_ind, int curr_zero, int zeroes, + int miss_ind, int curr_allele, double *genome_loss_sum, + int *loss_tracker) +{ + int i, full_ind; + full_ind = 1 + (0 - miss_ind); + genos[miss_ind*nalleles + zero_ind[curr_zero]] = + genos[full_ind*nalleles + curr_allele]; + for (i = curr_allele; i < nalleles; i++) + { + if (curr_zero < zeroes - 1) + { + genome_loss_calc(genos, nalleles, perm_array, woo, loss, add, + zero_ind, ++curr_zero, zeroes, miss_ind, i, genome_loss_sum, + loss_tracker); + if (curr_zero == zeroes - 1) + { + return; + } + } + else + { + *genome_loss_sum += test_bruvo_dist(genos, &nalleles, perm_array, + &woo, loss, add)*nalleles; + *loss_tracker += 1; + if (zeroes == 1 || i == nalleles - 1) + { + return; + } + } + curr_zero--; + } + return; +} + +/*============================================================================== +* Notes for fill_short_geno: This will act much in the same way as +* genome_loss_calc, except it will fill the shorter genotype with all possible +* combinations of that genotype before sending it through test_bruvo_dist with +* one full genotype. +* +* Things that need to be set before running this: +* - replacement is an array of the non-missing alleles from the shorter +* genotype. +* - inds is the number of non-missing alleles. +* - *res will be minn +* - *tracker will count the number of iterations this goes through in order +* to get an average. +==============================================================================*/ +void fill_short_geno(int *genos, int nalleles, int *perm_array, int *woo, + int *loss, int *add, int zeroes, int *zero_ind, int curr_zero, + int miss_ind, int *replacement, int inds, int curr_ind, double *res, + int *tracker) +{ + int i; //full_ind; + //full_ind = 1 + (0 - miss_ind); + genos[miss_ind*nalleles + zero_ind[curr_zero]] = + genos[miss_ind*nalleles + replacement[curr_ind]]; + for (i = curr_ind; i < inds; i++) + { + if (curr_zero < zeroes - 1) + { + fill_short_geno(genos, nalleles, perm_array, woo, loss, add, zeroes, + zero_ind, ++curr_zero, miss_ind, replacement, inds, i, res, + tracker); + if (curr_zero == zeroes - 1) + { + return; + } + } + else + { + *res += test_bruvo_dist(genos, &nalleles, perm_array, woo, loss, + add); + *tracker += 1; + if (zeroes == 1 || i == nalleles - 1) + { + return; + } + } + curr_zero--; + } + return; +} + + + +/* +// Multiset coefficient: fact(n+k-1)/(fact(k)*fact(n-1)) +*/ + +double mindist(int perms, int alleles, int *perm, double *dist) +{ + int i, j, w = perms, p = alleles, counter = 0; + double res = 0, minn = 100; + for(i = 0; i < w; i += p) + { + for(j = 0; j < p; j++) + { + if (j == 0) + { + res = dist[*(perm + counter++) + p*j]; + if(res > minn) + { + j = p; + counter = i + w/p; + i = counter; + } + } + else + { + res += dist[*(perm + counter++) + p*j]; + if(j < p-1 && res > minn) + { + counter += (p-j-1); + j = p; + } + } + } + /* Checking if the new calculated distance is smaller than the smallest + distance seen. */ + if ( res < minn ) + { + minn = res; + } + } + return minn; +} diff --git a/tests/test-all.R b/tests/test-all.R new file mode 100644 index 00000000..61789125 --- /dev/null +++ b/tests/test-all.R @@ -0,0 +1,2 @@ +library(testthat) +test_check("poppr") \ No newline at end of file diff --git a/tests/testthat/test-amova.R b/tests/testthat/test-amova.R new file mode 100644 index 00000000..ca8a7e38 --- /dev/null +++ b/tests/testthat/test-amova.R @@ -0,0 +1,23 @@ +context("Amova tests") + +test_that("Amova returns published values", { + data(Aeut, package = "poppr") + res <- poppr.amova(Aeut, ~Pop/Subpop, quiet = TRUE) + rescc <- poppr.amova(Aeut, ~Pop/Subpop, quiet = TRUE, clonecorrect = TRUE) + expect_that(res$componentsofcovariance[, 2], equals(c(70.0067859292295, + 8.40748251295027, + 21.5857315578203, + 100))) + expect_that(res$componentsofcovariance[, 1], equals(c(11.0634458464745, + 1.3286673034988, + 3.41127747798475, + 15.8033906279581))) + expect_that(rescc$componentsofcovariance[, 2], equals(c(66.7776803325885, + 6.10535452127678, + 27.1169651461347, + 100))) + expect_that(rescc$componentsofcovariance[, 1], equals(c(10.4131525344064, + 0.952054452775842, + 4.22855500416477, + 15.593761991347))) + }) \ No newline at end of file diff --git a/tests/testthat/test-distpop.R b/tests/testthat/test-distpop.R new file mode 100644 index 00000000..986c6a1b --- /dev/null +++ b/tests/testthat/test-distpop.R @@ -0,0 +1,16 @@ +context("Population Distance Tests") + +test_that("dist.genpop matches distance", { + data(nancycats, package = "adegenet") + nanpop <- genind2genpop(nancycats, quiet = TRUE) + nei <- as.vector(nei.dist(nanpop)) + edwards <- as.vector(edwards.dist(nanpop)) + reynolds <- as.vector(reynolds.dist(nanpop)) + rogers <- as.vector(rogers.dist(nanpop)) + provesti <- as.vector(provesti.dist(nanpop)) + expect_that(as.vector(dist.genpop(nanpop, method = 1)), equals(nei)) + expect_that(as.vector(dist.genpop(nanpop, method = 2)), equals(edwards)) + expect_that(as.vector(dist.genpop(nanpop, method = 3)), equals(reynolds)) + expect_that(as.vector(dist.genpop(nanpop, method = 4)), equals(rogers)) + expect_that(as.vector(dist.genpop(nanpop, method = 5)), equals(provesti)) +}) \ No newline at end of file diff --git a/tests/testthat/test-genclone.R b/tests/testthat/test-genclone.R new file mode 100644 index 00000000..b8d81ec8 --- /dev/null +++ b/tests/testthat/test-genclone.R @@ -0,0 +1,40 @@ +context("Genclone coercion tests") + +test_that("A genclone object contains a genind object", { + + data(partial_clone, package = "poppr") + pc <- as.genclone(partial_clone) + expect_that(slotNames(partial_clone), equals(slotNames(pc)[-c(1:2)])) + expect_that(partial_clone@tab, equals(pc@tab)) + expect_that(partial_clone@loc.names, is_identical_to(pc@loc.names)) + expect_that(partial_clone@loc.nall, is_identical_to(pc@loc.nall)) + expect_that(partial_clone@all.names, is_identical_to(pc@all.names)) + expect_that(partial_clone@ind.names, is_identical_to(pc@ind.names)) + expect_that(partial_clone@pop, is_identical_to(pc@pop)) + expect_that(partial_clone@pop.names, is_identical_to(pc@pop.names)) + expect_that(partial_clone@ploidy, is_identical_to(pc@ploidy)) + expect_that(partial_clone@type, is_identical_to(pc@type)) + expect_that(partial_clone@other, is_identical_to(pc@other)) + expect_that(pc@mlg, is_identical_to(mlg.vector(partial_clone))) +}) + +test_that("Hierarchy methods work for genclone objects.", { + data(Aeut, package = "poppr") + agc <- as.genclone(Aeut) + expect_that(length(gethierarchy(agc)), equals(3)) + expect_that(agc@pop.names, equals(c(P1 = "Athena", P2 = "Mt. Vernon"))) + expect_that({agcsplit <- splithierarchy(agc, ~Pop/Subpop)}, gives_warning()) + expect_that(gethierarchy(agcsplit), equals(gethierarchy(agc, ~Pop/Subpop, combine = FALSE))) + expect_that(sethierarchy(agc, gethierarchy(agcsplit)), equals(agcsplit)) + namehierarchy(agcsplit) <- ~Field/Core + expect_that(names(gethierarchy(agcsplit)), equals(c("Field", "Core"))) + setpop(agc) <- ~Pop/Subpop + expect_that(agc@pop.names, equals(c("Athena_1", "Athena_2", "Athena_3", + "Athena_4", "Athena_5", "Athena_6", + "Athena_7", "Athena_8", "Athena_9", + "Athena_10", "Mt. Vernon_1", + "Mt. Vernon_2", "Mt. Vernon_3", + "Mt. Vernon_4", "Mt. Vernon_5", + "Mt. Vernon_6", "Mt. Vernon_7", + "Mt. Vernon_8"))) +}) \ No newline at end of file diff --git a/tests/testthat/test-greyscale.R b/tests/testthat/test-greyscale.R new file mode 100644 index 00000000..1fc7e62b --- /dev/null +++ b/tests/testthat/test-greyscale.R @@ -0,0 +1,10 @@ +context("Greyscale tests") + +test_that("rerange works as expected", { + xnorm <- rnorm(1000) + xrange <- seq(0, 1, length = 100) + xrange2 <- seq(0, 2, length = 100) + expect_that(range(poppr:::rerange(xnorm)), equals(c(0, 1))) + expect_that(range(poppr:::rerange(xrange)), equals(c(0, 1))) + expect_that(range(poppr:::rerange(xrange2)), equals(c(0, 1))) +}) \ No newline at end of file diff --git a/tests/testthat/test-missing.R b/tests/testthat/test-missing.R new file mode 100644 index 00000000..8d3466a1 --- /dev/null +++ b/tests/testthat/test-missing.R @@ -0,0 +1,18 @@ +context("Missing tests") + +data(nancycats, package = "adegenet") + +test_that("missingno removes genotypes", { + expect_that(genmiss <- missingno(nancycats, "geno", cutoff = 0.05), prints_text("38 genotypes contained missing values greater than 5%.")) + expect_that(nInd(genmiss), equals(199)) +}) + +test_that("missingno removes loci", { + expect_that(locmiss <- missingno(nancycats, "loci", cutoff = 0.05), prints_text("2 loci contained missing values greater than 5%.")) + expect_that(nLoc(locmiss), equals(7)) +}) + +test_that("missingno matches na.replace", { + expect_identical(missingno(nancycats, "zero", quiet = TRUE), na.replace(nancycats, "0", quiet = TRUE)) + expect_identical(missingno(nancycats, "mean", quiet = TRUE), na.replace(nancycats, "mean", quiet = TRUE)) +}) \ No newline at end of file diff --git a/tests/testthat/test-mlg.R b/tests/testthat/test-mlg.R new file mode 100644 index 00000000..6074ae9f --- /dev/null +++ b/tests/testthat/test-mlg.R @@ -0,0 +1,149 @@ +context("Multilocus genotype tests") + +test_that("multilocus genotype vector is same length as samples", { + data(Aeut, package = "poppr") + data(partial_clone, package = "poppr") + data(nancycats, package = "adegenet") + amlg <- mlg.vector(Aeut) + pmlg <- mlg.vector(partial_clone) + nmlg <- mlg.vector(nancycats) + expect_that(length(amlg), equals(nInd(Aeut))) + expect_that(length(pmlg), equals(nInd(partial_clone))) + expect_that(length(nmlg), equals(nInd(nancycats))) + expect_that(length(unique(amlg)), equals(mlg(Aeut, quiet = TRUE))) + expect_that(length(unique(pmlg)), equals(mlg(partial_clone, quiet = TRUE))) + expect_that(length(unique(nmlg)), equals(mlg(nancycats, quiet = TRUE))) +}) + +test_that("multilocus genotype matrix matches mlg.vector and data", { + data(Aeut, package = "poppr") + data(partial_clone, package = "poppr") + data(nancycats, package = "adegenet") + aclone <- as.genclone(Aeut) + atab <- mlg.table(Aeut, bar = FALSE) + ptab <- mlg.table(partial_clone, bar = FALSE) + ntab <- mlg.table(nancycats, bar = FALSE) + expect_that(nrow(atab), equals(length(Aeut@pop.names))) + expect_that(nrow(ptab), equals(length(partial_clone@pop.names))) + expect_that(nrow(ntab), equals(length(nancycats@pop.names))) + expect_that(ncol(atab), equals(mlg(Aeut, quiet = TRUE))) + expect_that(ncol(ptab), equals(mlg(partial_clone, quiet = TRUE))) + expect_that(ncol(ntab), equals(mlg(nancycats, quiet = TRUE))) + expect_that(sum(atab), equals(nInd(Aeut))) + expect_that(sum(ptab), equals(nInd(partial_clone))) + expect_that(sum(ntab), equals(nInd(nancycats))) +}) + +test_that("mlg.crosspop will work with subsetted genclone objects", { + data(Aeut, package = "poppr") + agc <- as.genclone(Aeut) + Athena <- popsub(agc, "Athena") + setpop(Athena) <- ~Subpop + expected_output <- structure(list(MLG.13 = structure(c(1L, 1L), .Names = c("8", +"9")), MLG.23 = structure(c(1L, 1L), .Names = c("4", "6")), MLG.24 = structure(c(1L, +1L), .Names = c("9", "10")), MLG.32 = structure(c(1L, 1L), .Names = c("7", +"9")), MLG.52 = structure(c(1L, 1L), .Names = c("5", "9")), MLG.63 = structure(c(1L, +1L), .Names = c("1", "5"))), .Names = c("MLG.13", "MLG.23", "MLG.24", +"MLG.32", "MLG.52", "MLG.63")) + expected_mlgout <- c(13, 23, 24, 32, 52, 63) + + expect_that(x <- mlg.crosspop(Athena, quiet = TRUE), equals(expected_output)) + expect_that(y <- mlg.crosspop(Athena, indexreturn = TRUE), equals(expected_mlgout)) + expect_warning(z <- mlg.crosspop(Athena, mlgsub = c(14, 2:5)), "The following multilocus genotypes are not defined in this dataset: 2, 3, 4, 5") +}) + +test_that("mlg.id Aeut works", { + data(Aeut, package = "poppr") + expected_output <- structure(list(`1` = "55", `2` = c("101", "103"), `3` = "111", + `4` = "112", `5` = "110", `6` = "102", `7` = "20", `8` = "7", + `9` = "68", `10` = "69", `11` = "73", `12` = "75", `13` = c("72", + "80"), `14` = c("74", "76", "77"), `15` = "79", `16` = c("4", + "9"), `17` = c("3", "8"), `18` = "95", `19` = "94", `20` = c("22", + "23", "24", "25", "27", "28", "29", "30", "31"), `21` = "60", + `22` = "43", `23` = c("38", "59"), `24` = c("84", "90"), + `25` = "63", `26` = "5", `27` = "71", `28` = "32", `29` = "78", + `30` = "26", `31` = c("89", "92"), `32` = c("65", "81"), + `33` = "53", `34` = "51", `35` = c("46", "48", "50"), `36` = c("45", + "47"), `37` = "88", `38` = "87", `39` = "56", `40` = "91", + `41` = "82", `42` = "6", `43` = "83", `44` = "13", `45` = "17", + `46` = "85", `47` = "61", `48` = "62", `49` = "66", `50` = "64", + `51` = "15", `52` = c("52", "86"), `53` = "2", `54` = "115", + `55` = "151", `56` = "113", `57` = "42", `58` = "109", `59` = c("159", + "57"), `60` = c("67", "70"), `61` = "58", `62` = "49", `63` = c("1", + "54"), `64` = "96", `65` = "40", `66` = c("33", "34", "36", + "39", "41"), `67` = "37", `68` = "35", `69` = c("145", "146", + "148", "149"), `70` = c("124", "126", "127", "131", "133" + ), `71` = "156", `72` = c("152", "154"), `73` = "116", `74` = c("139", + "140", "141"), `75` = c("134", "135", "137", "142", "147" + ), `76` = c("125", "162"), `77` = c("160", "168", "170"), + `78` = c("169", "177"), `79` = "175", `80` = c("107", "108", + "117", "120", "121", "122", "164", "167", "172", "183"), + `81` = c("130", "182"), `82` = "99", `83` = "100", `84` = "114", + `85` = "157", `86` = "98", `87` = c("158", "171"), `88` = c("123", + "166"), `89` = "118", `90` = c("128", "163"), `91` = c("104", + "173"), `92` = "132", `93` = "10", `94` = "11", `95` = "180", + `96` = c("138", "144"), `97` = c("181", "184", "185", "186" + ), `98` = "143", `99` = c("136", "165"), `100` = "150", `101` = c("174", + "187"), `102` = "176", `103` = c("178", "179"), `104` = "129", + `105` = "153", `106` = "119", `107` = "161", `108` = "97", + `109` = "93", `110` = "18", `111` = "21", `112` = "12", `113` = "16", + `114` = "19", `115` = "155", `116` = "106", `117` = "105", + `118` = "14", `119` = "44"), .Names = c("1", "2", "3", "4", + "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", + "16", "17", "18", "19", "20", "21", "22", "23", "24", "25", "26", + "27", "28", "29", "30", "31", "32", "33", "34", "35", "36", "37", + "38", "39", "40", "41", "42", "43", "44", "45", "46", "47", "48", + "49", "50", "51", "52", "53", "54", "55", "56", "57", "58", "59", + "60", "61", "62", "63", "64", "65", "66", "67", "68", "69", "70", + "71", "72", "73", "74", "75", "76", "77", "78", "79", "80", "81", + "82", "83", "84", "85", "86", "87", "88", "89", "90", "91", "92", + "93", "94", "95", "96", "97", "98", "99", "100", "101", "102", + "103", "104", "105", "106", "107", "108", "109", "110", "111", + "112", "113", "114", "115", "116", "117", "118", "119")) + x <- mlg.id(Aeut) + Avec <- mlg.vector(Aeut) + expect_that(x, equals(expected_output)) + expect_that(length(x), equals(length(unique(Avec)))) + expect_that(sapply(x, length), is_equivalent_to(as.vector(table(Avec)))) + expect_that(names(x[1]), equals("1")) + }) + +test_that("mlg.id Pinf works", { + data(Pinf, package = "poppr") + expected_output <- structure(list(`1` = "PiEC06", `4` = "PiMX03", `5` = "PiMX04", + `6` = "PiMXT01", `7` = "PiPE03", `8` = "PiPE01", `10` = "PiPE07", + `11` = "PiPE06", `12` = c("PiPE10", "PiPE26"), `13` = "PiMX01", + `14` = "PiEC02", `15` = "PiPE04", `17` = c("PiCO01", "PiCO03", + "PiCO04"), `19` = "PiEC03", `21` = "PiMX42", `22` = "PiEC01", + `23` = "PiPE09", `24` = "PiCO02", `25` = "PiPE05", `30` = "PiMX07", + `33` = "PiMX20", `34` = c("PiMX48", "PiMX49", "PiMX50"), + `35` = "PiPE13", `36` = c("PiPE11", "PiPE12", "PiPE14"), + `37` = "PiMX06", `38` = "PiMX02", `39` = "PiMX12", `40` = "PiMXT06", + `41` = "PiMX19", `42` = "PiMX17", `45` = "PiMX13", `46` = "PiMX24", + `47` = c("PiPE02", "PiPE08"), `50` = "PiMX23", `51` = "PiMX10", + `52` = "PiMX29", `53` = "PiMX05", `54` = "PiCO05", `55` = "PiMXT07", + `56` = "PiMX11", `57` = "PiMX26", `58` = "PiMX22", `59` = "PiMX14", + `61` = "PiMX18", `62` = "PiMX15", `63` = c("PiPE22", "PiPE24", + "PiPE25"), `68` = "PiPE23", `69` = "PiEC10", `71` = "PiPE21", + `72` = "PiPE20", `74` = "PiEC12", `75` = c("PiEC13", "PiEC14" + ), `77` = "PiMX28", `79` = "PiEC11", `80` = "PiMX16", `83` = "PiEC08", + `84` = "PiEC07", `93` = "PiMX30", `94` = "PiMX41", `95` = "PiMX27", + `96` = "PiMX43", `97` = c("PiMX44", "PiMX45", "PiMX46", "PiMX47" + ), `98` = "PiMX25", `99` = "PiMX40", `104` = "PiMXT02", `105` = "PiMXT05", + `106` = "PiPE27", `109` = "PiMXT03", `110` = "PiMX21", `115` = "PiMXT04", + `116` = "PiMXt48", `117` = "PiMXt68"), .Names = c("1", "4", + "5", "6", "7", "8", "10", "11", "12", "13", "14", "15", "17", + "19", "21", "22", "23", "24", "25", "30", "33", "34", "35", "36", + "37", "38", "39", "40", "41", "42", "45", "46", "47", "50", "51", + "52", "53", "54", "55", "56", "57", "58", "59", "61", "62", "63", + "68", "69", "71", "72", "74", "75", "77", "79", "80", "83", "84", + "93", "94", "95", "96", "97", "98", "99", "104", "105", "106", + "109", "110", "115", "116", "117")) + x <- mlg.id(Pinf) + Pvec <- mlg.vector(Pinf) + expect_that(x, equals(expected_output)) + expect_that(length(x), equals(length(unique(Pvec)))) + expect_that(sapply(x, length), is_equivalent_to(as.vector(table(Pvec)))) + expect_that(names(x[1]), equals("1")) +}) + diff --git a/tests/testthat/test-polyploids.R b/tests/testthat/test-polyploids.R new file mode 100644 index 00000000..ca92542f --- /dev/null +++ b/tests/testthat/test-polyploids.R @@ -0,0 +1,15 @@ +context("Polyploid Tests") + +test_that("recode_polyploids works as expected", { + testdf <- data.frame(test = c("00/20/23/24", "20/24/26/43")) + testgid <- df2genind(testdf, ploidy = 4, sep = "/") + testrec <- recode_polyploids(testgid) + expect_equal(testgid@ploidy, testrec@ploidy) + expect_equivalent(testgid@tab[2, -1], testrec@tab[2, ]) + expect_equivalent(testrec@tab[1, ], c(1/3, 1/3, 1/3, 0, 0)) +}) + +test_that("recode_polyploids won't take diploid data", { + data(nancycats, package = "adegenet") + expect_warning(recode_polyploids(nancycats), "Input is not a polyploid data set, returning original.") +}) \ No newline at end of file diff --git a/tests/testthat/test-poppr.R b/tests/testthat/test-poppr.R new file mode 100644 index 00000000..ff07d800 --- /dev/null +++ b/tests/testthat/test-poppr.R @@ -0,0 +1,69 @@ +context("Poppr table tests") + +test_that("poppr returns expected PA values", { + data(Aeut, package = "poppr") + A.tab <- poppr(Aeut, quiet = TRUE) + comparison <- structure(list(Pop = structure(1:3, + .Label = c("Athena", "Mt. Vernon", "Total"), + class = "factor"), + N = c(97, 90, 187), + MLG = c(70, 50, 119), + eMLG = c(65.9808393377379, 50, 68.4525682280634), + SE = c(1.24567688244012, 0, 2.98857840353058), + H = c(4.06272002528149, 3.66843094399907, 4.55798828426928), + G = c(42.1928251121076, 28.7234042553191, 68.9723865877712), + Hexp = c(0.986469072164949, 0.976029962546817, 0.990799838997182), + E.5 = c(0.721008688944842, 0.725926650260449, 0.720112175857993), + Ia = c(2.90602921191748, 13.3024309367662, 14.3707995986407), + rbarD = c(0.0723700801886747, 0.281642324983496, 0.270617053778004), + File = structure(c(1L, 1L, 1L), class = "factor", .Label = "rootrot.csv")), + .Names = c("Pop", "N", "MLG", "eMLG", "SE", "H", "G", "Hexp", "E.5", "Ia", "rbarD", "File"), + row.names = c(NA, -3L), + class = c("popprtable", "data.frame")) + + expect_that(A.tab$Pop, is_equivalent_to(comparison$Pop)) + expect_that(A.tab$N, equals(comparison$N)) + expect_that(A.tab$MLG, equals(comparison$MLG)) + expect_that(A.tab$eMLG, equals(comparison$eMLG)) + expect_that(A.tab$SE, equals(comparison$SE)) + expect_that(A.tab$H, equals(comparison$H)) + expect_that(A.tab$G, equals(comparison$G)) + expect_that(A.tab$Hexp, equals(comparison$Hexp)) + expect_that(A.tab$E.5, equals(comparison$E.5)) + expect_that(A.tab$Ia, equals(comparison$Ia)) + expect_that(A.tab$rbarD, equals(comparison$rbarD)) +}) + +test_that("poppr returns expected codominant values", { + data(partial_clone, package = "poppr") + p.tab <- poppr(partial_clone, quiet = TRUE) + comparison <- structure(list(Pop = structure(1:5, .Label = c("1", "2", "3", +"4", "Total"), class = "factor"), N = c(13, 13, 12, 12, 50), + MLG = c(10, 12, 11, 9, 26), eMLG = c(9.46153846153846, 11.1538461538462, + 11, 9, 9.93701353621932), SE = c(0.498518515262143, 0.36080121229411, + 0, 0, 1.12881059579593), H = c(2.24503527412618, 2.45831132968308, + 2.36938211969468, 2.09472904752765, 3.07152395656842), G = c(8.89473684210526, + 11.2666666666667, 10.2857142857143, 7.2, 17.8571428571429 + ), Hexp = c(0.961538461538461, 0.987179487179487, 0.984848484848485, + 0.939393939393939, 0.963265306122449), E.5 = c(0.935312405238733, + 0.960842907662783, 0.958200460752105, 0.870390481833875, + 0.819311895784525), Ia = c(2.1580763424628, 1.87492360969648, + 1.15572679509632, 1.157153633392, 1.93513179817012), rbarD = c(0.243225877705591, + 0.212786561587854, 0.132460530412697, 0.13328661732193, 0.217470007471919 + ), File = structure(c(1L, 1L, 1L, 1L, 1L), class = "factor", .Label = "partial_clone.dat")), .Names = c("Pop", +"N", "MLG", "eMLG", "SE", "H", "G", "Hexp", "E.5", "Ia", "rbarD", +"File"), row.names = c(NA, -5L), class = c("popprtable", "data.frame" +)) + + expect_that(p.tab$Pop, is_equivalent_to(comparison$Pop)) + expect_that(p.tab$N, equals(comparison$N)) + expect_that(p.tab$MLG, equals(comparison$MLG)) + expect_that(p.tab$eMLG, equals(comparison$eMLG)) + expect_that(p.tab$SE, equals(comparison$SE)) + expect_that(p.tab$H, equals(comparison$H)) + expect_that(p.tab$G, equals(comparison$G)) + expect_that(p.tab$Hexp, equals(comparison$Hexp)) + expect_that(p.tab$E.5, equals(comparison$E.5)) + expect_that(p.tab$Ia, equals(comparison$Ia)) + expect_that(p.tab$rbarD, equals(comparison$rbarD)) +}) \ No newline at end of file diff --git a/tests/testthat/test-popsub.R b/tests/testthat/test-popsub.R new file mode 100644 index 00000000..d78ad820 --- /dev/null +++ b/tests/testthat/test-popsub.R @@ -0,0 +1,96 @@ +context("Population subset tests") + +test_that("subsetting needs a genind object", { + expect_error(popsub(1:10), "popsub requires a genind object\n") +}) + +test_that("sublist needs to match populations", { + data(microbov, package = "adegenet") + expect_error(popsub(microbov, "missingno"), + poppr:::unmatched_pops_warning(microbov@pop.names, "missingno")) +}) + +test_that("sum of the pops equal the whole", { + data(microbov, package = "adegenet") + mb1to7 <- popsub(microbov, 1:7) + mb8to15 <- popsub(microbov, 8:15) + expect_that(nInd(mb1to7) + nInd(mb8to15), equals(nInd(microbov))) + }) + +test_that("subsetting works with populations", { + data(nancycats, package = "adegenet") + temp <- nancycats@pop=="P04" | nancycats@pop=="P08" + p48 <- nancycats[temp, ] + p4 <- nancycats[nancycats@pop == "P04", , drop = TRUE] + nan48 <- popsub(nancycats, c(4, 8), drop = FALSE) + + # Matrices equivalent + expect_that(nan48@tab, equals(p48@tab)) + ## Dropping columns + expect_that(popsub(nan48, "4")@tab, equals(p4@tab)) + expect_that(popsub(nan48, 1)@tab, equals(p4@tab)) + expect_that(popsub(nancycats, 4)@tab, equals(p4@tab)) + # Populations equivalent + expect_that(as.character(pop(nan48)), is_identical_to(as.character(pop(p48)))) + # Individuals equivalent + expect_that(nan48@ind.names, is_identical_to(p48@ind.names)) + # Rejects unknown populations + expect_that(popsub(nancycats, 18), gives_warning()) + # Rejects equivalent blacklist and sublist + ## As numeric + expect_that(popsub(nancycats, sublist = 1, blacklist = 1), gives_warning()) + ## As characters + expect_that(popsub(nancycats, sublist = "1", blacklist = "1"), gives_warning()) + expect_that(popsub(nancycats, sublist = "1", blacklist = 1), gives_warning()) + expect_that(popsub(nancycats, sublist = 1, blacklist = "1"), gives_warning()) + expect_that(popsub(nancycats, sublist = c(4, 8), blacklist = c(4, 8)), + gives_warning()) + expect_that(popsub(nancycats, sublist = c(4, 8), blacklist = "8")@tab, + equals(p4@tab)) + # numeric and character are the same + expect_that(popsub(nancycats, c("4", "8"), drop = FALSE)@tab, equals(nan48@tab)) +}) + +test_that("subsetting doesn't work without populations", { + data(partial_clone, package = "poppr") + p1 <- 1:50 %% 4 == 1 + expect_that(nInd(popsub(partial_clone, 1)), equals(nInd(partial_clone[p1, ]))) + expect_that(popsub(partial_clone, 1)@tab, equals(partial_clone[p1, , drop = TRUE]@tab)) + pop(partial_clone) <- NULL + expect_that(popsub(partial_clone, 1), gives_warning()) +}) + +test_that("subsetting works with genclone objects", { + data(nancycats, package = "adegenet") + temp <- nancycats@pop=="P04" | nancycats@pop=="P08" + p48 <- nancycats[temp, ] + p4 <- nancycats[nancycats@pop == "P04", , drop = TRUE] + nancycats <- as.genclone(nancycats) + nan48 <- popsub(nancycats, c(4, 8), drop = FALSE) + + # Matrices equivalent + expect_that(nan48@tab, equals(p48@tab)) + ## Dropping columns + expect_that(popsub(nan48, "4")@tab, equals(p4@tab)) + expect_that(popsub(nan48, 1)@tab, equals(p4@tab)) + expect_that(popsub(nancycats, 4)@tab, equals(p4@tab)) + # Populations equivalent + expect_that(as.character(pop(nan48)), is_identical_to(as.character(pop(p48)))) + # Individuals equivalent + expect_that(nan48@ind.names, is_identical_to(p48@ind.names)) + # Rejects unknown populations + expect_that(popsub(nancycats, 18), gives_warning()) + # Rejects equivalent blacklist and sublist + ## As numeric + expect_that(popsub(nancycats, sublist = 1, blacklist = 1), gives_warning()) + ## As characters + expect_that(popsub(nancycats, sublist = "1", blacklist = "1"), gives_warning()) + expect_that(popsub(nancycats, sublist = "1", blacklist = 1), gives_warning()) + expect_that(popsub(nancycats, sublist = 1, blacklist = "1"), gives_warning()) + expect_that(popsub(nancycats, sublist = c(4, 8), blacklist = c(4, 8)), + gives_warning()) + expect_that(popsub(nancycats, sublist = c(4, 8), blacklist = "8")@tab, + equals(p4@tab)) + # numeric and character are the same + expect_that(popsub(nancycats, c("4", "8"), drop = FALSE)@tab, equals(nan48@tab)) +}) diff --git a/tests/testthat/test-values.R b/tests/testthat/test-values.R new file mode 100644 index 00000000..69caf0e3 --- /dev/null +++ b/tests/testthat/test-values.R @@ -0,0 +1,32 @@ +context("Analytical value tests") + +test_that("Bruvo's distance works as expected.", { + testdf <- data.frame(test = c("00/20/23/24", "20/24/26/43")) + testgid <- df2genind(testdf, ploidy = 4, sep = "/") + addloss <- as.vector(bruvo.dist(testgid, add = FALSE, loss = FALSE)) + ADDloss <- as.vector(bruvo.dist(testgid, add = TRUE, loss = FALSE)) + addLOSS <- as.vector(bruvo.dist(testgid, add = FALSE, loss = TRUE)) + ADDLOSS <- as.vector(bruvo.dist(testgid, add = TRUE, loss = TRUE)) + expect_that(addloss, equals(0.46875000000000)) + expect_that(ADDloss, equals(0.34374987334013)) + expect_that(addLOSS, equals(0.458333164453506)) + expect_that(ADDLOSS, equals(0.401041518896818)) +}) + +test_that("Dissimilarity distance works as expected.", { + data(nancycats, package = "adegenet") + nan1 <- popsub(nancycats, 1) + nanmat <- diss.dist(nan1, mat = TRUE) + expect_that(diss.dist(nan1), is_a("dist")) + expect_that(nanmat, is_a("matrix")) + #expect_that(diss.dist(nan1, mat = TRUE, percent = FALSE), equals(nanmat*2*9)) + #expect_that(nanmat[2, 1], equals(0.222222222222222)) + expect_that(diss.dist(nan1, mat = TRUE, percent = TRUE), equals((nanmat/2)/9)) + expect_that(nanmat[2, 1], equals(4)) +}) + +test_that("Index of association works as expected.", { + data(Aeut, package = "poppr") + res <- c(Ia = 14.3707995986407, rbarD = 0.270617053778004) + expect_that(ia(Aeut), equals(res)) +}) \ No newline at end of file diff --git a/vignettes/.gitignore b/vignettes/.gitignore index c947d857..b5d45b3c 100644 --- a/vignettes/.gitignore +++ b/vignettes/.gitignore @@ -22,7 +22,6 @@ poppr_manual-subnor.pdf poppr_manual-bruvo.msn.pdf poppr_manual.tex poppr_manual.log -poppr_manual.pdf poppr_manual.toc poppr_manual.bbl poppr_manual.synctex.gz @@ -58,4 +57,6 @@ poppr_manual-112.pdf *.bbl *.toc *.log -figure +*.blg +figures/ +figure/ diff --git a/vignettes/algo-concordance.tex b/vignettes/algo-concordance.tex new file mode 100644 index 00000000..16650ce6 --- /dev/null +++ b/vignettes/algo-concordance.tex @@ -0,0 +1,2 @@ +\Sconcordance{concordance:algo.tex:algo.Rnw:% +1 64 1 46 0 1 8 413 1 4 0 22 1 11 0 48 1} diff --git a/vignettes/algo.Rnw b/vignettes/algo.Rnw new file mode 100644 index 00000000..609a7d57 --- /dev/null +++ b/vignettes/algo.Rnw @@ -0,0 +1,555 @@ +\documentclass[letterpaper]{article} +%\VignetteIndexEntry{Algorightms and Equations} +%\VignetteEngine{knitr::knitr} +\usepackage{graphicx} +\usepackage[colorlinks = true, + urlcolor = blue, + citecolor = blue, + linkcolor = blue]{hyperref} +\usepackage{array} +\usepackage{color} +\usepackage[usenames,dvipsnames,svgnames,table]{xcolor} +\usepackage[utf8]{inputenc} % for UTF-8/single quotes from sQuote() +\usepackage{fullpage} +\usepackage{mathtools} +\usepackage{makeidx} +\usepackage{longtable} + +% for bold symbols in mathmode +\usepackage{bm} +\newcommand{\R}{\mathbb{R}} +\newcommand{\m}[1]{\mathbf{#1}} +\newcommand{\tab}{\hspace*{1em}} +\newcolumntype{H}{>{\setbox0=\hbox\bgroup}c<{\egroup}@{}} +\newcommand{\cmdlink}[2]{% + \texttt{\hyperref[#1]{#2}}% +} +\newcommand{\seclink}[2]{% + \textsc{\hyperref[#1]{#2}}% +} + +\newcommand{\poppr}{\textit{poppr}} +\newcommand{\Poppr}{\textit{Poppr}} +\newcommand{\adegenet}{\textit{adegenet}} +\newcommand{\Adegenet}{\textit{Adegenet}} +\newcommand{\tline}{ + \noindent + \rule{\textwidth}{1pt} + \par +} +\newcommand{\bline}{ + \noindent + \rule{\textwidth}{1pt} + \kern1pt +} + +\newcommand{\jala}{ + \includegraphics[height = 5mm, keepaspectratio=true]{jalapeno-poppers} +} + +\newcommand{\revjala}{ + \scalebox{-1}[1]{\jala{}} +} + +\title{Algorithms and equations utilized in poppr version 1.1.0} +\author{Zhian N. Kamvar$^{1}$\ and Niklaus J. Gr\"unwald$^{1,2}$\\\scriptsize{1) +Department of Botany and Plant Pathology, Oregon State University, Corvallis, +OR}\\\scriptsize{2) Horticultural Crops Research Laboratory, USDA-ARS, +Corvallis, OR}} + + + + +\begin{document} + +<>= +knitr::opts_knit$set(out.format = "latex") +thm <- knitr::knit_theme$get("acid") +knitr::knit_theme$set(thm) +knitr::opts_chunk$set(concordance=TRUE) +knitr::opts_chunk$set(size = 'footnotesize', message = FALSE, warning = FALSE) +knitr::opts_chunk$set(out.width = '0.95\\linewidth', fig.align = "center", fig.show = 'asis') +@ +\maketitle + +\begin{abstract} +This vignette is focused on simply explaining the different algorithms utilized +in calculations such as the index of association and different distance measures. +Many of these are previously described in other papers and it would be prudent +to cite them properly if they are used. +\end{abstract} + +% \begin{figure}[b] +% \centering +% \label{logo} +% \includegraphics{popprlogo} +% \end{figure} +% +% \newpage +\begingroup + \hypersetup{linkcolor=black} + \tableofcontents +\endgroup + +\section{Mathematical representation of data in \adegenet{} and \poppr{}} + +The sections dealing with the index of association and genetic distances will be +based on the same data structure, a matrix with samples in rows and alleles in +columns. The number of columns is equal to the total number of alleles observed +in the data set. Much of this description is derived from \adegenet{}'s +\texttt{dist.genpop} manual page. + +\begin{quote} +Let \textbf{A} be a table containing allelic frequencies with $t$ +samples\footnote{populations or individuals} (rows) and $m$ alleles (columns).\\ +\end{quote} +The above statement describes the table present in genind or genpop object +where, instead of having the number of columns equal the number of loci, the +number of columns equals the number of observed alleles in the entire data set. + +\begin{quote} +Let $\nu$ be the number of loci. The locus $j$ gets $m(j)$ alleles. + +\begin{equation} + m=\sum_{j=1}^{\nu} m(j) +\end{equation} +\end{quote} + +So, if you had a data set with 5 loci that had 2 alleles each, your table +would have ten columns. Of course, codominant loci like microsatellites have +varying numbers of alleles. + + +\begin{quote} +For the row $i$ and the modality $k$ of the variable $j$, notice the value + +\begin{equation} + a_{ijk}\\ (1 \leq i \leq t,\\ 1 \leq j \leq \nu,\\ 1 \leq k \leq m(j)) +\end{equation} + +\begin{equation} + a_{ij\cdot}=\sum_{k=1}^{m(j)}a_{ijk} +\end{equation} + +\begin{equation} + p_{ijk}=\frac{a_{ijk}}{a_{ij\cdot}} +\end{equation} +\end{quote} + +The above couple of equations are basically defining the allele counts +($a_{ijk}$) and frequency ($p_{ijk}$). Remember that $i$ is individual, $j$ is +locus, and $k$ is allele. The following continues to describe properties of the +frequency table used for analysis: + +\begin{quote} +\begin{equation} + p_{ij\cdot}=\sum_{k=1}^{m(j)}p_{ijk}=1 +\end{equation} +\end{quote} +The sum of all allele frequencies for a single population (or individual) at a +single locus is one. + +\begin{quote} +\begin{equation} +p_{i{\cdot}\cdot}=\sum_{j=1}^{\nu}p_{ij\cdot}=\nu +\end{equation} +\end{quote} +The sum of all allele frequencies over all loci is equal to the number of loci. + +\begin{quote} +\begin{equation} +p_{{\cdot}{\cdot}\cdot}=\sum_{j=1}^{\nu}p_{i{\cdot}\cdot}=t\nu +\end{equation} +\end{quote} +The the sum of the entire table is the sum of all loci multiplied by the number +of populations (or individuals). + + +% % +% %-----------------------------------------------------------------------------% +% % + + +\section{The Index of Association} +\label{indexassoc} + +The index of association was originally developed by A.H.D. Brown analyzing +population structure of wheat and has been widely used as a tool to detect +clonal reproduction within populations \cite{Brown:1980, Smith:1993}. +Populations whose members are undergoing sexual reproduction, whether it be +selfing or out-crossing, will produce gametes via meiosis, and thus have a +chance to shuffle alleles in the next generation. Populations whose members are +undergoing clonal reproduction, however, generally do so via mitosis. + +The most likely mechanism for a change in genotype, for a clonal organism, is +mutation. The rate of mutation varies from species to species, but it is rarely +sufficiently high to approximate a random shuffling of alleles. The index of +association is a calculation based on the ratio of the variance of the raw +number of differences between individuals and the sum of those variances over +each locus \cite{Smith:1993}. It can also be thought of as the observed variance +over the expected variance. If both variances are equal, then the index is zero +after subtracting one (from Maynard-Smith, 1993 \cite{Smith:1993}): +\begin{equation} +\label{eq:I_A} +I_A = \frac{V_O}{V_E}-1 +\end{equation} +Any sort of marker can be used for this analysis as it only counts differences +between pairs of samples. This can be thought of as a distance whose maximum is +equal to the number of loci multiplied by the ploidy of the sample. This is +calculated using an absolute genetic distance. + +Remember that in \poppr{}, genetic data is stored in a table where the rows +represent samples and the columns represent potential allelic states grouped by +locus. Notice also that the sum of the rows all equal one. \Poppr{} uses this to +calculate distances by simply taking the sum of the absolute values of the +differences between rows. + +The calculation for the distance between two individuals at a single locus $j$ +with $m(j)$ allelic states and a ploidy of $l$ is as follows\footnote{Individuals +with Presence / Absence data will have the $l/2$ term dropped.}: +\begin{equation} +\label{eq:ia_d} + d(a,b)=\frac{l}{2} \sum_{k=1}^{m(j)} + |p_{ajk} - p_{bjk}| +\end{equation} + +% \begin{equation} + +% d = \displaystyle \frac{k}{2}\sum_{i=1}^{a} \mid ind_{Ai} - ind_{Bi}\mid +% \end{equation} +\noindent +To find the total number of differences between two individuals over all loci, +you just take $d$ over $\nu$ loci, a value we'll call $D$: + +\begin{equation} +\label{eq:ia_D} +D(a,b) = \displaystyle \sum_{i=1}^{\nu} d_i +\end{equation} +An interesting observation: $D(a,b)/(l\nu)$ is Provesti's distance. + +These values are calculated over all possible combinations of individuals in the +data set, ${n \choose 2}$ after which you end up with ${n \choose 2}\cdot{}\nu$ +values of $d$ and ${n \choose 2}$ values of $D$. Calculating the observed +variances is fairly straightforward (modified from Agapow and Burt, 2001) +\cite{Agapow:2001}: + +\begin{equation} +\label{eq:V_O} +V_O = \frac{\displaystyle \sum_{i=1}^{n \choose 2} D_{i}^2 - \frac{\left(\displaystyle\sum_{i=1}^{n \choose 2} D_{i}\right)^2}{{n \choose 2}}}{{n \choose 2}} +\end{equation} + +Calculating the expected variance is the sum of each of the variances of the +individual loci. The calculation at a single locus, $j$ is the same as the +previous equation, substituting values of $D$ for $d$ \cite{Agapow:2001}: + +\begin{equation} +\label{eq:var_j} +var_j = \frac{\displaystyle \sum_{i=1}^{n \choose 2} d_{i}^2 - \frac{\left(\displaystyle\sum_{i=1}^{n \choose 2} d_i\right)^2}{{n \choose 2}}}{{n \choose 2}} +\end{equation} + +The expected variance is then the sum of all the variances over all $\nu$ loci +\cite{Agapow:2001}: + +\begin{equation} +\label{eq:V_E} +V_E = \displaystyle \sum_{j=1}^{\nu} var_j +\end{equation} + +Now you can plug the sums of equations (\ref{eq:V_O}) and (\ref{eq:V_E}) into +equation (\ref{eq:I_A}) to get the index of association. Of course, Agapow and +Burt showed that this index increases steadily with the number of loci, so they +came up with an approximation that is widely used, $\bar r_d$ +\cite{Agapow:2001}. For the derivation, see the manual for \textit{multilocus}. +The equation is as follows, utilizing equations (\ref{eq:V_O}), +(\ref{eq:var_j}), and (\ref{eq:V_E}) \cite{Agapow:2001}: + +\begin{equation} +\label{eq:r_d} +\bar{r}_d = \frac{V_O - V_E} +{2\displaystyle \sum_{j=1}^{\nu}\displaystyle \sum_{k \neq j}^{\nu}\sqrt{var_j\cdot{}var_k}} +\end{equation} + +\section{Genetic distances} + +Genetic distances are great tools for analyzing diversity in +populations as they are the basis for creating dendrograms with bootstrap +support and also for AMOVA. This section will simply present different genetic +distances along with a few notes about them. Most of these distances are derived +from the \textit{ade4} and \adegenet{} packages, where they were implemented as +distances between populations. \Poppr{} extends the implementation to individuals +as well (with the exception of Bruvo's distance). + +\begin{table}[ht] +\centering +\caption{Distance measures and their respective assumptions} +\begin{tabular}{lllll} + \hline + Method & Function & Assumption & Euclidean & Citation\\ + \hline +Provesti & \texttt{provesti.dist} & - & No & \cite{prevosti1975distances}\\ + & \texttt{diss.dist} & & & \\ +Nei & \texttt{nei.dist} & Infinite Alleles & No & \cite{nei1972genetic, nei1978estimation}\\ + & & Genetic Drift & & \\ +Edwards & \texttt{edwards.dist} & Genetic Drift & Yes & \cite{edwards1971distances}\\ +Reynolds & \texttt{reynolds.dist} & Genetic Drift & Yes & \cite{reynolds1983estimation}\\ +Rogers & \texttt{rogers.dist} & - & Yes & \cite{rogers1972measures}\\ +Bruvo & \texttt{bruvo.dist} & Stepwise Mutation & No & \cite{Bruvo:2004}\\ + \hline +\end{tabular} +\end{table} + +\subsection{Distances that assume genetic drift} +\subsubsection{Nei's 1978 Distance} +\label{distance:nei} +\begin{equation} + D_{Nei}(a,b)= -\ln\left(\frac{\sum_{k=1}^{\nu} \sum_{j=1}^{m(k)} + p_{ajk} p_{bjk}}{\sqrt{\sum_{k=1}^{\nu} \sum_{j=1}^{m(k)} + {(p_{ajk}) }^2}\sqrt{\sum_{k=1}^{\nu} \sum_{j=1}^{m(k)} + {(p_{bjk})}^2}}\right) +\end{equation} + +Note: if comparing individuals in \poppr{}, those that do not share any alleles +normally receive a distance of $\infty$. As you cannot draw a dendrogram with +infinite branch lengths, infinite values are converted to a value that is equal +to an order of magnitude greater than the largest finite value. + +\subsubsection{Edwards' angular distance} +\label{distance:edwards} +\begin{equation} + D_{Edwards}(a,b)=\sqrt{1-\frac{1}{\nu} \sum_{k=1}^{\nu} + \sum_{j=1}^{m(k)} \sqrt{p_{ajk} p_{bjk}}} +\end{equation} + +\subsubsection{Reynolds' coancestry distance} +\label{distance:reynolds} +\begin{equation} + D_{Reynolds}(a,b)=\sqrt{\frac{\sum_{k=1}^{\nu} + \sum_{j=1}^{m(k)}{(p_{ajk} - p_{bjk})}^2}{2 \sum_{k=1}^{\nu} \left(1- + \sum_{j=1}^{m(k)}p_{ajk} p_{bjk}\right)}} +\end{equation} + +\subsection{Distances without assumptions} +\subsubsection{Rogers' distance} +\label{distance:rogers} +\begin{equation} + D_{Rogers}(a,b)=\frac{1}{\nu} \sum_{k=1}^{\nu} \sqrt{\frac{1}{2} + \sum_{j=1}^{m(k)}{(p_{ajk} - p_{bjk})}^2} +\end{equation} + + +\subsubsection{Provesti's absolute genetic distance} +\label{distance:provesti} +\begin{equation} + D_{Provesti}(a,b)=\frac{1}{2{\nu}} \sum_{k=1}^{\nu} \sum_{j=1}^{m(k)} + |p_{ajk} - p_{bjk}| +\end{equation} +Note: for AFLP data, the $2$ is dropped. + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\subsection{Bruvo's distance (stepwise mutation for microsatellites)} +\label{bruvo} + +Bruvo's distance between two individuals calculates the minimum distance across +all combinations of possible pairs of alleles at a single locus and then +averaging that distance across all loci \cite{Bruvo:2004}. The distance between +each pair of alleles is calculated as\footnote{Notation presented unmodified +from Bruvo et al, 2004}\cite{Bruvo:2004}: + +\begin{equation} +\label{eq:m_x} +m_x = 2^{-\mid x \mid} +\end{equation} + +\begin{equation} +\label{eq:d_a} +d_a = 1 - m_x +\end{equation} + +Where $x$ is the number of steps between each allele. So, let's say we were +comparing two haploid $(k = 1)$ individuals with alleles 228 and 244 at a locus +that had a tetranucleotide repeat pattern (CATG$)^n$. The number of steps for +each of these alleles would be $228/4 = 57$ and $244/4 =61$, respectively. The +number of steps between them is then $\mid 57 - 61 \mid = 4$. Bruvo's distance +at this locus between these two individuals is then $1-2^{-4} = 0.9375$. For +samples with higher ploidy ($k$), there would be $k$ such distances of which we +would need to take the sum \cite{Bruvo:2004}. + +\begin{equation} +\label{eq:s_i} +s_i = \displaystyle \sum_{a=1}^{k} d_a +\end{equation} + +Unfortunately, it's not as simple as that since we do not assume to know phase. +Because of this, we need to take all possible combinations of alleles into +account. This means that we will have $k^2$ values of $d_a$, when we only want +$k$. How do we know which $k$ distances we want? We will have to invoke +parsimony for this and attempt to take the minimum sum of the alleles, of which +there are $k!$ possibilities \cite{Bruvo:2004}: + +\begin{equation} +\label{eq:d_l} +d_l = \frac{1}{k}\left(\min_{i \dotsc k!} s_i\right) +\end{equation} +\noindent +Finally, after all of this, we can get the average distance over all loci +\cite{Bruvo:2004}. + +\begin{equation} +\label{eq:D} +D = \frac{1}{l}\sum_{i=1}^l d_i +\end{equation} +\noindent +This is calculated over all possible combinations of individuals and results in +a lower triangle distance matrix over all individuals. + +\subsubsection{Special cases of Bruvo's distance} +\label{appendix:algorithm:bruvospecial} +As shown in the above section, ploidy is irrelevant with respect to +calculation of Bruvo's distance. However, since it makes a comparison between +all alleles at a locus, it only makes sense that the two loci need to have the +same ploidy level. Unfortunately for polyploids, it's often difficult to fully +separate distinct alleles at each locus, so you end up with genotypes that +appear to have a lower ploidy level than the organism \cite{Bruvo:2004}. + +To help deal with these situations, Bruvo has suggested three methods for dealing +with these differences in ploidy levels \cite{Bruvo:2004}: +\begin{itemize} + \item{Infinite Model -} The simplest way to deal with it is to count all + missing alleles as infinitely large so that the distance between it and + anything else is 1. Aside from this being computationally simple, it will tend + to inflate distances between individuals. + \item{Genome Addition Model -} If it is suspected that the organism has gone + through a recent genome expansion, the missing alleles will be replace with + all possible combinations of the observed alleles in the shorter genotype. For + example, if there is a genotype of [69, 70, 0, 0] where 0 is a missing allele, + the possible combinations are: [69, 70, 69, 69], [69, 70, 69, 70], and [69, + 70, 70, 70]. The resulting distances are then averaged over the number of + comparisons. + \item{Genome Loss Model -} This is similar to the genome addition model, + except that it assumes that there was a recent genome reduction event and uses + the observed values in the full genotype to fill the missing values in the + short genotype. As with the Genome Addition Model, the resulting distances are + averaged over the number of comparisons. + \item{Combination Model -} Combine and average the genome addition and loss + models. +\end{itemize} + +As mentioned above, the infinite model is biased, but it is not nearly as +computationally intensive as either of the other models. The reason for this is +that both of the addition and loss models requires replacement of alleles and +recalculation of Bruvo's distance. The number of replacements required is equal +to the multiset coefficient: $\left({n \choose k}\right) == {(n-k+1) \choose k}$ +where $n$ is the number of potential replacements and $k$ is the number of +alleles to be replaced. So, for the example given above, The genome addition +model would require $\left({2 \choose 2}\right) = 3$ calculations of Bruvo's +distance, whereas the genome loss model would require $\left({4 \choose +2}\right) = 10$ calculations. + +To reduce the number of calculations and assumptions otherwise, Bruvo's distance +will be calculated using the largest observed ploidy in pairwise comparisons. +This means that when +comparing [69,70,71,0] and [59,60,0,0], they will be treated as triploids. + +\subsection{Tree topology} + +All of these distances were designed for analysis of populations. When applying +them to individuals, we must change our interpretations. For example, with Nei's +distance, branch lengths increase linearly with mutation +\cite{nei1972genetic,nei1978estimation}. When two populations share no alleles, +then the distance becomes infinite. However, we expect two individuals to +segregate for different alleles more often than entire populations, thus we +would expect exaggerated internal branch lengths separating clades. To +demonstrate the effect of the different distances on tree topology, we will use +5 diploid samples at a single locus demonstrating a range of possibilities: + +\begin{table}[ht] +\centering +\begin{tabular}{c} + \hline +Genotype \\ + \hline +1/1 \\ + 1/2 \\ + 2/3 \\ + 3/4 \\ + 4/4 \\ + \hline +\end{tabular} +\caption{Table of genotypes to be used for analysis} +\end{table} + +<<>>= +library(poppr) +dat.df <- data.frame(Genotype = c("1/1", "1/2", "2/3", "3/4", "4/4")) +dat <- as.genclone(df2genind(dat.df, sep = "/", ind.names = dat.df[[1]])) +@ + +We will now compute the distances and construct neighbor-joining dendrograms +using the package \textit{ape}. This allows us to see the effect of the +different distance measures on the tree topology. + +<<>>= +distances <- c("Nei", "Rogers", "Edwards", "Reynolds", "Provesti") +dists <- lapply(distances, function(x){ + DISTFUN <- match.fun(paste(tolower(x), "dist", sep = ".")) + DISTFUN(dat) +}) +names(dists) <- distances + +# Adding Bruvo's distance at the end because we need to specify repeat length. +dists$Bruvo <- bruvo.dist(dat, replen = 1) +library(ape) +par(mfrow = c(2, 3)) +x <- lapply(names(dists), function(x){ + plot(nj(dists[[x]]), main = x, type = "unrooted") + add.scale.bar(lcol = "red") +}) +@ + +\section{AMOVA} + +AMOVA in \poppr{} acts as a wrapper for the \textit{ade4} implementation, which +is an implementation of Excoffier's original formulation +\cite{excoffier1992analysis}. As the calculation relies on a genetic distance +matrix, \poppr{} calculates the distance matrix as the number of differing sites +between two genotypes using equation \ref{eq:ia_D}. This is equivalent to +Provesti's distance multiplied by the ploidy and number of loci. This is also +equivalent to Kronecker's delta as presented in \cite{excoffier1992analysis}. It +should also be mentioned that \textbf{at this moment, it is not possible to calculate +AMOVA on polyploid data.} + +\section{Genotypic Diversity} + +Many of the calculations of genotypic diversity exist within the package \textit{vegan} in the \texttt{diversity} +function. Descriptions of most calculations can be found in the paper by Gr\"unwald et al \cite{Grunwald:2003}. + +\subsection{Rarefaction, Shannon-Wiener, Stoddart and Taylor indices} + +Stoddart and Taylor's index is also known as Inverse Simpson's index. +A detailed description of these can be found in the ``Diversity'' vignette in \textit{vegan}. You can access it by typing \texttt{vignette("diversity-vegan")} + +\subsection{Evenness ($E_{5}$)} +Evenness ($E_{5}$) is essentially the ratio of the number of abundant genotypes to the number of rarer genotypes. + +This is simply calculated as + +\begin{equation} +E_{5} = \frac{(1/\lambda) - 1}{e^{H}} +\end{equation} + +Where $1/\lambda$ is Stoddart and Taylor's index and $H$ is Shannon diversity \cite{Stoddart:1988,Shannon:1948}. + +\subsection{Hexp} + +Essentially, $H_{exp}$ is an analog of Nei's expected heterozygosity and is equivalent to an unbiased estimator for Simpson's index. It was implemented in \poppr{} as it was previously in \textit{multilocus} \cite{Agapow:2001}. The calculation is as follows: + +\begin{equation} +H_{exp} = \frac{N}{N-1} \sum_{i=1}^{g}{p^{2}_{i}} +\end{equation} +\noindent +Where $p_i$ is the frequency of the $i$th genotype, $g$ is the number of +genotypes observed and $N$ is the sample size. + +\bibliographystyle{pnas.bst} +\bibliography{poppr_man} +\end{document} \ No newline at end of file diff --git a/vignettes/algo.pdf b/vignettes/algo.pdf new file mode 100644 index 00000000..bfc9939e Binary files /dev/null and b/vignettes/algo.pdf differ diff --git a/vignettes/algo.tex b/vignettes/algo.tex new file mode 100644 index 00000000..ca6c5a7a --- /dev/null +++ b/vignettes/algo.tex @@ -0,0 +1,609 @@ +\documentclass[letterpaper]{article}\usepackage[]{graphicx}\usepackage[]{color} +%% maxwidth is the original width if it is less than linewidth +%% otherwise use linewidth (to make sure the graphics do not exceed the margin) +\makeatletter +\def\maxwidth{ % + \ifdim\Gin@nat@width>\linewidth + \linewidth + \else + \Gin@nat@width + \fi +} +\makeatother + +\definecolor{fgcolor}{rgb}{0, 0, 0} +\newcommand{\hlnum}[1]{\textcolor[rgb]{0.502,0,0.502}{\textbf{#1}}}% +\newcommand{\hlstr}[1]{\textcolor[rgb]{0.651,0.522,0}{#1}}% +\newcommand{\hlcom}[1]{\textcolor[rgb]{1,0.502,0}{#1}}% +\newcommand{\hlopt}[1]{\textcolor[rgb]{1,0,0.502}{\textbf{#1}}}% +\newcommand{\hlstd}[1]{\textcolor[rgb]{0,0,0}{#1}}% +\newcommand{\hlkwa}[1]{\textcolor[rgb]{0.733,0.475,0.467}{\textbf{#1}}}% +\newcommand{\hlkwb}[1]{\textcolor[rgb]{0.502,0.502,0.753}{\textbf{#1}}}% +\newcommand{\hlkwc}[1]{\textcolor[rgb]{0,0.502,0.753}{#1}}% +\newcommand{\hlkwd}[1]{\textcolor[rgb]{0,0.267,0.4}{#1}}% + +\usepackage{framed} +\makeatletter +\newenvironment{kframe}{% + \def\at@end@of@kframe{}% + \ifinner\ifhmode% + \def\at@end@of@kframe{\end{minipage}}% + \begin{minipage}{\columnwidth}% + \fi\fi% + \def\FrameCommand##1{\hskip\@totalleftmargin \hskip-\fboxsep + \colorbox{shadecolor}{##1}\hskip-\fboxsep + % There is no \\@totalrightmargin, so: + \hskip-\linewidth \hskip-\@totalleftmargin \hskip\columnwidth}% + \MakeFramed {\advance\hsize-\width + \@totalleftmargin\z@ \linewidth\hsize + \@setminipage}}% + {\par\unskip\endMakeFramed% + \at@end@of@kframe} +\makeatother + +\definecolor{shadecolor}{rgb}{.97, .97, .97} +\definecolor{messagecolor}{rgb}{0, 0, 0} +\definecolor{warningcolor}{rgb}{1, 0, 1} +\definecolor{errorcolor}{rgb}{1, 0, 0} +\newenvironment{knitrout}{}{} % an empty environment to be redefined in TeX + +\usepackage{alltt} +%\VignetteIndexEntry{Algorightms and Equations} +%\VignetteEngine{knitr::knitr} +\usepackage{graphicx} +\usepackage[colorlinks = true, + urlcolor = blue, + citecolor = blue, + linkcolor = blue]{hyperref} +\usepackage{array} +\usepackage{color} +\usepackage[usenames,dvipsnames,svgnames,table]{xcolor} +\usepackage[utf8]{inputenc} % for UTF-8/single quotes from sQuote() +\usepackage{fullpage} +\usepackage{mathtools} +\usepackage{makeidx} +\usepackage{longtable} + +% for bold symbols in mathmode +\usepackage{bm} +\newcommand{\R}{\mathbb{R}} +\newcommand{\m}[1]{\mathbf{#1}} +\newcommand{\tab}{\hspace*{1em}} +\newcolumntype{H}{>{\setbox0=\hbox\bgroup}c<{\egroup}@{}} +\newcommand{\cmdlink}[2]{% + \texttt{\hyperref[#1]{#2}}% +} +\newcommand{\seclink}[2]{% + \textsc{\hyperref[#1]{#2}}% +} + +\newcommand{\poppr}{\textit{poppr}} +\newcommand{\Poppr}{\textit{Poppr}} +\newcommand{\adegenet}{\textit{adegenet}} +\newcommand{\Adegenet}{\textit{Adegenet}} +\newcommand{\tline}{ + \noindent + \rule{\textwidth}{1pt} + \par +} +\newcommand{\bline}{ + \noindent + \rule{\textwidth}{1pt} + \kern1pt +} + +\newcommand{\jala}{ + \includegraphics[height = 5mm, keepaspectratio=true]{jalapeno-poppers} +} + +\newcommand{\revjala}{ + \scalebox{-1}[1]{\jala{}} +} + +\title{Algorithms and equations utilized in poppr version 1.1.0} +\author{Zhian N. Kamvar$^{1}$\ and Niklaus J. Gr\"unwald$^{1,2}$\\\scriptsize{1) +Department of Botany and Plant Pathology, Oregon State University, Corvallis, +OR}\\\scriptsize{2) Horticultural Crops Research Laboratory, USDA-ARS, +Corvallis, OR}} +\IfFileExists{upquote.sty}{\usepackage{upquote}}{} +\begin{document} + + +\maketitle + +\begin{abstract} +This vignette is focused on simply explaining the different algorithms utilized +in calculations such as the index of association and different distance measures. +Many of these are previously described in other papers and it would be prudent +to cite them properly if they are used. +\end{abstract} + +% \begin{figure}[b] +% \centering +% \label{logo} +% \includegraphics{popprlogo} +% \end{figure} +% +% \newpage +\begingroup + \hypersetup{linkcolor=black} + \tableofcontents +\endgroup + +\section{Mathematical representation of data in \adegenet{} and \poppr{}} + +The sections dealing with the index of association and genetic distances will be +based on the same data structure, a matrix with samples in rows and alleles in +columns. The number of columns is equal to the total number of alleles observed +in the data set. Much of this description is derived from \adegenet{}'s +\texttt{dist.genpop} manual page. + +\begin{quote} +Let \textbf{A} be a table containing allelic frequencies with $t$ +samples\footnote{populations or individuals} (rows) and $m$ alleles (columns).\\ +\end{quote} +The above statement describes the table present in genind or genpop object +where, instead of having the number of columns equal the number of loci, the +number of columns equals the number of observed alleles in the entire data set. + +\begin{quote} +Let $\nu$ be the number of loci. The locus $j$ gets $m(j)$ alleles. + +\begin{equation} + m=\sum_{j=1}^{\nu} m(j) +\end{equation} +\end{quote} + +So, if you had a data set with 5 loci that had 2 alleles each, your table +would have ten columns. Of course, codominant loci like microsatellites have +varying numbers of alleles. + + +\begin{quote} +For the row $i$ and the modality $k$ of the variable $j$, notice the value + +\begin{equation} + a_{ijk}\\ (1 \leq i \leq t,\\ 1 \leq j \leq \nu,\\ 1 \leq k \leq m(j)) +\end{equation} + +\begin{equation} + a_{ij\cdot}=\sum_{k=1}^{m(j)}a_{ijk} +\end{equation} + +\begin{equation} + p_{ijk}=\frac{a_{ijk}}{a_{ij\cdot}} +\end{equation} +\end{quote} + +The above couple of equations are basically defining the allele counts +($a_{ijk}$) and frequency ($p_{ijk}$). Remember that $i$ is individual, $j$ is +locus, and $k$ is allele. The following continues to describe properties of the +frequency table used for analysis: + +\begin{quote} +\begin{equation} + p_{ij\cdot}=\sum_{k=1}^{m(j)}p_{ijk}=1 +\end{equation} +\end{quote} +The sum of all allele frequencies for a single population (or individual) at a +single locus is one. + +\begin{quote} +\begin{equation} +p_{i{\cdot}\cdot}=\sum_{j=1}^{\nu}p_{ij\cdot}=\nu +\end{equation} +\end{quote} +The sum of all allele frequencies over all loci is equal to the number of loci. + +\begin{quote} +\begin{equation} +p_{{\cdot}{\cdot}\cdot}=\sum_{j=1}^{\nu}p_{i{\cdot}\cdot}=t\nu +\end{equation} +\end{quote} +The the sum of the entire table is the sum of all loci multiplied by the number +of populations (or individuals). + + +% % +% %-----------------------------------------------------------------------------% +% % + + +\section{The Index of Association} +\label{indexassoc} + +The index of association was originally developed by A.H.D. Brown analyzing +population structure of wheat and has been widely used as a tool to detect +clonal reproduction within populations \cite{Brown:1980, Smith:1993}. +Populations whose members are undergoing sexual reproduction, whether it be +selfing or out-crossing, will produce gametes via meiosis, and thus have a +chance to shuffle alleles in the next generation. Populations whose members are +undergoing clonal reproduction, however, generally do so via mitosis. + +The most likely mechanism for a change in genotype, for a clonal organism, is +mutation. The rate of mutation varies from species to species, but it is rarely +sufficiently high to approximate a random shuffling of alleles. The index of +association is a calculation based on the ratio of the variance of the raw +number of differences between individuals and the sum of those variances over +each locus \cite{Smith:1993}. It can also be thought of as the observed variance +over the expected variance. If both variances are equal, then the index is zero +after subtracting one (from Maynard-Smith, 1993 \cite{Smith:1993}): +\begin{equation} +\label{eq:I_A} +I_A = \frac{V_O}{V_E}-1 +\end{equation} +Any sort of marker can be used for this analysis as it only counts differences +between pairs of samples. This can be thought of as a distance whose maximum is +equal to the number of loci multiplied by the ploidy of the sample. This is +calculated using an absolute genetic distance. + +Remember that in \poppr{}, genetic data is stored in a table where the rows +represent samples and the columns represent potential allelic states grouped by +locus. Notice also that the sum of the rows all equal one. \Poppr{} uses this to +calculate distances by simply taking the sum of the absolute values of the +differences between rows. + +The calculation for the distance between two individuals at a single locus $j$ +with $m(j)$ allelic states and a ploidy of $l$ is as follows\footnote{Individuals +with Presence / Absence data will have the $l/2$ term dropped.}: +\begin{equation} +\label{eq:ia_d} + d(a,b)=\frac{l}{2} \sum_{k=1}^{m(j)} + |p_{ajk} - p_{bjk}| +\end{equation} + +% \begin{equation} + +% d = \displaystyle \frac{k}{2}\sum_{i=1}^{a} \mid ind_{Ai} - ind_{Bi}\mid +% \end{equation} +\noindent +To find the total number of differences between two individuals over all loci, +you just take $d$ over $\nu$ loci, a value we'll call $D$: + +\begin{equation} +\label{eq:ia_D} +D(a,b) = \displaystyle \sum_{i=1}^{\nu} d_i +\end{equation} +An interesting observation: $D(a,b)/(l\nu)$ is Provesti's distance. + +These values are calculated over all possible combinations of individuals in the +data set, ${n \choose 2}$ after which you end up with ${n \choose 2}\cdot{}\nu$ +values of $d$ and ${n \choose 2}$ values of $D$. Calculating the observed +variances is fairly straightforward (modified from Agapow and Burt, 2001) +\cite{Agapow:2001}: + +\begin{equation} +\label{eq:V_O} +V_O = \frac{\displaystyle \sum_{i=1}^{n \choose 2} D_{i}^2 - \frac{\left(\displaystyle\sum_{i=1}^{n \choose 2} D_{i}\right)^2}{{n \choose 2}}}{{n \choose 2}} +\end{equation} + +Calculating the expected variance is the sum of each of the variances of the +individual loci. The calculation at a single locus, $j$ is the same as the +previous equation, substituting values of $D$ for $d$ \cite{Agapow:2001}: + +\begin{equation} +\label{eq:var_j} +var_j = \frac{\displaystyle \sum_{i=1}^{n \choose 2} d_{i}^2 - \frac{\left(\displaystyle\sum_{i=1}^{n \choose 2} d_i\right)^2}{{n \choose 2}}}{{n \choose 2}} +\end{equation} + +The expected variance is then the sum of all the variances over all $\nu$ loci +\cite{Agapow:2001}: + +\begin{equation} +\label{eq:V_E} +V_E = \displaystyle \sum_{j=1}^{\nu} var_j +\end{equation} + +Now you can plug the sums of equations (\ref{eq:V_O}) and (\ref{eq:V_E}) into +equation (\ref{eq:I_A}) to get the index of association. Of course, Agapow and +Burt showed that this index increases steadily with the number of loci, so they +came up with an approximation that is widely used, $\bar r_d$ +\cite{Agapow:2001}. For the derivation, see the manual for \textit{multilocus}. +The equation is as follows, utilizing equations (\ref{eq:V_O}), +(\ref{eq:var_j}), and (\ref{eq:V_E}) \cite{Agapow:2001}: + +\begin{equation} +\label{eq:r_d} +\bar{r}_d = \frac{V_O - V_E} +{2\displaystyle \sum_{j=1}^{\nu}\displaystyle \sum_{k \neq j}^{\nu}\sqrt{var_j\cdot{}var_k}} +\end{equation} + +\section{Genetic distances} + +Genetic distances are great tools for analyzing diversity in +populations as they are the basis for creating dendrograms with bootstrap +support and also for AMOVA. This section will simply present different genetic +distances along with a few notes about them. Most of these distances are derived +from the \textit{ade4} and \adegenet{} packages, where they were implemented as +distances between populations. \Poppr{} extends the implementation to individuals +as well (with the exception of Bruvo's distance). + +\begin{table}[ht] +\centering +\caption{Distance measures and their respective assumptions} +\begin{tabular}{lllll} + \hline + Method & Function & Assumption & Euclidean & Citation\\ + \hline +Provesti & \texttt{provesti.dist} & - & No & \cite{prevosti1975distances}\\ + & \texttt{diss.dist} & & & \\ +Nei & \texttt{nei.dist} & Infinite Alleles & No & \cite{nei1972genetic, nei1978estimation}\\ + & & Genetic Drift & & \\ +Edwards & \texttt{edwards.dist} & Genetic Drift & Yes & \cite{edwards1971distances}\\ +Reynolds & \texttt{reynolds.dist} & Genetic Drift & Yes & \cite{reynolds1983estimation}\\ +Rogers & \texttt{rogers.dist} & - & Yes & \cite{rogers1972measures}\\ +Bruvo & \texttt{bruvo.dist} & Stepwise Mutation & No & \cite{Bruvo:2004}\\ + \hline +\end{tabular} +\end{table} + +\subsection{Distances that assume genetic drift} +\subsubsection{Nei's 1978 Distance} +\label{distance:nei} +\begin{equation} + D_{Nei}(a,b)= -\ln\left(\frac{\sum_{k=1}^{\nu} \sum_{j=1}^{m(k)} + p_{ajk} p_{bjk}}{\sqrt{\sum_{k=1}^{\nu} \sum_{j=1}^{m(k)} + {(p_{ajk}) }^2}\sqrt{\sum_{k=1}^{\nu} \sum_{j=1}^{m(k)} + {(p_{bjk})}^2}}\right) +\end{equation} + +Note: if comparing individuals in \poppr{}, those that do not share any alleles +normally receive a distance of $\infty$. As you cannot draw a dendrogram with +infinite branch lengths, infinite values are converted to a value that is equal +to an order of magnitude greater than the largest finite value. + +\subsubsection{Edwards' angular distance} +\label{distance:edwards} +\begin{equation} + D_{Edwards}(a,b)=\sqrt{1-\frac{1}{\nu} \sum_{k=1}^{\nu} + \sum_{j=1}^{m(k)} \sqrt{p_{ajk} p_{bjk}}} +\end{equation} + +\subsubsection{Reynolds' coancestry distance} +\label{distance:reynolds} +\begin{equation} + D_{Reynolds}(a,b)=\sqrt{\frac{\sum_{k=1}^{\nu} + \sum_{j=1}^{m(k)}{(p_{ajk} - p_{bjk})}^2}{2 \sum_{k=1}^{\nu} \left(1- + \sum_{j=1}^{m(k)}p_{ajk} p_{bjk}\right)}} +\end{equation} + +\subsection{Distances without assumptions} +\subsubsection{Rogers' distance} +\label{distance:rogers} +\begin{equation} + D_{Rogers}(a,b)=\frac{1}{\nu} \sum_{k=1}^{\nu} \sqrt{\frac{1}{2} + \sum_{j=1}^{m(k)}{(p_{ajk} - p_{bjk})}^2} +\end{equation} + + +\subsubsection{Provesti's absolute genetic distance} +\label{distance:provesti} +\begin{equation} + D_{Provesti}(a,b)=\frac{1}{2{\nu}} \sum_{k=1}^{\nu} \sum_{j=1}^{m(k)} + |p_{ajk} - p_{bjk}| +\end{equation} +Note: for AFLP data, the $2$ is dropped. + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\subsection{Bruvo's distance (stepwise mutation for microsatellites)} +\label{bruvo} + +Bruvo's distance between two individuals calculates the minimum distance across +all combinations of possible pairs of alleles at a single locus and then +averaging that distance across all loci \cite{Bruvo:2004}. The distance between +each pair of alleles is calculated as\footnote{Notation presented unmodified +from Bruvo et al, 2004}\cite{Bruvo:2004}: + +\begin{equation} +\label{eq:m_x} +m_x = 2^{-\mid x \mid} +\end{equation} + +\begin{equation} +\label{eq:d_a} +d_a = 1 - m_x +\end{equation} + +Where $x$ is the number of steps between each allele. So, let's say we were +comparing two haploid $(k = 1)$ individuals with alleles 228 and 244 at a locus +that had a tetranucleotide repeat pattern (CATG$)^n$. The number of steps for +each of these alleles would be $228/4 = 57$ and $244/4 =61$, respectively. The +number of steps between them is then $\mid 57 - 61 \mid = 4$. Bruvo's distance +at this locus between these two individuals is then $1-2^{-4} = 0.9375$. For +samples with higher ploidy ($k$), there would be $k$ such distances of which we +would need to take the sum \cite{Bruvo:2004}. + +\begin{equation} +\label{eq:s_i} +s_i = \displaystyle \sum_{a=1}^{k} d_a +\end{equation} + +Unfortunately, it's not as simple as that since we do not assume to know phase. +Because of this, we need to take all possible combinations of alleles into +account. This means that we will have $k^2$ values of $d_a$, when we only want +$k$. How do we know which $k$ distances we want? We will have to invoke +parsimony for this and attempt to take the minimum sum of the alleles, of which +there are $k!$ possibilities \cite{Bruvo:2004}: + +\begin{equation} +\label{eq:d_l} +d_l = \frac{1}{k}\left(\min_{i \dotsc k!} s_i\right) +\end{equation} +\noindent +Finally, after all of this, we can get the average distance over all loci +\cite{Bruvo:2004}. + +\begin{equation} +\label{eq:D} +D = \frac{1}{l}\sum_{i=1}^l d_i +\end{equation} +\noindent +This is calculated over all possible combinations of individuals and results in +a lower triangle distance matrix over all individuals. + +\subsubsection{Special cases of Bruvo's distance} +\label{appendix:algorithm:bruvospecial} +As shown in the above section, ploidy is irrelevant with respect to +calculation of Bruvo's distance. However, since it makes a comparison between +all alleles at a locus, it only makes sense that the two loci need to have the +same ploidy level. Unfortunately for polyploids, it's often difficult to fully +separate distinct alleles at each locus, so you end up with genotypes that +appear to have a lower ploidy level than the organism \cite{Bruvo:2004}. + +To help deal with these situations, Bruvo has suggested three methods for dealing +with these differences in ploidy levels \cite{Bruvo:2004}: +\begin{itemize} + \item{Infinite Model -} The simplest way to deal with it is to count all + missing alleles as infinitely large so that the distance between it and + anything else is 1. Aside from this being computationally simple, it will tend + to inflate distances between individuals. + \item{Genome Addition Model -} If it is suspected that the organism has gone + through a recent genome expansion, the missing alleles will be replace with + all possible combinations of the observed alleles in the shorter genotype. For + example, if there is a genotype of [69, 70, 0, 0] where 0 is a missing allele, + the possible combinations are: [69, 70, 69, 69], [69, 70, 69, 70], and [69, + 70, 70, 70]. The resulting distances are then averaged over the number of + comparisons. + \item{Genome Loss Model -} This is similar to the genome addition model, + except that it assumes that there was a recent genome reduction event and uses + the observed values in the full genotype to fill the missing values in the + short genotype. As with the Genome Addition Model, the resulting distances are + averaged over the number of comparisons. + \item{Combination Model -} Combine and average the genome addition and loss + models. +\end{itemize} + +As mentioned above, the infinite model is biased, but it is not nearly as +computationally intensive as either of the other models. The reason for this is +that both of the addition and loss models requires replacement of alleles and +recalculation of Bruvo's distance. The number of replacements required is equal +to the multiset coefficient: $\left({n \choose k}\right) == {(n-k+1) \choose k}$ +where $n$ is the number of potential replacements and $k$ is the number of +alleles to be replaced. So, for the example given above, The genome addition +model would require $\left({2 \choose 2}\right) = 3$ calculations of Bruvo's +distance, whereas the genome loss model would require $\left({4 \choose +2}\right) = 10$ calculations. + +To reduce the number of calculations and assumptions otherwise, Bruvo's distance +will be calculated using the largest observed ploidy in pairwise comparisons. +This means that when +comparing [69,70,71,0] and [59,60,0,0], they will be treated as triploids. + +\subsection{Tree topology} + +All of these distances were designed for analysis of populations. When applying +them to individuals, we must change our interpretations. For example, with Nei's +distance, branch lengths increase linearly with mutation +\cite{nei1972genetic,nei1978estimation}. When two populations share no alleles, +then the distance becomes infinite. However, we expect two individuals to +segregate for different alleles more often than entire populations, thus we +would expect exaggerated internal branch lengths separating clades. To +demonstrate the effect of the different distances on tree topology, we will use +5 diploid samples at a single locus demonstrating a range of possibilities: + +\begin{table}[ht] +\centering +\begin{tabular}{c} + \hline +Genotype \\ + \hline +1/1 \\ + 1/2 \\ + 2/3 \\ + 3/4 \\ + 4/4 \\ + \hline +\end{tabular} +\caption{Table of genotypes to be used for analysis} +\end{table} + +\begin{knitrout}\footnotesize +\definecolor{shadecolor}{rgb}{0.933, 0.933, 0.933}\color{fgcolor}\begin{kframe} +\begin{alltt} +\hlkwd{library}\hlstd{(poppr)} +\hlstd{dat.df} \hlkwb{<-} \hlkwd{data.frame}\hlstd{(}\hlkwc{Genotype} \hlstd{=} \hlkwd{c}\hlstd{(}\hlstr{"1/1"}\hlstd{,} \hlstr{"1/2"}\hlstd{,} \hlstr{"2/3"}\hlstd{,} \hlstr{"3/4"}\hlstd{,} \hlstr{"4/4"}\hlstd{))} +\hlstd{dat} \hlkwb{<-} \hlkwd{as.genclone}\hlstd{(}\hlkwd{df2genind}\hlstd{(dat.df,} \hlkwc{sep} \hlstd{=} \hlstr{"/"}\hlstd{,} \hlkwc{ind.names} \hlstd{= dat.df[[}\hlnum{1}\hlstd{]]))} +\end{alltt} +\end{kframe} +\end{knitrout} + +We will now compute the distances and construct neighbor-joining dendrograms +using the package \textit{ape}. This allows us to see the effect of the +different distance measures on the tree topology. + +\begin{knitrout}\footnotesize +\definecolor{shadecolor}{rgb}{0.933, 0.933, 0.933}\color{fgcolor}\begin{kframe} +\begin{alltt} +\hlstd{distances} \hlkwb{<-} \hlkwd{c}\hlstd{(}\hlstr{"Nei"}\hlstd{,} \hlstr{"Rogers"}\hlstd{,} \hlstr{"Edwards"}\hlstd{,} \hlstr{"Reynolds"}\hlstd{,} \hlstr{"Provesti"}\hlstd{)} +\hlstd{dists} \hlkwb{<-} \hlkwd{lapply}\hlstd{(distances,} \hlkwa{function}\hlstd{(}\hlkwc{x}\hlstd{)\{} + \hlstd{DISTFUN} \hlkwb{<-} \hlkwd{match.fun}\hlstd{(}\hlkwd{paste}\hlstd{(}\hlkwd{tolower}\hlstd{(x),} \hlstr{"dist"}\hlstd{,} \hlkwc{sep} \hlstd{=} \hlstr{"."}\hlstd{))} + \hlkwd{DISTFUN}\hlstd{(dat)} +\hlstd{\})} +\hlkwd{names}\hlstd{(dists)} \hlkwb{<-} \hlstd{distances} + +\hlcom{# Adding Bruvo's distance at the end because we need to specify repeat length.} +\hlstd{dists}\hlopt{$}\hlstd{Bruvo} \hlkwb{<-} \hlkwd{bruvo.dist}\hlstd{(dat,} \hlkwc{replen} \hlstd{=} \hlnum{1}\hlstd{)} +\hlkwd{library}\hlstd{(ape)} +\hlkwd{par}\hlstd{(}\hlkwc{mfrow} \hlstd{=} \hlkwd{c}\hlstd{(}\hlnum{2}\hlstd{,} \hlnum{3}\hlstd{))} +\hlstd{x} \hlkwb{<-} \hlkwd{lapply}\hlstd{(}\hlkwd{names}\hlstd{(dists),} \hlkwa{function}\hlstd{(}\hlkwc{x}\hlstd{)\{} + \hlkwd{plot}\hlstd{(}\hlkwd{nj}\hlstd{(dists[[x]]),} \hlkwc{main} \hlstd{= x,} \hlkwc{type} \hlstd{=} \hlstr{"unrooted"}\hlstd{)} + \hlkwd{add.scale.bar}\hlstd{(}\hlkwc{lcol} \hlstd{=} \hlstr{"red"}\hlstd{)} +\hlstd{\})} +\end{alltt} +\end{kframe} + +{\centering \includegraphics[width=0.95\linewidth]{figure/unnamed-chunk-3} + +} + + + +\end{knitrout} + +\section{AMOVA} + +AMOVA in \poppr{} acts as a wrapper for the \textit{ade4} implementation, which +is an implementation of Excoffier's original formulation +\cite{excoffier1992analysis}. As the calculation relies on a genetic distance +matrix, \poppr{} calculates the distance matrix as the number of differing sites +between two genotypes using equation \ref{eq:ia_D}. This is equivalent to +Provesti's distance multiplied by the ploidy and number of loci. This is also +equivalent to Kronecker's delta as presented in \cite{excoffier1992analysis}. It +should also be mentioned that \textbf{at this moment, it is not possible to calculate +AMOVA on polyploid data.} + +\section{Genotypic Diversity} + +Many of the calculations of genotypic diversity exist within the package \textit{vegan} in the \texttt{diversity} +function. Descriptions of most calculations can be found in the paper by Gr\"unwald et al \cite{Grunwald:2003}. + +\subsection{Rarefaction, Shannon-Wiener, Stoddart and Taylor indices} + +Stoddart and Taylor's index is also known as Inverse Simpson's index. +A detailed description of these can be found in the ``Diversity'' vignette in \textit{vegan}. You can access it by typing \texttt{vignette("diversity-vegan")} + +\subsection{Evenness ($E_{5}$)} +Evenness ($E_{5}$) is essentially the ratio of the number of abundant genotypes to the number of rarer genotypes. + +This is simply calculated as + +\begin{equation} +E_{5} = \frac{(1/\lambda) - 1}{e^{H}} +\end{equation} + +Where $1/\lambda$ is Stoddart and Taylor's index and $H$ is Shannon diversity \cite{Stoddart:1988,Shannon:1948}. + +\subsection{Hexp} + +Essentially, $H_{exp}$ is an analog of Nei's expected heterozygosity and is equivalent to an unbiased estimator for Simpson's index. It was implemented in \poppr{} as it was previously in \textit{multilocus} \cite{Agapow:2001}. The calculation is as follows: + +\begin{equation} +H_{exp} = \frac{N}{N-1} \sum_{i=1}^{g}{p^{2}_{i}} +\end{equation} +\noindent +Where $p_i$ is the frequency of the $i$th genotype, $g$ is the number of +genotypes observed and $N$ is the sample size. + +\bibliographystyle{pnas.bst} +\bibliography{poppr_man} +\end{document} diff --git a/vignettes/getfile.pdf b/vignettes/getfile.pdf new file mode 100644 index 00000000..36bc4792 Binary files /dev/null and b/vignettes/getfile.pdf differ diff --git a/vignettes/getfile.png b/vignettes/getfile.png deleted file mode 100755 index 048f640f..00000000 Binary files a/vignettes/getfile.png and /dev/null differ diff --git a/vignettes/jalapeno-poppers.pdf b/vignettes/jalapeno-poppers.pdf new file mode 100644 index 00000000..c7fb4b24 Binary files /dev/null and b/vignettes/jalapeno-poppers.pdf differ diff --git a/vignettes/pnas.bst b/vignettes/pnas.bst new file mode 100644 index 00000000..669ec28d --- /dev/null +++ b/vignettes/pnas.bst @@ -0,0 +1,1033 @@ +% pnas.bst + +% BibTex format for Proceedings of the National Academy of Science +% this version matches the current PNAS format. + +% history: +% version = 3.02 of pnas.bst 2009 Oct 19 +% 2009 Oct 19, 3.02: revised by TDS: there is no emph on et al. +% see: +% http://www.pnas.org/site/misc/iforc.shtml#order +% http://www.pnas.org/content/106/41/17261 +% http://www.pnas.org/content/106/41/17261.full +% reference 4: +% http://www.pnas.org/content/106/41/17261.full#ref-4 +% +% 2009 Oct 19, 3.01: revised by MYW to clean up et~al.,s (emphasis, +% comma before), and remove comma between last name +% and initials in authors' names. +% 2009 Jun 28, 3.00: revised modernize emphasis function +% 2009 Jun 26 renamed pnas.bst by Tom Schneider to replace the 1999 version. +% 2008 Jan 30 received by Tom Schneider +% From: Ben Bolker +%%% BMB modified version from http://www.pnas.org/misc/iforc.shtml +%%% 28 Jan 2008 + +%% References. References must be in PNAS style. Only published or +%% in-press papers and books may be cited in the reference +%% list. Unpublished abstracts of papers presented at meetings are not +%% permitted. References should be cited in numerical order as they +%% appear in text. Because tables and figures will be inserted in the +%% text where first cited, references in these sections should be +%% numbered accordingly. Manuscripts must include the full title for each +%% cited article. Authors must translate foreign language titles into +%% English, with a notation of the original language. All authors (unless +%% there are more than five) should be named in the citation. If there +%% are more than five, list the first author's name followed by et +%% al. Provide inclusive page ranges for journal articles and book +%% chapters. Databases are cited in the text or as footnotes. + +%% The corresponding author must be prepared to provide a signed +%% authorization for the citation of unpublished data and personal +%% communications. + +%% Journal articles are cited as follows: + +%% 10. Neuhaus J-M, Sitcher L, Meins F, Jr, Boller T (1991) A short C-terminal +%% sequence is necessary and sufficient for the targeting of chitinases +%% to the plant vacuole. Proc Natl Acad Sci USA 88:10362-10366. +%% +%% For correct abbreviations of journal titles, refer to Chemical Abstracts Service Source Index (CASSI). +%% +%% Articles or chapters in books are cited as follows: +%% +%% 14. Hill AVS (1991) in Molecular Evolution of the Major Histocompatibility +%% Complex, eds Klein J, Klein D (Springer, Heidelberg), pp 403-420. +%% + +%%% ==================================================================== +%%% @BibTeX-style-file{ +%%% author = "Alan R. Rogers", +%%% version = "1.0", +%%% date = "12 November 1992", +%%% time = "16:07:20 MST", +%%% filename = "pnas.bst", +%%% address = "Department of Anthropology +%%% University of Utah +%%% Salt Lake City, UT 84112 +%%% USA", +%%% telephone = "+1 801 581 5529", +%%% FAX = "+1 801 581 6252", +%%% checksum = "37034 967 2316 17724", +%%% email = "rogers@anthro.utah.edu (Internet)", +%%% codetable = "ISO/ASCII", +%%% keywords = "", +%%% supported = "yes", +%%% abstract = "", +%%% docstring = "This file produces bibliographies in the +%%% format of ``Proceedings of the National +%%% Academy of Sciences, USA.''", +%%% } +%%% ==================================================================== +% This is "pnas.bst", modified from "unsrt.bst" +% BibTeX standard bibliography style `unsrt' + % version 0.99a for BibTeX versions 0.99a or later, LaTeX version 2.09. + % Copyright (C) 1985, all rights reserved. + % Copying of this file is authorized only if either + % (1) you make absolutely no changes to your copy, including name, or + % (2) if you do make changes, you name it something other than + % btxbst.doc, plain.bst, unsrt.bst, alpha.bst, and abbrv.bst. + % This restriction helps ensure that all standard styles are identical. + % The file btxbst.doc has the documentation for this style. + +ENTRY + { address + author + booktitle + chapter + edition + editor + howpublished + institution + journal + key + month + note + number + organization + pages + publisher + school + series + title + type + volume + year + } + {} + { label } + +INTEGERS { output.state before.all mid.sentence after.sentence after.block } + +FUNCTION {init.state.consts} +{ #0 'before.all := + #1 'mid.sentence := + #2 'after.sentence := + #3 'after.block := +} + +STRINGS { s t } + +FUNCTION {output.nonnull} +{ 's := + output.state mid.sentence = +% { ", " * write$ } + { " " * write$ } + { output.state after.block = +% BMB hack + { + % add.period$ + write$ + newline$ + "\newblock " write$ + } + { output.state before.all = + 'write$ +% { add.period$ " " * write$ } + { ", " * write$ } + if$ + } + if$ + mid.sentence 'output.state := + } + if$ + s +} + +FUNCTION {output} +{ duplicate$ empty$ + 'pop$ + 'output.nonnull + if$ +} + +FUNCTION {output.check} +{ 't := + duplicate$ empty$ + { pop$ "empty " t * " in " * cite$ * warning$ } + 'output.nonnull + if$ +} + +FUNCTION {output.bibitem} +{ newline$ + "\bibitem{" write$ + cite$ write$ + "}" write$ + newline$ + "" + before.all 'output.state := +} + +FUNCTION {fin.entry} +{ add.period$ + write$ + newline$ +} + +FUNCTION {new.block} +{ output.state before.all = + 'skip$ + { after.block 'output.state := } + if$ +} + +FUNCTION {new.sentence} +{ output.state after.block = + 'skip$ + { output.state before.all = + 'skip$ + { after.sentence 'output.state := } + if$ + } + if$ +} + +FUNCTION {not} +{ { #0 } + { #1 } + if$ +} + +FUNCTION {and} +{ 'skip$ + { pop$ #0 } + if$ +} + +FUNCTION {or} +{ { pop$ #1 } + 'skip$ + if$ +} + +FUNCTION {new.block.checka} +{ empty$ + 'skip$ + 'new.block + if$ +} + +FUNCTION {new.block.checkb} +{ empty$ + swap$ empty$ + and + 'skip$ + 'new.block + if$ +} + +FUNCTION {new.sentence.checka} +{ empty$ + 'skip$ + 'new.sentence + if$ +} + +FUNCTION {new.sentence.checkb} +{ empty$ + swap$ empty$ + and + 'skip$ + 'new.sentence + if$ +} + +FUNCTION {field.or.null} +{ duplicate$ empty$ + { pop$ "" } + 'skip$ + if$ +} + +FUNCTION {emphasize} +{ duplicate$ empty$ + { pop$ "" } +% { "{\em " swap$ * "}" * } +% 2009 Jun 28: modernize the emphasis call + { "\emph{" swap$ * "}" * } + if$ +} + +FUNCTION {parenthesize} +{ duplicate$ empty$ + { pop$ "" } + { "(" swap$ * ")" * } + if$ +} + +FUNCTION {boldface} +{ duplicate$ empty$ + { pop$ "" } + { "{\bf " swap$ * "}" * } + if$ +} + +INTEGERS { nameptr namesleft numnames } + +FUNCTION {format.names} +{ 's := + #1 'nameptr := + s num.names$ 'numnames := + numnames #5 > % et al for >5 names (copied from chicago.bst) + { s #1 "{vv~}{ll}{, jj}{ f{}}" format.name$ ", {et~al.}" * } + { + numnames 'namesleft := + { namesleft #0 > } + { s nameptr "{vv~}{ll}{, jj}{ f{}}" format.name$ 't := + nameptr #1 > + { namesleft #1 > %% >1 name left? + { ", " * t * } %% yes: add comma, next name + { t "others" = + { ", {et~al.}" * } + { ", " * t * } %% add comma, last name + if$ + } + if$ + } + 't + if$ + nameptr #1 + 'nameptr := + namesleft #1 - 'namesleft := + } + while$ %% while namesleft>0 + } + if$ +} + +FUNCTION {format.publisher} +{ publisher empty$ + { address empty$ + { "" } + { address } + if$ + } + { address empty$ + { publisher} + { publisher ", " * address * } + if$ + } + if$ +} + +FUNCTION {format.editors.as.authors} +{ editor empty$ + { organization } + { editor format.names + editor num.names$ #1 > + { ", eds" * } %% BMB removed period + { ", ed" * } %% BMB ditto + if$ + } + if$ +} + +FUNCTION {format.authors} +{ author empty$ + { editor empty$ + { "" } + { editor format.names + editor num.names$ #1 > + { ", eds" * } %% BMB removed period + { ", ed" * } %% BMB ditto + if$ + } + if$ + } + { author format.names } + if$ +} + +FUNCTION {format.editors} +{ editor empty$ + { "" } + { author empty$ %in this case, see format.authors + { "" } + { editor num.names$ #1 > + { "eds{} " } + { "ed{} " } + if$ + editor format.names * + } + if$ + } + if$ +} + +FUNCTION {format.title} +% BMB hack -- add period +{ title "t" change.case$ add.period$ +} + +FUNCTION {n.dashify} +{ 't := + "" + { t empty$ not } + { t #1 #1 substring$ "-" = + { t #1 #2 substring$ "--" = not + { "--" * + t #2 global.max$ substring$ 't := + } + { { t #1 #1 substring$ "-" = } + { "-" * + t #2 global.max$ substring$ 't := + } + while$ + } + if$ + } + { t #1 #1 substring$ * + t #2 global.max$ substring$ 't := + } + if$ + } + while$ +} + +FUNCTION {format.date} +{ year empty$ + { "year?" parenthesize } + { year parenthesize } + if$ +} + +FUNCTION {format.btitle.series} +{ title empty$ + { "" } + { title emphasize } + if$ + series empty$ + { "" * } + { ", " * series * } + if$ +} + +FUNCTION {tie.or.space.connect} +{ duplicate$ text.length$ #3 < + { "~" } + { " " } + if$ + swap$ * * +} + +FUNCTION {comma.connect} +{ duplicate$ empty$ + { pop$ } + { swap$ + duplicate$ empty$ + { pop$ } + { swap$ ", " swap$ * * } + if$ + } + if$ +} + +FUNCTION {either.or.check} +{ empty$ + 'pop$ + { "can't use both " swap$ * " fields in " * cite$ * warning$ } + if$ +} + +FUNCTION {format.bvolume.number} +{ volume empty$ + { number empty$ + { "" } + { "No.{}" number tie.or.space.connect } + if$ + } + { "Vol.{}" volume tie.or.space.connect + "volume and number" number either.or.check + } + if$ +} + +FUNCTION {format.number} +{ volume empty$ + { number empty$ + { "" } + { "No.{}" number tie.or.space.connect } + if$ + } + { "" } + if$ +} + +FUNCTION {format.edition} +{ edition empty$ + { "" } + { output.state mid.sentence = + { edition "l" change.case$ " edition" * } + { edition "t" change.case$ " edition" * } + if$ + } + if$ +} + +INTEGERS { multiresult } + +FUNCTION {multi.page.check} +{ 't := + #0 'multiresult := + { multiresult not + t empty$ not + and + } + { t #1 #1 substring$ + duplicate$ "-" = + swap$ duplicate$ "," = + swap$ "+" = + or or + { #1 'multiresult := } + { t #2 global.max$ substring$ 't := } + if$ + } + while$ + multiresult +} + +FUNCTION {format.pages} +{ pages empty$ + { "" } + { pages multi.page.check + { "pp" pages n.dashify tie.or.space.connect } + { "p" pages tie.or.space.connect } + if$ + } + if$ +} + +FUNCTION {format.vol.num.pages} +{ volume % boldface %% BB -- not boldface + pages empty$ + 'skip$ + { duplicate$ empty$ + { pop$ format.pages } + { ":" * pages n.dashify * } + if$ + } + if$ +} + +FUNCTION {format.chapter.pages} +{ chapter empty$ + 'format.pages + { type empty$ + { "chapter" } + { type "l" change.case$ } + if$ + chapter tie.or.space.connect + pages empty$ + 'skip$ + { ", " * format.pages * } + if$ + } + if$ +} + +FUNCTION {format.in.ed.booktitle} +{ booktitle empty$ + { "" } + { "in " booktitle emphasize * } + if$ + series empty$ + { "" * } + { booktitle empty$ + { "XXmissing booktitle \& seriesXX" * } + { ", " * } + if$ + series * + } + if$ +} + +FUNCTION {empty.misc.check} +{ author empty$ title empty$ howpublished empty$ + month empty$ year empty$ note empty$ + and and and and and + { "all relevant fields are empty in " cite$ * warning$ } + 'skip$ + if$ +} + +FUNCTION {format.thesis.type} +{ type empty$ + 'skip$ + { pop$ + type "t" change.case$ + } + if$ +} + +FUNCTION {format.tr.number} +{ type empty$ + { "Technical Report" } + 'type + if$ + number empty$ + { "t" change.case$ } + { number tie.or.space.connect } + if$ +} + +FUNCTION {format.article.crossref} +{ key empty$ + { journal empty$ + { "need key or journal for " cite$ * " to crossref " * crossref * + warning$ + "" + } + { "In {\em " journal * "\/}" * } + if$ + } + { "In " key * } + if$ + " \cite{" * crossref * "}" * +} + +FUNCTION {format.crossref.editor} +{ editor #1 "{vv~}{ll}" format.name$ + editor num.names$ duplicate$ + #2 > + { pop$ ", {et~al.}" * } + { #2 < + 'skip$ + { editor #2 "{vv }{ll}{ jj}{ f}" format.name$ "others" = + { ", {et~al.}" * } + { " and " * editor #2 "{vv~}{ll}" format.name$ * } + if$ + } + if$ + } + if$ +} + +FUNCTION {format.book.crossref} +{ volume empty$ + { "empty volume in " cite$ * "'s crossref of " * crossref * warning$ + "In " + } + { "Vol.{}" volume tie.or.space.connect + " of " * + } + if$ + editor empty$ + editor field.or.null author field.or.null = + or + { key empty$ + { series empty$ + { "need editor, key, or series for " cite$ * " to crossref " * + crossref * warning$ + "" * + } + { "{\em " * series * "\/}" * } + if$ + } + { key * } + if$ + } + { format.crossref.editor * } + if$ + " \cite{" * crossref * "}" * +} + +FUNCTION {format.incoll.inproc.crossref} +{ editor empty$ + editor field.or.null author field.or.null = + or + { key empty$ + { booktitle empty$ + { "need editor, key, or booktitle for " cite$ * " to crossref " * + crossref * warning$ + "" + } + { "in {\em " booktitle * "\/}" * } + if$ + } + { "in " key * } + if$ + } + { "in " format.crossref.editor * } + if$ + " \cite{" * crossref * "}" * +} + +FUNCTION {article} +{ output.bibitem + format.authors "author" output.check + new.block + format.date "year" output.check + format.title "title" output.check + new.block + crossref missing$ + { journal emphasize "journal" output.check + format.vol.num.pages output + } + { format.article.crossref output.nonnull + format.pages output + } + if$ + new.block + note output + fin.entry +} + +FUNCTION {book} +{ output.bibitem + format.authors output.nonnull + crossref missing$ + { "author and editor" editor either.or.check } + 'skip$ + if$ + new.block + format.date "year" output.check + format.btitle.series "title" output.check + crossref missing$ + { format.editors output + new.block + format.publisher parenthesize output + format.bvolume.number output + new.sentence + } + { new.block + format.book.crossref output.nonnull + } + if$ + format.edition output + new.sentence + format.pages output + new.block + note output + fin.entry +} + +FUNCTION {booklet} +{ output.bibitem + format.authors output + new.block + format.date output + format.title "title" output.check + howpublished address comma.connect parenthesize output + new.block + note output + fin.entry +} + +FUNCTION {inbook} +{ output.bibitem + format.authors output.nonnull + new.block + format.date "year" output.check + format.btitle.series "title" output.check + new.sentence + crossref missing$ + { format.editors output + new.block + format.publisher parenthesize output + format.bvolume.number output + new.sentence + } + { format.chapter.pages "chapter and pages" output.check + new.block + format.book.crossref output.nonnull + } + if$ + format.edition output + new.sentence + format.pages output + new.block + note output + fin.entry +} + +FUNCTION {incollection} +{ output.bibitem + format.authors "author" output.check + new.block + format.date "year" output.check + format.in.ed.booktitle "title" output.check + new.sentence + crossref missing$ + { format.editors output + new.block + format.publisher parenthesize output + format.bvolume.number output + new.sentence + } + { format.incoll.inproc.crossref output.nonnull + format.chapter.pages output + } + if$ + format.edition output + new.sentence + format.pages output + new.block + note output + fin.entry +} + +FUNCTION {inproceedings} +{ output.bibitem + format.authors "author" output.check + new.block + format.date "year" output.check + crossref missing$ + { format.btitle.series "title" output.check + format.editors output + new.block + organization publisher comma.connect + address comma.connect + parenthesize output + new.sentence + format.bvolume.number output + new.sentence + } + { format.incoll.inproc.crossref output.nonnull + } + if$ + format.pages output + new.block + note output + fin.entry +} + +FUNCTION {conference} { inproceedings } + +FUNCTION {manual} +{ output.bibitem + author empty$ + { organization empty$ + 'skip$ + { organization output.nonnull } + if$ + } + { format.authors output.nonnull } + if$ + new.block + format.date "year" output.check + format.btitle.series "title" output.check + author empty$ + { "" } + { organization } + if$ + publisher comma.connect address comma.connect parenthesize output + new.sentence + format.edition output + new.block + note output + fin.entry +} + +FUNCTION {mastersthesis} +{ output.bibitem + format.authors "author" output.check + new.block + format.date "year" output.check + "Master's thesis" format.thesis.type output.nonnull + school address comma.connect parenthesize output + new.block + note output + fin.entry +} + +FUNCTION {misc} +{ output.bibitem + format.authors output + new.block + format.date output + format.title output + howpublished parenthesize output + new.block + note output + fin.entry + empty.misc.check +} + +FUNCTION {phdthesis} +{ output.bibitem + format.authors "author" output.check + new.block + format.date "year" output.check + "Ph.D. thesis" format.thesis.type output.nonnull + school address comma.connect parenthesize output + new.block + note output + fin.entry +} + +FUNCTION {proceedings} +{ output.bibitem + format.editors.as.authors output + new.block + format.date output + format.btitle.series "title" output.check + organization publisher comma.connect + address comma.connect parenthesize output + format.bvolume.number output + new.sentence + format.pages output + new.block + note output + fin.entry +} + +FUNCTION {techreport} +{ output.bibitem + format.authors "author" output.check + new.block + format.date output + format.title "title" output.check + new.sentence + institution address comma.connect parenthesize output + new.sentence + format.tr.number output.nonnull + new.block + note output + fin.entry +} + +FUNCTION {unpublished} +{ output.bibitem + format.authors "author" output.check + new.block + format.date output + format.title "title" output.check + new.block + note "note" output.check + fin.entry +} + +FUNCTION {default.type} { misc } + +MACRO {jan} {"January"} + +MACRO {feb} {"February"} + +MACRO {mar} {"March"} + +MACRO {apr} {"April"} + +MACRO {may} {"May"} + +MACRO {jun} {"June"} + +MACRO {jul} {"July"} + +MACRO {aug} {"August"} + +MACRO {sep} {"September"} + +MACRO {oct} {"October"} + +MACRO {nov} {"November"} + +MACRO {dec} {"December"} + +MACRO {acmcs} {"ACM Computing Surveys"} + +MACRO {acta} {"Acta Informatica"} + +MACRO {cacm} {"Communications of the ACM"} + +MACRO {ibmjrd} {"IBM Journal of Research and Development"} + +MACRO {ibmsj} {"IBM Systems Journal"} + +MACRO {ieeese} {"IEEE Transactions on Software Engineering"} + +MACRO {ieeetc} {"IEEE Transactions on Computers"} + +MACRO {ieeetcad} + {"IEEE Transactions on Computer-Aided Design of Integrated Circuits"} + +MACRO {ipl} {"Information Processing Letters"} + +MACRO {jacm} {"Journal of the ACM"} + +MACRO {jcss} {"Journal of Computer and System Sciences"} + +MACRO {scp} {"Science of Computer Programming"} + +MACRO {sicomp} {"SIAM Journal on Computing"} + +MACRO {tocs} {"ACM Transactions on Computer Systems"} + +MACRO {tods} {"ACM Transactions on Database Systems"} + +MACRO {tog} {"ACM Transactions on Graphics"} + +MACRO {toms} {"ACM Transactions on Mathematical Software"} + +MACRO {toois} {"ACM Transactions on Office Information Systems"} + +MACRO {toplas} {"ACM Transactions on Programming Languages and Systems"} + +MACRO {tcs} {"Theoretical Computer Science"} + +READ + +STRINGS { longest.label } + +INTEGERS { number.label longest.label.width } + +FUNCTION {initialize.longest.label} +{ "" 'longest.label := + #1 'number.label := + #0 'longest.label.width := +} + +FUNCTION {longest.label.pass} +{ number.label int.to.str$ 'label := + number.label #1 + 'number.label := + label width$ longest.label.width > + { label 'longest.label := + label width$ 'longest.label.width := + } + 'skip$ + if$ +} + +EXECUTE {initialize.longest.label} + +ITERATE {longest.label.pass} + +FUNCTION {begin.bib} +{ preamble$ empty$ + 'skip$ + { preamble$ write$ newline$ } + if$ + "\begin{thebibliography}{" longest.label * "}" * write$ newline$ +} + +EXECUTE {begin.bib} + +EXECUTE {init.state.consts} + +ITERATE {call.type$} + +FUNCTION {end.bib} +{ newline$ + "\end{thebibliography}" write$ newline$ +} + +EXECUTE {end.bib} diff --git a/vignettes/poppr_man.bib b/vignettes/poppr_man.bib index 15f98d66..b28d0731 100644 --- a/vignettes/poppr_man.bib +++ b/vignettes/poppr_man.bib @@ -1,4 +1,100 @@ -@article {Agapow:2001, +@Book{ggplot2, + author = {Hadley Wickham}, + title = {ggplot2: elegant graphics for data analysis}, + publisher = {Springer New York}, + year = {2009}, + isbn = {978-0-387-98140-6}, + url = {http://had.co.nz/ggplot2/book}, + } + +@article{goss2014irish, +author = {Goss, Erica M. and Tabima, Javier F. and Cooke, David E. L. and Restrepo, Silvia and Fry, William E. and Forbes, Gregory A. and Fieland, Valerie J. and Cardenas, Martha and Grünwald, Niklaus J.}, +title = {The Irish potato famine pathogen Phytophthora infestans originated in central Mexico rather than the Andes}, +year = {2014}, +doi = {10.1073/pnas.1401884111}, +volume = {111}, +pages={8791--8796}, +number={24}, +abstract ={Phytophthora infestans is a destructive plant pathogen best known for causing the disease that triggered the Irish potato famine and remains the most costly potato pathogen to manage worldwide. Identification of P. infestan’s elusive center of origin is critical to understanding the mechanisms of repeated global emergence of this pathogen. There are two competing theories, placing the origin in either South America or in central Mexico, both of which are centers of diversity of Solanum host plants. To test these competing hypotheses, we conducted detailed phylogeographic and approximate Bayesian computation analyses, which are suitable approaches to unraveling complex demographic histories. Our analyses used microsatellite markers and sequences of four nuclear genes sampled from populations in the Andes, Mexico, and elsewhere. To infer the ancestral state, we included the closest known relatives Phytophthora phaseoli, Phytophthora mirabilis, and Phytophthora ipomoeae, as well as the interspecific hybrid Phytophthora andina. We did not find support for an Andean origin of P. infestans; rather, the sequence data suggest a Mexican origin. Our findings support the hypothesis that populations found in the Andes are descendants of the Mexican populations and reconcile previous findings of ancestral variation in the Andes. Although centers of origin are well documented as centers of evolution and diversity for numerous crop plants, the number of plant pathogens with a known geographic origin are limited. This work has important implications for our understanding of the coevolution of hosts and pathogens, as well as the harnessing of plant disease resistance to manage late blight.}, +URL = {http://www.pnas.org/content/early/2014/05/29/1401884111.abstract}, +eprint = {http://www.pnas.org/content/early/2014/05/29/1401884111.full.pdf+html}, +journal = {Proceedings of the National Academy of Sciences} +} + +@article{excoffier1992analysis, + title={Analysis of molecular variance inferred from metric distances among DNA haplotypes: application to human mitochondrial DNA restriction data}, + author={Excoffier, L and Smouse, PE and Quattro, JM}, + journal={Genetics}, + volume={131}, + number={2}, + pages={479--491}, + url = {http://www.genetics.org/content/131/2/479.abstract}, + year={1992}, + publisher={Genetics Soc America} +} + +@article{nei1972genetic, + title={Genetic distance between populations}, + author={Nei, Masatoshi}, + journal={American naturalist}, + pages={283--292}, + year={1972}, + publisher={JSTOR} +} + +@article{nei1978estimation, + title={Estimation of average heterozygosity and genetic distance from a small number of individuals}, + author={Nei, Masatoshi}, + journal={Genetics}, + volume={89}, + number={3}, + pages={583--590}, + year={1978}, + publisher={Genetics Soc America} +} + +@article{edwards1971distances, + title={Distances between populations on the basis of gene frequencies}, + author={Edwards, AWF}, + journal={Biometrics}, + pages={873--881}, + year={1971}, + publisher={JSTOR} +} + +@article{reynolds1983estimation, + title={Estimation of the coancestry coefficient: basis for a short-term genetic distance}, + author={Reynolds, John and Weir, Bruce S and Cockerham, C Clark}, + journal={Genetics}, + volume={105}, + number={3}, + pages={767--779}, + year={1983}, + publisher={Genetics Soc America} +} + +% Optional fields: editor, volume/number, series, type, chapter, pages, address, edition, month, note +@INCOLLECTION{rogers1972measures, + author = {Rogers, J S}, + title = {Measures of genetic similarity and genetic distances}, + booktitle = {Studies in Genetics}, + publisher = {University of Texas Publishers}, + year = {1972}, + pages = {145--153}, + number = {7213} +} +@article{prevosti1975distances, + title={Distances between populations of \textit{{Drosophila subobscura}}, based on chromosome arrangement frequencies}, + author={Prevosti, Antoni and Oca{\~n}a, Jy and Alonso, G}, + journal={Theoretical and Applied Genetics}, + volume={45}, + number={6}, + pages={231--241}, + year={1975}, + publisher={Springer} +} + +@article {Agapow:2001, author = {Agapow, Paul-Michael and Burt, Austin}, title = {Indices of multilocus linkage disequilibrium}, journal = {Molecular Ecology Notes}, @@ -29,10 +125,10 @@ @article{Smith:1993 @article{Brown:1980, author = {Brown,A.H.D. and Feldman,M.W. and Nevo,E.}, -title = {MULTILOCUS STRUCTURE OF NATURAL POPULATIONS OF HORDEUM SPONTANEUM}, +title = {MULTILOCUS STRUCTURE OF NATURAL POPULATIONS OF \textit{{Hordeum spontaneum}}}, volume = {96}, -number = {2}, -pages = {523-536}, +number = {2}, +pages = {523--536}, year = {1980}, abstract ={The association of alleles among different loci was studied in natural populations of Hordeum spontaneum, the evolutionary progenitor of cultivated barley. The variance of the number of heterozygous loci in two randomly chosen gametes affords a useful measure of such association. The behavior of this statistic in several particular models is described. Generally, linkage (gametic phase) disequilibrium tends to increase the variance above the value expected under complete independence. This increase is greatest when disequilibria are such as to maximize the sum of squares of the two-locus gametic frequencies.—When data on several loci per individual are available, the observed variance may be tested for its agreement with that expected under the hypothesis of complete interlocus independence, using the sampling theory of this model. When applied to allozyme data from 26 polymorphic populations of wild barley, this test demonstrated the presence of geographically widespread multilocus organization. On average, the variance was 80% higher than expected under random association. Gametic frequencies for four esterase loci in both of these populations of wild barley and two composite crosses of cultivated barley were analyzed. Most generations of the composites showed less multilocus structure, as measured by the indices of association, than the wild populations.}, URL = {http://www.genetics.org/content/96/2/523.abstract}, @@ -43,7 +139,7 @@ @article{Brown:1980 @article{Jombart:2008, author = {Jombart, Thibaut}, -title = {adegenet: a R package for the multivariate analysis of genetic markers}, +title = {adegenet: a {R} package for the multivariate analysis of genetic markers}, volume = {24}, number = {11}, pages = {1403-1405}, @@ -57,7 +153,7 @@ @biomserv.univ-lyon1.frSupplementary @article{Jombart:2011, author = {Jombart, Thibaut and Ahmed, Ismaïl}, -title = {adegenet 1.3-1: new tools for the analysis of genome-wide SNP data}, +title = {adegenet 1.3-1: new tools for the analysis of genome-wide {SNP} data}, volume = {27}, number = {21}, pages = {3070-3071}, @@ -70,8 +166,8 @@ @imperial.ac.ukSupplementary } @article{Grunwald:2006, -title={Hierarchical Analysis of Diversity, Selfing, and Genetic Differentiation in Populations of the Oomycete Aphanomyces euteiches.}, -author={Grünwald, N.J. and Hoheisel, G.}, +title={Hierarchical Analysis of Diversity, Selfing, and Genetic Differentiation in Populations of the Oomycete \textit{{Aphanomyces euteiches}}.}, +author={Grünwald, N. J. and Hoheisel, G.}, journal={Phytopathology}, volume={96}, number={10}, @@ -184,14 +280,14 @@ @article{Peakall:2006 journal = {Molecular Ecology Notes}, number = {1}, pages = {288-295+}, - title = {{GENALEX} 6: genetic analysis in Excel. Population genetic software for teaching and research}, + title = {{GenAlEx} 6: genetic analysis in Excel. Population genetic software for teaching and research}, volume = {6}, year = {2006} } @article{Peakall:2012, author = {Peakall, Rod and Smouse, Peter E.}, -title = {GenAlEx 6.5: genetic analysis in Excel. Population genetic software for teaching and research—an update}, +title = {{GenAlEx} 6.5: genetic analysis in Excel. Population genetic software for teaching and research—an update}, volume = {28}, number = {19}, pages = {2537-2539}, @@ -213,7 +309,7 @@ @Manual{vegan @article{Haubold:2000, author = {Haubold, Bernhard and Hudson, Richard R.}, -title = {LIAN 3.0: detecting linkage disequilibrium in multilocus data}, +title = {{LIAN} 3.0: detecting linkage disequilibrium in multilocus data}, volume = {16}, number = {9}, pages = {847-849}, diff --git a/vignettes/poppr_manual.Rnw b/vignettes/poppr_manual.Rnw old mode 100755 new mode 100644 index 608210c2..091a06ae --- a/vignettes/poppr_manual.Rnw +++ b/vignettes/poppr_manual.Rnw @@ -1,7 +1,11 @@ \documentclass[letterpaper]{article} -%\VignetteIndexEntry{Poppr User Manual} +%\VignetteIndexEntry{Data import and manipulation} +%\VignetteEngine{knitr::knitr} \usepackage{graphicx} -\usepackage[colorlinks=true,urlcolor=blue]{hyperref} +\usepackage[colorlinks = true, + urlcolor = blue, + citecolor = blue, + linkcolor = blue]{hyperref} \usepackage{array} \usepackage{color} \usepackage[usenames,dvipsnames,svgnames,table]{xcolor} @@ -9,131 +13,311 @@ \usepackage{fullpage} \usepackage{mathtools} \usepackage{makeidx} -% \usepackage{lineno} -% Doublespacing. -% \usepackage{setspace} -% \setstretch{2} +\usepackage{longtable} % for bold symbols in mathmode \usepackage{bm} \newcommand{\R}{\mathbb{R}} -\newcommand{\beq}{\begin{equation}} -\newcommand{\eeq}{\end{equation}} \newcommand{\m}[1]{\mathbf{#1}} \newcommand{\tab}{\hspace*{1em}} +\newcolumntype{H}{>{\setbox0=\hbox\bgroup} c<{\egroup}@{}} +\newcommand{\cmdlink}[2]{% + \texttt{\hyperref[#1]{#2}}% +} +\newcommand{\seclink}[2]{% + \textsc{\hyperref[#1]{#2}}% +} + +\newcommand{\poppr}{\textit{poppr}} +\newcommand{\Poppr}{\textit{Poppr}} +\newcommand{\adegenet}{\textit{adegenet}} +\newcommand{\Adegenet}{\textit{Adegenet}} +\newcommand{\tline}{ + \noindent + \rule{\textwidth}{1pt} + \par +} +\newcommand{\bline}{ + \noindent + \rule{\textwidth}{1pt} + \kern1pt +} + +\newcommand{\jala}{ + \includegraphics[height = 5mm, keepaspectratio=true]{jalapeno-poppers} +} + +\newcommand{\revjala}{ + \scalebox{-1}[1]{\jala{}} +} -\title{Poppr 1.0.5: An R package for genetic analysis of populations with mixed (clonal/sexual) reproduction} -\author{Zhian N. Kamvar$^{1}$\ and Niklaus J. Gr\"unwald$^{1,2}$\\\scriptsize{1) Department of Botany and Plant Pathology, Oregon State University, Corvallis, OR}\\\scriptsize{2) Horticultural Crops Research Laboratory, USDA-ARS, Corvallis, OR}} +\title{Data import and manipulation in Poppr version 1.1.0} +\author{Zhian N. Kamvar$^{1}$\ and Niklaus J. Gr\"unwald$^{1,2}$\\\scriptsize{1) +Department of Botany and Plant Pathology, Oregon State University, Corvallis, +OR}\\\scriptsize{2) Horticultural Crops Research Laboratory, USDA-ARS, +Corvallis, OR}} \begin{document} % Set the width of figures. \setkeys{Gin}{width=0.5\textwidth} -\SweaveOpts{concordance=TRUE} -\definecolor{Sinput}{rgb}{0.75,0.19,0.19} -\definecolor{Soutput}{rgb}{0,0,0} -\definecolor{Scode}{rgb}{0.75,0.19,0.19} -\definecolor{light-gray}{gray}{0.95} + +<>= +knitr::opts_knit$set(out.format = "latex") +thm <- knitr::knit_theme$get("acid") +knitr::knit_theme$set(thm) +knitr::opts_chunk$set(concordance=TRUE) +knitr::opts_chunk$set(size = 'footnotesize', message = FALSE, warning = FALSE) +knitr::opts_chunk$set(out.width = '0.5\\linewidth', fig.align = "center", fig.show = 'asis') +@ + +<>= +print_command <- function(funk){ + fargs <- formals(funk) + + lapply(names(fargs), function(arg_name, fargs){ + arg <- fargs[[arg_name]] + if (missing(arg)){ + fargs[[arg_name]] <<- as.symbol(arg_name) + names(fargs)[names(fargs) == arg_name] <<- "" + } + }, fargs) + fargs$call <- as.symbol(funk) + fargs <- fargs[c(length(fargs), 1:(length(fargs) - 1))] + return(as.call(fargs)) +} +@ + +\definecolor{light-gray}{gray}{0.97} \definecolor{salmon}{HTML}{F0AAAA} -\DefineVerbatimEnvironment{Sinput}{Verbatim} -{formatcom={\color{Sinput}},fontsize=\footnotesize, baselinestretch=0.75} -\DefineVerbatimEnvironment{Soutput}{Verbatim} -{formatcom={\color{Soutput}},fontsize=\footnotesize, baselinestretch=0.75} -% The first page will have the title, abstract, and then the \textit{Poppr} logo at the bottom. -\maketitle -\begin{abstract} -\textit{Poppr} provides open-source, cross-platform tools for quick analysis of population genetic data enabling focus on data analysis and interpretation. While there are a plethora of packages for population genetic analysis, few are able to offer quick and easy analysis of populations with mixed reproductive modes. \textit{Poppr}'s main advantage is the ease of use and integration with other packages such as \textit{adegenet} and \textit{vegan}, including support for novel methods such as clone correction, multilocus genotype analysis, calculation of Bruvo's distance and the index of association. -\end{abstract} -% Inserting the \textit{Poppr} logo here -\begin{figure}[b] + +\maketitle +\begin{abstract} +\Poppr{} provides open-source, cross-platform tools for quick analysis of +population genetic data enabling focus on data analysis and interpretation. +While there are a plethora of packages for population genetic analysis, few are +able to offer quick and easy analysis of populations with mixed reproductive +modes. \Poppr{}'s main advantage is the ease of use and integration with other +packages such as \adegenet{} and \textit{vegan}, including support for novel +methods such as clone correction, multilocus genotype analysis, calculation of +Bruvo's distance and the index of association. +\end{abstract} +% Inserting the \Poppr{} logo here + +\begin{figure}[b] \centering - \label{logo} - \includegraphics{popprlogo} -\end{figure} -\newpage + \label{logo} + \includegraphics{popprlogo} +\end{figure} + +\newpage \begingroup -\hypersetup{linkcolor=black} -\tableofcontents -\endgroup + \hypersetup{linkcolor=black} + \tableofcontents +\endgroup %\linenumbers -\section{Introduction}\label{intro} -\subsection{Purpose}\label{intro:purpose} -\tab\tab\textit{Poppr} is an R package with convenient functions for analysis of genetic data with mixed modes of reproduction including sexual and clonal reproduction. While there are many R packages in CRAN and other repositories with tools for population genetic analyses, few are appropriate for populations with mixed modes of reproduction. There are several stand alone programs that can handle these types of data sets, but they are often platform specific and often only accept specific data types. Furthermore, a typical analysis often involves switching between many programs, and converting data to each specific format. +%=============================================================================% +%=============================================================================% +% +% +% +%=============================================================================% +%=============================================================================% +\section{Introduction} +\label{intro} -\textit{Poppr} is designed to make analysis of populations with mixed reproductive modes more streamlined and user friendly so that the researcher using it can focus on data analysis and interpretation. \textit{Poppr} allows analysis of haploid and diploid dominant/co-dominant marker data including microsattelites, Single Nucleotide Polymorphisms (SNP), and Amplified Fragment Length Polymorphisms (AFLP). To avoid creating yet another file format that is specific to a program, \textit{poppr} was created on the backbone of the popular R package \textit{adegenet} and can take all the file formats that \textit{adegenet} can take (Genpop, Genetix, Fstat, and Structure) and newly introduces compatibility with GenAlEx formatted files (exported to CSV). This means that anything you can analyze in \textit{adegenet} can be further analyzed with \textit{poppr}. +%-----------------------------------------------------------------------------% +% +%-----------------------------------------------------------------------------% +\subsection{Purpose} +\label{intro:purpose} + +\Poppr{} is an R package with convenient functions for analysis of +genetic data with mixed modes of reproduction including sexual and clonal +reproduction. While there are many R packages in CRAN and other repositories +with tools for population genetic analyses, few are appropriate for populations +with mixed modes of reproduction. There are several stand alone programs that +can handle these types of data sets, but they are often platform specific and +often only accept specific data types. Furthermore, a typical analysis often +involves switching between many programs, and converting data to each specific +format. + +\Poppr{} is designed to make analysis of populations with mixed reproductive +modes more streamlined and user friendly so that the researcher using it can +focus on data analysis and interpretation. \Poppr{} allows analysis of haploid +and diploid dominant/co-dominant marker data including microsattelites, Single +Nucleotide Polymorphisms (SNP), and Amplified Fragment Length Polymorphisms +(AFLP). To avoid creating yet another file format that is specific to a program, +\poppr{} was created on the backbone of the popular R package \adegenet{} and +can take all the file formats that \adegenet{} can take (Genpop, Genetix, Fstat, +and Structure) and newly introduces compatibility with GenAlEx formatted files +(exported to CSV). This means that anything you can analyze in \adegenet{} can +be further analyzed with \poppr{}. + +The real power of \poppr{} is in the data manipulation and analytic tools. +\Poppr{} has the ability to define multiple population hierarchies, clone- +censor, and subset data sets. With \poppr{} you can also quickly calculate +Bruvo's distance, the index of association, and easily determine which +multilocus genotypes are shared across populations. + +\subsection{Resources} + +This vignette will cover all of the material you need to know to efficiently +analyze data in \poppr{}. For information on methods of analysis (eg. index of +association, distance measures, AMOVA, ...), please read the manual pages +provided for each function. + +As \poppr{} expanded from version 1.0, the vignette also expanded to be 80+ +pages. As a result, it became clear that over 22,000 was less of a manual and +more of a novella with a terrible plot. To remedy this, this vignette will focus +only on data manipulation and a separate vignette, ``algo", has been written +to give algorithmic details of analyses introduced with \poppr{}. + +As of spring 2014, Drs. Niklaus J. Gr\"unwald, Sydney E. Everhart, and I have +co-authored a primer on using R for population genetic analysis. It is located +\href{http://grunwaldlab.cgrb.oregonstate.edu/popgen}{here} and the source code +can be found \href{https://github.com/grunwaldlab/Population_Genetics_in_R}{on +our github site}. + +\subsection{Getting Help} + +If you have any questions or feedback, feel free to send a message to the \poppr{} forum at \url{http://groups.google.com/group/poppr}. You can submit bug reports +there or on our github site: \url{https://github.com/grunwaldlab/poppr} + +\subsection{Acknowledgements} + +Much thanks goes to Sydney E. Everhart for alpha testing, beta testing, feature +requests, proofreading, data contribution, and moral support throught the writing +of this package and manual. Thanks also to Brian Knaus, Ignazio Carbone, David Cooke, +Corine Schoebel, Jane Stewart, and Zaid Abdo for beta testing and feedback. + +The following data sets are included in \poppr{}: \texttt{Pinf} \cite{goss2014irish}, \texttt{monpop} (Sydney E. Everhart, unpublished), +\texttt{Aeut} \cite{Grunwald:2006} + +\subsection{Citation} + +To cite \poppr{}, please type: + +<>= +citation(package = "poppr") +@ + +%-----------------------------------------------------------------------------% +% +%-----------------------------------------------------------------------------% +\subsection{Installation} +\label{intro:install} -The real power of \textit{poppr} is in the data manipulation and analytic tools. \textit{Poppr} has the ability to define multiple population hierarchies, clone-censor, and subset data sets. With \textit{poppr} you can also quickly calculate Bruvo's distance, the index of association, and easily determine which multilocus genotypes are shared across populations. -\subsection{Installation}\label{intro:install} -\tab\tab This manual assumes that you have already installed R. If you have not, please refer to The CRAN home page at \url{http://cran.r-project.org/}. The author also recommends utilizing an R gui such as Rstudio (\url{http://www.rstudio.com/}) for a better R experience. +This manual assumes you have installed R. If you have not, +please refer to The CRAN home page at \url{http://cran.r-project.org/}. We also +recommend the Rstudio IDE (\url{http://www.rstudio.com/}), which allows the user +to view the R console, environment, scripts, and plots in a single window. \subsubsection{From CRAN} -\tab\tab To install \textit{poppr} from CRAN is as simple as selecting ``Package Installer" from the menu ``Packages \& Data" in the gui or by typing in your command line: +\tab\tab To install \poppr{} from CRAN, select ``Package +Installer" from the menu ``Packages \& Data" in the gui or type: + <>= install.packages("poppr", dependencies=TRUE) @ -If everything is working perfectly, all the dependencies (\textit{adegenet, pegas, vegan, ggplot2, phangorn, ape} and \textit{igraph}) should be installed. In the unfortunate case this does not work, consult \url{http://cran.r-project.org/doc/manuals/R-admin.html#Installing-packages}. + +All dependencies (\textit{adegenet, pegas, vegan, ggplot2, phangorn, ape, +reshape2} and \textit{igraph}) will also be installed. In the unfortunate case +this does not work, consult +\url{http://cran.r-project.org/doc/manuals/R-admin.html#Installing-packages}. \subsubsection{From Source} -\tab\tab The tarball for \textit{poppr} can be from CRAN: \url{http://cran.r-project.org/package=poppr}, the Gr\"unwald Lab website: \url{http://http://grunwaldlab.cgrb.oregonstate.edu/} under the \textsc{Resources} tab, or github at \url{https://github.com/grunwaldlab/poppr}. +\tab\tab The tarball for \poppr{} can be downloaded from CRAN: +\url{http://cran.r-project.org/package=poppr}, under the \textsc{Resources} tab +in the Gr\"unwald Lab website: +\url{http://http://grunwaldlab.cgrb.oregonstate.edu/}, or via github at +\url{https://github.com/grunwaldlab/poppr}. -Since \textit{poppr} contains C code, it needs to be compiled, which means that you need a working C compiler. If you are on Linux, you shouldn't have to worry too much about that, but if you are on Windows or OSX, you might need to download some special tools: +Since \poppr{} contains C code, it needs to be compiled, which means that you +need a working C compiler. If you are on Linux, you should have that, but if you +are on Windows or OSX, you might need to download some special tools: \begin{description} \item[Windows] Download Rtools: \url{http://cran.r-project.org/bin/windows/Rtools/} \item[OSX] Download Xcode: \url{https://developer.apple.com/xcode/} \end{description} -If you choose to install \textit{poppr} from a source file, you should first make sure to install all of the dependencies with the following command: +If you choose to install \poppr{} from a source file, you should first make sure +to install all of the dependencies with the following command: + <>= install.packages(c("adegenet", "pegas", "vegan", "ggplot2", "phangorn", "ape", "igraph")) @ -\textbf{If you want to install from github, you may skip to the next section.} +\textbf{If you want to install from github, skip to the next section.} -After installing dependencies, download the package to your computer and then you can install it with: +After installing dependencies, download the package to your computer and then +install it with: <>= install.packages("/path/to/poppr.tar.gz", type="source", repos=NULL) @ \subsubsection{From github} -\tab\tab Github is a repository where you can find all stable and development versions of \textit{poppr}. Installing from github requires a C compiler, so be sure to read the section above for instructions on how to obtain that if you aren't on a Linux system. - -To install from github, you do not need to actually download the tarball since there is a package called \textit{devtools} that will download and install the package for you directly from github. After you have installed all dependencies (see above section), you should download \textit{devtools}: +\tab\tab Github is a repository where you can find all stable and development +versions of \poppr{}. Installing from github requires a C compiler, so be sure +to read the section above for instructions on how to obtain that if you aren't +on a Linux system. + +To install from github, you do not need to download the tarball since +there is a package called \textit{devtools} that will download and install the +package for you directly from github. After you have installed all dependencies +(see above section), you should download \textit{devtools}: <>= install.packages("devtools") @ -Now you can execute the command \texttt{install\_github} with the user and repository name: +Now you can execute the command \texttt{install\_github} with the user and +repository name: <>= library(devtools) install_github(repo = "grunwaldlab/poppr") @ -If you are the adventurous type and are willing to test out unreleased versions of the package, you can also install the development version: +If you are the adventurous type and are willing to test out unreleased versions +of the package, you can also install the development version: <>= library(devtools) install_github(repo = "grunwaldlab/poppr", ref = "devel") @ -Users who install this version do so at their own risk. Since it is a development version, documentation may be rough or nonexistant for new functions. +Users who install this version do so at their own risk. Since it is a +development version, documentation may be incomplete or nonexistant for new +functions. -\subsection{Quick start}\label{intro:qstart} - -\tab\tab The author assumes that if you have reached this point in the manual, then you have successfully installed R and \textit{poppr}. Before proceeding, you should be aware that R is case sensitive. This means that the words ``Case" and ``case" are different from R's perspective. You should also know where your R package Library is located. +%-----------------------------------------------------------------------------% +% +%-----------------------------------------------------------------------------% +\subsection{Quick start} +\label{intro:qstart} + +\tab\tab The author assumes that if you have reached this point in the manual, +then you have successfully installed R and \poppr{}. Before proceeding, you +should be aware that R is case sensitive. This means that the words ``Case" and +``case" are different. You should also know where your R +package Library is located. \begin{center} - \fcolorbox{black}{light-gray}{ + \fcolorbox{light-gray}{light-gray}{ \begin{minipage}[t]{0.8\textwidth} -{\large \textsc{What or where is my R package library?}}\\ -R is as powerful as it is through a community of people who submit extra code called ``Packages" to help it do specific things. These packages live in a certain place on your computer called an R library. You can find out where this library is by typing -.libPaths() + {\large \textsc{What or where is my R package library?}}\\ + R is as powerful as it is through a community of people who submit + extra code called ``Packages" to help it do specific things. These + packages live in a certain place on your computer called an R library. + You can find out where this library is by typing \texttt{.libPaths()} \end{minipage} } \end{center} -Importing a file into R involves you knowing the path to your file and then typing that into R's console. \texttt{getfile()} will help provide a point and click interface for selecting a file. There are two steps: +Importing a file into R involves you knowing the path to your file and then +typing that into R's console. \texttt{getfile()} will help provide a point and +click interface for selecting a file. There are two steps: <>= library(poppr) x <- list(files="/path/to/R/poppr/files/rootrot.csv", path="/path/to/R/poppr/files") @ -Before you do anything, you'll want to tell your computer to search R's library to find the \textit{poppr} and load the package: +Tell your computer to search R's library to find the \poppr{} and load the package: <>= library(poppr) @ @@ -141,50 +325,79 @@ After that, you can use \texttt{getfile()} <>= x <- getfile() @ -At this point, a pop up window will appear like this\footnote{This window sometimes appears behind your current session of R, depending on the GUI and you will have to toggle to this window}: +A pop up window will appear like this\footnote{This window +sometimes appears behind your current session of R, depending on the GUI and you +will have to toggle to this window}: \begin{figure}[h!] \centering \caption{\footnotesize \footnotesize A popup window as it appears in OSX (Mountain Lion).} \label{getfile window} -\includegraphics{getfile.png} + \includegraphics{getfile} \end{figure} +\newpage \begin{center} - \fcolorbox{black}{light-gray}{ - \begin{minipage}[t]{0.8\textwidth} -{\large \textsc{Hey! My window doesn't look like that!}}\\ -Now, this window will not match up to your window on your computer because you will probably not be in the right directory. Remember the first path in \texttt{.libPaths()}? Move to a folder called \textbf{poppr} in that path. In that folder, you will find another folder called \textbf{files}. Move there and your window will match the one displayed. - \end{minipage} + \fcolorbox{light-gray}{light-gray}{ + \begin{minipage}[t]{0.8\textwidth} + {\large \textsc{Hey! My window doesn't look like that!}}\\ + Now, this window will not match up to your window on your computer + because you will probably not be in the right directory. Remember the + first path in \texttt{.libPaths()}? Move to a folder called + \textbf{poppr} in that path. In that folder, you will find another + folder called \textbf{files}. Move there and your window will match the + one displayed. + \end{minipage} } \end{center} -\newpage -We can navigate throughout your entire computer through this little window and tell R where to go. The example I'm using goes to your R library directory. If you don't know where that is, you can find it by typing \texttt{.libPaths()} into the R command line. -Once we select a file, the file name and its path will be stored in the variable, x. We can confirm what we selected by simply typing \texttt{x} into R's command line. + +We can navigate throughout your entire computer through this window and tell R +where to go. The example I'm using goes to your R library directory where +\poppr{} is stored. If you don't know where that is, you can find it by typing +\texttt{find.package('poppr')} into the R command line. Once we select a file, +the file name and its path will be stored in the variable, x. We can confirm +that by typing \texttt{x} into R's command line. <>= x @ -Here we can see that \texttt{x} is a list with two entries: \texttt{\$files} giving you the files you selected and \texttt{\$path} giving you the path to those files. +Here we can see that \texttt{x} is a list with two entries: \texttt{\$files} +shows the files you selected and \texttt{\$path} shows the path to +those files. \begin{center} - \fcolorbox{black}{light-gray}{ - \begin{minipage}[t]{0.8\textwidth} -{\large \textsc{Not sure what I mean by path or working directory?}}\\ -For anyone who has never used a command line, this is a new concept. -You can think of the path as an address. So instead of \texttt{"/path/to/R"}, you could have \texttt{"/USA/Oregon/Corvallis"}. Or on your computer, it could be \texttt{"C:/users/poppr-user/R/win-library/2.15"} on Windows (where "poppr-user" is your username) or \texttt{"/Library/Frameworks/R.framework/Versions/2.15/Resources/library"} on OSX. Each slash represents a folder that you would click through when you are using the mouse. - -A working directory is simply the folder that R is working in. It is where you can access and write files. When you tell R to read a file, it will only look for that file in your working directory. Note that you will not endanger your files by reading them into R. R works by making a copy of the file into memory. This means that you can manipulate the data in any way that you want without ever losing the content.\\ - -To find out your current working directory, type \texttt{getwd()} into the R console. Usually, you will start off a session in your "home" directory, which will look like this: \texttt{"$\sim$/"}. -The command \texttt{setwd()} will change your working directory to any place of your choice on your computer as indicated by the path that you provide. -For more information, see Quick R at \url{http://www.statmethods.net}. - \end{minipage} - } + \fcolorbox{light-gray}{light-gray}{ + \begin{minipage}[t]{0.8\textwidth} + {\large \textsc{Not sure what I mean by path or working directory?}}\\ + For anyone who has never used a command line, this is a new concept. You + can think of the path as an address. So instead of + \texttt{"/path/to/R"}, you could have \texttt{"/USA/Oregon/Corvallis"}. + Or on your computer, it could be + \texttt{"C:/users/poppr-user/R/win-library/3.1"} on Windows (where ``poppr-user" + is your username) or + \texttt{"/Library/Frameworks/R.framework/Versions/3.1/Resources/library"} + on OSX. Each slash represents a folder that you would click on when + you are using the mouse. + + A working directory is the folder that R is working in. It is where + you can access and write files. When you tell R to read a file, it will + only look for that file in your working directory. Note that you will not + endanger your files by reading them into R. R works by making a copy of + the file into memory. This means that you can manipulate the data in any + way that you want without ever changing the original file.\\ + + To find what your current working directory is set to, type + \texttt{getwd()} into the R console. Usually, you will start off a session + in your ``home" directory, which will look like this: \texttt{"$\sim$/"}. + The command \texttt{setwd()} will change your working directory to any + folder of your choice on your computer as indicated by the path that you + provide. For more information, see Quick R at + \url{http://www.statmethods.net}. + \end{minipage} + } \end{center} -% Now we can set our working directory. That is, we can tell R to go to the folder that contains all of our data with the \texttt{setwd} command. -% <>= -% setwd(x$path) -% @ -We will use \texttt{x\$files} to access the file. The \texttt{poppr()} function provides a quick and convenient first analysis of your data directly from the file on the your disk (For information on importing your data into R, see section \ref{intro:import}, \textit{Get out of my dreams and into my R}). +We will use \texttt{x\$files} to access the file. The \texttt{poppr()} function +provides a simple first analysis of your data directly from the file on the your +disk (For information on importing your data into R, see +section \seclink{intro:import}{Get out of my dreams and into my R.}) <>= popdata <- poppr(x$files) @ @@ -192,50 +405,97 @@ popdata <- poppr(x$files) options(width=90) popprsoutput <- poppr(system.file("files/rootrot.csv", package="poppr")) @ -The output of \texttt{poppr()} was assigned to the variable \texttt{popdata}, so let's look at the data. +The output of \texttt{poppr()} was assigned to the variable \texttt{popdata}, so +let's look at the data. <>= popdata @ <>= popprsoutput @ -One thing to note about this output is the \texttt{NaN} in the column labeled \texttt{SE}. This is produced from calculation of a standard error based on rarefaction analysis. Occasionally, this calculation will encounter a situation in which it must attempt to take a square root of a negative number. As you no doubt have learned in high school mathematics, the root of any negative number is not defined in the set of real numbers, and must have an imaginary component, $i$. Unfortunately, R is a computer program without any imagination and, thus imaginary numbers cannot be represented. To account for this, R represents the square roots of negatives as ``not a number" or \texttt{NaN}.\\ The fields you see in the output include: \begin{itemize} - \item \texttt{Pop -} Population name (Note that ``Total" also means ``Pooled"). + \item \texttt{Pop -} Population name (Note that ``Total" also means + ``Pooled"). \item \texttt{N -} Number of individuals observed. \item \texttt{MLG -} Number of multilocus genotypes (MLG) observed. - \item \texttt{eMLG -} The number of expected MLG at the smallest sample size $\geq 10$ based on rarefaction. \cite{Hurlbert:1971} + \item \texttt{eMLG -} The number of expected MLG at the smallest sample size + $\geq 10$ based on rarefaction. \cite{Hurlbert:1971} \item \texttt{SE -} Standard error based on \texttt{eMLG} \cite{Heck:1975} \item \texttt{H -} Shannon-Wiener Index of MLG diversity. \cite{Shannon:1948} - \item \texttt{G -} Stoddart and Taylor's Index of MLG diversity. \cite{Stoddart:1988} - \item \texttt{Hexp -} Nei's 1978 genotypic diversity (corrected for sample size), or Expected Heterozygosity. \cite{Nei:1978} - \item \texttt{E.5 -} Evenness, $E_5$. \cite{Pielou:1975}\cite{Ludwig:1988}\cite{Grunwald:2003} - \item \texttt{Ia -} The index of association, $I_A$. \cite{Brown:1980} \cite{Smith:1993} \cite{Agapow:2001} - \item \texttt{rbarD -} The standardized index of association, $\bar r_d$. \cite{Agapow:2001} + \item \texttt{G -} Stoddart and Taylor's Index of MLG diversity. + \cite{Stoddart:1988} + \item \texttt{Hexp -} Nei's 1978 genotypic diversity (corrected for sample + size), or Expected Heterozygosity. \cite{Nei:1978} + \item \texttt{E.5 -} Evenness, $E_5$. + \cite{Pielou:1975}\cite{Ludwig:1988}\cite{Grunwald:2003} + \item \texttt{Ia -} The index of association, $I_A$. \cite{Brown:1980} + \cite{Smith:1993} \cite{Agapow:2001} + \item \texttt{rbarD -} The standardized index of association, $\bar r_d$. + \cite{Agapow:2001} \end{itemize} -These fields are further described in section \ref{summary}, \textit{I know what you did last summary table} at the end of this vignette. - -\subsection{Get out of my dreams and into my R \{importing data into poppr\}}\label{intro:import} -There are several ways of reading data into R. -\subsubsection{Function: getfile}\label{intro:import:getfile} -\tab\tab \texttt{getfile} gives the user an easy way to point R to the directory in which your data is stored. It is only meant for R GUIs such as Rstudio. Using this on the command line has very little advantage over setting the working directory manually. +These fields are further described in the function \texttt{poppr}. You can +access the help page for \texttt{poppr} by typing \texttt{?poppr} in your R +console. + +One thing to note about this output is the \texttt{NaN} in the column labeled +\texttt{SE}. In R, \texttt{NaN} means ``Not a number''. This is produced from +calculation of a standard error based on rarefaction analysis. Occasionally, +this calculation will encounter a situation in which it must attempt to take a +square root of a negative number. Since the root of any negative number is not +defined in the set of real numbers, it must therefore have an imaginary +component, $i$. Unfortunately, R will not represent the imaginary components of +numbers unless you specifically tell it to do so. By default, R represents these +as \texttt{NaN}.\\ + +%-----------------------------------------------------------------------------% +% +%-----------------------------------------------------------------------------% +\subsection{Importing data into poppr \{Get out of my dreams and into my R\}} +\label{intro:import} + + +There are several ways of reading data into R. One way is using the function +\texttt{getfile}. +\subsubsection{Function: getfile} +\label{intro:import:getfile} +\tab\tab \texttt{getfile} gives the user an easy way to point R to the directory +in which your data is stored. It is only meant for R GUIs such as Rstudio. Using +this on the command line has little advantage over setting the working +directory manually.\\ +\tline{} \begin{quote} -Default Command:\\ -\texttt{getfile(multi = FALSE, pattern = NULL, combine = TRUE)} +Default Command: +% \texttt{getfile(multi = FALSE, pattern = NULL, combine = TRUE)} +<>= +funk <- "getfile" +print_command(funk) +@ \end{quote} \begin{itemize} - \item \texttt{multi -} This is normally set to \texttt{FALSE}, meaning that it will only grab the file you selected. If it's \texttt{TRUE}, it will grab all files within the directory, constrained only by what you type into the \texttt{pattern} field. - \item \texttt{pattern -} A pattern that you want to filter the files you get. This accepts regular expressions, so you must be careful with anything that is not an alphanumeric character. - \item \texttt{combine -} This tells \texttt{getfile} to combine the path and all the files. This is set to \texttt{TRUE} by default so that you can access your files no matter what working directory you are in. + \item \texttt{multi -} This is normally set to \texttt{FALSE}, meaning that it + will only grab the file you selected. If it's \texttt{TRUE}, it will grab all + files within the directory, constrained only by what you type into the + \texttt{pattern} field. + \item \texttt{pattern -} A pattern that you want to filter the files you get. + This accepts regular expressions, so you must be careful with anything that is + not an alphanumeric character. + \item \texttt{combine -} This tells \texttt{getfile} to combine the path and + all the files. This is set to \texttt{TRUE} by default so that you can access + your files no matter what working directory you are in. \end{itemize} +\bline{} -This method works for a single file, but let's say you had a lot of data sets you wanted to import. You would have to do all of these one by one, right? Not so. \texttt{getfile} has a nice little flag called \texttt{multi} telling the computer that you want to grab multiple files in the folder. You would use this with \texttt{poppr.all} to produce a summary table for all of your files\footnote{These files do not need to be similar in any way to do this analysis}: +This method works for a single file, but let's say you had a lot of data sets +you wanted to import. Instead of doing these one-by-one, \texttt{getfile} has a +flag called \texttt{multi} telling the computer that you want to grab multiple +files in the folder: <>= x <- getfile(multi=TRUE) @ -A window would pop up again, and you should navigate to the same directory as you had before, and select any of the files in that directory. +A window would pop up again, and you should navigate to the same directory as +you had before, and select any of the files in that directory. <>= x @ @@ -244,27 +504,41 @@ x$files <- list.files(dirname(system.file("files/rootrot.csv", package="poppr")) x$files <- paste(x$path, x$files, sep="/") x @ -As you can see, now all of the files that existed in that directory are there! Now you can look at all those files at once! +As you can see, now all of the files that existed in that directory are there! +Now you can look at all those files at once! We will use \texttt{poppr.all} to +produce a summary table for all of your files\footnote{These files do not need +to be similar in any way to do this analysis}. Let's set \texttt{digits = 2} to +only print 2 significant digits. <>= -poppr.all(x$files) +all_files <- poppr.all(x$files) +print(all_files, digits = 2) @ <>= -poppr.all(c(system.file("files/rootrot.csv", package="poppr"), system.file("files/rootrot2.csv", package="poppr"), system.file("files/simulated.dat", package="poppr"))) -@ - -You've seen examples of how to use \texttt{getfile} to extract a single file and all the files in a directory, but what if you wanted many files, but only wanted ones that were of a certain type or had a certain name? This is what you would use the \texttt{pattern} argument for. -A perfect use would be the example data contained in the \textit{adegenet} package. Let's take a look at the names of these files. +print(poppr.all(c(system.file("files/rootrot.csv", package="poppr"), + system.file("files/rootrot2.csv", package="poppr"), + system.file("files/simulated.dat", package="poppr"))), + digits = 2) +@ + +You've seen examples of how to use \texttt{getfile} to extract a single file and +all the files in a directory, but what if you wanted many files only of a +certain type or with a certain name? This is what you would use the +\texttt{pattern} argument for. For example, there are several data files with +different formats in the \adegenet{} folder in your R library. Let's take a look +at the names of these files. \begin{center} - \fcolorbox{black}{salmon}{ - \begin{minipage}[t]{0.8\textwidth} - For the rest of this section, remember that every time you invoke \texttt{getfile()}, a window will pop up and you should select a file before hitting enter. - \end{minipage} - } + \fcolorbox{light-gray}{salmon}{ + \begin{minipage}[t]{0.8\textwidth} + \jala{} For the rest of this section, remember that every time you invoke + \texttt{getfile()}, a window will pop up and you should select a file + before hitting enter. + \end{minipage} + } \end{center} <>= getfile(multi=TRUE) @ -Navigate to the \textit{adegenet} folder in your R library. +Navigate to the \adegenet{} folder in your R library. <>= nancylist <- dir(dirname(system.file("files/nancycats.gtx",package="adegenet"))) list( @@ -273,7 +547,12 @@ list( ) @ -We can see that we have a mix of files with different formats. If we tried to run all of these files using poppr, we would have a problem because some of the file formats have no direct import into a \texttt{genind} object (*.fasta, or *.snp), or just simply are not supported (eg. *.rda files). We want to be able to filter these files out, and we will do so with the \texttt{pattern} argument. Let's say we only wanted the files that have the word "nancy" in them. +We can see that we have a mix of files with different formats. If we tried to +run all of these files using poppr, we would have a problem because some of the +file formats have no direct import into a \texttt{genind} object (*.fasta, or +*.snp), or just simply are not supported (eg. *.rda files). To filter these +files, use the \texttt{pattern} argument. Let's say we only wanted the files +that have the word ``nancy'' in them. <>= getfile(multi=TRUE, pattern="nancy") @ @@ -307,7 +586,16 @@ list( ) @ -Uh-oh. We've run into a problem. Three out of our four files are not FSTAT files. Why did this happen? It happened because they happen to have \texttt{"dat"} within their name. This problem can be solved, by using regular expressions. If you are unfamiliar with regular expressions, you can think of them as special characters that you can use to make your search pattern more strict or more flexible. Since the topic of regular expressions can take up several lectures, I will spare you the gory details. For this situation, the only one you need to know is ``\texttt{\$}". The dollar sign indicates the end of a word or string. If we want specific file extensions all we have to do is add this to the end of the search term like so: +Uh-oh. We've run into a problem. Three out of our four files are not FSTAT +files. Why did this happen? It happened because they happen to have +\texttt{"dat"} within their name. This problem can be solved, by using regular +expressions. If you are unfamiliar with regular expressions, you can think of +them as special characters that you can use to make your search pattern more +strict or more flexible. Since the topic of regular expressions can take up +several lectures, I will spare you the gory details. For this situation, the +only one you need to know is ``\texttt{\$}". The dollar sign indicates the end +of a word or string. If we want specific file extensions all we have to do is +add this to the end of the search term like so: <>= getfile(multi=TRUE, pattern="dat$") @ @@ -319,109 +607,224 @@ list( ) @ Now we have our FSTAT file! -\subsubsection{Function: read.genalex}\label{intro:import:read.genalex} - -\tab\tab A very popular program for population genetics is GenAlEx (\url{http://biology.anu.edu.au/GenAlEx/Welcome.html}) \cite{Peakall:2012, Peakall:2006}. GenAlEx runs within the Excel environment and can be very powerful in its analyses. \textit{Poppr} has added the ability to read *.CSV files\footnote{*.CSV files are comma separated files that are easily machine readable.} produced in the GenAlEx format. It can handle data types containing regions and geographic coordinates, but currently it cannot import allelic frequency data from GenAlEx. All the user has to do is to export a single sheet of GenAlEx data from Excel into a *.CSV file, and the \textit{poppr} function \texttt{read.genalex} will import it into \textit{adegenet}'s \texttt{genind} object (more information on that below). For ways of formatting a GenAlEx file, see the manual here: \url{http://biology.anu.edu.au/GenAlEx/Download_files/GenAlEx\%206.5\%20Guide.pdf} -\begin{quote} -Default Command:\\ -\texttt{read.genalex(genalex, ploidy = 2, geo = FALSE, region = FALSE)} -\end{quote} -\begin{itemize} - \item \texttt{genalex -} a *.CSV file exported from GenAlEx on your disk (For example: \texttt{"my\_genalex\_file.csv"}). - \item \texttt{ploidy -} a number indicating the ploidy for the data set (eg 2 for diploids, 1 for haploids). - \item \texttt{geo -} GenAlEx allows you to have geographic data within your file. To do this for \textit{poppr}, you will need to follow the first format outlined in the GenAlEx manual and place the geographic data AFTER all genetic and demographic data with one blank column separating it (See the GenAlEx Manual for details). If you have geographic information in your file, set this flag to \texttt{TRUE} and it will be included within the resulting genind object in the \texttt{@other} slot. (If you don't know what that is, don't worry. It will be explained later in section \ref{intro:genind:other}). - \item \texttt{region -} To format your GenAlEx file to include regions along with your populations, You can choose to include a separate column for regional data, or, since regional data must be in contiguous blocks, you can simply format it in the same way you would any other data (see the GenAlEx manual for details). If you have your file organized in this manner, select this option and the regional information will be stored in the resulting genind object in the \texttt{@other} slot. -\end{itemize} - -\begin{center} - \fcolorbox{black}{light-gray}{ - \begin{minipage}[t]{0.8\textwidth} -{\large \textsc{If you are unfamiliar with exporting data from Excel}}\\ -\begin{enumerate} - \item Click the Microsoft Office Button in the top left corner of Excel. (Or go to the File menu if you have an older version) - \item Click Save As... - \item In the ``Save as type" drop down box, select CSV (comma delimited). -\end{enumerate} - \end{minipage} - } -\end{center} - -Note that regional data and geographic data are not mutually exclusive. You can have both in one file, just make sure that they are on the same sheet and that the geographic data is always placed after all genetic and demographic data. - -We have a short example of genalex formatted data with no geographic or regional formatting. We will first see where the data is using the command \texttt{system.file()} -<>= -system.file("files/rootrot.csv", package="poppr") -@ -<>= -paste("/path/to/R/library/poppr/files/rootrot.csv") -@ -Now import the data into \textit{poppr} like so: -<>= -rootrot <- read.genalex(system.file("files/rootrot.csv", package="poppr")) -@ -Executing rootrot shows that this file is now in genind format (ie. the format required by \textit{poppr} and \textit{adegenet}). -<>== -rootrot -@ - -\subsubsection{Genalex formatting shortcuts}\label{intro:import:genalex.short} - -\tab\tab The GenAlEx format is a nice way to import data because it allows you to have geographic coordinates and two hierarchical levels of sampling (Region and population). If you have multiple levels of hierarchy, you will need to code them so that you combine multiple columns of hierarchy into one using a common separator (For an example, see section \ref{data.manip:hier:splitcombine} of this manual). A problem arises when it becomes more work than it's worth to do that since, for the GenAlEx format, you must provide the sizes of each population in the header. Here, I'll show you a simple way to circumvent that. First, let's use the microbov data set from \textit{adegenet} (for details, type \texttt{help("microbov")} into your R console). It contains three demographic factors: Country, Species and Breed contained within the \texttt{@other} slot (detailed in section \ref{intro:genind:other}). We will combine these and save the file to our desktop. We will cover these functions later in this manual. For now, just know they exist. +\subsubsection{Function: read.genalex} +\label{intro:import:read.genalex} + +\tab\tab A very popular program for population genetics is GenAlEx +(\url{http://biology.anu.edu.au/GenAlEx/Welcome.html}) +\cite{Peakall:2012, Peakall:2006}. GenAlEx runs within the Excel environment and +can be very powerful in its analyses. \Poppr{} has added the ability to read +*.CSV files\footnote{*.CSV files are comma separated files that are easily +machine readable.} produced in the GenAlEx format. It can handle data types +containing regions and geographic coordinates, but currently cannot import +allelic frequency data from GenAlEx. Using the \poppr{} function +\texttt{read.genalex} will import your data into \adegenet{}'s \texttt{genind} +object or \poppr{}'s \texttt{genclone} object (more information on that below). +For ways of formatting a GenAlEx file, see the manual here: +\url{http://biology.anu.edu.au/GenAlEx/Download_files/GenAlEx\%206.5\%20Guide.pdf} + + +Below is an example of the GenAlEx format. We will use the data set called +\texttt{microbov} from the \adegenet{} package to generate it. The data contains +three demographic factors: Country, Species and Breed contained within the +\texttt{@other} slot (detailed in \seclink{intro:genind:other}{The other slot}). +We will combine these and save the file to the desktop. Details of these +functions are presented elsewhere in this manual. <>= library(poppr) data(microbov) -microbov@other$population_hierarchy <- data.frame(list(Country = microbov@other$coun, - Species = microbov@other$spe, Breed = microbov@other$breed)) -microbov <- splitcombine(microbov, method=2, hier=c("Country", "Species", "Breed")) +microbov <- as.genclone(microbov) +sethierarchy(microbov) <- data.frame(other(microbov)) +setpop(microbov) <- ~coun/breed/spe genind2genalex(microbov, file="~/Desktop/microbov.csv") @ <>= cat("Extracting the table ... Writing the table to ~/Desktop/microbov.csv ... Done.") @ -After we do this, we can open the file in our favorite spreadsheet editor and see the following image. \setkeys{Gin}{width=\textwidth} \begin{figure}[h!] \centering - \caption{\footnotesize \footnotesize The first 15 individuals and 4 loci of the microbov data set. The first column contains the individual names, the second column contains the population names, and each subsequent column represents microsatellite genetic data. Highlighted in red is a list of populations and their relative sizes.} + \caption{\footnotesize \footnotesize The first 15 individuals and 4 loci of + the microbov data set. The first column contains the individual names, the + second column contains the population names, and each subsequent column + represents microsatellite genetic data. Highlighted in red is a list of + populations and their relative sizes.} \label{microbov unmodified} \includegraphics{unmod_dat} \end{figure} +The GenAlEx format contains individuals in rows and loci in columns. Individual +data begins at row 4. Column A always contains individual names and column B +defines the population of each individual. Notice here that the three +demographic factors from the data have been concatenated with a ``\_''. This +allows us to import more than one population factor to use as hierarchical +levels in a \seclink{intro:genclone}{genclone object.} + \newpage -All that \textit{poppr} needs from the first header row are the first three numbers (unless you are including regional data, but it's not terribly necessary with the hierarchical support \textit{poppr} provides.), which represent the number of loci, individuals, and populations, respectively. After that, you have counts of individuals per population in each subsequent cell. For \textit{poppr}, These cells don't matter because we already have that information in column 2. -If you have a large data set with many population levels, you can use the following shortcut by setting the number in the third cell to 1. The number in cell 4 is arbitrary (but must be there). In the following figure, it is set to the number of individuals in your data set, but can easily be replaced with any other number (perhaps your favorite number?). +The First three rows contain information pretaining to the global data set. The +only important information for \poppr{} is the information contained in row 3 +and the first three columns of row 1. + +\begin{table}[h!] +\centering +\begin{tabular}{|l|l|l|l|ll|} + \hline + & \textbf{A} & \textbf{B} & \textbf{C} & \textbf{D} & \\ + \hline + \textbf{1} & \# of Loci & \# of Individuals & \# of Populations & Pop1 Size & ...\\ + \hline + \textbf{2} & - & - & - & Pop1 Name & ...\\ + \hline + \textbf{3} & - & - & Locus 1 & ... & \\ + \hline +\end{tabular} +\end{table} + +Highlighted in red in figure \ref{microbov unmodified} are definitions of the +number of populations and their respective sizes. As this is redundant +information, we can remove it. Below is an example of a valid data set that can +be imported into \poppr{}. \begin{figure}[h!] \centering - \caption{\footnotesize \footnotesize The first 15 individuals and 4 loci of the microbov data set. This is the same figure as above, however the populations and counts have been removed from the header row and the third number in the header has been replaced by 1.} + \caption{\footnotesize \footnotesize The first 15 individuals and 4 loci of + the microbov data set. This is the same figure as above, however the + populations and counts have been removed from the header row and the third + number in the header has been replaced by 1.} + \label{microbov modified} \includegraphics{mod_dat} \end{figure} \setkeys{Gin}{width=0.5\textwidth} -\subsubsection{Other ways of importing data}\label{intro:import:other} +All GenAlEx formatted data can be imported with the command \texttt{read.genalex}, +detailed below: +\newpage +\tline{} +\begin{quote} +Default Command: +<>= +funk <- "read.genalex" +print_command(funk) +@ +\end{quote} +\begin{itemize} + \item \texttt{genalex -} a *.CSV file exported from GenAlEx on your disk (For + example: \texttt{"my\_genalex\_file.csv"}). + \item \texttt{ploidy -} a number indicating the ploidy for the data set (eg 2 + for diploids, 1 for haploids). + \item \texttt{geo -} GenAlEx allows you to have geographic data within your + file. To do this for \poppr{}, you will need to follow the first format + outlined in the GenAlEx manual and place the geographic data AFTER all genetic + and demographic data with one blank column separating it (See the GenAlEx + Manual for details). If you have geographic information in your file, set this + flag to \texttt{TRUE} and it will be included within the resulting genind + object in the \texttt{@other} slot. (If you don't know what that is, don't + worry. It will be explained later in \seclink{intro:genind:other}{The other + slot.}) + \item \texttt{region -} To format your GenAlEx file to include regions, you + can choose to include a separate column for regional data, or, since regional + data must be in contiguous blocks, you can simply format it in the same way + you would any other data (see the GenAlEx manual for details). If you have + your file organized in this manner, select this option and the regional + information will be stored in the \texttt{@other} slot of the resulting + genind object or be incorporated into the hierarchy of the genclone object. + \item \texttt{genclone -} This flag will convert your data into a + \texttt{genclone} object (see \seclink{intro:genclone}{Send in the clones} for + more info). + \item \texttt{sep -} The separator argument for columns in your data. It + defaults to ``,". +\end{itemize} +\bline{} +\begin{center} + \fcolorbox{light-gray}{light-gray}{ + \begin{minipage}[t]{0.8\textwidth} + {\large \textsc{If you are unfamiliar with exporting data from Excel}}\\ + \begin{enumerate} + \item Click the Microsoft Office Button in the top left corner of Excel. + (Or go to the File menu if you have an older version) + \item Click Save As... + \item In the ``Save as type" drop down box, select CSV (comma delimited). + \end{enumerate} + \end{minipage} + } +\end{center} + +Note that regional data and geographic data are not mutually exclusive. You can +have both in one file, just make sure that they are on the same sheet and that +the geographic data is always placed after all genetic and demographic data. + +We have a short example of genalex formatted data with no geographic or regional +formatting. We will first see where the data is using the command +\texttt{system.file()} +<>= +system.file("files/rootrot.csv", package="poppr") +@ +<>= +paste("/path/to/R/library/poppr/files/rootrot.csv") +@ +Now import the data into \poppr{} like so: +<>= +rootrot <- read.genalex(system.file("files/rootrot.csv", package="poppr")) +@ +Executing rootrot shows that this file is now in genclone format and can be used +with any function in \poppr{} and \adegenet{} +<>= +rootrot +@ + + +\subsubsection{Other ways of importing data} +\label{intro:import:other} + +\tab\tab \Adegenet{} already supports the import of FSTAT, Structure, Genpop, +and Genetix formatted files, so if you have data in those formats, you can +import them using the function \texttt{import2genind}. For sequence data, check +if you can use \texttt{read.dna} from the \textit{ape} package to import your +data. If you can, then you can use the \adegenet{} function +\texttt{DNAbin2genind}. If you don't have any of these formats handy, you can +still import your data using R's \texttt{read.table} along with +\texttt{df2genind} from \adegenet{}. For more information, see \adegenet{}'s +``Getting Started" vignette. -\tab\tab \textit{Adegenet} already supports the import of FSTAT, Structure, Genpop, and Genetix formatted files, so if you have those formats, you can import them using the function \texttt{import2genind}. For sequence data, check if you can use \texttt{read.dna} from the \textit{ape} package to import your data. If you can, then you can use the \textit{adegenet} function \texttt{DNAbin2genind}. If you don't have any of these formats handy, you can still import your data using R's \texttt{read.table} along with \texttt{df2genind} from \textit{adegenet}. For more information, see \textit{adegenet}'s ``Getting Started" vignette. +\subsubsection{Function: genind2genalex} +\label{intro:import:genind2genalex} -\subsubsection{Function: genind2genalex}\label{intro:import:genind2genalex} +\tab\tab Of course, being able to export data is just as useful as being able to +import it, so we have this handy little function that will write a GenAlEx +formatted file to wherever you desire.\\ +\textbf{WARNING: This will overwrite any file that exists with the same name.} -\tab\tab Of course, being able to export data is just as useful as being able to import it, so we have this handy little function that will write a GenAlEx formatted file to wherever you desire.\\ -WARNING: This will overwrite any file that exists with the same name. +\tline{} \begin{quote} -Default Command:\\ -\texttt{genind2genalex(pop, filename = "genalex.csv", quiet = FALSE)} +Default Command: +<>= +funk <- "genind2genalex" +print_command(funk) +@ +% \texttt{genind2genalex(pop, filename = "genalex.csv", quiet = FALSE)} \end{quote} \begin{itemize} \item \texttt{pop -} a \texttt{genind} object. - \item \texttt{filename -} This is where you specify where you want the file to go. If you simply type the file name, it will deposit the file in the directory R is currently in. If you don't know what directory you are in, you can type \texttt{getwd()} to find out. - \item \texttt{quiet -} If this is set to \texttt{FALSE}, a message will be printed to the screen. - \item \texttt{geo -} This is set to \texttt{FALSE} by default. If it is set to \texttt{TRUE}, then that means you have a data frame or matrix in the \texttt{@other} slot of your genind object that contains geographic coordinates for all individuals or all populations. Setting this to \texttt{TRUE} means that you want the resulting file to have two extra columns at the end of your file with geographic coordinates. - \item \texttt{geodf -} The name of the data frame or matrix containing the geographic coordinates. The default is \texttt{geodf = "xy"}. + \item \texttt{filename -} This is where you specify the path to the new file + you wish to create. If you specify only a filename with no path, it will place + the file in your current working directory. + \item \texttt{quiet -} If this is set to \texttt{FALSE}, a status message will + be printed to the console as the extraction progresses. + \item \texttt{geo -} Set to \texttt{TRUE}, if you have a data frame or matrix + in the \texttt{@other} slot of your genind object that contains geographic + coordinates for all individuals or all populations. Setting this to + \texttt{TRUE} means the resulting file will have two extra columns at the end + of your file with geographic coordinates. + \item \texttt{geodf -} The name of the data frame or matrix containing the + geographic coordinates. + \item \texttt{sep -} A separator to separate columns in the resulting file. \end{itemize} +\bline{} First, a simple example for the rootrot data we demonstrated in section 1.4.2: <>= @@ -431,19 +834,23 @@ genind2genalex(rootrot, "~/Desktop/rootrot.csv") cat("Extracting the table ... Writing the table to ~/Desktop/rootrot.csv ... Done.\n") @ -Now here's an example of exporting the nancycats data set into GenAlEx format with geographic information. If we look at the nancycats geographic information, we can see it's coordinates for each population, but not each individual: +Here's an example of exporting the nancycats data set into GenAlEx format +with geographic information. If we look at the nancycats geographic information, +we can see it's coordinates for each population, but not each individual: <>= data(nancycats) nancycats@other$xy @ -And we can export it easily: + +To export it: <>= -genind2genalex(nancycats, "~/Desktop/nancycats_pop_xy.csv") +genind2genalex(nancycats, "~/Desktop/nancycats_pop_xy.csv", geo = TRUE) @ <>= cat("Extracting the table ... Writing the table to ~/Desktop/nancycats_pop_xy.csv ... Done.\n") @ -If we wanted to assign a geographic coordinate to each individual, we can simply use this little repetition trick knowing that there are 17 populations in the data set: +If we wanted to assign a geographic coordinate to each individual, we can use +this trick knowing that there are 17 populations in the data set: <>= nan2 <- nancycats nan2@other$xy <- nan2@other$xy[rep(1:17, table(pop(nan2))), ] @@ -451,468 +858,1530 @@ head(nan2@other$xy) @ Now we can export it to a different file. <>= -genind2genalex(nan2, "~/Desktop/nancycats_inds_xy.csv") +genind2genalex(nan2, "~/Desktop/nancycats_inds_xy.csv", geo = TRUE) @ <>= cat("Extracting the table ... Writing the table to ~/Desktop/nancycats_inds_xy.csv ... Done.\n") @ -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -\subsection{Getting to know adegenet's genind object}\label{intro:genind} - -\tab\tab Since \textit{poppr} was built around adegenet's framework, it is important to know how \textit{adegenet} stores data in the genind object, as that is the object used by \textit{poppr}. To create a genind object, \textit{adegenet} takes a data frame of genotypes (rows) across multiple loci (columns) and converts them into a matrix of individual allelic frequencies at each locus \cite{Jombart:2008}. - -For example, if you had a data frame with 3 diploid individuals each with 3 loci that had 3, 4, and 5 allelic states respectively, the resulting \texttt{genind} object would contain a matrix that has 3 rows and 12 columns. -\noindent Here's the example data frame: -<>= +%-----------------------------------------------------------------------------% +% +%-----------------------------------------------------------------------------% +\subsection{Getting to know \adegenet{}'s genind object} +\label{intro:genind} + +\tab\tab Since \poppr{} was built around \adegenet{}'s framework, it is +important to know how \adegenet{} stores data in the genind object, as that is +the object used by \poppr{}. To create a genind object, \adegenet{} takes a data +frame of genotypes (rows) across multiple loci (columns) and converts them into +a matrix of individual allelic frequencies at each locus \cite{Jombart:2008}. + +For example, Let's say we had data with 3 diploid individuals each with 3 loci +that had 3, 4, and 5 allelic states respectively: + +% latex table generated in R 3.1.0 by xtable 1.7-3 package +% Sat Apr 19 19:03:55 2014 +\begin{table}[ht] +\centering +\begin{tabular}{rlll} + \hline + & locus1 & locus2 & locus3 \\ + \hline +1 & 101/101 & 201/201 & 301/302 \\ + 2 & 102/103 & 202/203 & 301/303 \\ + 3 & 102/102 & 203/204 & 304/305 \\ + \hline +\end{tabular} +\label{example_table} +\end{table} +\noindent +The resulting \texttt{genind} object would contain a matrix that has 3 rows and +12 columns. Below is a schematic of what that would look like. Each column +represents a separate allele, each row represents an individual and each color +represents a different locus. + +<>= library(adegenet) df <- data.frame(list(locus1=c("101/101", "102/103", "102/102"), - locus2=c("201/201","202/203","203/204"), + locus2=c("201/201", "202/203", "203/204"), locus3=c("301/302", "301/303", "304/305"))) -df -@ -And the resulting matrix after importing to genind. +dat <- df2genind(df, sep="/")$tab +tdat <- dat +tdat[] <- 1 +barplot(tdat, axes = FALSE) +barplot(rep(3, 12), col = rep(rainbow(3, alpha = 0.5), 3:5), axes = FALSE, add = TRUE) +axis(2, at = 1:3 - 0.5, labels = 1:3, tick = FALSE) +axis(3, at = c(2, 6.125, 11.5), labels = names(df), tick = FALSE) +@ +\noindent +When we look at the data derived from table \ref{example_table}, we see that we +have a matrix of individual allele frequencies at each locus. <>= -df2genind(df, sep="/")$tab +dat +@ +\noindent +At each locus, the allele frequencies for each individual sum to one. +Homozygotes are denoted as having an allele frequency of 1 at a particular +allele while heterozygotes have their allele frequencies represented as $1/p$ +wher $p$ = ploidy. Along with this matrix, are elements that define the names of +the individuals, loci, alleles, and populations. If you wish to know more, see +the \adegenet{} ``Getting Started'' manual. + +\subsubsection{The other slot} +\label{intro:genind:other} + +The other slot is a place in the genind object that can be used to store useful +information about the data. We saw earlier that it could store demographic +information, now let's explore a different example. Bruvo's +distance is based off of a stepwise mutation model for microsatellites. This +requires us to know the length of the repeat of each locus. We could store the +repeat lengths in a separate variable in our R environment, but we are at risk +of losing that. One wa to prevent it from being lost would be to place it in the +``other'' slot. For the purpose of this example, we will use the ``nancycats'' +data set from the \adegenet{} package and assume that it has dinucleotide repeats +at all of its loci. + +<>= +data(nancycats) # Load the data +other(nancycats) # geographical coordinates +repeats <- rep(2, nLoc(nancycats)) #nLoc = number of loci +repeats +other(nancycats)$repeat_lengths <- repeats +other(nancycats) # two items named xy and repeat_lengths +@ + +%-----------------------------------------------------------------------------% +% +%-----------------------------------------------------------------------------% +\subsection{The genclone object \{send in the clones\}} +\label{intro:genclone} + +The \texttt{genclone} class was defined in order to make working with +hierarchies more intuitive. It is built off of the \texttt{genind} object and +has dedicated slots for the population hierarchy and defined multilocus +genotypes. The name genclone refers to the fact that it has the ability to +handle genotypes of clonal organisms (but it is also used for sexual +populations). + +The main difference between the genclone and genind objects is how they handle +populations: With the \texttt{genind} object, the user must find the vector +defining the population and set it using that vector. The \texttt{genclone} +object already defines a data frame with different population factors or +hierarchical levels in the object. The user simply supplies a formula defining +the desired hierarchy with which to set the population. This formula driven +method is also used for clone correction, combining hierarchical levels and +conducting AMOVA. These will all be explained in later chapters. For examples +and details, type \texttt{help("genclone")} in your R console. + +The function \texttt{as.genclone} allows the user to convert a \texttt{genind} +object to a \texttt{genclone} object. The following example will demonstrate +that the \texttt{genclone} object is an extention of the \texttt{genind} object +as well as the advantages of having populations pre-defined in your data set. + +\subsubsection{Function: as.genclone} +\label{intro:genclone:as.genclone} + +\tline{} +\begin{quote} +Default Command: +<>= +funk <- "as.genclone" +print_command(funk) +@ +\end{quote} +\begin{itemize} + \item \textbf{x -} a \texttt{genind} object to be converted. + \item \textbf{hierarchy -} an optional data frame where each column represents + a hierarchical level of the population hierarchy in the data set. +\end{itemize} +\bline{} + +Let's show an example of a \texttt{genclone} object. First, we will take an +existing \texttt{genind} object and convert it using the function +\texttt{as.genclone} (We can also use the function +\cmdlink{intro:import:read.genalex}{read.genalex} to import as \texttt{genclone} +or \texttt{genind} objects). We will use the \texttt{Aeut} data set because it +is a clonal data set that has a simple population hierarchy +\cite{Grunwald:2006}. The data set is here: +\url{http://dx.doi.org/10.6084/m9.figshare.877104} and it is AFLP data of the +root rot pathogen \textit{Aphanomyces euteiches} collected from two different +fields in NW Oregon and W Washington, USA. These fields were divided up into +subplots from which samples were collected. The fields represent the population +and the subplots represent the subpopulation. Let's take a look at what the +\texttt{genind} object looks like: + +<>= +library(poppr) +data(Aeut) +Aeut @ -The first three columns represent the alleles of locus 1, the next four represent locus 2, and the last five represent locus 3. -Do you see what I mean when I say individual allele frequencies at each locus? For a diploid individual, you only have three possible allele frequencies at each locus: 1, 0.5, or 0. Now, this is not the entire genind object, but it is the main feature. The object also has various elements associated with it that give you information about the population membership, the names of loci, individuals, and alleles among other things that \textit{poppr} uses to work \cite{Jombart:2008}. If you wish to know more, see the \textit{adegenet} ``Getting Started" manual. +This gives us a lot of information about the object, and is useful once you +become more comfortable with programming. Once we convert it to a +\texttt{genclone} object, the multilocus genotypes will be defined and the +population hierarchy (if a data frame is defined in the +\texttt{@other} slot called ``population\_hierarchy") will be set. -\subsubsection{The other slot}\label{intro:genind:other} +<>= +agc <- as.genclone(Aeut) +agc -\tab\tab The element that you as a \textit{poppr} user needs to be concerned with is the ``other" slot. No, I'm not trying to be cryptic. If you look at an \textit{adegenet} object, you will see that it has several ``slots" (starting with ``@"). \cite{Jombart:2008} Let's start by recreating that data frame I showed you earlier. -<>= -df <- data.frame(list(locus1=c("101/101", "102/103", "102/102"), - locus2=c("201/201", "202/203", "203/204"), - locus3=c("301/302", "301/303", "304/305") - ) - ) -dfg <- df2genind(df, sep="/") +# We can also manually set the hierarchy. +as.genclone(Aeut, hierarchy = other(Aeut)$population_hierarchy[-1]) @ -Next we will display the contents of the \texttt{genind} object \texttt{dfg} -<>= -dfg + +We can see here that it shows less information, but it gives us a very simple +overview of our data. Don't be fooled, however, because it contains the same +information as a \texttt{genind} object and the advantage of the +\texttt{genclone} object is that setting the population from the hierarchy +becomes \textbf{much} easier. + +<>= +c(is.genind(Aeut), is.genclone(Aeut), is.genind(agc), is.genclone(agc)) + +# Adegenet functions work the same, too +c(nInd(Aeut), nInd(agc)) @ -The matrix containing our allelic frequencies is located in the \texttt{@tab} slot. All of the slots below that have very specific properties related to the matrix in \texttt{@tab}, but the \texttt{@other} slot is more or less a grab bag, where you can place anything you want, even if it doesn't make sense! +<>= +# We'll look at the population names +Aeut$pop.names + +# genind way: +# Extract the combined hierarchical levels. +pophier <- other(Aeut)$population_hierarchy$Pop_Subpop +pop(Aeut) <- pophier +Aeut$pop.names + +# genclone way: +agc +setpop(agc) <- ~Pop/Subpop +agc + +# Notice that you only see the first and last three population names. +# Use the print function to display all population names. -Here, I'll give you an example of placing the genind object inside itself. Notice first, that the \texttt{@other} slot is empty and pay attention to the commands I use, noting that you can use either ``\$" or ``@" to access the slots. -<>= -# First off, how big is the object? -print(object.size(dfg), units="auto") -dfg$other$dfg <- dfg -dfg # we can now see that the @other slot is now filled. -dfg$other$dfg -print(object.size(dfg), units="auto") # How big is it now? +print(agc) @ + +%-----------------------------------------------------------------------------% +% +%-----------------------------------------------------------------------------% +\subsection{Accessing the population hierarchy} +\label{intro:genclone:access} + +The hierarchy slot in the genclone object allows us to access and manipulate +different levels of a population hierarchy without the complication of creating +new data sets. We implemented the use of hierarchical nested formulae: + +<>= +hier = ~Population/Subpopulation/Year +@ +\noindent +\jala{} \textbf{NOTE:} The $\sim$ symbol is absolutely required for formulas, even if +you only are specifying one level.\\ +The \texttt{"/"} symbolizes a hierarchical nesting. The above formula represents +years nested within subpopulations, nested within populations. This allows the +user to easily restructure the population hierarchy. Manipulation of the +hierarchy is necessary for \seclink{data.manip:cc}{clone correction} and AMOVA +analysis. See the section \seclink{data.manip:hier}{Can you take me +hier(archy)} for more details on manipulation of hierarchies. +%-----------------------------------------------------------------------------% +% +%-----------------------------------------------------------------------------% +\subsection{About polyploid data} +\label{intro:import:polyploid} + \begin{center} - \fcolorbox{black}{light-gray}{ - \begin{minipage}[t]{0.8\textwidth} -{\large \textsc{What is the \# sign for?}}\\ -This is called a comment. If you type something in R with the ``\#" sign in front of it, R will not interpret it. - \end{minipage} - } + \fcolorbox{light-gray}{salmon}{ + \begin{minipage}[t]{0.8\textwidth} + {\begin{center} \jala{}\jala{} \textbf{WARNING}\revjala{}\revjala{}\end{center}} + Treat polyploid data with care. Please read this section carefully and + consult the help pages for all functions mentioned here. + \end{minipage} + } \end{center} -And we can continue to do this until we reach the limit of our available memory. Why am I showing this silliness to you? For one thing I want to show you that you can stick anything you want into that slot and the object will not be hurt in any way. It's also important when considering how you are going to deal with the population structure of your genind object. For the \textit{poppr} functions \texttt{clonecorrect} (Section \ref{data.manip:cc:clonecorrect}) and \texttt{splitcombine} (Section \ref{data.manip:hier:splitcombine}) to work, a data frame of the population hierarchy must be present in the \texttt{@other} slot and it must have the same number of rows as individuals in the data set. There are several ways to go about this. If you know how to create a data frame or import data into R, the command is no more difficult than \texttt{obj\$other\$population\_hierarchy <- df}. If you do not know how to create a data frame or import data into R, you can visit Quick R at \url{http://www.statmethods.net/input/importingdata.html}. -\subsubsection{Setting the population factor \{adegenet's function: pop\}}\label{intro:genind:pop} +With diploid or haploid data, genotypes are unambiguous. It is often +clear when it is homozygous or heterozygous. With polyploid data, genotypes can +be ambiguous. For example, a tetraploid individual with the apparent genotype of +\textbf{A/B} could actually have one of three genotypes: \textbf{A/A/A/B}, +\textbf{A/A/B/B}, or \textbf{A/B/B/B}. This ambiguity prevents a researcher from +accurately calling all alleles present. In \adegenet{}, it was previously +difficult to import polyploid data because of this ambiguity as data was +required to be unambiguous or missing. + +A solution to this problem is to code missing alleles as ``0''. An example of +this is found within the \texttt{Pinf} data set in \poppr{} +\cite{goss2014irish}. We look at the last six samples over two loci, Pi63 (3 +alleles, triploid) and Pi70 (3 alleles, diploid) to examine how the data is +represented. + +<>= +data(Pinf) +Pinf +tail(truenames(Pinf[loc = c("L09", "L10")])$tab) +@ +\noindent +Each column in this data represents a different allele at a particular locus. +\texttt{Pi63.148} is the allele \texttt{148} at locus \texttt{Pi63}. Each row is +an individual. The numbers represent the fraction of a given allele that makes +up the individual genotype at a particular locus. What we can see here is that +the number of columns is 8 when we expect only 6 (2 loci $\times$ 3 alleles). +The first allele at each locus is \texttt{000}. Let's take a look at the data in +a human-readable format. +<>= +Pinfdf <- genind2df(Pinf, sep = "/") +tail(Pinfdf[10:11]) +@ +\noindent +It's more clear now that we have a data set of tetraploid individuals where some +genotypes appear diploid (\texttt{\Sexpr{Pinfdf[85, 10]}}) and some appear +triploid (\texttt{\Sexpr{Pinfdf[86, 10]}}). The tetraploid genotype is padded +with zeroes to make up the difference in ploidy. + +This method allows \textsc{Bruvo's Distance} \cite{Bruvo:2004} and the +\textsc{Index of Association} \cite{Brown:1980,Smith:1993,Agapow:2001} to work +with polyploids as they specifically recognize the zeroes as being missing data. +A side effect, unfortunately is that the extra zeroes appear as extra alleles. +As this affects all frequency-based statistics (except for the ones noted +above), the user should reformat their data set with the function +\texttt{recode\_polyploids}, which will remove the zeroes and recode the allele +frequencies to the observed frequencies. + +<>= +Pinf_rc <- recode_polyploids(Pinf, newploidy = 2) +tail(truenames(Pinf_rc[loc = c("L09", "L10")])$tab) +@ +\noindent +Notice that the triploid locus now has frequencies that are multiples of +$\frac{1}{3}$ and the diploid locus has multiples of $\frac{1}{2}$. Below, we +show the observed genotypes: +<>= +Pinfrcdf <- genind2df(Pinf_rc, sep = "/") +tail(Pinfrcdf[10:11]) +@ + +%=============================================================================% +%=============================================================================% +% +% +% +%=============================================================================% +%=============================================================================% +\section{Data Manipulation} +\label{data.manip} + +\tab\tab One tedious aspect of population genetic analysis is the need for +repeated data manipulation. \Adegenet{} has some functions for manipulating data +that are limited to replacing missing data and dividing data into populations, +loci, or by sample size \cite{Jombart:2008}. \Poppr{} includes novel functions +for clone-censoring your data sets or sub-setting a genind object by specific +populations. + +%-----------------------------------------------------------------------------% +% +%-----------------------------------------------------------------------------% +\subsection{Population hierarchy construction \{Can you take me hier(archy)?\}} +\label{data.manip:hier} + +\tab\tab in \poppr{}, the \seclink{intro:genclone}{genclone} object contains a +slot called \texttt{hierarchy}. This slot contains a data frame used to define +hierarchical levels of population factors describing your data. The preferred +way of defining these hierarchical levels is to concatenate them using `\_' and +use them to define a single population in your data before you import it into +\poppr{}. Examples of this format can be found in figures \ref{microbov +unmodified}, \ref{microbov modified}, and at +\url{http://dx.doi.org/10.6084/m9.figshare.877104}. + +In this section, we will show you how to \seclink{data.manip:hier:define}{define} +hierarchical levels, \seclink{data.manip:hier:view}{view} those levels to ensure +that they are correctly defined, \seclink{data.manip:hier:manip}{manipulate} your +hierarchical levels by adding and renaming them, and use these levels +to \seclink{data.manip:hier:setpop}{set the population} in +your \seclink{intro:genclone}{genclone} object using the following methods: + +% latex table generated in R 3.0.3 by xtable 1.7-3 package +% Sat Mar 22 20:24:23 2014 +\begin{table}[ht] +\centering +\begin{tabular}{llll} + \hline + Method & Function & Input & Result\\ + \hline + split & \cmdlink{data.manip:hier:define:split}{splithierarchy} & formula & defined hierarchical levels\\ + set & \cmdlink{data.manip:hier:define:set}{sethierarchy} & data frame & new hierarchical levels\\ + get & \cmdlink{data.manip:hier:view}{gethierarchy} & formula & data frame\\ + name & \cmdlink{data.manip:hier:manip:name}{namehierarchy} & formula & new hierarchical level names\\ + add & \cmdlink{data.manip:hier:manip:add}{addhierarchy} & vector or data frame & new hierarchical level\\ + \hline +\end{tabular} +\end{table} + + +\begin{center} + \fcolorbox{light-gray}{light-gray}{ + \begin{minipage}[t]{0.8\textwidth} + \jala{} {\large \textsc{A note about formulas}} \revjala{}\\ + The formulas used by genclone objects always start with a $\sim$ and + are hierarchical levels are always separated by a $/$. Some examples + are: + + $\sim$Country/City/District -\tab\tab A genind object can contain several populations, and, if you have differing population structures, you might want to switch among them for different analyses. The tools you as the user would need, are the slot \texttt{@pop.names} and the \textit{adegenet} function \texttt{pop()}. I'll use the H3N2 data set packaged with \textit{adegenet} as an example. -<>= + $\sim$Field/Year + + Refer to \seclink{intro:genclone:access}{Accessing hierarchies} for more + details on how to access hierarchies. + \end{minipage} + } +\end{center} +\noindent +In the next section, we'll explore two ways of defining hierarchical levels. + +\subsubsection{Defining hierarchies} +\label{data.manip:hier:define} + +\tab\tab As explained above, the best way to define hierarchical levels is to +concatenate them using `\_' and set that as your population factor. We will use +the example data set from \url{http://dx.doi.org/10.6084/m9.figshare.877104}. It +is an AFLP data set of the root rot pathogen \textit{Aphanomyces euteiches} from +two fields and multiple soil cores per field. First, we will follow the link and +copy the download link from figshare. +<>= +aphan <- read.genalex("http://files.figshare.com/1314228/rootrot.csv") +aphan +@ +<>= +aphan <- namehierarchy(sethierarchy(agc, gethierarchy(agc, ~Pop_Subpop)), ~Pop) +aphan +@ +\noindent +The supplemental information in the data defined two hierarchical leves, yet we +only see one here labeled `Pop'. This is how populations are automatically +defined when importing to a \seclink{intro:genclone}{genclone} object since it +does not know how many hierarchical levels you have defined. To define these +levels present in the data set, we will need to split them up using the function +\texttt{splithierarchy}: +\label{data.manip:hier:define:split} +<>= +splithierarchy(aphan) <- ~field/sample +aphan +@ +\noindent +Now we have sucessfully defined our hierarchies. If you have imported your data +in this manner, you may skip to the \seclink{data.manip:hier:setpop}{setting +population hierarchies}, \seclink{data.manip:hier:view}{viewing hierarchies}, or +\seclink{data.manip:hier:manip}{manipulating hierarchical levels.} + +\bline{} + +If you have imported your data with a single population and want to add +hierarchical levels separately, you can use the function \texttt{sethierarchy} +with a data frame containing your hierarchical levels. For this example, we will +use the data set \texttt{H3N2}, which contains SNP data from the H3N2 virus. +This data set holds a data frame in the the \seclink{intro:genind:other}{other +slot} that contains many variables including country, year, and month of +collection. We will first load the data and write that data frame to a file on +the desktop. +\label{data:virus} +<>= +data(H3N2) +write.table(other(H3N2)$x, file = "~/Desktop/virus_info.csv", row.names = FALSE) +@ +<>= data(H3N2) -H3N2 -pop(H3N2) -H3N2$pop.names +virus_info <- as.data.frame(other(H3N2)$x) # extracting the data frame @ +\noindent +Now we have our data and we have a separate table in a file on our desktop +defining our hierarchical levels. Let's import those levels into R with +\texttt{read.table} and see what they are: -Notice how both the pop and pop.names are empty. This means that the population information needs to be set. Notice, however that there are 1903 individuals in the data set and that the \texttt{@other} slot is not empty. Let's investigate an object in this slot. -<>= -head(H3N2$other$x) -nrow(H3N2$other$x) +<>= +virus_info <- read.table("~/Desktop/virus_info.csv", header = TRUE) +names(virus_info) +@ +<>= +names(virus_info) @ -\begin{center} - \fcolorbox{black}{light-gray}{ - \begin{minipage}[t]{0.8\textwidth} -{\large \textsc{What is head()?}}\\ -\texttt{head()} is a command that will show you only the top portion of an R object. By default it will show you the first six elements (or rows of a data frame or matrix). This is so that you can quickly check the contents of an object. - \end{minipage} - } -\end{center} -We can see that it's a data frame containing a wealth of information that we could use to subset our data. So, let's start by setting the population structure by country. How do we do that? Well, the function \texttt{pop()} will allow us to set that structure using a vector that is the same length as the number of individuals in the data set. Since the number of rows in the data frame \texttt{x} meets that criteria, we can use any item in that data frame. Let's take a look. -<>= -pop(H3N2) <- H3N2$other$x$country -head(pop(H3N2)) -H3N2$pop.names +From here we will convert our genind object to a genclone object and use +\texttt{sethierarchy} to define the hierarchical levels with the table we just +imported. + +\label{data.manip:hier:define:set} +<>= +virus <- as.genclone(H3N2) # Converting it to a genclone object. +sethierarchy(virus) <- virus_info # Setting the hierarchy +virus @ -Notice how useful the \texttt{@other} slot can be. We now have population structure in the data set and you now know how to set the population factor. The other slot will become useful later on when we are talking about multilocus genotypes. -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -% -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -% -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\noindent +In this data, levels such as host and segment are unimportant levels because +they are all the same. Let's say that we are only interested in year and country. +To make things easier to view, we will set the hierarchical levels to these two +columns: +<>= +sethierarchy(virus) <- virus_info[c("country", "year")] +virus +@ + +Notice that there are no populations defined. Now that we have the hierarchical +levels in place, we can use it to define the population hierarchy. We will use +the function \texttt{setpop} to define the population as year with respect to +country: +<>= +setpop(virus) <- ~year/country +virus +@ + +\subsubsection{Viewing hierarchies} +\label{data.manip:hier:view} + +If you wanted to view your hierarchies to make sure that you made no spelling +errors in your population definitions, you can extract the data frame from your +genclone object by using the function \texttt{gethierarchy}: + +\begin{quote} +Default Command: +<>= +funk <- "gethierarchy" +print_command(funk) +@ +\end{quote} +\noindent +Where \textbf{x} is the genclone object, \textbf{formula} defines the hierarchical +levels, and \textbf{combine} indicates whether or not you want the lower levels +of the hierarchy combined with the higher levels. For example, in the root rot +data above, the hierarchical levels are explicitly hierarchical and should be +combined. Note, if you don't supply a formula argument, the data frame as it +exists will be returned. + +<>= +head(gethierarchy(aphan)) +head(gethierarchy(aphan, ~field/sample)) +@ + +If the hierarchical levels are not nested, or you simply want to use this +hierarchy for another data set, you might want to set the \textbf{combine} flag +to \texttt{FALSE}. Let's use the virus data as an example: + +<>= +head(gethierarchy(virus, ~year/country)) +head(gethierarchy(virus, ~year/country, combine = FALSE)) +@ + +It will return only the levels you ask it to return: + +<>= +head(gethierarchy(virus, ~country)) +@ + +\subsubsection{Manipulating hierarchical levels} +\label{data.manip:hier:manip} + +Once we have our hierarchies set in place, we want to be able to rename and add +to them. For this example, we will revisit the \seclink{data:virus}{virus +example} from above. We have set the population hierarchy to be based on year and +country, but we've noticed that we left out month. And let's say that we +accidentally overwrote the data object like this: +<>= +virus_info <- virus_info[["month"]] +names(virus_info) +@ +\noindent +If we were saving our script the whole time, we could just go back and retrieve +the data frame, but that defeats the purpose of this section where we imagine +that we've recieved new information and wanted to add it to our hierarchy. If we +want to add this to our hierarchy, we just use the function +\texttt{addhierarchy} defined as thus: +\begin{quote} +Default Command: +<>= +funk <- "addhierarchy" +print_command(funk) +@ +\end{quote} +\noindent +We can use this function to add on a new column to the data frame. +\label{data.manip:hier:manip:add} +<>= +addhierarchy(virus) <- virus_info +virus +@ +\noindent +Notice that the new hierarchical level is simply labeled as \texttt{NEW}. We +will customize the name of the hierarchical levels with the function +\texttt{namehierarchy}. +\label{data.manip:hier:manip:name} + +<>= +namehierarchy(virus) <- ~country/year/month +virus +@ +\noindent +Of course, perhaps a better way still would be to use a data frame: +<>= +addhierarchy(virus) <- data.frame(month = virus_info) +@ + +\subsubsection{Defining populations with hierarchies} +\label{data.manip:hier:setpop} + +Now that we have defined the hierarchical levels in the data set, setting the +population hierarchy allows us to group our data according to the hierarchical +level of your choice. This is a necessary step. For this example, we will use a +data set of \textit{Phytophthora infestans} collected from North America and +South America. + +<>= +data(Pinf) +Pinf +@ +\noindent +Above we have two hierarchies for Continent and Country, but the +populations only show Continent level populations. If we wanted to investigate +each country separately, we would need to reset the population to be represented +by Country. This can be done with the function \texttt{setpop}. This function +utilizes the defined population hierarchies to set the population. We'll use our +data set above to illustrate this: + +<>= +setpop(Pinf) <- ~Country +Pinf # Now set by country +@ +\noindent +The beauty about it is the fact that it will also combine all the hierarchical +levels you want to use. Let's see when we ask it to set the population of +Country with respect to Continent. + +<>= +setpop(Pinf) <- ~Continent/Country +Pinf +@ +\noindent +Nice! +%-----------------------------------------------------------------------------% % -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -\section{Data Manipulation}\label{data.manip} +%-----------------------------------------------------------------------------% +\subsection{Replace or remove missing data \{Inside the golden days of missing data\}} +\label{data.manip:missing} + +\tab\tab A data set without missing data is always ideal, but often not +achievable. Many functions in \adegenet{} cannot handle missing data and thus +the function \texttt{na.replace} exists \cite{Jombart:2008}. It will replace +missing data with with either ``0" representing a mysterious extra allele in the +data set resulting in more diversity or the mean of allelic frequencies at the +locus. There is no set method, however, for simply removing missing data from +analyses, which is why the \poppr{} function \texttt{missingno} (see below) +exists. If the name makes you uneasy it's because it should. Missing data can +mean different things based on your data type. For microsatellites, missing data +might represent any source of error that could cause a PCR product to not +amplify in gel electrophoresis, which may or may not be biologically relevant. +For a DNA alignment, missing data could mean something as simple as an insertion +or deletion, which is biologically relevant. The choice to exclude or estimate +data has very different implications for the type of data you have. -\tab\tab One tedious aspect of population genetic analysis is the need for repeated data manipulation. \textit{Adegenet} has some functions for manipulating data that are limited to replacing missing data and dividing data into populations, loci, or by sample size \cite{Jombart:2008}. \textit{Poppr} includes novel functions for clone-censoring your data sets or sub-setting a genind object by specific populations. -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -\subsection{Inside the golden days of missing data \{replace or remove missing data\}}\label{data.manip:missing} +\begin{center} + \fcolorbox{light-gray}{salmon}{ + \begin{minipage}[t]{0.8\textwidth} + {\begin{center} \jala{}\jala{} \textbf{WARNING}\revjala{}\revjala{}\end{center}} + Treatment of Missing data is a non-trivial task. You should understand the nature of missing data in your data set before treatment. + \end{minipage} + } +\end{center} -\tab\tab A data set without missing data is always ideal, but often not achievable. Many functions in \textit{adegenet} cannot handle missing data and thus the function \texttt{na.replace} exists \cite{Jombart:2008}. It will replace missing data with with either ``0" representing a mysterious extra allele in the data set resulting in more diversity or the mean of allelic frequencies at the locus. There is no set method, however, for simply removing missing data from analyses, which is why the \textit{poppr} function \texttt{missingno} (see below) exists. If the name makes you uneasy it's because it should. Missing data can mean different things based on your data type. For microsatellites, missing data might represent any source of error that could cause a PCR product to not amplify in gel electrophoresis, which may or may not be biologically relevant. For a DNA alignment, missing data could mean something as simple as an insertion or deletion, which is biologically relevant. The choice to exclude or estimate data has very different implications for the type of data you have. -\subsubsection{Function: missingno}\label{data.manip:missing:missingno} +\tline{} +\subsubsection{Function: missingno} +\label{data.manip:missing:missingno} -\tab\tab \texttt{missingno} is a function that serves partially as a wrapper for adegenet's \texttt{na.replace} to replace missing data and as a way to exclude specific areas that contain systematic missing data. +\tab\tab \texttt{missingno} is a function that serves partially as a wrapper for +\adegenet{}'s \texttt{na.replace} to replace missing data and as a way to +exclude specific areas that contain systematic missing data. \begin{quote} -Default Command:\\ -\texttt{missingno(pop, type = "loci", cutoff = 0.05, quiet = FALSE)} +Default Command: +<>= +funk <- "missingno" +print_command(funk) +@ +% \texttt{missingno(pop, type = "loci", cutoff = 0.05, quiet = FALSE)} \end{quote} \begin{itemize} \item \texttt{pop -} a \texttt{genind} object. \item \texttt{type -} This could be one of four options: \begin{description} - \item[ ``mean"] This replaces missing data with the mean allele frequencies in the entire data set. - \item[ ``zero" or ``0"] This replaces missing data with zero, signifying a new allele. - \item[ ``loci"] This is to be used for a data set that has systematic problems with certain loci that contain null alleles or simply failed to amplify. This will remove loci with a defined threshold of missing data from the data set. - \item[ ``geno"] This is to be used for genotypes (individuals) in your data set where many null alleles are present. Individuals with a defined threshold missing data will be removed. + \item[ ``mean"] This replaces missing data with the mean allele + frequencies in the entire data set. + \item[ ``zero" or ``0"] This replaces missing data with zero, signifying a + new allele. + \item[ ``loci"] This is to be used for a data set that has systematic + problems with certain loci that contain null alleles or simply failed to + amplify. This will remove loci with a defined threshold of missing data + from the data set. + \item[ ``geno"] This is to be used for genotypes (individuals) in your + data set where many null alleles are present. Individuals with a defined + threshold missing data will be removed. \end{description} - \item \texttt{cutoff -} This is a numeric value from 0 to 1 indicating the percent allowable missing data for either loci or genotypes. If you have, for example, two loci containing missing 5\% and 10\% missing data, respectively and you set \texttt{cutoff = 0.05}, \texttt{missingno} will remove the second locus. Percent missing data for genotypes is considered the percent missing loci over number of total loci. - \item \texttt{quiet -} When this is set to \texttt{FALSE}, the number of missing values replaced will be printed to screen if the method is ``zero" or ``mean". It will print the number of loci or individuals removed if the method is ``loci" or ``geno". + \item \texttt{cutoff -} This is a numeric value from 0 to 1 indicating the + percent allowable missing data for either loci or genotypes. If you have, + for example, two loci containing missing 5\% and 10\% missing data, + respectively and you set \texttt{cutoff = 0.05}, \texttt{missingno} will + remove the second locus. Percent missing data for genotypes is considered + the percent missing loci over number of total loci. + \item \texttt{quiet -} When this is set to \texttt{FALSE}, the number of + missing values replaced will be printed to screen if the method is ``zero" + or ``mean". It will print the number of loci or individuals removed if the + method is ``loci" or ``geno". \end{itemize} +\bline{} -Of course, seeing is believing. Let's take a look at what this does by focusing in on areas with missing data. Note that I will be using some sub-setting functions here that are described in adegenet's \textit{Getting Started} vignette. First, let's take a look at what the missing data in R looks like as well as how many loci and individuals the data set nancycats contains. We need to first tell R to look in its library for the package \textit{poppr}. -<>= +Let's take a look at what this does by focusing in on areas with missing data. +We'll use the data set \texttt{nancycats} as an example. Using the \poppr{} +function \texttt{info\_table}, we can assess missing data within populations. +<>= library(poppr) -@ -Next, we'll initialize the \textit{adegenet} data set nancycats and load it into memory. -<>= data(nancycats) +info_table(nancycats, plot = TRUE) @ -Now, we'll take a quick look at the nancycats data set using \textit{adegenet}'s \texttt{summary()} \index{summary} function: -<>= -summary(nancycats) -@ +\label{function:info-table} -We can see here a lot of summary statistics about nancycats. Here we can see that there are 17 populations, 237 individuals, and 9 loci. Nancycats also has a little over 2.3\% missing data. Let's take a look at the names of the loci and the structure of the data. In order to save space, I will only show you the first five individuals (rows) and a portion of the alleles in the first locus (columns). +We can see that locus \Sexpr{locNames(nancycats)[1]} has a lot of missing data. +To demonstrate the function \cmdlink{data.manip:missing:missingno}{missingno}, we +will zoom into the first five individuals at the first locus. <>= -nancycats$loc.names # Names of the loci nancycats$tab[1:5, 8:13] @ -When looking at this data set, recall how a \texttt{genind} object is formatted. You have a matrix of 0's, 1's and 0.5's. For diploids, if you see 0.5, that means it is heterozygous at that allele, and a 1 means it's homozygous. Here we see three heterozygotes and two individuals with missing data (indicated by NA). Now, there are more places with missing data in the data set, but I'm only showing a little bit at one locus so it's easier to digest. Let's first replace it by zero and mean, respectively. +When looking at this data set, recall how a \texttt{genind} object is formatted. +You have a matrix of 0's, 1's and 0.5's. For diploids, if you see 0.5, that +means it is heterozygous at that allele, and a 1 means it's homozygous. Here we +see three heterozygotes and two individuals with missing data (indicated by NA). +Let's first replace it by zero and mean, respectively. <>= nanzero <- missingno(nancycats, type = "zero") nanmean <- missingno(nancycats, type = "mean") nanzero$tab[1:5, 8:13] nanmean$tab[1:5, 8:13] @ -You notice how the values of NA changed, yet the basic structure stayed the same. These are the replacement options from adegenet. Let's look at the same example with the exclusion options (set to the default cutoff of 5\%). +You notice how the values of NA changed, yet the basic structure stayed the +same. These are the replacement options from \adegenet{}. Let's look at the same +example with the exclusion options (set to the default cutoff of 5\%). <>= nanloci <- missingno(nancycats, "loci") nangeno <- missingno(nancycats, "geno") nanloci$tab[1:5, 8:13] @ -Notice how we now see columns named ``L2.01" and ``L2.02". This is showing us another locus because we have removed the first. Recall from the summary table that the first locus had 16 alleles, and the second had 11. Now that we've removed loci containing missing data, all others have shifted over.\\ -Let's look at the loci names and number of individuals. +Notice how we now see columns named ``L2.01" and ``L2.02". This is showing us +another locus because we have removed the first. Recall from the summary table +that the first locus had 16 alleles, and the second had 11. Now that we've +removed loci containing missing data, all others have shifted over.\\ Let's look +at the loci names and number of individuals. <>= -length(nanloci$ind.names) # Individuals -nanloci$loc.names # Names of the loci +nInd(nanloci) # Individuals +locNames(nanloci) # Names of the loci @ -You can see that the number of individuals stayed the same but the loci ``fca8", ``fca45", and ``fca96" were removed.\\ +You can see that the number of individuals stayed the same but the loci ``fca8", +``fca45", and ``fca96" were removed.\\ Let's look at what happened when we removed individuals. <>= nangeno$tab[1:5, 8:13] -length(nangeno$ind.names) # Individuals -nangeno$loc.names # Names of the loci +nInd(nangeno) # Individuals +locNames(nangeno) # Names of the loci @ -We can see here that the number of individuals decreased, yet we have the same number of loci. Notice how the frequency matrix changes in both scenarios? In the scenario with ``loci", we removed several columns of the data set, and so with our sub-setting, we see alleles from the second locus. In the scenario with ``geno", we removed several rows of the data set so we see other individuals in our sub-setting. -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -\subsection{Can you take me hier(archy)? \{population hierarchy construction\}}\label{data.manip:hier} +We can see here that the number of individuals decreased, yet we have the same +number of loci. Notice how the frequency matrix changes in both scenarios? In +the scenario with ``loci", we removed several columns of the data set, and so +with our sub-setting, we see alleles from the second locus. In the scenario with +``geno", we removed several rows of the data set so we see other individuals in +our sub-setting. +%-----------------------------------------------------------------------------% +% +%-----------------------------------------------------------------------------% +\subsection{Extract populations \{Divide (populations) and conquer (your analysis)\}} +\label{data.manip:divide} -\tab\tab Remember all that fuss we made about the \texttt{@other} slot above in section \ref{intro:genind:other}? The way you can achieve hierarchical analysis in \textit{poppr} is through a data frame in that slot. Many of the file formats that \textit{adegenet} and \textit{poppr} can import do not allow for more than two hierarchies. If you need more levels, you have a couple of choices: -\begin{enumerate} - \item Import them into R as a data frame with each column being a separate hierarchical element. - \item Collapse them into a single population factor so that you can trick these file formats into taking multiple population hierarchies (eg. instead of ``Pop1", ``Subpop1", ``Subsubpop1", you would have ``Pop1\_Subpop1\_Subsubpop1"). -\end{enumerate} -Whichever choice you make, The \textit{poppr} function \texttt{splitcombine} can help you divide and combine those factors in any way you can think of. -\subsubsection{Function: splitcombine}\label{data.manip:hier:splitcombine} +\tab\tab \Adegenet{} provides methods for subsetting data by individual or +splitting all of the data into a list of populations. If you only want one or +two populations, these methods become tedious. The \poppr{} function +\texttt{popsub} makes this easier: -\tab\tab This function will allow you to combine your population hierarchies in ways meaningful to your data without needing to know R programming. It can either split a vector of combined population hierarchies or it can combine columns of a data frame containing population hierarchies (Note that it will only split the first column of the data frame if you choose \texttt{method = 1}). -\begin{quote} -Default Command:\\ -\texttt{splitcombine(pop, method = 1, dfname = "population\_hierarchy", sep = "\_", hier = c(1), setpopulation = TRUE, fixed = TRUE)} -\end{quote} -\begin{itemize} - \item \texttt{pop -} a \texttt{genind} object with a data frame in the \texttt{@other} slot. - \item \texttt{method -} An integer indicating what you want to do on your data frame: - \begin{enumerate} - \item \textbf{split} Any populations combined using a common separator in your data frame. So, a population hierarchy of ``Pop1\_Subpop1\_Subsubpop1" would be split into a data frame containing the columns ``Pop1", ``Subpop1", ``Subsubpop1". Since it will split the population factor, it needs only to be used once. - \item \textbf{combine} If you have your population hierarchy split into a data frame, you can do the exact opposite of method 1 and combine separate elements into one. - \end{enumerate} - \item \texttt{dfname -} This is the name of the data frame containing your population factor. Note that you are not limited to one data frame in your genind object. If you do not have that data frame in the \texttt{@other} slot, a warning will be returned and nothing will happen. - \item \texttt{sep -} A separation factor you want to separate your populations with. Note, that you can choose whatever you want, but be careful because some characters have special meanings (regular expressions) in R and could give you incorrect results (``\_" is the suggested default). - \item \texttt{hier -} This can be a vector of words or numbers referring to what you want to name your population hierarchies in \texttt{method = 1}, or specific column names in your data frame in \texttt{method = 2}. - \item \texttt{setpopulation -} if \texttt{TRUE} (default), this will automatically set the population factor to either the highest population factor (with \texttt{method = 1}, split) or the combined population hierarchy (with \texttt{method = 2}, combine). if this is set to \texttt{FALSE}, the population factor will not be set. - \item \texttt{fixed -} This is an option to be passed onto the \textit{base} function \texttt{strsplit}. For those not familiar with regular expressions, it will tell R whether or not the character in \texttt{split} should be treated as a special character or not. If you don't know regular expressions, don't touch it. -\end{itemize} +\subsubsection{Function: popsub} +\label{data.manip:divide:popsub} -Let's give an example using AFLP data of different populations of \textit{A. euteiches} collected in Washington and Oregon. \cite{Grunwald:2006} -<>= -Aeut <- read.genalex(system.file("files/rootrot.csv", package="poppr")) -summary(Aeut) -@ -\begin{center} - \fcolorbox{black}{light-gray}{ - \begin{minipage}[t]{0.8\textwidth} -{\large \textsc{Does this summary seem a little lacking?}}\\ -The data that we have here is presence absence data. This means that many of the functions that \textit{adegenet} uses to calculate heterozygosity and number of alleles are slightly useless in this regard. - \end{minipage} - } -\end{center} -Notice that we have 18 different ``populations" here, but they are really a hierarchy. Let's say we want to analyze the diversity statistics of the two overall populations. Take a look at how the combined population factor is kept in the data frame. -<>= -head(Aeut$other$population_hierarchy) -@ -We'll use \texttt{splitcombine} to split that into a population and sub-population and set the population factor to the population. -\begin{center} - \fcolorbox{black}{light-gray}{ - \begin{minipage}[t]{0.8\textwidth} -{\large \textsc{Important point about splitcombine}}\\ -Ideally, method split should only be used once after you read in your data. The reason for this is that when you select this method, it will look in the first column of your data frame to choose the combined population factor to split. -If you do not name your hierarchy or if you attempt to give your hierarchy too many names, it will automatically name the columns ``h1", ``h2", etc. - \end{minipage} - } -\end{center} -<>= -Aeut.pop <- splitcombine(Aeut, method=1, dfname="population_hierarchy", hier=c("Pop", "Subpop"), setpopulation=TRUE) -head(Aeut.pop$other$population_hierarchy) -summary(Aeut.pop) -@ +\tab\tab The command \texttt{popsub} is powerful in that it allows you to choose +exactly what populations you choose to include or exclude from your analyses. As +with many R functions, you can also use this within a function to avoid +creating a new variable to keep track of. -Now we can see that we have a data frame with all of our population factors separated, and we still have our original combined hierarchy, but it is now called ``Pop\_Subpop". This allows you to keep track of what you named your population hierarchies. We can now run the function \texttt{poppr} to get a diversity analysis. -<>= -poppr(Aeut.pop, quiet=TRUE) +\tline{} +\begin{quote} +Default Command: +<>= +funk <- "popsub" +print_command(funk) @ -%\newpage -It's as simple as that. Now, let's take a look at the same data set, except the input file is a GenAlEx file that has been formatted with Regional data (See section \ref{intro:import:read.genalex} for details). First, let's see how the data set is laid out: - -\setkeys{Gin}{width=0.9\textwidth} - -\begin{figure}[h!] - \centering - \caption{\footnotesize \footnotesize Part of the rootrot2.csv data set. Note the last two columns denoting the Regions and the number of individuals per region.} - \label{rootrot2_csv} -\includegraphics{rootrot2} -\end{figure} -\setkeys{Gin}{width=0.5\textwidth} -% We can reverse it by simply changing the method. -\begin{center} - \fcolorbox{black}{light-gray}{ - \begin{minipage}[t]{0.8\textwidth} -{\large \textsc{The amazing disappearing options!}}\\ -Notice that I'm not writing in many of the options? This is because they have defaults. Since the data frame in my \texttt{@other} slot is called ``population hierarchy", I don't have to specify that every time I do the function call, and that saves a lot of typing! -% Also, since the hierarchy has already been split into its components, I can now index the data frame by column number. So, since I want to combine the ``Pop" and ``Subpop" components, I will tell the function to use columns 2 through 3. - \end{minipage} - } -\end{center} - -We'll import our data using \texttt{read.genalex} and take a look at the population hierarchy. -<>= -Aeut2 <- read.genalex(system.file("files/rootrot2.csv", package="poppr"), region=TRUE) -head(Aeut2@other$population_hierarchy) -summary(Aeut2) -@ -What we see is that we have both of the population factors, but the names have changed and they are not combined. Note that since we specified ``Athena" and ``Mt. Vernon" as regions, the other level of the hierarchy was set as the population factor. We'll use \texttt{splitcombine} to combine both of these in the proper order. Note that we can use the indexes of the data frame columns to index these. - -<>= -Aeut2.combine <- splitcombine(Aeut2, method=2, hier=2:1) -head(Aeut2.combine@other$population_hierarchy) -summary(Aeut2.combine) -@ -% <>= -% Aeut.combine <- splitcombine(Aeut.pop, method=2, hier=2:3) -% head(Aeut.combine$other$population_hierarchy) -% summary(Aeut.combine) -% @ -% -% Let's imagine for a second that these ``hierarchies" are not actually hierarchical, but rather they represent independent variables (For example: the ``Subpop" factor could represent months of the year). In this case, we would want to only analyze the ``Subpop" factor. We can do this by using \texttt{splitcombine} with only one hierarchical level. -% <>= -% Aeut.subpop <- splitcombine(Aeut.pop, method=2, hier="Subpop") -% summary(Aeut.subpop) -% poppr(Aeut.subpop, quiet=TRUE) -% @ -Having these hierarchies in your data set is important when it comes to clone-censoring your data set (see section \ref{data.manip:cc} \textit{Attack of the Clone Correction}). -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -\subsection{Divide (populations) and conquer (your analysis) \{extract populations\}}\label{data.manip:divide} - -\tab\tab As I've mentioned before, \textit{adegenet} has many ways of sub-setting the data, but you cannot easily subset a \texttt{genind} object by population in an efficient way. \textit{Poppr} allows sub-setting a population from a \texttt{genind} object with one command. -\subsubsection{Function: popsub}\label{data.manip:divide:popsub} - -\tab\tab The command \texttt{popsub} is powerful in that it allows you to choose exactly what populations you choose to include or exclude from your analyses. As with many R functions, you can easily use this within a function to avoid creating a new variable to keep track of. -\begin{quote} -Default Command:\\ -\texttt{popsub(pop, sublist = "ALL", blacklist = NULL, mat = NULL)} \end{quote} \begin{itemize} \item \texttt{pop -} a \texttt{genind} object. - \item \texttt{sublist -} The vector of populations or integers representing the populations in your data set you wish to retain. For example: \texttt{sublist = c("pop\_z", "pop\_y")} or \texttt{sublist = 1:2}. - \item \texttt{blacklist -} The vector of populations or integers representing the populations in your data set you wish to exclude. This can take the same type of arguments as sublist, and can be used in conjunction with sublist for when you want a range of populations, but know that there is one in there that you do not want to analyze. For example: \texttt{sublist = 1:15, blacklist = "pop\_x"}. - One very useful thing about the blacklist is that it allows the user to be extremely paranoid about the data. You can set the blacklist to contain populations that are not even in your data set and it will still work! - \item \texttt{mat -} (see section \ref{mlg}, \textit{Multilocus Genotype Analysis} for more information) This is where you would put a matrix that's produced by \texttt{mlg.table} to be subsetted instead of the genind object. If you do this, the matrix will return with only the rows equal to your populations and only the multilocus genotypes (columns) pertaining to those populations. + \item \texttt{sublist -} vector of populations or integers representing + the populations in your data set you wish to \textbf{retain}. For example: + \texttt{sublist = c("pop\_z", "pop\_y")} or \texttt{sublist = 1:2}. + \item \texttt{blacklist -} vector of populations or integers representing + the populations in your data set you wish to \textbf{exclude}. This can take the same + type of arguments as sublist, and can be used in conjunction with sublist for + when you want a range of populations, but know that there is one in there that + you do not want to analyze. For example: \texttt{sublist = 1:15, blacklist = + "pop\_x"}. One very useful thing about the blacklist is that it allows the + user to be extremely paranoid about the data. You can set the blacklist to + contain populations that are not even in your data set and it will still work! + \item \texttt{mat -} (see section \seclink{mlg}{Multilocus Genotype + Analysis} for more information) A matrix produced from + the \cmdlink{mlg:table:mlg.table}{mlg.table} function. This overrides the + \textit{pop} argument and subsets this table instead. \end{itemize} +\bline{} -To demonstrate this tool, let's revisit the H3N2 data set. Let's say we wanted to analyze only the data in North America. To make sure we are all on the same page, we will reset the population factor to ``country". Remember that this is located in a data frame in the \texttt{@other} slot called ``x". +To demonstrate this tool, let's revisit the \seclink{data:virus}{virus} data set +that we saw in \seclink{data.manip:hier:define}{Defining hierarchies.} Let's say +we wanted to analyze only the data in North America. To make sure we are all on +the same page, we will reset the population factor to ``country". <>= -data(H3N2) -pop(H3N2) <- H3N2$other$x$country -H3N2$pop.names # Only two countries from North America. -H.na <- popsub(H3N2, sublist=c("USA", "Canada")) -H.na$pop.names -@ -Since this is a larger data set, running the \texttt{summary} function might take a few seconds longer than we want it to. If we want to see the population size, we can use the \textit{adegenet} function \texttt{nInd()}: +setpop(virus) <- ~country +virus$pop.names # Only two countries from North America. +vna <- popsub(virus, sublist=c("USA", "Canada")) +vna$pop.names +@ +\noindent +If we want to see the population size, we can use the \adegenet{} function +\texttt{nInd()}: <>= -nInd(H.na) -nInd(H3N2) +c(NorthAmerica = nInd(vna), Total = nInd(virus)) @ -You can see that the population factors are correct and that the size of the data set is considerably smaller. Let's see the data set without the North American countries. +\noindent +You can see that the population factors are correct and that the size of the +data set is considerably smaller. Let's see the data set without the North +American countries. <>= -H.minus.na <- popsub(H3N2, blacklist=c("USA", "Canada")) -H.minus.na$pop.names +vnaminus <- popsub(virus, blacklist=c("USA", "Canada")) +vnaminus$pop.names @ -Let's make sure that the number of individuals in both data sets added up equals the number of individuals in our original data set: +\noindent +Let's make sure that the number of individuals in both data sets is equal to +the number of individuals in our original data set: <>= -(nInd(H.minus.na) + nInd(H.na)) == nInd(H3N2) +(nInd(vnaminus) + nInd(vna)) == nInd(virus) @ -Now we have data sets with and without North America. Let's try something a bit more challenging. Let's say that we want The first 10 populations in alphabetical order, but we know that we still don't want any countries in North America. We can easily do this by using the \textit{base} function \texttt{sort}. +Now we have data sets with and without North America. Let's try something a bit +more challenging. Let's say that we want the first 10 populations in +alphabetical order, but we know that we still don't want any countries in North +America. We can easily do this by using the \textit{base} function +\texttt{sort}. <>= -Hsort <- sort(H3N2$pop.names)[1:10] -Hsort -H.alph <- popsub(H3N2, sublist=Hsort, blacklist=c("USA", "Canada")) -H.alph$pop.names +vsort <- sort(virus$pop.names)[1:10] +vsort +valph <- popsub(virus, sublist=vsort, blacklist=c("USA", "Canada")) +valph$pop.names @ +\noindent And that, is how you subset your data with poppr! -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -\subsection{Attack of the clone correction \{clone-censor data sets\}}\label{data.manip:cc} - -\tab\tab Clone correction refers to the ability of keeping one observation per clone in a given population (or sub-population). Clone correcting can be hazardous if its done by hand (even on small data sets) and it requires a defined population hierarchy to get relevant results. \textit{Poppr} has a clone correcting function that is able to correct at the lowest level of any defined population hierarchy. Note that clone correction in \textit{poppr} is sensitive to missing data, as it treats all missing data as a single extra allele. -\subsubsection{Function: clonecorrect}\label{data.manip:cc:clonecorrect} - -\tab\tab This function will return a clone corrected data set corrected for the lowest population level. Population levels are specified with the \texttt{hier} flag. You can choose to combine the population hierarchy to analyze at the lowest population level by choosing \texttt{combine = TRUE}. +%-----------------------------------------------------------------------------% +% +%-----------------------------------------------------------------------------% +\subsection{Clone-censor data sets \{Attack of the clone correction\}} +\label{data.manip:cc} + +\tab\tab Clone correction refers to the ability of keeping one observation of +each MLG in a given population (or sub-population). Clone correcting can be +hazardous if its done by hand (even on small data sets) and it requires a +defined population hierarchy to get relevant results. \Poppr{} has a clone +correcting function that that will correct down to the lowest level of any +defined population hierarchy. Note that clone correction in \poppr{} is +sensitive to missing data, as it treats all missing data as a single extra +allele. + +This function will create new data sets, but it is also utilized by the functions + \texttt{poppr} and \texttt{poppr.amova} natively. + +\subsubsection{Function: clonecorrect} +\label{data.manip:cc:clonecorrect} + +\tab\tab This function will return a clone corrected data set corrected for the +lowest population level. Population levels are specified with the \texttt{hier} +flag. You can choose to combine the population hierarchy to analyze at the +lowest population level by choosing \texttt{combine = TRUE}. + +\tline{} \begin{quote} -Default Command:\\ -\texttt{clonecorrect(pop, hier = c(1), dfname = "population\_hierarchy", combine = FALSE, keep = 1)} +Default Command: +<>= +funk <- "clonecorrect" +print_command(funk) +@ \end{quote} \begin{itemize} - \item \texttt{pop -} a \texttt{genind} object that has a population hierarchy data frame in the \texttt{@other} slot. Note, the \texttt{genind} object does not necessarily require a population factor to begin with. - \item \texttt{hier -} This can be a vector of words or numbers referring to specific column names in your data frame in the \texttt{@other} slot. - \item \texttt{dfname -} The name of a data frame you have in the \texttt{@other} slot with the population factors. - \item \texttt{combine -} Do you want to combine the population hierarchy? If it's set to \texttt{FALSE} (default), you will be returned a genind object with the top most hierarchical level as a population factor. - \item \texttt{keep -} This flag is to be used if you set \texttt{combine = FALSE}. This will tell clone correct to return a specific combination of your hierarchy. For example, imagine a hierarchy that needs to be clone corrected at three levels: \textit{Population} by \textit{Year} by \textit{Month}. If you wanted to only run an analysis on the \textit{Population} level, you would set \texttt{keep = 1} since \textit{Population} is the first level of the hierarchy. On the other hand, if you wanted to run analysis on \textit{Year} by \textit{Month}, you would set \texttt{keep = 2:3} since those are the second and third levels of the hierarchy. + \item \texttt{pop -} a \texttt{genclone} object with a defined hierarchy or a + \texttt{genind} object that has a population hierarchy data frame in the + \texttt{@other} slot. Note, the \texttt{genind} object does not necessarily + require a population factor to begin with. + \item \texttt{hier -} A hierarchical formula (eg. \texttt{$\sim$Pop/Subpop}), + representing the hierarchical levels in your data. + \item \texttt{dfname -} \textbf{Only for use in genind objects, otherwise, + deprecated} The name of a data frame you have in the \texttt{@other} slot + with the population factors. + \item \texttt{combine -} Do you want to combine the population hierarchy? If + it's set to \texttt{FALSE} (default), you will be returned an object with the + top most hierarchical level as a population factor unless the \textbf{keep} + argument is defined. If set to \texttt{TRUE}, the hierarchy will be returned + combined. + \item \texttt{keep -} This flag is to be used if you set \texttt{combine = + FALSE}. This will tell clone correct to return a specific combination of your + hierarchy defined as integers. For example, imagine a hierarchy that needs to + be clone corrected at three levels: + \textit{Population} by \textit{Year} by \textit{Month}. If you wanted to only + run an analysis on the \textit{Population} level, you would set \texttt{keep + = 1} since \textit{Population} is the first level of the hierarchy. On the + other hand, if you wanted to run analysis on \textit{Year} by \textit{Month}, + you would set + \texttt{keep = 2:3} since those are the second and third levels of the + hierarchy. \end{itemize} - -Let's look at ways to clone-correct our data. We'll look at our \textit{A. euteichies} data since that data set is known to include clonal populations \cite{Grunwald:2006}. Notice that I am not including the options \texttt{dfname} and \texttt{combine} because the default arguments suit my needs. +\bline{} + +Let's look at ways to clone-correct our data. We'll look at our \textit{A. +euteichies} data that we loaded earlier in the +section \seclink{data.manip:hier}{Can you take me hier(archy)} since that data +set is known to include clonal populations \cite{Grunwald:2006}. Try playing +around with the data and see what different combinations of the \texttt{hier}, +and \texttt{keep} flags produce. Below, I will give three examples of clone +corrections at the sample level with respect to field, at the field level, and +finally, at the level of the entire data set. + +First, we will examine the original data set. <>= -data(Aeut) -A.cc <- clonecorrect(Aeut, hier=c("Pop", "Subpop"), keep=1) -poppr(A.cc, quiet=TRUE) +aphan # Original object @ -Now let's compare the clone corrected analysis to the uncorrected data set: -<>= -poppr(Aeut, quiet=TRUE) +\noindent +Now we correct by sample with respect to field and keep the field as the population. +<>= +clonecorrect(aphan, hier = ~field/sample) +# Your turn: Use the same hierarchy and use combine = TRUE and then +# keep = 1:2. Is there any difference? +@ +\noindent +Correcting by field. Notice how the number of MLG is much closer to our census. +<>= +clonecorrect(aphan, hier = ~field) +@ +\noindent +Correcting over whole data set. Our MLG is equal to our census. +<>= +clonecorrect(aphan, hier = NA) @ -As you can see from the summary tables, everything all sub-populations have been clone censored to the sub population level with respect to the population hierarchy. Notice how the observed number of individuals \texttt{(N)} decreases in the clone corrected data set. - -\subsection{Every day I'm shuffling (data sets) \{permutations and bootstrap resampling\}}\label{data.manip:shuffle} - -\tab\tab A common null hypothesis for populations with mixed reproductive modes is panmixia, or to put it simply: lots of sex. A handy way to test for that is permutation analysis to assess random linkage among loci whereupon you randomly shuffle your data. \textit{Poppr} uses randomly shuffled data sets in order to calculate P-values for the index of association ($I_A$ and $\bar r_d$) \cite{Agapow:2001}. Since there might be other tests where a permutation analysis would be pertinent, a shuffler for \texttt{genind} objects was created with four shuffling schemes: two schemes shuffling without replacement and two shuffling with replacement. Details below. -\subsubsection{Function: shufflepop}\label{data.manip:shuffle:shufflepop} +%-----------------------------------------------------------------------------% +% +%-----------------------------------------------------------------------------% +\subsection{Permutations and bootstrap resampling \{every day I'm shuffling (data sets)\}} +\label{data.manip:shuffle} + +\tab\tab A common null hypothesis for populations with mixed reproductive modes +is panmixia, or to put it simply: lots of sex. \Poppr{} randomly shuffles data +sets in order to calculate P-values for the index of association ($I_A$ and +$\bar{r} _d$) \cite{Agapow:2001} using 4 different methods: + +% latex table generated in R 3.1.0 by xtable 1.7-3 package +% Fri May 2 23:17:17 2014 +\begin{table}[ht] +\centering +\begin{tabular}{rll} + \hline +method & strategy & units sampled \\ + \hline +1 & permutation & alleles \\ + 2 & simulation & alleles \\ + 3 & simulation & alleles \\ + 4 & permutation & genotypes \\ + \hline +\end{tabular} +\end{table} + +These methods are detailed below. We will create a dummy data set to be shuffled +by each example below. Let's assume a single diploid locus with four alleles (1, +2, 3, 4) with the frequencies of 0.1, 0.2, 0.3, and 0.4, respectively: + +% latex table generated in R 3.0.3 by xtable 1.7-3 package +% Sun Mar 23 11:52:35 2014 +\begin{table}[h!] +\centering +\begin{tabular}{lc} + \hline + & A1/A2 \\ + \hline + 1 & 4/4 \\ + 2 & 4/1 \\ + 3 & 4/3 \\ + 4 & 2/2 \\ + 5 & 3/3 \\ + \hline + \end{tabular} + \caption{Original} + \label{tab:original} +\end{table} +\noindent +The 4 methods are detailed below. + +\tline{} +\subsubsection{Function: shufflepop} +\label{data.manip:shuffle:shufflepop} \begin{quote} -Default Command:\\ -\texttt{shufflepop(pop, method = 1)} +Default Command: +<>= +funk <- "shufflepop" +print_command(funk) +@ +% \texttt{shufflepop(pop, method = 1)} \end{quote} \begin{itemize} \item \texttt{pop -} a \texttt{genind} object. - \item \texttt{method -} a number indicating the method of sampling you wish to use. - The following methods are available for use: + \item \texttt{method -} a number indicating the method of sampling you wish to + use. The following methods are available for use: \begin{enumerate} - \item \textbf{Permute Alleles (default)} This is a sampling scheme that will permute alleles within the locus. For example, a single diploid locus with four alleles (1, 2, 3, 4) with the frequencies of 0.1, 0.2, 0.3, and 0.4, respectively: -<>= -exmat <- matrix(c(4,4, - 4,1, - 4,3, - 2,2, - 3,3), 5, byrow=TRUE) -exmat -@ -might become: -<>= -set.seed(1001) -matrix(sample(exmat), 5, byrow=T) -@ -As you can see, The heterozygosity has changed, yet the allelic frequencies remain the same. Overall this would show you, for example, what would happen if the sample you had underwent panmixis within this sample itself. - \item \textbf{Parametric Bootstrap} The previous scheme reshuffled the observed sample, but the parametric bootstrap uses the allelic frequencies as estimates of what the true allelic frequencies are and uses those as probabilities for each allele when resampling the data with replacement. Here are two samples to show you what I mean. -<>= -set.seed(1001) -cat("First Sample") -matrix(sample(1:4, 10, prob=c(0.1,0.2,0.3,0.4), replace=TRUE), 5, byrow=T) -cat("Second Sample") -matrix(sample(1:4, 10, prob=c(0.1,0.2,0.3,0.4), replace=TRUE), 5, byrow=T) -@ - -Notice how the heterozygosity has changed along with the allelic frequencies. The frequencies for alleles 3 and 4 have switched in the first data set, and we've lost allele 1 in the second data set purely by chance! This type of sampling scheme attempts to show you what the true population would look like if it were panmictic and your original sample gave you a basis for estimating expected allele frequencies. Since estimates are made from the observed allele frequencies, small samples will produce skewed results. - \item \textbf{Non-Parametric Bootstrap} The third method is sampling with replacement, but with no assumption about the distribution of the alleles. -<>= -set.seed(1001) -matrix(sample(1:4, 10, prob=rep(1, 4), replace=TRUE), 5, byrow=T) -@ -Again, heterozygosity and allele frequencies are not maintained, but now all of the alleles have a 1 in 4 chance of being chosen. - \item \textbf{Multilocus permutation} This is called Multilocus permutation because it does the same thing as the permutation analysis in the program \textit{multilocus} by Paul Agapow and Austin Burt \cite{Agapow:2001}. This will shuffle the genotypes at each locus. Using our example above, here it is shuffled with method 4: -<>= -set.seed(1001) -exmat[sample(1:5), ] -@ - -Note that you have the same genotypes after shuffling, so at each locus, you will maintain the same allelic frequencies and heterozygosity. So, in this sample, you will only see a homozygote with allele 2. This also ensures that the P-values associated with $I_A$ and $\bar r_d$ are exactly the same (for an explanation, see the end of section \ref{index:iard:ia} of this manual). Unfortunately, if you are trying to simulate a sexual population, this does not make much biological sense as it assumes that alleles are not independently assorting within individuals. + \item \textbf{Permute Alleles (default)} This is a sampling scheme that will \textbf{permute alleles within the locus.} +% <>= +% exmat <- matrix(c(4,4, +% 4,1, +% 4,3, +% 2,2, +% 3,3), 5, byrow=TRUE) +% exmat +% @ + +The example above might become tables \ref{tab:permsinglebest} and \ref{tab:permtwobest}. +\begin{table}[h!] +\centering +\makebox[0pt][c]{\parbox{\textwidth}{% + \begin{minipage}[b]{0.45\hsize}\centering + \begin{tabular}{lc} + \hline + & A1/A2 \\ + \hline + 1 & 3/4 \\ + 2 & 2/3 \\ + 3 & 4/4 \\ + 4 & 2/1 \\ + 5 & 3/4 \\ + \hline + \end{tabular} + \caption{Permute 1} + \label{tab:permsinglebest} + \end{minipage} + \hfill + \begin{minipage}[b]{0.45\hsize}\centering + \begin{tabular}{lc} + \hline + & A1/A2 \\ + \hline + 1 & 1/3 \\ + 2 & 2/4 \\ + 3 & 3/4 \\ + 4 & 4/3 \\ + 5 & 4/2 \\ + \hline + \end{tabular} + \caption{Permute 2} + \label{tab:permtwobest} + \end{minipage} +}} +\end{table} + +As you can see, The heterozygosity has changed, yet the allelic frequencies +remain the same. Overall this would show you what would happen if +the sample you had underwent panmixis within this sample itself. + \item \textbf{Parametric Bootstrap} The previous scheme reshuffled the + observed sample, but the parametric bootstrap \textbf{draws samples from a + multinomial distribution using the observed allele frequencies as weights.} + Tables \ref{tab:param1} and \ref{tab:param2} are examples of what I mean. +% <>= +% set.seed(1001) +% cat("First Sample") +% matrix(sample(1:4, 10, prob=c(0.1,0.2,0.3,0.4), replace=TRUE), 5, byrow=T) +% cat("Second Sample") +% matrix(sample(1:4, 10, prob=c(0.1,0.2,0.3,0.4), replace=TRUE), 5, byrow=T) +% @ + +\begin{table}[h!] +\centering +\makebox[0pt][c]{\parbox{\textwidth}{% + \begin{minipage}[b]{0.45\hsize}\centering + \begin{tabular}{lc} + \hline + & A1/A2 \\ + \hline + 1 & 1/3 \\ + 2 & 3/3 \\ + 3 & 3/2 \\ + 4 & 4/4 \\ + 5 & 4/2 \\ + \hline + \end{tabular} + \caption{Parametric 1} + \label{tab:param1} + \end{minipage} + \hfill + \begin{minipage}[b]{0.45\hsize}\centering + \begin{tabular}{lc} + \hline + & A1/A2 \\ + \hline + 1 & 3/4 \\ + 2 & 2/3 \\ + 3 & 4/2 \\ + 4 & 4/4 \\ + 5 & 4/2 \\ + \hline + \end{tabular} + \caption{Parametric 2} + \label{tab:param2} + \end{minipage} +}} +\end{table} + +Notice how the heterozygosity has changed along with the allelic frequencies. +The frequencies for alleles 3 and 4 have switched in the first data set, and +we've lost allele 1 in the second data set purely by chance! This type of +sampling scheme attempts to show you what the true population would look like if +it were panmictic and your original sample gave you a basis for estimating +expected allele frequencies. Since estimates are made from the observed allele +frequencies, small samples will produce skewed results. + \item \textbf{Non-Parametric Bootstrap} The third method is sampling with + replacement, again \textbf{drawing from a multinomial distribution, but with no + assumption about the allele frequencies} (tables \ref{tab:nonparam1} and \ref{tab:nonparam2}). +% <>= +% set.seed(1001) +% matrix(sample(1:4, 10, prob=rep(1, 4), replace=TRUE), 5, byrow=T) +% @ + +% latex table generated in R 3.0.3 by xtable 1.7-3 package +% Sun Mar 23 11:57:06 2014 + +\begin{table}[h!] +\centering +\makebox[0pt][c]{\parbox{\textwidth}{% + \begin{minipage}[b]{0.45\hsize}\centering + \begin{tabular}{lc} + \hline + & A1/A2 \\ + \hline + 1 & 1/3 \\ + 2 & 3/3 \\ + 3 & 3/1 \\ + 4 & 2/2 \\ + 5 & 3/1 \\ + \hline + \end{tabular} + \caption{Non-parametric 1} + \label{tab:nonparam1} + \end{minipage} + \hfill + \begin{minipage}[b]{0.45\hsize}\centering + \begin{tabular}{lc} + \hline + & A1/A2 \\ + \hline + 1 & 1/3 \\ + 2 & 3/1 \\ + 3 & 2/3 \\ + 4 & 2/1 \\ + 5 & 4/3 \\ + \hline + \end{tabular} + \caption{Non-parametric 2} + \label{tab:nonparam2} + \end{minipage} +}} +\end{table} + +Again, heterozygosity and allele frequencies are not maintained, but now all of +the alleles have a 1 in 4 chance of being chosen. + \item \textbf{Multilocus permutation} This is called Multilocus permutation + because it does the same thing as the permutation analysis in the program + \textit{multilocus} by Paul Agapow and Austin Burt \cite{Agapow:2001}. This + will shuffle the genotypes at each locus. Using our example above, tables + \ref{tab:singlebest} and \ref{tab:twobest} + are shuffled with method 4. +% <>= +% set.seed(1001) +% exmat[sample(1:5), ] +% @ + +\begin{table}[h!] +\centering +\makebox[0pt][c]{\parbox{\textwidth}{% + \begin{minipage}[b]{0.45\hsize}\centering + \begin{tabular}{lc} + \hline + & A1/A2 \\ + \hline + 1 & 3/3 \\ + 2 & 4/1 \\ + 3 & 2/2 \\ + 4 & 4/4 \\ + 5 & 4/3 \\ + \hline + \end{tabular} + \caption{ML 1} + \label{tab:singlebest} + \end{minipage} + \hfill + \begin{minipage}[b]{0.45\hsize}\centering + \begin{tabular}{lc} + \hline + & A1/A2 \\ + \hline + 1 & 4/4 \\ + 2 & 2/2 \\ + 3 & 3/3 \\ + 4 & 4/3 \\ + 5 & 4/1 \\ + \hline + \end{tabular} + \caption{ML 2} + \label{tab:twobest} + \end{minipage} +}} +\end{table} + +Note that you have the same genotypes after shuffling, so at each locus, you +will maintain the same allelic frequencies and heterozygosity. So, in this +sample, you will only see a homozygote with allele 2. This also ensures that the +P-values associated with $I_A$ and $\bar{r} _d$ are exactly the same. +Unfortunately, if you are trying to simulate a sexual population, this does not +make much biological sense as it \textbf{assumes that alleles are not independently +assorting within individuals.} \end{enumerate} \end{itemize} -These shuffling schemes have been implemented for the index of association, but there may be other summary statistics you can use \texttt{shufflepop} for. All you have to do is use the function \texttt{replicate}. Let's use $I_A$ as an example: -<>= + +\bline{} + +These shuffling schemes have been implemented for the index of association, but +there may be other summary statistics you can use \texttt{shufflepop} for. All +you have to do is use the function \texttt{replicate}. Let's use +average Bruvo's distance with the first population of the +data set \texttt{nancycats} as an example: +<>= data(nancycats) nan1 <- popsub(nancycats, 1) -ia(nan1) -replicate(10, ia(shufflepop(nan1, method = 2), quiet=TRUE)) -@ -You could use this method to replicate the resampling 999 times and then create a histogram to visualize a distribution of what would happen under different assumptions of panmixia. - -\subsection{Cut It Out! \{removing uninformative loci\}}\label{data.manip:informloci} - \tab\tab Phylogenetically uninformative loci are those that have only one sample differentiating from the rest. This can lead to biased results when using multilocus analyses such as the index of association (See \ref{index:iard} and \ref{summary}). These nuisance loci can be removed with the following function. -\subsubsection{Function: informloci}\label{data.manip:informloci:informloci} +reps <- rep(2, 9) # Assuming dinucleotide repeats. +observed <- mean(bruvo.dist(nan1, replen = reps)) +observed +@ +<>= +set.seed(9999) +bd.test <- replicate(999, mean(bruvo.dist(shufflepop(nan1, method = 2), replen = reps))) +@ +<>= +bd.test <- c(0.383180217978395, 0.415172887731481, 0.395401716820988, 0.36112075617284, +0.381246383101852, 0.391417100694444, 0.368165268132716, 0.44764419367284, +0.39599006558642, 0.392292390046296, 0.416001157407407, 0.359629388503086, +0.375589554398148, 0.405174575617284, 0.409643856095679, 0.391482204861111, +0.382218123070988, 0.41436149691358, 0.383174189814815, 0.404762249228395, +0.409695698302469, 0.351765046296296, 0.375487075617284, 0.394843508873457, +0.403155140817901, 0.395619936342593, 0.35908203125, 0.396396363811728, +0.387588011188272, 0.375771604938272, 0.374916811342593, 0.371144386574074, +0.375567853009259, 0.372751494984568, 0.41537181712963, 0.390055941358025, +0.372065489969136, 0.38319106867284, 0.418766878858025, 0.389811197916667, +0.391309799382716, 0.380735194830247, 0.392493730709877, 0.407855902777778, +0.373485725308642, 0.393967013888889, 0.419801311728395, 0.419554157021605, +0.349768518518519, 0.368592062114198, 0.396170910493827, 0.38164665316358, +0.397676745756173, 0.359424430941358, 0.390004099151235, 0.375790895061728, +0.415828751929012, 0.396510898919753, 0.398491753472222, 0.399006558641975, +0.378590374228395, 0.410376880787037, 0.373421826774691, 0.401580584490741, +0.388343942901235, 0.389790702160494, 0.406230709876543, 0.410212914737654, +0.411181037808642, 0.397419945987654, 0.409830729166667, 0.38501880787037, +0.375995852623457, 0.374977092978395, 0.404217303240741, 0.381970968364198, +0.427674093364198, 0.421094955632716, 0.392737268518519, 0.372973331404321, +0.396556712962963, 0.41493537808642, 0.367643229166667, 0.417230902777778, +0.37703872492284, 0.388992573302469, 0.390311535493827, 0.404422260802469, +0.325649836033951, 0.382992139274691, 0.396333670910494, 0.385146604938272, +0.40302734375, 0.369362461419753, 0.40317684220679, 0.400687210648148, +0.402665653935185, 0.383434606481481, 0.398591820987654, 0.403266059027778, +0.33031322337963, 0.381671971450617, 0.398115596064815, 0.401163435570988, +0.416612413194444, 0.403988233024691, 0.421760464891975, 0.425095244984568, +0.431723813657407, 0.401032021604938, 0.390247636959877, 0.431340422453704, +0.390993923611111, 0.421104600694444, 0.339672550154321, 0.437996720679012, +0.358885513117284, 0.368224344135802, 0.361101466049383, 0.384192949459877, +0.410045331790123, 0.368414834104938, 0.340546633873457, 0.341089168595679, +0.387595244984568, 0.371152826003086, 0.386452305169753, 0.438270399305556, +0.414109519675926, 0.376890432098765, 0.367886766975309, 0.326484133873457, +0.385950761959877, 0.429093123070988, 0.414934172453704, 0.395864679783951, +0.385291280864198, 0.405513358410494, 0.373255449459877, 0.380583285108025, +0.371088927469136, 0.367192322530864, 0.39708236882716, 0.337759211033951, +0.412621768904321, 0.394872444058642, 0.381664737654321, 0.423829330632716, +0.389819637345679, 0.420828510802469, 0.404482542438272, 0.414830488040123, +0.383400848765432, 0.389350646219136, 0.38399040316358, 0.378415557484568, +0.401076630015432, 0.367534722222222, 0.429076244212963, 0.419275655864198, +0.369282889660494, 0.402340133101852, 0.341787229938272, 0.379204041280864, +0.386735628858025, 0.446989535108025, 0.402697000385802, 0.433216386959877, +0.367690248842593, 0.375049430941358, 0.368416039737654, 0.402647569444444, +0.40395809220679, 0.425563030478395, 0.45283203125, 0.369378134645062, +0.355145640432099, 0.419715711805556, 0.321680893132716, 0.400511188271605, +0.399321228780864, 0.408874662422839, 0.380805121527778, 0.357367621527778, +0.387917148919753, 0.46065055941358, 0.397224633487654, 0.365272955246914, +0.419821807484568, 0.395893614969136, 0.434101321373457, 0.420037615740741, +0.344226224922839, 0.380940152391975, 0.364171006944444, 0.419171971450617, +0.419407069830247, 0.370770640432099, 0.3603515625, 0.408857783564815, +0.417365933641975, 0.352559558256173, 0.399633487654321, 0.389433834876543, +0.394340760030864, 0.383954234182099, 0.388071469907407, 0.359601658950617, +0.363640528549383, 0.357619598765432, 0.37666015625, 0.39146412037037, +0.360531201774691, 0.37396556712963, 0.396057581018519, 0.436072530864198, +0.426344280478395, 0.404943094135802, 0.418567949459877, 0.360329861111111, +0.44258415316358, 0.382421875, 0.396319203317901, 0.404704378858025, +0.352875434027778, 0.401589023919753, 0.389913676697531, 0.38095100308642, +0.390802228009259, 0.401812065972222, 0.410579427083333, 0.401623987268519, +0.437044270833333, 0.403096064814815, 0.335836226851852, 0.393484760802469, +0.39955150462963, 0.391191647376543, 0.347428385416667, 0.397435619212963, +0.372844328703704, 0.405638744212963, 0.381063126929012, 0.377789834104938, +0.406129436728395, 0.385578221450617, 0.371550684799383, 0.397008825231481, +0.410454041280864, 0.383675733024691, 0.344425154320988, 0.352178578317901, +0.395658516589506, 0.410639708719136, 0.372582706404321, 0.401175491898148, +0.364237316743827, 0.401288821373457, 0.393103780864198, 0.371531394675926, +0.369349199459877, 0.386342592592593, 0.43155502507716, 0.420232928240741, +0.448606288580247, 0.345263069058642, 0.399540653935185, 0.376935040509259, +0.357755835262346, 0.382535204475309, 0.408638358410494, 0.421451822916667, +0.408481626157407, 0.416130160108025, 0.429947916666667, 0.387145543981481, +0.427443817515432, 0.334611304012346, 0.389210792824074, 0.370070167824074, +0.408222415123457, 0.400714940200617, 0.429736930941358, 0.365889033564815, +0.416772762345679, 0.364687017746914, 0.407233796296296, 0.403978587962963, +0.371117862654321, 0.43031322337963, 0.368804253472222, 0.376974826388889, +0.432552083333333, 0.400812596450617, 0.379140142746914, 0.397846739969136, +0.346225163966049, 0.372135416666667, 0.408092206790123, 0.414579716435185, +0.404601900077161, 0.40112003279321, 0.364250578703704, 0.359230324074074, +0.390632233796296, 0.392872299382716, 0.406500771604938, 0.396532600308642, +0.433258584104938, 0.407444782021605, 0.369508342978395, 0.375815007716049, +0.404651331018519, 0.398518277391975, 0.403392650462963, 0.393666811342593, +0.407813705632716, 0.424363425925926, 0.389061294367284, 0.369866415895062, +0.396776138117284, 0.409172453703704, 0.416928288966049, 0.378232301311728, +0.40765697337963, 0.417314091435185, 0.389046826774691, 0.415744357638889, +0.393634259259259, 0.36586612654321, 0.401743344907407, 0.426003086419753, +0.39360050154321, 0.412666377314815, 0.41353443287037, 0.395747733410494, +0.390230758101852, 0.386709104938272, 0.387943672839506, 0.35624397183642, +0.415022183641975, 0.409261670524691, 0.396379484953704, 0.38554084683642, +0.363905767746914, 0.397246334876543, 0.396917197145062, 0.370558449074074, +0.326807243441358, 0.380318045910494, 0.371108217592593, 0.402881462191358, +0.421170910493827, 0.377513744212963, 0.406248794367284, 0.432503858024691, +0.389230082947531, 0.374774546682099, 0.385296103395062, 0.356553819444444, +0.429593460648148, 0.399232011959877, 0.413334297839506, 0.357564139660494, +0.380099826388889, 0.392191116898148, 0.425604021990741, 0.382099971064815, +0.396692949459877, 0.378401089891975, 0.382632860725309, 0.379443962191358, +0.353100887345679, 0.350179639274691, 0.400798128858025, 0.391773967978395, +0.390212673611111, 0.421403597608025, 0.383235677083333, 0.404604311342593, +0.382499035493827, 0.404053337191358, 0.380803915895062, 0.409326774691358, +0.360821759259259, 0.352030285493827, 0.382887249228395, 0.362068383487654, +0.395824893904321, 0.426868730709877, 0.350993441358025, 0.351259886188272, +0.376520302854938, 0.405735194830247, 0.333004195601852, 0.375434027777778, +0.454225742669753, 0.385618007330247, 0.397746672453704, 0.399845679012346, +0.381020929783951, 0.390805844907407, 0.37349537037037, 0.366712480709877, +0.34423828125, 0.384405140817901, 0.376293643904321, 0.423095100308642, +0.362887008101852, 0.410147810570988, 0.407542438271605, 0.397905815972222, +0.438437982253086, 0.391622058256173, 0.359475067515432, 0.412031008873457, +0.395044849537037, 0.406966145833333, 0.377733169367284, 0.353202160493827, +0.382794415509259, 0.39121696566358, 0.411637972608025, 0.406511622299383, +0.395616319444444, 0.352868200231481, 0.411424575617284, 0.386510175540123, +0.388442804783951, 0.381898630401235, 0.44444565007716, 0.414210792824074, +0.415461033950617, 0.43555290316358, 0.386957465277778, 0.349611786265432, +0.360434751157407, 0.417320119598765, 0.381280140817901, 0.383104263117284, +0.404839409722222, 0.363233024691358, 0.379905719521605, 0.405243296682099, +0.364553192515432, 0.345275125385802, 0.414390432098765, 0.405825617283951, +0.371305941358025, 0.382529176311728, 0.404598283179012, 0.369455295138889, +0.409336419753086, 0.394952015817901, 0.380199893904321, 0.405327690972222, +0.405947386188272, 0.384239969135802, 0.40400390625, 0.397398244598765, +0.386231674382716, 0.398750964506173, 0.366917438271605, 0.413671875, +0.377615017361111, 0.441937934027778, 0.373066165123457, 0.426989293981481, +0.393100163966049, 0.384252025462963, 0.386873070987654, 0.447501929012346, +0.392793933256173, 0.350059076003086, 0.350538917824074, 0.375974151234568, +0.394501109182099, 0.364362702546296, 0.359079619984568, 0.398760609567901, +0.365193383487654, 0.413980516975309, 0.336530671296296, 0.367936197916667, +0.427888695987654, 0.411747685185185, 0.361118344907407, 0.340543016975309, +0.374469521604938, 0.376176697530864, 0.339312065972222, 0.36255184220679, +0.428757957175926, 0.396603732638889, 0.398220486111111, 0.377998408564815, +0.458297164351852, 0.411371527777778, 0.388263165509259, 0.38179012345679, +0.393031442901235, 0.378683207947531, 0.365017361111111, 0.395235339506173, +0.361461950231481, 0.387102141203704, 0.446594087577161, 0.402951388888889, +0.413870804398148, 0.383012635030864, 0.392099488811728, 0.380672501929012, +0.369762731481481, 0.404457224151235, 0.437080439814815, 0.42606819058642, +0.344309413580247, 0.370612702546296, 0.386519820601852, 0.330557966820988, +0.414650848765432, 0.376538387345679, 0.367432243441358, 0.434569106867284, +0.410765094521605, 0.393921199845679, 0.430010609567901, 0.424990354938272, +0.37861930941358, 0.389408516589506, 0.397084780092593, 0.418393132716049, +0.404865933641975, 0.419696421682099, 0.42328197337963, 0.35384837962963, +0.440784143518519, 0.339085407021605, 0.398503809799383, 0.38325496720679, +0.343084490740741, 0.365447771990741, 0.394507137345679, 0.437507233796296, +0.373072193287037, 0.383201919367284, 0.399937307098765, 0.379497010030864, +0.356886574074074, 0.420395688657407, 0.356246383101852, 0.388859953703704, +0.445442708333333, 0.361277488425926, 0.371222752700617, 0.388386140046296, +0.385187596450617, 0.375283323688272, 0.387521701388889, 0.388689959490741, +0.357333863811728, 0.453578317901235, 0.36547550154321, 0.39613956404321, +0.417972366898148, 0.374092158564815, 0.382782359182099, 0.381862461419753, +0.423981240354938, 0.414672550154321, 0.348871527777778, 0.387929205246914, +0.413448832947531, 0.389042004243827, 0.336711516203704, 0.407901716820988, +0.370625964506173, 0.412831548996914, 0.381561053240741, 0.366015625, +0.37412712191358, 0.407411024305556, 0.378071952160494, 0.372794897762346, +0.419258777006173, 0.402212336033951, 0.403891782407407, 0.425675154320988, +0.415992717978395, 0.392179060570988, 0.407238618827161, 0.402026668595679, +0.395384837962963, 0.380525414737654, 0.363785204475309, 0.412713396990741, +0.398572530864198, 0.395569299768519, 0.383186246141975, 0.36587215470679, +0.36796272183642, 0.365980661651235, 0.357057773919753, 0.41279899691358, +0.395349874614198, 0.390680459104938, 0.377873022762346, 0.389040798611111, +0.377109857253086, 0.348837770061728, 0.368975453317901, 0.357876398533951, +0.374716676311728, 0.394544511959877, 0.394398630401235, 0.417626350308642, +0.382323013117284, 0.351409384645062, 0.425944010416667, 0.388559751157407, +0.428446903935185, 0.398743730709877, 0.392685426311728, 0.398725646219136, +0.345968364197531, 0.377530623070988, 0.382658179012346, 0.349848090277778, +0.396012972608025, 0.353338396990741, 0.430840084876543, 0.377238859953704, +0.394731385030864, 0.411692226080247, 0.37016300154321, 0.399468315972222, +0.453606047453704, 0.368296682098765, 0.33921802662037, 0.417118778935185, +0.406395881558642, 0.389037181712963, 0.417344232253086, 0.374247685185185, +0.358676938657407, 0.404021990740741, 0.412682050540123, 0.347416329089506, +0.394303385416667, 0.333446662808642, 0.361119550540123, 0.397492283950617, +0.450394241898148, 0.381412760416667, 0.429846643518519, 0.376713204089506, +0.40488884066358, 0.372957658179012, 0.404951533564815, 0.3615234375, +0.380840084876543, 0.375343605324074, 0.378115354938272, 0.386223234953704, +0.407086709104938, 0.390598476080247, 0.38189380787037, 0.426994116512346, +0.398075810185185, 0.355055217978395, 0.40372299382716, 0.384253231095679, +0.377801890432099, 0.421995563271605, 0.438786410108025, 0.39292896412037, +0.373168643904321, 0.396023823302469, 0.406528501157407, 0.385123697916667, +0.411454716435185, 0.407146990740741, 0.354187162422839, 0.385707224151235, +0.45199893904321, 0.423301263503086, 0.426093508873457, 0.357736545138889, +0.41849681712963, 0.397376543209877, 0.388570601851852, 0.369656635802469, +0.389693045910494, 0.364955873842593, 0.41753833912037, 0.381816647376543, +0.366625675154321, 0.405238474151235, 0.381565875771605, 0.395021942515432, +0.412459008487654, 0.359935619212963, 0.361991222993827, 0.385581838348765, +0.415857687114198, 0.383632330246914, 0.434061535493827, 0.352875434027778, +0.42680362654321, 0.364462770061728, 0.401983265817901, 0.371050347222222, +0.382638888888889, 0.341284481095679, 0.388495852623457, 0.40468147183642, +0.398787133487654, 0.434308690200617, 0.415680459104938, 0.407871576003086, +0.431914303626543, 0.387229938271605, 0.388371672453704, 0.377091772762346, +0.409389467592593, 0.37120346257716, 0.39843147183642, 0.385562548225309, +0.366154272762346, 0.373057725694444, 0.399386332947531, 0.357748601466049, +0.436159336419753, 0.362225115740741, 0.365024594907407, 0.396502459490741, +0.415729890046296, 0.37396556712963, 0.361053240740741, 0.37051986882716, +0.409051890432099, 0.406489920910494, 0.399397183641975, 0.346672453703704, +0.385487798996914, 0.382908950617284, 0.387248022762346, 0.370142505787037, +0.361493296682099, 0.373767843364198, 0.419709683641975, 0.432388117283951, +0.345741705246914, 0.386319685570988, 0.368377459490741, 0.377933304398148, +0.355714699074074, 0.379832175925926, 0.376674623842593, 0.352170138888889, +0.402731963734568, 0.35858772183642, 0.409573929398148, 0.430726755401235, +0.398097511574074, 0.387141927083333, 0.401740933641975, 0.397419945987654, +0.374105420524691, 0.405016637731481, 0.382298900462963, 0.392015094521605, +0.342367139274691, 0.41382137345679, 0.36112075617284, 0.39140625, +0.391415895061728, 0.362373408564815, 0.418616174768519, 0.385100790895062, +0.37310474537037, 0.359172453703704, 0.344196084104938, 0.405438609182099, +0.393285831404321, 0.426271942515432, 0.407395351080247, 0.358265817901235, +0.413771942515432, 0.348013117283951, 0.364741271219136, 0.38968822337963, +0.404763454861111, 0.38482349537037, 0.415417631172839, 0.385947145061728, +0.386601803626543, 0.356246383101852, 0.372376543209877, 0.420251012731481, +0.380846113040123, 0.402442611882716, 0.404289641203704, 0.374849295910494, +0.405134789737654, 0.424883053626543, 0.369742235725309, 0.394878472222222, +0.402324459876543, 0.408324893904321, 0.36083984375, 0.401047694830247, +0.370205198688272, 0.399468315972222, 0.365071614583333, 0.387427662037037, +0.386566840277778, 0.387134693287037, 0.341704041280864, 0.386242525077161, +0.389988425925926, 0.394380545910494, 0.373389274691358, 0.386977961033951, +0.372048611111111, 0.361917679398148, 0.360828993055556, 0.338351176697531, +0.39232494212963, 0.376856674382716, 0.377413676697531, 0.375761959876543, +0.417889178240741, 0.408604600694444, 0.419092399691358, 0.392327353395062, +0.426388888888889, 0.386496913580247, 0.418935667438272, 0.369508342978395, +0.389066116898148, 0.35234375, 0.409077208719136, 0.397744261188272, +0.402287085262346, 0.412764033564815, 0.399705825617284, 0.395793547453704, +0.390901089891975, 0.390275366512346, 0.425829475308642, 0.422141444830247, +0.40127555941358, 0.425836709104938, 0.408755304783951, 0.43119212962963, +0.394044174382716, 0.445629581404321, 0.38599537037037, 0.429912953317901, +0.356674382716049, 0.410478153935185, 0.384129050925926, 0.423949893904321, +0.419736207561728, 0.354499421296296, 0.411144868827161, 0.437016541280864, +0.371636284722222, 0.384575135030864, 0.369868827160494, 0.417621527777778, +0.384243586033951, 0.411648823302469, 0.398116801697531, 0.388364438657407, +0.386033950617284, 0.415200617283951, 0.407120466820988, 0.373558063271605, +0.41405647183642, 0.406703317901235, 0.378138261959877, 0.399733555169753, +0.388875626929012, 0.42020278742284, 0.355137201003086, 0.351853057484568, +0.377156876929012, 0.415581597222222, 0.397840711805556, 0.351316550925926, +0.43052059220679, 0.408252555941358, 0.373283179012346, 0.364079378858025, +0.370189525462963, 0.39624324845679, 0.371161265432099, 0.436905623070988, +0.372820216049383, 0.416814959490741, 0.367123601466049, 0.382185570987654, +0.422733410493827, 0.412718219521605, 0.380322868441358, 0.385768711419753, +0.429175106095679, 0.392156153549383, 0.336970727237654, 0.42496021412037, +0.404698350694444, 0.385574604552469, 0.417764998070988, 0.347227044753086, +0.387035831404321, 0.418354552469136, 0.401291232638889, 0.382189187885802, +0.367764998070988, 0.366671489197531, 0.399421296296296, 0.348514660493827, +0.392493730709877, 0.362890625, 0.391638937114198, 0.426443142361111, +0.379252266589506, 0.411027922453704, 0.370541570216049, 0.378446903935185, +0.394773582175926, 0.413078703703704, 0.443235194830247, 0.401103153935185, +0.403628954475309, 0.431617717978395, 0.394756703317901, 0.41430362654321, +0.387205825617284, 0.356032986111111, 0.377866994598765, 0.427340133101852, +0.430457899305556, 0.369756703317901, 0.40421127507716, 0.42396556712963, +0.394716917438272, 0.344619261188272, 0.429246238425926, 0.358376736111111, +0.373515866126543, 0.376716820987654, 0.415721450617284, 0.398945071373457, +0.390829957561728, 0.415590036651235, 0.372623697916667, 0.411378761574074, +0.362014130015432, 0.385200858410494, 0.42539665316358, 0.358989197530864, +0.39154369212963, 0.409165219907407, 0.354264322916667, 0.382320601851852, +0.392441888503086, 0.38319106867284, 0.415165653935185, 0.419118923611111, +0.438945553626543, 0.383711902006173, 0.406321132330247, 0.420744116512346, +0.393296682098765, 0.373635223765432, 0.382902922453704, 0.370631992669753, +0.386036361882716, 0.395402922453704, 0.386788676697531, 0.386981577932099, +0.38959659529321, 0.391449652777778, 0.355647183641975, 0.376593846450617, +0.354891251929012, 0.343709008487654, 0.404910542052469, 0.393446180555556, +0.363194444444444, 0.393581211419753, 0.381512827932099, 0.385577015817901, +0.361928530092593, 0.436765769675926, 0.38978587962963) +@ +You could use this method to replicate the resampling 999 times and then create +a histogram to visualize a distribution of what would happen under different +assumptions of panmixia. + +<>= +hist(bd.test, xlab = "Bruvo's Distance", main = "Average Bruvo's distance over 999 randomizations") +abline(v = observed, col = "red") +legend('topleft', legend="observed", col="red", lty = 1) +@ + +%-----------------------------------------------------------------------------% +% +%-----------------------------------------------------------------------------% +\subsection{Removing uninformative loci \{Cut It Out!\}} +\label{data.manip:informloci} +\tab\tab Phylogenetically uninformative loci are those that have only one +sample differentiating from the rest. This can lead to biased results when +using multilocus analyses such as the index of association \cite{Brown:1980,Smith:1993}. +These nuisance loci can be removed with the following function. +\subsubsection{Function: informloci} +\label{data.manip:informloci:informloci} + +\tline{} \begin{quote} -Default Command:\\ -\texttt{informloci(pop, cutoff = 2/nInd(pop), quiet = FALSE)} +Default Command: +<>= +funk <- "informloci" +print_command(funk) +@ +% \texttt{informloci(pop, cutoff = 2/nInd(pop), quiet = FALSE)} \end{quote} \begin{itemize} \item \texttt{pop -} a \texttt{genind} object. - \item \texttt{cutoff -} this represents the minimum fraction of individuals needed for a locus to be considered informative. The default is set to $2/n$ with $n$ being the number of individuals in the data set (represented by the \textit{adegenet} function \texttt{nInd}). Essentially, this means that any locus with fewer than 2 observations differing will be removed. The user can also specify a fraction of observations for the cutoff (eg. 0.05). - \item \texttt{quiet -} if \texttt{TRUE}, nothing will be printed to the screen, if \texttt{FALSE}, the cutoff value in percentage and number of individuals will be printed as well as the names of the uninfomrative loci found. + \item \texttt{cutoff -} this represents the minimum fraction of individuals + needed for a locus to be considered informative. The default is set to $2/n$ + with $n$ being the number of individuals in the data set (represented by the + \adegenet{} function \texttt{nInd}). Essentially, this means that any locus + with fewer than 2 observations differing will be removed. The user can also + specify a fraction of observations for the cutoff (eg. 0.05). + \item \texttt{quiet -} if \texttt{TRUE}, nothing will be printed to the + screen, if \texttt{FALSE} (default), the cutoff value in percentage and + number of individuals will be printed as well as the names of the + uninfomrative loci found. \end{itemize} +\bline{} -Here's a quick example. +Here's a quick example using the H3N2 virus SNP data set. We will only retain +loci that have a minor allele frequency of $\geq 5\%$ <>= data(H3N2) H.five <- informloci(H3N2, cutoff = 0.05) @@ -921,951 +2390,2199 @@ H.five <- informloci(H3N2, cutoff = 0.05) res <- c(157,177,233,243,262,267,280,303,313,327,357,382,384,399,412,418,424,425,429,433,451,470,529,546,555,557,564,576,592,595,597,602,612,627,642,647,648,654,658,663,667,681,717,806,824,837,882) cat("cutoff value: 5 percent ( 95 individuals ).\n","47 uninfomative loci found:", res, fill = 80) @ -Now what happens when you have all informative loci: +Now what happens when you have all informative loci. We'll use the nancycats +data set, which has microsatellite loci. It is important to note that this is +searching for loci with a specified genotype frequency as fixed heterozygous +sites are also uninformative: <>= data(nancycats) naninform <- informloci(nancycats, cutoff = 0.05) @ -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -\section{Multilocus Genotype Analysis}\label{mlg} - -\tab\tab In populations with mixed sexual and clonal reproduction, it is not uncommon to have multiple samples from the same population have the same genotype across multiple loci (multilocus genotype, MLG). Here, we introduce tools for tracking MLGs within and across populations in \texttt{genind} objects from the \textit{adegenet} package. We will be using SNP data from isolates of the H3N2 virus from 2002 to 2006. -\subsection{Just a peek \{How many multilocus genotypes are in our data set?\}}\label{mlg:mlg} - -\tab\tab First, let's take a quick look at how many Multilocus Genotypes are present within the H3N2 data set using the \texttt{mlg} function. This will tell us if any MLG analysis is needed. -\subsubsection{Function: mlg}\label{mlg:mlg:mlg} +%=============================================================================% +%=============================================================================% +% +% +% +%=============================================================================% +%=============================================================================% +\section{Multilocus Genotype Analysis} +\label{mlg} -\tab\tab The function \texttt{mlg} allows for the counting of the number of MLGs in a \texttt{genind} object. This is a very simple command for quick reference to determine if your data set needs further multilocus genotype analysis. +\tab\tab In populations with mixed sexual and clonal reproduction, it common to have multiple samples from the same population that have the same set of alleles at all loci. Here, we introduce tools for tracking MLGs within and across populations in \seclink{intro:genind}{genind} objects from the \adegenet{} package. We will be using the \seclink{data:virus}{virus} data set containing SNP data from isolates of the H3N2 virus from 2002 to 2006. Note that genclone objects are optimal for these analyses. %-----------------------------------------------------------------------------% +% +%-----------------------------------------------------------------------------% +\subsection{How many multilocus genotypes are in our data set? \{Just a peek\}} +\label{mlg:mlg} +\tab\tab Counting the number of MLGs in a population is the +first step for these analyses as they allow us to see how many clones +exist. With the \seclink{intro:genclone}{genclone} object, This information is +already displayed when we view the object. +<>= +virus +@ +\noindent +If we need to store the number of MLGs as a variable, we can simply run the +\texttt{mlg} command. +<>= +virus_mlg <- mlg(virus) +virus_mlg +@ +\label{mlg:mlg:mlg} + +Since the number of individuals exceeds the number of multilocus +genotypes, we conclude that this data set contains clones. Let's examine what populations these clones belong to. +%-----------------------------------------------------------------------------% +% +%-----------------------------------------------------------------------------% +\subsection{MLGs across populations \{clone-ing around\}} +\label{mlg:cross} + +\tab\tab Since you have the ability to define hierarchical levels of your +data set freely, it is quite possible to see some of the same MLGs across +different populations. Tracking them by hand can be a nightmare with large data +sets. Luckily, \texttt{mlg.crosspop} has you covered in that regard. +\subsubsection{Function: mlg.crosspop} +\label{mlg:cross:mlg.crosspop} + +\tab\tab Analyze the MLGs that cross populations within your data set. This has +three output modes. The default one gives a list of MLGs, and for each MLG, it +gives a named numeric vector indicating the abundance of that MLG in each +population. Alternate outputs are described with \texttt{indexreturn} and +\texttt{df}. + +\tline{} \begin{quote} -Default Command:\\ -\texttt{mlg(pop, quiet = FALSE)} -\end{quote} - \begin{itemize} - \item \texttt{pop -} a \texttt{genind} object. - \item \texttt{quiet -} if \texttt{TRUE}, the number of individuals and multilocus genotypes will be printed to the screen, if \texttt{FALSE}, nothing will be printed to the screen and the number of multilocus genotypes will be reported. - \end{itemize} -<>= -data(H3N2) -mlg(H3N2, quiet=FALSE) +Default Command: +<>= +funk <- "mlg.crosspop" +print_command(funk) @ -We can see that since the number of individuals exceeds the number of multilocus genotypes, this data set contains clones. Let's take a look at where those clones are with respect to populations. -\subsection{Clone-ing around \{MLGs across populations\}}\label{mlg:cross} - -\tab\tab Since you have the ability to change the population structure of your data set freely, it is quite possible to see some of the same MLGs across different populations. Tracking them by hand can be a nightmare with large data sets. Luckily, \texttt{mlg.crosspop} has you covered in that regard. -\subsubsection{Function: mlg.crosspop}\label{mlg:cross:mlg.crosspop} - -\tab\tab Analyze the MLGs that cross populations within your data set. This has three output modes. The default one gives a list of MLGs, and for each MLG, it gives a named numeric vector indicating the abundance of that MLG in each population. Alternate outputs are described with \texttt{indexreturn} and \texttt{df}. -\begin{quote} -Default Command:\\ -\texttt{mlg.crosspop(pop, sublist = "ALL", blacklist = NULL, mlgsub = NULL, indexreturn = FALSE, df = FALSE, quiet = FALSE)} \end{quote} - \begin{itemize} - \item \texttt{pop -} a \texttt{genind} object. - \item \texttt{sublist -} see \texttt{mlg.table}, Section \ref{mlg:table:mlg.table}. Analyze specified populations. - \item \texttt{blacklist -} see \texttt{mlg.table}, Section \ref{mlg:table:mlg.table}. Do not include specified populations. - \item \texttt{mlgsub -} see \texttt{mlg.table}, Section \ref{mlg:table:mlg.table}. Only analyze specified MLGs. The vector for this flag can be produced by this function as you will see later in this vignette. - \item \texttt{indexreturn -} return a vector of indices of MLGs. (You can use these in the \texttt{mlgsub} flag, or you can use them to subset the columns of an MLG table). - \item \texttt{df -} return a data frame containing the MLGs, the populations they cross, and the number of copies you find in each population. This is useful for making graphs in \textit{ggplot2}. - \item \texttt{quiet -} \texttt{TRUE} or \texttt{FALSE}. Should the populations be printed to screen as they are processed? (will print nothing if \texttt{indexreturn} is \texttt{TRUE}) - \end{itemize} +\begin{itemize} + \item \texttt{pop -} a \texttt{genind} object. + \item \texttt{sublist -} Populations to include (Defaults to ``ALL''). + see \cmdlink{data.manip:divide:popsub}{popsub.} + \item \texttt{blacklist -} Populations to exclude. see \cmdlink{data.manip:divide:popsub}{popsub.} + \item \texttt{mlgsub -} see \cmdlink{mlg:table:mlg.table}{mlg.table.} Only + analyze specified MLGs. The vector for this flag can be produced by this + function as you will see later in this vignette. + \item \texttt{indexreturn -} return a vector of indices of MLGs. (You can + use these in the \texttt{mlgsub} flag, or you can use them to subset the + columns of an MLG table). + \item \texttt{df -} return a data frame containing the MLGs, the populations + they cross, and the number of copies you find in each population. This is + useful for making graphs in \textit{ggplot2}. + \item \texttt{quiet -} \texttt{TRUE} or \texttt{FALSE}. Should the + populations be printed to screen as they are processed? (will print nothing + if \texttt{indexreturn} is \texttt{TRUE}) +\end{itemize} -We can see what Multilocus Genotypes cross different populations and then give a vector that shows how many populations each multi-population MLG crosses. -<>= -pop(H3N2) <- H3N2$other$x$country -H.dup <- mlg.crosspop(H3N2, quiet=TRUE) +\bline{} + +We can see what MLGs cross different populations and then give a +vector that shows how many populations each one of those MLGs crosses. +<>= +setpop(virus) <- ~country +v.dup <- mlg.crosspop(virus, quiet=TRUE) @ -Here is a snippet of what the output looks like when \texttt{quiet} is \texttt{FALSE}. It will print out the MLG name, the total number of individuals that make up that MLG, and the populations where that MLG can be found. +Here is a snippet of what the output looks like when \texttt{quiet} is +\texttt{FALSE}. It will print out the MLG name, the total number of individuals +that make up that MLG, and the populations where that MLG can be found. <>= -H.inds <- mlg.crosspop(H3N2, indexreturn=TRUE) -Hadoo <- mlg.crosspop(H3N2, mlgsub=H.inds[1:10]) +setpop(virus) <- ~country +v.dup <- structure(list(MLG.3 = structure(c(4L, 8L), .Names = c("USA", +"Denmark")), MLG.9 = structure(c(1L, 13L, 1L, 1L), .Names = c("Japan", +"USA", "Finland", "Denmark")), MLG.31 = structure(c(2L, 7L), .Names = c("Japan", +"Canada")), MLG.75 = structure(c(2L, 8L, 2L, 1L, 6L, 2L, 1L, +1L), .Names = c("Japan", "USA", "Finland", "Norway", "Denmark", +"Austria", "Russia", "Ireland")), MLG.80 = structure(c(1L, 1L +), .Names = c("USA", "Denmark")), MLG.86 = structure(3:4, .Names = c("Denmark", +"Austria")), MLG.95 = structure(c(1L, 1L), .Names = c("USA", +"Bangladesh")), MLG.97 = structure(c(1L, 5L, 1L, 1L), .Names = c("USA", +"Austria", "Bangladesh", "Romania")), MLG.104 = structure(1:2, .Names = c("USA", +"France")), MLG.110 = structure(c(2L, 3L, 11L), .Names = c("Japan", +"USA", "China"))), .Names = c("MLG.3", "MLG.9", "MLG.31", "MLG.75", +"MLG.80", "MLG.86", "MLG.95", "MLG.97", "MLG.104", "MLG.110")) +printthings <- function(ind, x){ + cat(paste0(names(x)[ind], ":"), + paste0("(", sum(x[[ind]])," inds)"), + names(x[[ind]]), + "\n") +} +invisible(lapply(1:10, printthings, v.dup)) @ -The output of this function is a list of MLGs, each containing a vector indicating the number of copies in each population. We'll count the number of populations each MLG crosses using the function \texttt{sapply} with \texttt{length}. +The output of this function is a list of MLGs, each containing a vector +indicating the number of copies in each population. We'll count the number of +populations each MLG crosses using the function \texttt{sapply} to loop over +the data with the function \texttt{length}. <>= -head(H.dup) -H.num <- sapply(H.dup, length) # count the number of populations each MLG crosses. -H.num +head(v.dup) +v.num <- sapply(v.dup, length) # count the number of populations each MLG crosses. +head(v.num) @ %\newpage -\subsection{Bringing something to the table \{producing MLG tables and graphs\}}\label{mlg:table} +%-----------------------------------------------------------------------------% +% +%-----------------------------------------------------------------------------% +\subsection{Producing MLG tables and graphs \{bringing something to the table\}} +\label{mlg:table} -We can also create a table of multilocus genotypes per population as well as bar graphs to give us a visual representation of the data. This is achieved through the function \texttt{mlg.table} -\subsubsection{Function: mlg.table}\label{mlg:table:mlg.table} +We can also create a table of MLGs per population as well as bar graphs to give +us a visual representation of the data. This is achieved through the function +\texttt{mlg.table} +\subsubsection{Function: mlg.table} +\label{mlg:table:mlg.table} -\tab\tab Produce a matrix containing counts of MLGs (columns) per population (rows). If there is no population structure to your data set, a vector will be produced instead. +\tab\tab This function will produce a matrix containing counts of MLGs (columns) +per population (rows). If there are not populations defined in your data set, a +vector will be produced instead. + +\tline{} \begin{quote} -Default Command:\\ -\texttt{mlg.table(pop, sublist = "ALL", blacklist = NULL, mlgsub = NULL, bar = TRUE, total = FALSE, quiet = FALSE)} +Default Command: +<>= +funk <- "mlg.table" +print_command(funk) +@ +% \texttt{mlg.table(pop, sublist = "ALL", blacklist = NULL, mlgsub = NULL, bar = TRUE, total = FALSE, quiet = FALSE)} \end{quote} \begin{itemize} \item \texttt{pop -} a \texttt{genind} object. - \item \texttt{sublist -} a vector indicating which specific populations you want to produce a table for. This can be a numeric or character vector. See section \ref{data.manip:divide:popsub} for details. - \item \texttt{blacklist -} a vector indicating which specific populations you do not want to include in your table. This can be a numeric or character vector, and does not necessarily have to be the same type as \texttt{sublist}. eg. \texttt{sublist=1:10, blacklist="USA"}. See section \ref{data.manip:divide:popsub} for details. - \item \texttt{mlgsub -} a vector containing the indices of MLGs you wish to subset your table with. - \item \texttt{bar -} \texttt{TRUE} or \texttt{FALSE}. If \texttt{TRUE}, a bar plot will be printed for each population with more than one individual. - \item \texttt{total -} \texttt{TRUE} or \texttt{FALSE}. Should the entire data set be included in the table? This is equivalent to evoking \texttt{colSums} on the table. - \item \texttt{quiet -} \texttt{TRUE} or \texttt{FALSE}. When \texttt{bar} is \texttt{TRUE}, should the populations be printed to screen as they are processed? + \item \texttt{sublist -} Populations to include (Defaults to ``ALL''). + see \cmdlink{data.manip:divide:popsub}{popsub.} + \item \texttt{blacklist -} Populations to exclude. + see \cmdlink{data.manip:divide:popsub}{popsub.} + \item \texttt{mlgsub -} a vector containing the indices of MLGs you wish to + subset your table with. + \item \texttt{bar -} \texttt{TRUE} or \texttt{FALSE}. If \texttt{TRUE}, a + bar plot will be printed for each population with more than one individual. + \item \texttt{total -} When set to \texttt{TRUE}, the pooled data set will + be added to the table. Defaults to \texttt{FALSE}. + \item \texttt{quiet -} Defaults to \texttt{FALSE}: population names will + be printed to the console as they are processed. \end{itemize} +\bline{} + <>= -H.tab <- mlg.table(H3N2, quiet=TRUE, bar=TRUE) -H.tab[1:10, 1:10] # Showing the first 10 columns and rows of the table. +v.tab <- mlg.table(virus, quiet=TRUE, bar=TRUE) +v.tab[1:10, 1:10] # Showing the first 10 columns and rows of the table. @ <>= -H.tab <- mlg.table(H3N2, quiet=TRUE, bar=FALSE) -H.tab[1:10, 1:10] +v.tab <- mlg.table(virus, quiet=TRUE, bar=FALSE) +v.tab[1:10, 1:10] @ \begin{figure}[h!] \centering - \caption{\footnotesize An example of a bar-chart produced by \texttt{mlg.table}. Note that this data set would produce several such charts.} + \caption{\footnotesize An example of a bar-chart produced by + \texttt{mlg.table}. Note that this data set would produce several such + charts but only the chart for Norway is shown here.} \label{nortable} -<>= -mlg.table(H3N2, sublist="Norway", quiet=TRUE, bar=TRUE) +<>= +mlg.table(virus, sublist="Norway", quiet=TRUE, bar=TRUE) @ \end{figure} \newpage -The MLG table is not restricted for use with just \textit{Poppr}. One of the main advantages of the function \texttt{mlg.table} is that it allows easy access to diversity functions present in the package \textit{vegan} \cite{vegan}. One very simple example is to create a rarefaction curve for each population in your data set giving the number of expected MLGs for a given sample size. For more information, type \texttt{help("diversity", package="vegan")} in your R console. - -For the sake of a simple example, instead of drawing a curve for each of the 37 countries represented in this sample, let's change the population structure to be the different years of the epidemics. -<>= -H.year <- H3N2 -pop(H.year) <- H.year$other$x$year -summary(H.year) # Check the data to make sure it's correct. +The MLG table is not limited to use with \poppr{}. In fact, one of the main +advantages of \texttt{mlg.table} is that it allows easy access to diversity +functions present in the package \textit{vegan} \cite{vegan}. One example is to +create a rarefaction curve for each population in your data set giving the +number of expected MLGs for a given sample size. For more information, type +\texttt{help("diversity", package="vegan")} in your R console. + +For the sake of this example, instead of drawing a curve for each of the 37 +countries represented in this sample, let's set the hierarchical level to year. +<>= +setpop(virus) <- ~year +summary(virus) # Check the data to make sure it's correct. +@ +<>= +setpop(virus) <- ~year +res <- structure(list(N = 1903L, pop.eff = structure(c(158, 415, 399, +469, 462), .Names = c("2002", "2003", "2004", "2005", "2006")), + loc.nall = structure(c(3L, 3L, 4L, 2L, 4L, 2L, 3L, 2L, 4L, + 3L, 4L, 2L, 4L, 3L, 2L, 2L, 3L, 3L, 2L, 2L, 3L, 3L, 3L, 2L, + 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 4L, 4L, 3L, 3L, 3L, 4L, + 2L, 2L, 2L, 4L, 3L, 2L, 3L, 4L, 2L, 3L, 2L, 3L, 2L, 2L, 2L, + 4L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 4L, 4L, 4L, 3L, 3L, 2L, 3L, + 4L, 3L, 2L, 3L, 3L, 3L, 3L, 2L, 3L, 2L, 4L, 2L, 3L, 2L, 2L, + 3L, 3L, 3L, 3L, 2L, 2L, 2L, 2L, 3L, 2L, 3L, 2L, 3L, 2L, 3L, + 2L, 3L, 3L, 2L, 2L, 2L, 3L, 2L, 2L, 2L, 3L, 3L, 3L, 2L, 2L, + 3L, 3L, 3L, 3L, 4L, 2L, 3L, 3L, 4L, 3L, 2L), .Names = c("L001", + "L002", "L003", "L004", "L005", "L006", "L007", "L008", "L009", + "L010", "L011", "L012", "L013", "L014", "L015", "L016", "L017", + "L018", "L019", "L020", "L021", "L022", "L023", "L024", "L025", + "L026", "L027", "L028", "L029", "L030", "L031", "L032", "L033", + "L034", "L035", "L036", "L037", "L038", "L039", "L040", "L041", + "L042", "L043", "L044", "L045", "L046", "L047", "L048", "L049", + "L050", "L051", "L052", "L053", "L054", "L055", "L056", "L057", + "L058", "L059", "L060", "L061", "L062", "L063", "L064", "L065", + "L066", "L067", "L068", "L069", "L070", "L071", "L072", "L073", + "L074", "L075", "L076", "L077", "L078", "L079", "L080", "L081", + "L082", "L083", "L084", "L085", "L086", "L087", "L088", "L089", + "L090", "L091", "L092", "L093", "L094", "L095", "L096", "L097", + "L098", "L099", "L100", "L101", "L102", "L103", "L104", "L105", + "L106", "L107", "L108", "L109", "L110", "L111", "L112", "L113", + "L114", "L115", "L116", "L117", "L118", "L119", "L120", "L121", + "L122", "L123", "L124", "L125")), pop.nall = structure(c(203L, + 255L, 232L, 262L, 240L), .Names = c("1", "2", "3", "4", "5" + )), NA.perc = 2.36342616920652, Hobs = 0, Xexp = 0), .Names = c("N", +"pop.eff", "loc.nall", "pop.nall", "NA.perc", "Hobs", "Xexp")) +listlab <- c("# Total number of genotypes: ", "# Population sample sizes: ", + "# Number of alleles per locus: ", "# Number of alleles per population: ", + "# Percentage of missing data: ", "# Observed heterozygosity: ", + "# Expected heterozygosity: ") +cat("\n", listlab[1], res[[1]], "\n") +for (i in 2:7) { + cat("\n", listlab[i], "\n") + print(res[[i]]) +} @ <>= library(vegan) -H.year <- mlg.table(H.year, bar=FALSE) -rarecurve(H.year, ylab="Multilocus genotypes expected", sample=min(rowSums(H.year))) +H.year <- mlg.table(virus, bar=FALSE) +rarecurve(H.year, ylab="Number of expected MLGs", sample=min(rowSums(H.year)), border = NA, fill = NA, font = 2, cex = 1, col = "blue") @ \begin{figure}[h!] \centering \caption{\footnotesize An example of a rarefaction curve produced using a MLG table.} \label{rarecurve} -<>= -library(vegan) -H.year <- mlg.table(H.year, bar=FALSE) -rarecurve(H.year, ylab="Multilocus Genotypes Expected", sample=min(rowSums(H.year))) +<>= +# library(vegan) +H.year <- mlg.table(virus, bar=FALSE) +vegan::rarecurve(H.year, ylab="Number of expected MLGs", sample=min(rowSums(H.year)), border = NA, fill = NA, font = 2, cex = 1, col = "blue") @ \end{figure} \newpage -The minimum value from the \textit{base} function \texttt{rowSums()} of the table represents the minimum common sample size of all populations. Setting the ``sample" flag draws the horizontal and vertical lines you see on the graph. The intersections of these lines correspond to the numbers you would find if you ran the function \texttt{poppr} on this data set (under the column ``\texttt{eMLG}"). -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -\subsection{Getting into the mix \{combining MLG functions\}}\label{mlg:mix} +The minimum value from the \textit{base} function \texttt{rowSums()} of the +table represents the minimum common sample size of all populations defined in +the table. Setting the ``sample" flag draws the horizontal and vertical lines +you see on the graph. The intersections of these lines correspond to the numbers +you would find if you ran the function \texttt{poppr} on this data +set (under the column ``\texttt{eMLG}"). -\tab\tab Alone, the different functionalities are neat. Combined, we can create interesting data sets. Let's say we wanted to know which MLGs were duplicated across the regions of the United Kingdom, Germany, Netherlands, and Norway. All we have to do is use the \texttt{sublist} flag in the function: +%-----------------------------------------------------------------------------% +% +%-----------------------------------------------------------------------------% +\subsection{Combining MLG functions \{getting into the mix\}} +\label{mlg:mix} + +\tab\tab Alone, the different functionalities are neat. Combined, we can create +interesting data sets. Let's say we wanted to know which MLGs were duplicated +across the regions of the United Kingdom, Germany, Netherlands, and Norway. All +we have to do is use the \texttt{sublist} flag in the function: <>= +setpop(virus) <- ~country UGNN.list <- c("United Kingdom", "Germany", "Netherlands", "Norway") -UGNN <- mlg.crosspop(H3N2, sublist=UGNN.list, indexreturn=TRUE) +UGNN <- mlg.crosspop(virus, sublist=UGNN.list, indexreturn=TRUE) @ -OK, the output tells us that there are three MLGs that are crossing between these -populations, but we do not know how many are in each. We can easily find that out if we subset our original table, \texttt{H.tab}. +OK, the output tells us that there are three MLGs that are crossing between +these populations, but we do not know how many are in each. We can easily find +that out if we subset our original table, \texttt{v.tab}. <>= UGNN # Note that we have three numbers here. This will index the columns for us. UGNN.list # And let's not forget that we have the population names. -H.tab[UGNN.list, UGNN] +v.tab[UGNN.list, UGNN] @ -Now we can see that Norway has a higher incidence of nearly all of these MLGs. -We can go even further and subset the original data set to only give us those MLGs by utilizing the function \texttt{mlg.vector}: -\subsubsection{Function: mlg.vector}\label{mlg:mix:mlg.vector} +Now we can see that Norway has a higher incidence of nearly all of these MLGs. +We can investigate the incidence of these MLGs throughout our data set. One +thing that the \seclink{intro:genclone}{genclone} object keeps track of is a +single vector defining the unique multilocus genotypes within the data. These +are represented as integers and can be accessed with \texttt{mlg.vector}. This +is useful for finding MLGs that correspond to certain individuals or +populations. Let's use \texttt{mlg.vector} to find individuals corresponding to +the MLGs. First we'll investigate what the output of this function looks like. -\tab\tab This function is the backbone for \texttt{mlg.table} and \texttt{mlg.crosspop}, and is The function that determines what your MLGs are. This is quite useful for sub-setting the data set to only contain the MLGs of interest. The numbers in the vector correspond to the number of columns in a matrix produced by \texttt{mlg.table}. It is important to remember that this is also sensitive to missing data and will treat it as a single extra allele. -\begin{quote} -Default Command:\\ -\texttt{mlg.vector(pop)} -\end{quote} - \begin{itemize} - \item \texttt{pop -} a \texttt{genind} object. - \end{itemize} -<>= -H.vec <- mlg.vector(H3N2) -H.sub <- H3N2[H.vec %in% UGNN, ] -mlg.table(H.sub, bar=FALSE) -@ -You can also do the same thing using the mlgsub flag. -<>= -mlg.table(H3N2, mlgsub=UGNN, bar=TRUE) -@ -<>= -mlg.table(H3N2, mlgsub=UGNN, bar=FALSE) +<>= +v.vec <- mlg.vector(virus) +str(v.vec) # Analyze the structure. @ -And we can see where exactly these three MLGs fall within our data set. -\begin{figure}[h!] - \centering - \caption{\footnotesize An example of the same bar-chart as \textit{Figure 1}, but focusing on three MLGs.} - \label{nortable2} -<>= -mlg.table(H3N2, sublist="Norway", mlgsub=UGNN) +\noindent +The integers produced are the MLG assigment of each individual in the same order +as the data set. This means that the first two individuals have the exact same +set of alleles at each locus, so they have the same MLG: \Sexpr{v.vec[1]}. If +we look at the number of unique integers in the vector, it corresponds to the +number of observed multilocus genotypes: +<>= +length(unique(v.vec)) # count the number of MLGs +virus # equal to the first number in this output. @ -\end{figure} -\\ -\newpage -Now, you might notice that the MLG vector no longer matches up with our data after we subset it. -<<>>= -H.vec[1:22] -mlg.vector(H.sub) -@ -Well, this is unfortunate because it means that we can't compare any subsetted data with non-subsetted data. Luckily, there's a little trick we can do using our old friend, the \texttt{@other} slot. -If we place the MLG vector in the \texttt{@other} slot of our original data set, it will be subsetted along with the data. -<<>>= -H3N2@other$MLG.vector <- H.vec -H.sub <- H3N2[H.vec %in% UGNN, ] -H.sub@other$MLG.vector -@ -Magic!\\ -%\newpage +\label{mlg:mix:mlg.vector} +\noindent +We will take \texttt{UGNN} (MLGs crossing UK, Germany, Netherlands, and Norway) +and compare its elements to the MLG vector (\texttt{v.vec}) to see where else +they occur. -So, we've gotten this far, yet we haven't actually seen what the genotypes look like! For analyses where the genotypic signature is important, this is a crucial identification step. Lucky for us, the \texttt{genind} object retains all of the genotypic information and can be accessed using the \texttt{genind2df} function. Let's take a look at the three genotypes we specified above utilizing the vector of MLGs we created above, \texttt{H.vec}. -<>= -H.df <- genind2df(H3N2) -H.df[H.vec %in% UGNN, 1:15] # Showing only 15 columns becaus it is a large dataset. -@ -<>= -H.df <- genind2df(H3N2[, loc=names(H3N2@loc.names)[1:15]]) -H.df[H.vec %in% UGNN, 1:15] # Showing only 15 columns becaus it is a large dataset. +<>= +UGNN # Show what we are looking for +UGNN_match <- v.vec %in% UGNN +table(UGNN_match) # How many individuals matched to those three MLGs? @ -Notice that there seems to be a clear separation between the SNPs of the first 10 isolates and the rest? This is no coincidence. Take a look at the output of our sub-setting. -<>= -UGNN -H.vec[H.vec %in% UGNN] -@ - -We have the MLGs 315, 317, and 620, and the result of the sub-setting shows us that 620 occurs earlier in our data set, and that MLGs 315 and 317 are mixed in together. The reason why we do not see a mixture of three different sets of SNP calls in our little window is because \texttt{mlg.vector} creates the MLGs by first concatenating and then sorting the genotypes. This way, the closer two MLG indexes are to each other, the fewer differences they will have between one another. -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -\subsection{Do you see what I see? \{alternative data visualization\}}\label{mlg:alt.vis} +\noindent +22 individuals matched to those three MLGs. We can use this vector to show us +the 22 individuals. -\tab\tab The graphs that are output by \textit{poppr} are simply aids for the user to make data analysis easier. We want to better visualize how these MLGs cross populations by MLG or population. We also want to see exactly what MLGs are in which populations, and how prevalent they are. As the package \textit{ggplot2} is based on data frames, we have to give ourselves a data frame to work with. We can do this using the \texttt{df = TRUE} flag. -<>= -df <- mlg.crosspop(H3N2, df=TRUE, quiet=TRUE) -names(df) -@ -Now that we have our data frame, we can do a couple of things. We can first see where the most omnipresent MLG occurs. After that, we will plot the top ten MLGs using ggplot2. -<<>>= -H.max <- names(sort(H.num, decreasing=TRUE)[1:10]) -# Showing the data frame by the largest MLG complex. -df[df$MLG %in% H.max[1], ] -@ -And now we can visualize the largest ten MLG complexes using \textit{ggplot2}'s \texttt{qplot} function. -\begin{figure}[h] - \centering - \caption{\footnotesize An example of the versatility of the MLG information.} - \label{mlgtable2} -<>= -df2 <- df[df$MLG %in% H.max, ] -library(ggplot2) -qplot(y=MLG, x=Population, data=df2, color=Count, size=Count) + - theme(axis.text.x = element_text(size = 10, angle = -45, hjust = 0)) +<>= +indNames(virus)[UGNN_match] @ -\end{figure} -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -\section{Index and Distance Calculations}\label{index} -\subsection{The missing linkage disequilibrium \{calculating the index of association, $I_A$ and $\bar r_d$\}}\label{index:iard} - -\tab\tab The index of association was originally developed as a measure of multilocus linkage disequilibrium \cite{Brown:1980} and was found to be able to detect signatures of sexual reproduction and population structure \cite{Brown:1980, Smith:1993}. Unfotunately, $I_A$ was found to increase with the number of loci, and was not suitable to comparisons across studies \cite{Agapow:2001}. To remedy this, $\bar{r}_d$ was developed that corrects for this scaling and forces the index to lie between 0 (linkage equilibrium) and 1 (full disequilibrium). $I_A$ has previously been implemented in a couple of programs including \textit{multilocus} \cite{Agapow:2001} and \textit{LIAN} \cite{Haubold:2000}. While both of these programs are still available for download, \textit{multilocus} is no longer actively supported, and \textit{LIAN}, despite its speed, is only appropriate for haplotypic data. Both of these programs each require one specific file format, and, until recently\footnote{LIAN 3.6 allows the user to run multiple contiguous data sets within a single file or across multiple files. It is impossible to run MULTILOCUS in batch.}, neither of these programs had an internal ability to run in batch across multiple populations within a file or multiple files within a directory in the same way that poppr can (see footnote). - -It is important to note that for this algorithm, all missing values are treated in the same way as \textit{multilocus} in that all missing alleles are imputed to be the same as the alleles they are being compared to. Depending on the percent missing data in your data set, this might influence the statistic. If you have a lot of missing data, consider using the \texttt{missing} flag in this function. -\subsubsection{Function: ia}\label{index:iard:ia} -\tab\tab This function is a quick look at a single data set. It can do almost everything that \texttt{poppr} can do except for sorting through populations. -\begin{quote} -Default Command:\\ -\texttt{ia(pop, sample = 0, method = 1, quiet = FALSE, missing = "ignore", - hist = TRUE)} -\end{quote} -\begin{itemize} - \item \texttt{pop -} a \texttt{genind} object. - \item \texttt{sample -} You should use this flag whenever you want to reshuffle your data set. Indicate how many times you want to reshuffle your data set to obtain a P-value. - \item \texttt{method -} a number from 1 to 4 indicating the sampling method: - \begin{enumerate} - \item permutation over alleles. - \item parametric bootstrap. - \item non-parametric bootstrap. - \item \textit{multilocus} style permutation \cite{Agapow:2001}. - \end{enumerate} - The methods are detailed in section \ref{data.manip:shuffle:shufflepop} of this manual. - \item \texttt{quiet -} If set to \texttt{TRUE}, nothing will be printed to the screen as the sampling progresses. If \texttt{FALSE} will produce a progress bar. - \item \texttt{missing -} This will preprocess your missing values. It is set to ignore missing data, so that they do not contribute to the distance measure. It can also be set to \texttt{"loci", "geno", "zero", or "mean"}. For details, see section \ref{data.manip:missing:missingno} of this manual. - \item \texttt{hist -} This will produce a pair of histograms for each population showing the distribution of $I_A$ and $\bar r_d$ across the sampled data sets, and plot the observed value as a single vertical line. -\end{itemize} -Running the analysis is as simple as this: -<>= -ia(nancycats) -@ -We can use \texttt{popsub} to subset for specific populations. Here, we'll also demonstrate the sampling flag and show you what the histogram looks like. -<>= -set.seed(1009) -ia(popsub(nancycats, 5), sample=999) -@ -<>= -set.seed(1009) -simplenan <- ia(popsub(nancycats, 5)) -cat("|================================================================================| 100%\n") -c(simplenan[1], p.Ia = 0.572, simplenan[2], p.rD = 0.572) +Note that there is an alternative way to list individuals matching specific MLGs using the function \texttt{mlg.id}. This function will return a list where each element represents a unique MLG. You can use this data to find out which individuals correspond to specific MLGs. Each element in the list is named with the MLG, but the index does not necessarily match up, so it is important to convert your query MLGs to strings: +\label{mlg:mix:mlg.id} +<>= +virus.id <- mlg.id(virus) +virus.id[as.character(UGNN)] @ -This analysis produced the histograms you see below. What these histograms represent are 999 resamplings of the data under the null hypothesis ($H_0$) of sexual reproduction. The way that $H_0$ is created is determined by the sampling method chosen. In this case, the method was to shuffle genotypes at each locus to simulate unlinked loci. Since the P = 0.572, we would fail to reject $H_0$ and we therefore might conclude that this population is sexually reproducing \cite{Brown:1980} \cite{Smith:1993} \cite{Agapow:2001}. -\begin{figure}[h!] - \centering - \caption{\footnotesize Histograms of 999 values of $I_A$ and $\bar{r}_d$ calculated from 999 resamplings of population 5 from the data set ``nancycats". The observed values of $I_A$ and $\bar r_d$ are represented as vertical blue lines overlaid on the distributions. The ticks at the bottom of each histogram represent individual observations.} - \label{ia_demo_fig} -<>= -set.seed(1009) -nan5 <- ia(popsub(nancycats, 5), sample=999, quiet=TRUE) -@ -\end{figure} -%\newpage -There, are, of course a couple of caveats that need to be mentioned regarding our P-values. First, while we have equivalent P-values for $I_A$ and $\bar{r}_d$, they might not always be equal due to the difference in calculation. Details about that can be found in the Appendix section \ref{appendix:algorithm:iard}. Second, the P-values are calculated by comparing how many permuted values are greater than or equal to the observed value. This includes the observed value (which is why setting the randomizations to 999 will give you a round P-value) which means that the lowest P-value you will ever have is $1/(n+1)$ where $n$ is the number of permutations you select. Take for example this population of a clonal root rot pathogen, \textit{Aphanomyces euteiches}: -<>= -data(Aeut) -set.seed(1001) -ia(popsub(Aeut, 1), sample=999, method=2, quiet=TRUE, hist=FALSE) -@ -<>= -data(Aeut) -set.seed(1001) -A.dum <- ia(popsub(Aeut, 1)) -c(A.dum[1], p.Ia = 0.001, A.dum[2], p.rD = 0.001) +We can also use the vector of MLGs to +subset \cmdlink{mlg:table:mlg.table}{mlg.table} with the \texttt{mlgsub} flag. +<>= +mlg.table(virus, mlgsub = UGNN, bar = TRUE) @ -If you want to be able to report $P < 0.001$ in this situation, then you can simply increase the number in sample: \texttt{sample = 1999} -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -\subsection{Going the distance \{dissimilarity distance\}}\label{index:dist} -\tab\tab Since \textit{poppr} is still in its infancy, the number of distance measures it can offer are few. Bruvo's distance is well supported and allows you to quickly visualize your data, but it only allows for microsatellites. The index of association, above, utilizes a discreet dissimilarity distance matrix. It is with this matrix that we have constructed a relative dissimilarity distance where the distance is the ratio of the number of dissimilarities to the number of dissimilarities possible. The number of dissimilarities possible is the number of loci multiplied by the ploidy, so if you have 10 loci from a diploid population, then there are 20 dissimilarities possible. For details, see equations (\ref{eq:ia_d}) and (\ref{eq:ia_D}) in section \ref{appendix:algorithm:iard}. - -\subsubsection{Function: diss.dist}\label{index:dist:diss.dist} - -\tab\tab Use this function to calculate relative dissimilarity between individuals and return a distance matrix for use in creating cladograms or minimum spanning networks. A note: missing alleles will be imputed to be the same as the challenging allele, decreasing the distance between some individuals. If you want to consider all missing data as special alleles, treat your data with \texttt{missingno(pop, type = "zero")} beforehand. - -\begin{quote} -Default Command:\\ -\texttt{diss.dist(pop)} -\end{quote} - -\begin{itemize} - \item \texttt{pop -} a \texttt{genind} object. -\end{itemize} -Since we have a data set that we know is very clonal, let's analyze the \textit{A. euteiches} data set \cite{Grunwald:2006} and create a heatmap to visualize the degree of difference between populations. -<>= -data(Aeut) -A.dist <- diss.dist(Aeut) -heatmap(as.matrix(A.dist), symm=TRUE) +<>= +mlg.table(virus, mlgsub = UGNN, bar = FALSE) @ +\noindent +That showed us exactly which populations these three MLGs came from in our data set. \begin{figure}[h!] \centering - \caption{\footnotesize Heatmap representation of a dissimilarity distance for the data set ``Aeut"} - \label{diss_heat_map} -<>= -data(Aeut) -A.dist <- diss.dist(Aeut) -heatmap(as.matrix(A.dist), symm=TRUE) + \caption{\footnotesize An example of the same bar-chart as \textit{Figure 1}, + but focusing on three MLGs.} + \label{nortable2} +<>= +mlg.table(virus, sublist = "Norway", mlgsub = UGNN) @ \end{figure} \newpage -\subsection{Step by stepwise mutation \{Bruvo's distance\}}\label{index:bruvo} - -\tab\tab Bruvo's distance is a genetic distance measure for microsatellite markers utilizing a stepwise mutation model that allows for differing ploidy levels \cite{Bruvo:2004}. As adegenet's genind object has an all or none approach to missing data, any genotypes not exhibiting full ploidy will be treated as missing. This means that only non-special cases will be considered for the calculation and missing data will be ignored \cite{Bruvo:2004}. It is important to note that this is a distance between individuals, not populations, unlike Nei's 1978 distance \cite{Nei:1978}. For distances between populations, see the \textit{adegenet} function \texttt{dist.genpop} - -\subsubsection{Function: bruvo.dist}\label{index:bruvo:bruvo.dist} -\tab\tab Bruvo's distance requires knowledge of the repeat lengths of each locus, so take care to read the description below. -\begin{quote} -Default Command:\\ -\texttt{bruvo.dist(pop, replen = c(2))} -\end{quote} -\begin{itemize} - \item \texttt{pop -} a \texttt{genind} object. - \item \texttt{replen -} This is a vector of numbers indicating the repeat length for each locus in your sample. If you have two dinucleotide repeats and five tetranucleotide repeats, you would put \texttt{c(2,2,4,4,4,4,4)} in this field. If you have imported data where that represents the raw number of steps, all you would have to type is \texttt{rep(1, n)}, replacing $n$ with the number of loci in your sample. It is important that you place something in this field because this function will attempt to estimate the repeat length based on the minimum difference of the alleles represented; with variability of position calls, relying on this estimation is NOT recommended. -\end{itemize} - -To illustrate why it is important to specify the repeat lengths, let's imagine a locus that contains 5 alleles and the true repeat length is 4. Note that Bruvo's distance between alleles is calculated as $1 - 2^{-[x]}$, where $x$ is the difference in repeat lengths: -<>= -locus1 <- c(244, 248, 256, 240, 236) -locus1/4 -1 - 2^-dist(locus1/4) -@ -We can see that the distance between them ranges from 1 to 5. Let's say, that we accidentally wrote 2 or 8 instead of 4: -<>= -locus1/2 -1 - 2^-dist(locus1/2) # Distance increase - -locus1/8 -1 - 2^-dist(locus1/8) # Distance decrease -@ -While we will still get results from this analysis with the incorrect repeat length, they will be inherently wrong as they do not represent the true distance. That being said, it's important to note that the repeat lengths we represent for the rest of the manual are not known by the authors, but are used as a simple example. - -This function will return a distance matrix (displaying the smallest population in the data set ``nancycats"): -<>= -dist9 <- bruvo.dist(popsub(nancycats, 9), replen=rep(1,9)) -dist9 -@ -You can visualize this better with a simple heatmap: -%\newpage - -<>= -heatmap(as.matrix(dist9), symm=TRUE) -@ -\begin{figure}[h!] - \centering - \caption{\footnotesize Heatmap representation of Bruvo's distance for population 9 of the data set ``nancycats"} - \label{bruvo_heat_map} -<>= -heatmap(as.matrix(dist9), symm=TRUE) -@ -\end{figure} -%\newpage - -Let's take a closer look at the two individuals, N113 and N111. They seem to have large distances between everyone else and themselves. The names and columns of the matrix contain the names of individuals, but not the population information. We can make a comparison of Bruvo's distance across populations easier by editing the ``Labels" attribute of the distance object. Let's take a look at the labels attribute using the \texttt{attr()} command. -<>= -attr(dist9, "Labels") -@ -Remember that they all came from population 9, so let's append that to each label using the \texttt{paste()} command. -<>= -dist9.attr <- attr(dist9, "Labels") -attr(dist9, "Labels") <- paste(rep("P09", 9), dist9.attr) -dist9 -@ -Now we can see that all of the labels are corresponding to population 9. Let's calculate Bruvo's distance between populations 8 and 9. -<>= -dist9to8 <- bruvo.dist(popsub(nancycats, 8:9), replen=rep(1,9)) -dist9to8.attr <- attr(dist9to8, "Labels") -nan9to8pop <- nancycats@pop[nancycats@pop %in% c("P08", "P09")] -attr(dist9to8, "Labels") <- paste(nan9to8pop, dist9to8.attr) -heatmap(as.matrix(dist9to8), symm=TRUE) -@ -\begin{figure}[h!] - \centering - \caption{\footnotesize Heatmap representation of Bruvo's distance for populations 8 and 9 of the data set ``nancycats"} - \label{bruvo_heat_map_8to9} -<>= -dist9to8 <- bruvo.dist(popsub(nancycats, 8:9), replen=rep(1,9)) -dist9to8.attr <- attr(dist9to8, "Labels") -nan9to8pop <- nancycats@pop[nancycats@pop %in% c("P08", "P09")] -attr(dist9to8, "Labels") <- paste(nan9to8pop, dist9to8.attr) -heatmap(as.matrix(dist9to8), symm=TRUE) -@ -\end{figure} - -Remember N113 and N111? Take a look at where they fall on the heatmap. They don't cluster together with population 9 anymore, but somewhere in population 8. -% \newpage - -\subsection{See the forest for the trees \{visualizing distances with dendrograms and networks\}}\label{index:trees} - -\tab\tab Staring at a raw distance matrix might be able to tell you something about your data, but it also might be able to ruin your eyesight. In this section, we present functions to display this data in trees and networks. - -\subsubsection{Function: bruvo.boot}\label{index:trees:bruvo.boot} - -\tab\tab This function provides the ability to draw a dendrogram based on Bruvo's distance including bootstrap support. -\begin{quote} -Default Command:\\ -\texttt{bruvo.boot(pop, replen = c(2), B = 100, tree = "upgma", showtree = TRUE, cutoff = NULL, quiet = FALSE)} -\end{quote} -\begin{itemize} - \item \texttt{pop -} a \texttt{genind} object. - \item \texttt{replen -} see \texttt{bruvo.dist}, above. - \item \texttt{sample -} How many bootstraps do you want to perform? - \item \texttt{tree -} Two trees are available, Neighbor-Joining \texttt{"nj"} or UPGMA \texttt{"upgma"}. - \item \texttt{showtree -} if \texttt{TRUE}, a tree will be plotted automatically. - \item \texttt{cutoff -} This is a number between 0 and 100 indicating the cutoff value for the bootstrap nodelables. If you only wanted to see the the boostrap values for nodes that were present more than 75\% of the time, you would use \texttt{cutoff = 75}. If you don't put anything for this parameter, all values will be shown. - \item \texttt{quiet -} if \texttt{quiet = TRUE}, no standard messages will be printed to screen. If \texttt{quiet = FALSE} (default), then a progress bar and standard message will be printed to the screen. -\end{itemize} - -For this example, let's set the cutoff to 50\%. -<>= -set.seed(1001) -nan9tree <- bruvo.boot(popsub(nancycats, 8:9), replen=rep(1,9), sample=1000, cutoff=50) -@ -<>= -cat("\nBootstrapping...\n(note: calculation of node labels can take a while even after the progress bar is full)\n\n") -cat("|================================================================================| 100%\n") -@ -\begin{figure}[h!] - \centering - \caption{\footnotesize UPGMA Tree of Bruvo's distance for population 9 of the data set ``nancycats" with 1000 Bootstrap Replicates. Node labels represent percentage of bootstrap replicates that contained that node.} - \label{bruvo_upgma} -% \includegraphics{bruvo_upgma.png} - -<>= - set.seed(1001) - nan9tree <- phangorn::upgma(bruvo.dist(popsub(nancycats, 8:9), replen = rep(1, 9))) - nan9tree$node.labels <- c(100, NA, NA, NA, NA, 69, NA, NA, NA, NA, NA, 79, NA, 66, NA, 52, NA, NA) - nan9tree$tip.label <- indNames(popsub(nancycats, 8:9)) - ape::plot.phylo(nan9tree, show.node.label=TRUE) - ape::axisPhylo(3) -@ - -\end{figure} - -\subsubsection{Function: greycurve}\label{index:trees:greycurve} -\tab\tab Use this function to display a gradient of grey values based on user-defined parameters. The following functions will display a minimum spanning network that utilize a grey scale to display the weight of the lines (referred to as ``edges") that connect two or more individuals. The darker the line the closer the distance. Since this is based off of a linear grey scale, what happens when you have a distance matrix comprised of values all below 0.2 or all above 0.8? +%%%% Removing this section since it's a bit confusing.%%%% + +% So, we've gotten this far, yet we haven't actually seen what the genotypes look +% like! For analyses where the genotypic signature is important, this is a crucial +% identification step. Lucky for us, the \texttt{genind} object retains all of the +% genotypic information and can be accessed using the \texttt{genind2df} function. +% Let's take a look at the three genotypes we specified above utilizing the vector +% of MLGs we created above, \texttt{H.vec}. +% <>= +% v.df <- genind2df(virus) +% v.df[v.vec %in% UGNN, 1:15] # Showing only 15 columns becaus it is a large dataset. +% @ +% <>= +% structure(list(pop = structure(c(15L, 15L, 15L, 15L, 15L, 15L, +% 15L, 21L, 25L, 6L, 1L, 1L, 6L, 6L, 6L, 6L, 21L, 1L, 1L, 13L, +% 1L, 6L), .Label = c("Japan", "USA", "Finland", "China", "South Korea", +% "Norway", "Taiwan", "France", "Latvia", "Netherlands", "Bulgaria", +% "Turkey", "United Kingdom", "Denmark", "Austria", "Canada", "Italy", +% "Russia", "Bangladesh", "Egypt", "Germany", "Romania", "Ukraine", +% "Czech Republic", "Greece", "Iceland", "Ireland", "Sweden", "Nepal", +% "Saudi Arabia", "Switzerland", "Iran", "Mongolia", "Spain", "Slovenia", +% "Croatia", "Algeria"), class = "factor"), `6` = c("a", "a", "a", +% "a", "a", "a", "a", "a", "a", "a", "a", "a", "a", "a", "a", "a", +% "a", "a", "a", "a", "a", "a"), `17` = c("a", "a", "a", "a", "a", +% "a", "a", "a", "a", "a", "a", "a", "a", "a", "a", "a", "a", "a", +% "a", "a", "a", "a"), `39` = c("g", "g", "g", "g", "g", "g", "g", +% "g", "g", "g", "g", "g", "g", "g", "g", "g", "g", "g", "g", "g", +% "g", "g"), `42` = c("c", "c", "c", "c", "c", "c", "c", "c", "c", +% "c", "c", "c", "c", "c", "c", "c", "c", "c", "c", "c", "c", "c" +% ), `45` = c("g", "g", "g", "g", "g", "g", "g", "g", "g", "g", +% "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t"), +% `51` = c("c", "c", "c", "c", "c", "c", "c", "c", "c", "c", +% "c", "c", "c", "c", "c", "c", "c", "c", "c", "c", "c", "c" +% ), `60` = c("g", "g", "g", "g", "g", "g", "g", "g", "g", +% "g", "a", "a", "a", "a", "a", "a", "a", "a", "a", "a", "a", +% "a"), `72` = c("g", "g", "g", "g", "g", "g", "g", "g", "g", +% "g", "g", "g", "g", "g", "g", "g", "g", "g", "g", "g", "g", +% "g"), `73` = c("c", "c", "c", "c", "c", "c", "c", "c", "c", +% "c", "a", "a", "a", "a", "a", "a", "a", "a", "a", "a", "a", +% "a"), `90` = c("g", "g", "g", "g", "g", "g", "g", "g", "g", +% "g", "g", "g", "g", "g", "g", "g", "g", "g", "g", "g", "g", +% "g"), `108` = c("a", "a", "a", "a", "a", "a", "a", "a", "a", +% "a", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", +% "t"), `123` = c("g", "g", "g", "g", "g", "g", "g", "g", "g", +% "g", "g", "g", "g", "g", "g", "g", "g", "g", "g", "g", "g", +% "g"), `129` = c("t", "t", "t", "t", "t", "t", "t", "t", "t", +% "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", +% "t"), `134` = c("g", "g", "g", "g", "g", "g", "g", "g", "g", +% "g", "g", "g", "g", "g", "g", "g", "g", "g", "g", "g", "g", +% "g")), .Names = c("pop", "6", "17", "39", "42", "45", "51", +% "60", "72", "73", "90", "108", "123", "129", "134"), row.names = c("CY026119", +% "CY026120", "CY026121", "CY026122", "CY026131", "CY026132", "CY026135", +% "EU502462", "EU502463", "EU502464", "EU501513", "AB243868", "DQ883618", +% "DQ883619", "DQ883620", "DQ883628", "EU501609", "EU501642", "EU501643", +% "EU501735", "EU501742", "EU502513"), class = "data.frame") +% @ +% Notice that there seems to be a clear separation between the SNPs of the first +% 10 isolates and the rest? This is no coincidence. Take a look at the output of +% our earlier sub-setting. +% <>= +% v.vec[UGNN_match] +% @ -With linear grey scaling, it becomes very difficult to detect the differences in these ranges. The following function allows you to visualize and manipulate a gradient from black to white so that you can use it in \textit{poppr}'s msn functions below to maximize the visual differences in your data. +% We have the MLGs \Sexpr{UGNN[1]}, \Sexpr{UGNN[2]}, and \Sexpr{UGNN[3]}, and the +% result of the sub-setting shows us that \Sexpr{UGNN[3]} occurs earlier in our +% data set, and that MLGs \Sexpr{UGNN[1]} and \Sexpr{UGNN[2]} are mixed in +% together. The reason why we do not see a mixture of three different sets of SNP +% calls in our little window is because \texttt{mlg.vector} creates the MLGs by +% first concatenating and then sorting the genotypes. This way, the closer two MLG +% indexes are to each other, the fewer differences they will have between one +% another. -\begin{quote} -Default Command:\\ -\texttt{greycurve(glim = c(0, 0.8), gadj = 3, gweight = 1)} -\end{quote} +%-----------------------------------------------------------------------------% +% +%-----------------------------------------------------------------------------% +% \subsection{Alternative data visualization \{do you see what I see?\}} +% \label{mlg:alt.vis} + +% \tab\tab The graphs that are output by \poppr{} are simply aids for the user to +% make data analysis easier. We want to better visualize how these MLGs cross +% populations by MLG or population. We also want to see exactly what MLGs are in +% which populations, and how prevalent they are. As the package \textit{ggplot2} +% is based on data frames, we have to give ourselves a data frame to work with. We +% can do this using the \texttt{df = TRUE} flag. +% <>= +% df <- mlg.crosspop(virus, df=TRUE, quiet=TRUE) +% names(df) +% @ +% Now that we have our data frame, we can do a couple of things. We can first see +% where the most omnipresent MLG occurs. After that, we will plot the top ten MLGs +% using ggplot2. Remember that \texttt{v.num} is the number of populations per +% multilocus genotype. +% <<>>= +% v.max <- names(sort(v.num, decreasing=TRUE)[1:10]) +% # Showing the data frame by the largest MLG complex. +% df[df$MLG %in% v.max[1], ] +% @ +% And now we can visualize the largest ten MLG complexes using \textit{ggplot2}'s +% \texttt{qplot} function. +% \begin{figure}[h] +% \centering +% \caption{\footnotesize An example of the versatility of the MLG information.} +% \label{mlgtable2} +% <>= +% df2 <- df[df$MLG %in% v.max, ] +% library(ggplot2) +% qplot(y=MLG, x=Population, data=df2, color=Count, size=Count) + +% theme(axis.text.x = element_text(size = 10, angle = -45, hjust = 0)) +% @ +% \end{figure} +%=============================================================================% +%=============================================================================% +% +% +% +%=============================================================================% +%=============================================================================% +% \section{Data Analysis} +% \label{index} +%-----------------------------------------------------------------------------% +% +%-----------------------------------------------------------------------------% +% \subsection{Calculating the index of association, $I_A$ and $\bar{r} _d$ \{the missing linkage disequilibrium\}} +% \label{index:iard} + +% \tab\tab The index of association was originally developed as a measure of +% multilocus linkage disequilibrium \cite{Brown:1980} and was found to be able to +% detect signatures of sexual reproduction and population structure +% \cite{Brown:1980, Smith:1993}. Unfotunately, $I_A$ was found to increase with +% the number of loci, and was not suitable to comparisons across studies +% \cite{Agapow:2001}. To remedy this, $\bar{r} _d$ was developed that corrects for +% this scaling and forces the index to lie between 0 (linkage equilibrium) and 1 +% (full disequilibrium). $I_A$ has previously been implemented in a couple of +% programs including \textit{multilocus} \cite{Agapow:2001} and \textit{LIAN} +% \cite{Haubold:2000}. While both of these programs are still available for +% download, \textit{multilocus} is no longer actively supported, and +% \textit{LIAN}, despite its speed, is only appropriate for haplotypic data. Both +% of these programs each require one specific file format, and, until +% recently\footnote{LIAN 3.6 allows the user to run multiple contiguous data sets +% within a single file or across multiple files. It is impossible to run +% MULTILOCUS in batch.}, neither of these programs had an internal ability to run +% in batch across multiple populations within a file or multiple files within a +% directory in the same way that poppr can (see footnote). + +% It is important to note that for this algorithm, all missing values are treated +% in the same way as \textit{multilocus} in that all missing alleles are imputed +% to be the same as the alleles they are being compared to. Depending on the +% percent missing data in your data set, this might influence the statistic. If +% you have a lot of missing data, consider using the \texttt{missing} flag in this +% function. +% \subsubsection{Function: ia} +% \label{index:iard:ia} + +% \tab\tab This function is a quick look at a single data set. It can do almost +% everything that \texttt{poppr} can do except for sorting through populations. +% \begin{quote} +% Default Command: +% <>= +% funk <- "ia" +% print_command(funk) +% @ +% % \texttt{ia(pop, sample = 0, method = 1, quiet = FALSE, missing = "ignore", +% % hist = TRUE)} +% \end{quote} +% \begin{itemize} +% \item \texttt{pop -} a \texttt{genind} object. +% \item \texttt{sample -} You should use this flag whenever you want to +% reshuffle your data set. Indicate how many times you want to reshuffle your +% data set to obtain a P-value. +% \item \texttt{method -} a number from 1 to 4 indicating the sampling method: +% \begin{enumerate} +% \item permutation over alleles. +% \item parametric bootstrap. +% \item non-parametric bootstrap. +% \item \textit{multilocus} style permutation \cite{Agapow:2001}. +% \end{enumerate} +% The methods are detailed +% in \cmdlink{data.manip:shuffle:shufflepop}{shufflepop.} +% \item \texttt{quiet -} If set to \texttt{TRUE}, nothing will be printed to the +% screen as the sampling progresses. If \texttt{FALSE} will produce a progress +% bar. +% \item \texttt{missing -} This will preprocess your missing values. It is set +% to ignore missing data, so that they do not contribute to the distance +% measure. It can also be set to \texttt{"loci", "geno", "zero", or "mean"}. For +% details, see \seclink{data.manip:missing:missingno}{missingno.} +% \item \texttt{hist -} This will produce a pair of histograms for each +% population showing the distribution of $I_A$ and $\bar{r} _d$ across the +% sampled data sets, and plot the observed value as a single vertical line. +% \item \texttt{valuereturn -} If set to \texttt{TRUE} and the number of samples +% is greater than zero, it will return all values generated from the permuted +% data. +% \end{itemize} +% Running the analysis is as simple as this: +% <>= +% ia(nancycats) +% @ +% We can use \texttt{popsub} to subset for specific populations. Here, we'll also +% demonstrate the sampling flag and show you what the histogram looks like. +% <>= +% set.seed(1009) +% ia(popsub(nancycats, 5), sample=999) +% @ +% <>= +% set.seed(1009) +% simplenan <- ia(popsub(nancycats, 5)) +% cat("|================================================================================| 100%\n") +% c(simplenan[1], p.Ia = 0.572, simplenan[2], p.rD = 0.572) +% @ +% This analysis produced the histograms you see below. What these histograms +% represent are 999 resamplings of the data under the null hypothesis ($H_0$) of +% sexual reproduction. The way that $H_0$ is created is determined by the sampling +% method chosen. In this case, the method was to shuffle genotypes at each locus +% to simulate unlinked loci. Since the P = 0.572, we would fail to reject $H_0$ +% and we therefore might conclude that this population is sexually reproducing +% \cite{Brown:1980} \cite{Smith:1993} \cite{Agapow:2001}. +% \begin{figure}[h!] +% \centering +% \caption{\footnotesize Histograms of 999 values of $I_A$ and $\bar{r} _d$ +% calculated from 999 resamplings of population 5 from the data set +% ``nancycats". The observed values of $I_A$ and $\bar r_d$ are represented as +% vertical blue lines overlaid on the distributions. The ticks at the bottom of +% each histogram represent individual observations.} +% \label{ia_demo_fig} +% <>= +% set.seed(1009) +% nan5 <- ia(popsub(nancycats, 5), sample=999, quiet=TRUE) +% @ +% \end{figure} +% %\newpage -This function does not return any values. It will print a visual gradient from black to white horizontally. On this gradient, it will plot the adjustment curve (in opposing grey values), yellow horizontal lines bounding the maximum and minimum values, and the equation used to calculate the correction in red. Keep in mind that this is plotting values from zero to one. +% If we wanted to, we could also have set the flag \texttt{valuereturn = TRUE} to +% get back our permuted data if we wanted to make our own histograms (we'll set +% the number of samples to 9 for demonstrative purposes): -%\newpage -First, we'll see what happens when we change the weight parameter. -\setkeys{Gin}{width=\textwidth} -\begin{figure}[h!] -\begin{minipage}[b]{0.45\linewidth} -\centering -\caption{\footnotesize Default for \texttt{greycurve()}, weighted for small values.} -<>= -greycurve() -@ -\end{minipage} -\hspace{0.5cm} -\begin{minipage}[b]{0.45\linewidth} -\centering -\caption{\footnotesize weighting for large values.} -<>= -greycurve(gweight = 2) -@ -\end{minipage} -\end{figure} +% <>= +% set.seed(1009) +% ia(popsub(nancycats, 5), sample = 9, hist = FALSE, valuereturn = TRUE) +% @ -Now, we'll see what happens when we change the adjustment parameter (affects the shape of the curve) and the upper and lower limits of the grey scale. +% <>= +% cat("|================================================================================| 100%\n") +% structure(list(index = structure(c(-0.0475399533129828, 0.5, +% -0.00600425397248417, 0.5), .Names = c("Ia", "p.Ia", "rbarD", +% "p.rD")), samples = structure(list(Ia = c(-0.174180206794681, +% 0.110998135909168, -0.0516575036742681, -0.227057548622598, -0.290794613187196, +% 0.17142682029909, 0.127426390403491, -0.0489769895090723, -0.0100523130577507 +% ), rbarD = c(-0.022056780840299, 0.0142082562146926, -0.00661071136361462, +% -0.0289477478354951, -0.0366851757977678, 0.0217644448059182, +% 0.0160542108673502, -0.00622397887147145, -0.00126986051456087 +% )), .Names = c("Ia", "rbarD"), row.names = c(NA, -9L), class = "data.frame")), .Names = c("index", +% "samples")) +% @ -\begin{figure}[h!] -\begin{minipage}[b]{0.45\linewidth} -\centering -\caption{\footnotesize Setting the lower and upper limits and weighting the curve heavily toward smaller values.} -<>= -greycurve(glim = c(0.2, 0.9), gadj=15) -@ -\end{minipage} -\hspace{0.5cm} -\begin{minipage}[b]{0.45\linewidth} -\centering -\caption{\footnotesize Same as the figure on the left, but weighting heavily toward larger values.} -<>= -greycurve(glim = c(0.2, 0.9), gadj=15, gweight=2) -@ -\end{minipage} -\end{figure} -\newpage -\subsubsection{Function: bruvo.msn}\label{index:trees:bruvo.msn} -\tab\tab This function will automatically draw a minimum spanning network of MLGs based on Bruvo's distance. It's important to note that this will recalculate Bruvo's distance each time it is run, but the amount of time it takes to run is on the order of seconds. It will return a list containing the network, the populations and the related colors in the network so you can export or redraw it with the legend if you wanted to using the package \textit{igraph} (type \texttt{help("plot.igraph")} for details). -\begin{quote} -Default Command:\\ -\texttt{bruvo.msn(pop, replen = c(2), palette = topo.colors, sublist = "All",\\ - \tab blacklist = NULL, vertex.label = "MLG", gscale = TRUE, glim = c(0, 0.8),\\ - \tab gadj = 3, gweight = 1, wscale = TRUE, ...) -} -\end{quote} -\begin{itemize} - \item \texttt{pop -} a \texttt{genind} object. - \item \texttt{replen -} see \texttt{bruvo.dist}, above. - \item \texttt{palette -} this is a \textbf{function} definining a color palette to use. The default is \texttt{topo.colors}. There are different palettes, which you can search by typing \texttt{?rainbow}. If you want a custom color palette, an easy way is to use the function \texttt{colorRampPalette}. - \item \texttt{sublist -} The populations you wish to analyze. This defaults to ``All". See section \ref{data.manip:divide:popsub} for details. - \item \texttt{blacklist -} Populations you do not want to include in the graph. See section \ref{data.manip:divide:popsub} for details. - \item \texttt{vertex.label -} This is an option that is passed on to \textit{igraph}'s \texttt{plot} function. \textit{Poppr} has added two arguments specific to \textit{poppr}. If you want to label the graph with the multilocus genotypes from the whole data set, use the argument \texttt{vertex.label = "mlg"}. If you want to display the representative individual names, you can use the argument \texttt{vertex.label = "inds"}. I say representative individual names because, only one representative from each MLG will be present in the clone corrected data set used to calculate the distance. For no labels, you can choose \texttt{vertex.label = NA}. - \item \texttt{gscale -} If this is set to \texttt{TRUE}, the edge color will be converted to greyscale based on Bruvo's distance. If two nodes are closely related, the edge will appear darker. The limits of the scale can be set by the argument \texttt{glim}. If this is set to \texttt{FALSE}, all edge colors will be black. - \item \texttt{glim -} This is a vector of numbers between 0 and 1. This lets you set the limits of the grey scaling based on R's internal \texttt{grey} function. For example, if you wanted a maximum of 50\% white saturation (for use if you have distantly related nodes) and a minimum of 1\%, you would use \texttt{glim = c(0.01, 0.5)}. - \item \texttt{gadj -} This is an integer greater than zero used to adjust the scaling factor for the grey curve. Since very small changes in the grey scale are not easily precieved, it's useful to be able to adjust the grey scale to be able to show you the weights of each edge. For example, a population with most weights less than 0.3, you might want to set \texttt{gadj = 10} to exaggerate the grey scale. - \item \texttt{gweight -} If \texttt{gweight = 1}, the grey scale adjustment will be weighted towards separating out smaller values of Bruvo's distance. If \texttt{gweight = 2}, the grey scale ajustment will be weighted towards separating out larger values of Bruvo's distance. - \item \texttt{wscale -} If this is set to \texttt{TRUE}, edge widths will be displayed corresponding to Bruvo's distance in that thicker edges will represent a smaller distance between nodes. If this is set to \texttt{FALSE}, all edges will be set to a width of 2. - \item \texttt{... -} This is a placeholder for any other arguments that you want to supply to \textit{igraph}. Useful arguments are \texttt{vertex.label.cex} to adjust the size of the labels, \texttt{vertex.label.dist} to adjust the position of the labels, and \texttt{vertex.label.color} to adjust the color of the labels. -\end{itemize} +% There, are, of course a couple of caveats that need to be mentioned regarding +% our P-values. First, while we have equivalent P-values for $I_A$ and +% $\bar{r} _d$, they might not always be equal due to the difference in +% calculation. Details about that can be found in the Appendix +% section \seclink{appendix:algorithm:iard}{$I_A$ and $\bar{r} _d$.} Second, the +% P-values are calculated by comparing how many permuted values are greater than +% or equal to the observed value. This includes the observed value (which is why +% setting the randomizations to 999 will give you a round P-value) which means +% that the lowest P-value you will ever have is $1/(n+1)$ where $n$ is the number +% of permutations you select. Take for example this population of a clonal root +% rot pathogen, \textit{Aphanomyces euteiches}: +% <>= +% data(Aeut) +% set.seed(1001) +% ia(popsub(Aeut, 1), sample=999, method=2, quiet=TRUE, hist=FALSE) +% @ +% <>= +% data(Aeut) +% set.seed(1001) +% A.dum <- ia(popsub(Aeut, 1)) +% c(A.dum[1], p.Ia = 0.001, A.dum[2], p.rD = 0.001) +% @ +% If you want to be able to report $P < 0.001$ in this situation, then you can +% simply increase the number in sample: \texttt{sample = 1999} +% % +% % +% % +% % +% % +% % +% % +% % +% % +% % +% % +% % +% % +% %-----------------------------------------------------------------------------% +% % +% %-----------------------------------------------------------------------------% +% \subsection{Genetic distances \{may I have distance?\}} +% \label{index:gendist} + +% \tab\tab Genetic distances are great tools for analyzing diversity in +% populations as they are the basis for creating dendrograms with bootstrap +% support and also for AMOVA. This section will simply present different genetic +% distances along with a few notes about them. Most of these distances are derived +% from the \textit{ade4} and \adegenet packages, where they were implemented as +% distances between populations. \Poppr extends the implementation to individuals +% as well (with the exception of Bruvo's distance). + +% \begin{table}[ht] +% \centering +% \begin{tabular}{llll} +% \hline +% Method & Function & Assumption & Euclidean \\ +% \hline +% \seclink{distance:dissimilarity}{Provesti} & \texttt{provesti.dist} & - & No \\ +% & \texttt{diss.dist} & & \\ +% \seclink{distance:nei}{Nei} & \texttt{nei.dist} & Infinite Alleles & No \\ +% & & Genetic Drift & \\ +% \seclink{distance:edwards}{Edwards} & \texttt{edwards.dist} & Genetic Drift & Yes \\ +% \seclink{distance:reynolds}{Reynolds} & \texttt{reynolds.dist} & Genetic Drift & Yes \\ +% \seclink{distance:rogers}{Rogers} & \texttt{rogers.dist} & - & Yes \\ +% \seclink{distance:bruvo}{Bruvo} & \texttt{bruvo.dist} & Stepwise Mutation & No \\ +% \hline +% \end{tabular} +% \end{table} + +% Note on these distances: It is valuable for the user to know which distance is +% appropriate for use within his/her system. All of these distances except for +% Bruvo's distance are based on allele frequencies. Bruvo's distance takes into +% account the value of the allele, and thus may be more appropriate for questions +% of finding clonal descendents as opposed to a distance that will estimate the +% number of mutations that occured. + +% Currently only Bruvo's distance and Provesti's distance perform as expected with +% polyploid data. The other distances appear to give spurious results. + +% All of the distances are based on the table found in +% a \seclink{intro:genind}{genind object.} I have translated the following from math +% jargon into English with a few minor modifications: + +% \begin{quote} +% Let \textbf{A} a table containing allelic frequencies with $t$ populations or +% individuals (rows) and $m$ alleles (columns).\\ +% Let $\nu$ be the number of loci. The locus $j$ gets $m(j)$ alleles. + + +% \begin{equation} +% m=\sum_{j=1}^{\nu} m(j) +% \end{equation} +% \end{quote} + + +% Again, this is describing the table present in the \seclink{intro:genind}{genind +% object} where, instead of having the number of columns equal the number of loci, +% the number of columns equals the number of observed alleles in the entire data +% set. So, if you had a data set with 5 loci that had 2 alleles each, your table +% would have ten columns. Of course, codominant loci like microsatellites have +% varying numbers of alleles. + + +% \begin{quote} +% For the row $i$ and the modality $k$ of the variable $j$, notice the value + +% \begin{equation} +% a_{ijk}\\ (1 \leq i \leq t,\\ 1 \leq j \leq \nu,\\ 1 \leq k \leq m(j)) +% \end{equation} + +% the value of the initial table. + +% \begin{equation} +% a_{ij\cdot}=\sum_{k=1}^{m(j)} a_{ijk} +% \end{equation} + +% and + +% \begin{equation} +% p_{ijk}=\frac{a_{ijk}}{a_{ij\cdot}} +% \end{equation} +% \end{quote} + +% The above couple of equations are basically defining the allele frequency +% ($p_{ijk}$). Remember that $i$ is individual, $j$ is locus, and $k$ is allele. +% The following continutes to describe properties of the table: + +% \begin{equation} +% p_{ij\cdot}=\sum_{k=1}^{m(j)} p_{ijk}=1 +% \end{equation} +% The sum of all allele frequencies for a single population (or individual) at a +% single locus is one. + +% \begin{equation} +% p_{i{\cdot}\cdot}=\sum_{j=1}^{\nu} p_{ij\cdot}=\nu +% \end{equation} +% The sum of all allele frequences over all loci is equal to the number of loci. + +% \begin{equation} +% p_{{\cdot}{\cdot}\cdot}=\sum_{j=1}^{\nu} p_{i{\cdot}\cdot}=t\nu +% \end{equation} +% The the sum of the entire table is the sum of all loci multiplied by the number +% of populations (or individuals). + + +% % +% %-----------------------------------------------------------------------------% +% % + +% \subsubsection{Non-Euclidean distances} +% \label{index:gendist:neu} +% Dissimilarity distance +% \label{distance:dissimilarity} + +% One of the least known distances and yet more conceptually satisfying is +% Provesti's (Also spelled Prevosti) distance. It's conceptually satisfying +% because it can be thought of as a percentage of dissimilar sites between two +% individuals or populations or the absolute genetic distance. It is implemented +% in two functions, \texttt{provesti.dist} and \texttt{diss.dist}. + +% \label{distance:provesti} +% \begin{equation} +% D_{P}(a,b)=\frac{1}{2{\nu}} \sum_{k=1}^{\nu} \sum_{j=1}^{m(k)} +% |p_{ajk} - p_{bjk}| +% \end{equation} +% Note: for AFLP data, the $2$ is dropped. + + +% Nei's distance +% \label{distance:nei} +% \begin{equation} +% D_1(a,b)= -\ln(\frac{\sum_{k=1}^{\nu} \sum_{j=1}^{m(k)} +% p_{ajk} p_{bjk}}{\sqrt{\sum_{k=1}^{\nu} \sum_{j=1}^{m(k)} +% {(p_{ajk}) }^2}\sqrt{\sum_{k=1}^{\nu} \sum_{j=1}^{m(k)} +% {(p_{bjk})}^2}}) +% \end{equation} + +% \subsubsection{Euclidean distance} +% \label{index:gendist:eu} + +% Edward's Angular distance +% \label{distance:edwards} +% \begin{equation} +% D_2(a,b)=\sqrt{1-\frac{1}{\nu} \sum_{k=1}^{\nu} +% \sum_{j=1}^{m(k)} \sqrt{p_{ajk} p_{bjk}}} +% \end{equation} + +% Reynold's Coancestry distance +% \label{distance:reynolds} +% \begin{equation} +% D_3(a,b)=\sqrt{\frac{\sum_{k=1}^{\nu} +% \sum_{j=1}^{m(k)}{(p_{ajk} - p_{bjk})}^2}{2 \sum_{k=1}^{\nu} (1- +% \sum_{j=1}^{m(k)} p_{ajk} p_{bjk})}} +% \end{equation} + + +% Roger's distance +% \label{distance:rogers} +% \begin{equation} +% D_4(a,b)=\frac{1}{\nu} \sum_{k=1}^{\nu} \sqrt{\frac{1}{2} +% \sum_{j=1}^{m(k)}{(p_{ajk} - p_{bjk})}^2} +% \end{equation} + +% \subsubsection{Bruvo's distance for microsatellites} +% \label{index:gendist:bruvo} + +% This is based off of a specific model where it is necessary to specify the +% repeat lengths of the markers at each locus. Eg a repeat of $(CAT)^n$ would have +% a repeat length of 3. +% \label{distance:bruvo} + + +% Comparing all distances (WIP) + +% <>= +% data(nancycats) +% nancy9 <- popsub(nancycats, 9) +% repeats <- rep(2, 9) +% pro_nan <- provesti.dist(nancy9) +% nei_nan <- nei.dist(nancy9) +% edw_nan <- edwards.dist(nancy9) +% rey_nan <- reynolds.dist(nancy9) +% rog_nan <- rogers.dist(nancy9) +% bru_nan <- bruvo.dist(nancy9, replen = repeats) +% distlist <- list(Provesti = pro_nan, +% Nei = nei_nan, +% Edwards = edw_nan, +% Reynolds = rey_nan, +% Rogers = rog_nan, +% Bruvo = bru_nan) +% combs <- combn(x = 1:length(distlist), m = 2) +% mtest <- apply(combs, 2, function(x) mantel.randtest(distlist[[x[1]]], distlist[[x[2]]])$obs) +% attr(mtest, "class") <- "dist" +% attr(mtest, "Size") <- 6 +% attr(mtest, "Labels") <- names(distlist) +% mtest <- as.matrix(mtest) +% diag(mtest) <- 1 +% library(reshape2) +% library(ggplot2) +% ggplot(melt(mtest)) + geom_tile(aes_string(x = "Var1", y = "Var2", fill = "value")) +% @ -Often, minimum spanning networks are the preferred way to visualize Bruvo's distance. \textit{Poppr} offers an easy way to plot these. For a demonstration, let's analyze a simulated data set of 50 individuals from populations that reproduce at a 99.9\% rate of clonal reproduction. -<>= -data(partial_clone) -set.seed(9005) -pc.msn <- bruvo.msn(partial_clone, replen=rep(1, 10), vertex.label.cex=0.7, - vertex.label.dist=-0.5, palette=colorRampPalette(c("blue", "yellow"))) -@ -\setkeys{Gin}{width=0.8\textwidth} -\begin{figure}[ht!] - \centering - \caption{\footnotesize Minimum Spanning Network representing 4 simulated populations. Each node represents a different multi locus genotype (MLG). Node sizes and colors correspond to the number of individuals and population membership, respectively. Edge thickness and color are proportional to Bruvo's distance. Edge lengths are arbitrary.} - \label{mst_bruvo} -<>= -data(partial_clone) -set.seed(9005) -pc.msn <- bruvo.msn(partial_clone, replen=rep(1, 10), vertex.label.cex=0.7, - vertex.label.dist=-0.5, palette=colorRampPalette(c("blue", "yellow"))) -@ -\end{figure} -\setkeys{Gin}{width=0.5\textwidth} +% \subsection{Bootstrap analysis \{give `em the boot(strap)\}} +% \label{index:boot} +% \subsubsection{Bootstrapping any distance} +% \label{index:boot:any} +% \subsubsection{Bootstrapping Bruvo's Distance} +% \label{index:boot:bruvo} +% This is optimized for Bruvo's distance. + +% \subsection{Minimum spanning networks \{nothing but net(works)\}} + +% \subsection{Analysis of MOlecular VAriance \{Come here! AMOVA here!\}} +% % +% % +% % +% % +% % +% % +% % +% % +% % +% % +% % +% % +% % +% %-----------------------------------------------------------------------------% +% % +% %-----------------------------------------------------------------------------% +% \subsection{Going the distance \{dissimilarity distance\}} +% \label{index:dist} +% \tab\tab Since \poppr{} is still in its infancy, the number of distance measures +% it can offer are few. Bruvo's distance is well supported and allows you to +% quickly visualize your data, but it only allows for microsatellites. The index +% of association, above, utilizes a discreet dissimilarity distance matrix. It is +% with this matrix that we have constructed a relative dissimilarity distance +% where the distance is the ratio of the number of dissimilarities to the number +% of dissimilarities possible. The number of dissimilarities possible is the +% number of loci multiplied by the ploidy, so if you have 10 loci from a diploid +% population, then there are 20 dissimilarities possible. For details, see +% equations (\ref{eq:ia_d}) and (\ref{eq:ia_D}) +% in \seclink{appendix:algorithm:iard}{$I_A$ and $\bar{r} _d$.} + +% \subsubsection{Function: diss.dist} +% \label{index:dist:diss.dist} + +% \tab\tab Use this function to calculate relative dissimilarity between +% individuals and return a distance matrix for use in creating cladograms or +% minimum spanning networks. A note: missing alleles will be imputed to be the +% same as the challenging allele, decreasing the distance between some +% individuals. If you want to consider all missing data as special alleles, treat +% your data with \texttt{missingno(pop, type = "zero")} beforehand. + +% \begin{quote} +% Default Command: +% <>= +% funk <- "diss.dist" +% print_command(funk) +% @ +% % \texttt{diss.dist(pop)} +% \end{quote} + +% \begin{itemize} +% \item \texttt{pop -} a \texttt{genind} object. +% \end{itemize} +% Since we have a data set that we know is very clonal, let's analyze the +% \textit{A. euteiches} data set \cite{Grunwald:2006} and create a heatmap to +% visualize the degree of difference between populations. +% <>= +% data(Aeut) +% A.dist <- diss.dist(Aeut) +% heatmap(as.matrix(A.dist), symm=TRUE) +% @ +% \begin{figure}[h!] +% \centering +% \caption{\footnotesize Heatmap representation of a dissimilarity distance for +% the data set ``Aeut"} +% \label{diss_heat_map} +% <>= +% data(Aeut) +% A.dist <- diss.dist(Aeut) +% heatmap(as.matrix(A.dist), symm=TRUE) +% @ +% \end{figure} +% \newpage +% %-----------------------------------------------------------------------------% +% % +% %-----------------------------------------------------------------------------% +% \subsection{Bruvo's distance \{step by stepwise mutation\}} +% \label{index:bruvo} + +% \tab\tab Bruvo's distance is a genetic distance measure for microsatellite +% markers utilizing a stepwise mutation model that allows for differing ploidy +% levels \cite{Bruvo:2004}. As \textit{adegenet's} \texttt{genind} object has an +% all or none approach to missing data, any genotypes not exhibiting full ploidy +% will be treated as missing. This means that only non-special cases will be +% considered for the calculation and missing data will be ignored +% \cite{Bruvo:2004}. + +% This is true only if missing data is treated as missing when importing to +% \adegenet{}. If missing data in the initial data frame has missing +% microsatellite alleles coded as 0, then the genind object will not treat them as +% missing (unless the genotype has no alleles in it). An example adapted from the +% \adegenet{} ``Getting Started" vignette shows how this can be done: + +% <>= +% set.seed(5001) +% temp <- lapply(1:30, function(i) sample(0:9, 4, replace=TRUE)) +% temp <- sapply(temp, paste, collapse = "/") +% temp <- matrix(temp, nrow=10, dimnames=list(paste("ind",1:10), paste("loc",1:3))) +% temp +% obj <- df2genind(temp, ploidy=4, sep="/") +% pop(obj) <- paste("ind", 1:10) +% @ -The output, as mentioned earlier, is a list containing the graph constructed via the \textit{igraph} package, a vector of the population names and a vector of colors representing the populations. -<>= -library(igraph) -pc.msn -@ +% We will save this object for a later demonstration of the addition and loss +% models of Bruvo's distance. It is important to note that this is a distance +% between individuals, not populations, unlike Nei's 1978 distance +% \cite{Nei:1978}. For distances between populations, see the \adegenet{} function +% \texttt{dist.genpop} + +% \subsubsection{Function: bruvo.dist} +% \label{index:bruvo:bruvo.dist} + +% \tab\tab Bruvo's distance requires knowledge of the repeat lengths of each +% locus, so take care to read the description below. +% \begin{quote} +% Default Command: +% <>= +% funk <- "bruvo.dist" +% print_command(funk) +% @ +% % \texttt{bruvo.dist(pop, replen = 1, add = TRUE, loss = TRUE)} +% \end{quote} +% \begin{itemize} +% \item \texttt{pop -} a \texttt{genind} object. +% \item \texttt{replen -} This is a vector of numbers indicating the repeat +% length for each locus in your sample. If you have two dinucleotide repeats and +% five tetranucleotide repeats, you would put \texttt{c(2,2,4,4,4,4,4)} in this +% field. If you have imported data where that represents the raw number of +% steps, all you would have to type is \texttt{rep(1, n)}, replacing $n$ with +% the number of loci in your sample. It is important that you place something in +% this field because this function will attempt to estimate the repeat length +% based on the minimum difference of the alleles represented; with variability +% of position calls, relying on this estimation is NOT recommended. +% \item \texttt{add -} For missing data: use the genome addition model +% (see \seclink{appendix:algorithm:bruvospecial}{Special cases of Bruvo's +% distance.}) +% \item \texttt{loss -} For missing data: use the genome loss model +% (see \seclink{appendix:algorithm:bruvospecial}{Special cases of Bruvo's +% distance.}) +% \end{itemize} + +% To illustrate why it is important to specify the repeat lengths, let's imagine a +% locus that contains 5 alleles and the true repeat length is 4. Note that Bruvo's +% distance between alleles is calculated as $1 - 2^{-[x]}$, where $x$ is the +% difference in repeat lengths: +% <>= +% locus1 <- c(244, 248, 256, 240, 236) +% locus1/4 +% 1 - 2^-dist(locus1/4) +% @ +% We can see that the distance between them ranges from 1 to 5. Let's say, that we +% accidentally wrote 2 or 8 instead of 4: +% <>= +% locus1/2 +% 1 - 2^-dist(locus1/2) # Distance increase + +% locus1/8 +% 1 - 2^-dist(locus1/8) # Distance decrease +% @ +% While we will still get results from this analysis with the incorrect repeat +% length, they will be inherently wrong as they do not represent the true +% distance. That being said, it's important to note that the repeat lengths we +% represent for the rest of the manual are not known by the authors, but are used +% as a simple example. + +% This function will return a distance matrix (displaying the smallest population +% in the data set ``nancycats"): +% <>= +% dist9 <- bruvo.dist(popsub(nancycats, 9), replen=rep(1,9)) +% dist9 +% @ -Note that the thickness of the edges (the lines that are connecting the dots) is representative of relatedness between individuals, but the lengths do not necessarily mean anything due to the fact that with a larger data sets, displaying lengths proportional to relatedness would be impossible to draw on a 2D surface. Interpreting these data would show that MLG 9 has 5 individuals from all four populations and that it is most closely related to MLG 7, whereas the most distantly related connection exists between MLG 25 and MLG 26. -\newpage -Since a graph can be represented in many ways, you might want to play around with different layouts using the \texttt{layout()} function in \textit{igraph}. Type \texttt{help("layout", package = igraph)} for details. Below is the code for reconstructing the previous graph using the output: -<>= -set.seed(9005) -library(igraph) -plot(pc.msn$graph, vertex.size = V(pc.msn$graph)$size * 3, vertex.label.cex=0.7, - vertex.label.dist=-0.5,) -legend(-1.55, 1, bty = "n", cex = 0.75, legend = pc.msn$populations, - title = "Populations", fill = pc.msn$colors, border = NULL) -@ +% You can visualize this better with a simple heatmap: +% %\newpage -\subsubsection{Function: poppr.msn}\label{index:trees:poppr.msn} +% <>= +% heatmap(as.matrix(dist9), symm=TRUE) +% @ +% \begin{figure}[h!] +% \centering +% \caption{\footnotesize Heatmap representation of Bruvo's distance for +% population 9 of the data set ``nancycats"} +% \label{bruvo_heat_map} +% <>= +% heatmap(as.matrix(dist9), symm=TRUE) +% @ +% \end{figure} +% %\newpage + +% Let's take a closer look at the two individuals, N113 and N111. They seem to +% have large distances between everyone else and themselves. The names and columns +% of the matrix contain the names of individuals, but not the population +% information. We can make a comparison of Bruvo's distance across populations +% easier by editing the ``Labels" attribute of the distance object. Let's take a +% look at the labels attribute using the \texttt{attr()} command. <>= +% attr(dist9, "Labels") +% @ +% Remember that they all came from population 9, so let's append that to each +% label using the \texttt{paste()} command. +% <>= +% dist9.attr <- attr(dist9, "Labels") +% attr(dist9, "Labels") <- paste(rep("P09", 9), dist9.attr) +% dist9 +% @ +% Now we can see that all of the labels are corresponding to population 9. Let's +% calculate Bruvo's distance between populations 8 and 9. +% <>= +% dist9to8 <- bruvo.dist(popsub(nancycats, 8:9), replen=rep(1,9)) +% dist9to8.attr <- attr(dist9to8, "Labels") +% nan9to8pop <- nancycats@pop[nancycats@pop %in% c("P08", "P09")] +% attr(dist9to8, "Labels") <- paste(nan9to8pop, dist9to8.attr) +% heatmap(as.matrix(dist9to8), symm=TRUE) +% @ +% \begin{figure}[h!] +% \centering +% \caption{\footnotesize Heatmap representation of Bruvo's distance for +% populations 8 and 9 of the data set ``nancycats"} +% \label{bruvo_heat_map_8to9} +% <>= +% dist9to8 <- bruvo.dist(popsub(nancycats, 8:9), replen=rep(1,9)) +% dist9to8.attr <- attr(dist9to8, "Labels") +% nan9to8pop <- nancycats@pop[nancycats@pop %in% c("P08", "P09")] +% attr(dist9to8, "Labels") <- paste(nan9to8pop, dist9to8.attr) +% heatmap(as.matrix(dist9to8), symm=TRUE) +% @ +% \end{figure} + +% Remember N113 and N111? Take a look at where they fall on the heatmap. They +% don't cluster together with population 9 anymore, but somewhere in population 8. +% % \newpage + +% %-----------------------------------------------------------------------------% +% % +% %-----------------------------------------------------------------------------% +% \subsection{See the forest for the trees \{visualizing distances with dendrograms and networks\}} +% \label{index:trees} + +% \tab\tab Staring at a raw distance matrix might be able to tell you something +% about your data, but it also might be able to ruin your eyesight. In this +% section, we present functions to display this data in trees and networks. + +% \subsubsection{Function: bruvo.boot} +% \label{index:trees:bruvo.boot} + +% \tab\tab This function provides the ability to draw a dendrogram based on +% Bruvo's distance including bootstrap support. +% \begin{quote} +% Default Command: +% <>= +% funk <- "bruvo.boot" +% print_command(funk) +% @ +% % \texttt{bruvo.boot(pop, replen = 1, add = TRUE, loss = TRUE, sample = 100, tree = "upgma", showtree = TRUE, cutoff = NULL, quiet = FALSE)} +% \end{quote} +% \begin{itemize} +% \item \texttt{pop -} a \texttt{genind} object. +% \item \texttt{replen -} see \texttt{bruvo.dist}, above. +% \item \texttt{add -} For missing data: use the genome addition model +% (see \seclink{appendix:algorithm:bruvospecial}{Special cases of Bruvo's +% distance.}) +% \item \texttt{loss -} For missing data: use the genome loss model +% (see \seclink{appendix:algorithm:bruvospecial}{Special cases of Bruvo's +% distance.}) +% \item \texttt{sample -} How many bootstraps do you want to perform? +% \item \texttt{tree -} Two trees are available, Neighbor-Joining \texttt{"nj"} +% or UPGMA \texttt{"upgma"}. +% \item \texttt{showtree -} if \texttt{TRUE}, a tree will be plotted +% automatically. +% \item \texttt{cutoff -} This is a number between 0 and 100 indicating the +% cutoff value for the bootstrap nodelables. If you only wanted to see the the +% boostrap values for nodes that were present more than 75\% of the time, you +% would use \texttt{cutoff = 75}. If you don't put anything for this parameter, +% all values will be shown. +% \item \texttt{quiet -} if \texttt{quiet = TRUE}, no standard messages will be +% printed to screen. If \texttt{quiet = FALSE} (default), then a progress bar +% and standard message will be printed to the screen. +% \end{itemize} + +% For this example, let's set the cutoff to 50\%. +% <>= +% set.seed(410) +% nan9tree <- bruvo.boot(popsub(nancycats, 8:9), replen=rep(1,9), sample=1000, cutoff=50) +% @ +% <>= +% cat("\nBootstrapping...\n(note: calculation of node labels can take a while even after the progress bar is full)\n\n") +% cat("|================================================================================| 100%\n") +% @ +% \begin{figure}[h!] +% \centering +% \caption{\footnotesize UPGMA Tree of Bruvo's distance for population 9 of the +% data set ``nancycats" with 1000 Bootstrap Replicates. Node labels represent +% percentage of bootstrap replicates that contained that node.} +% \label{bruvo_upgma} +% % \includegraphics{bruvo_upgma.png} + +% <>= +% set.seed(410) +% nan9tree <- phangorn::upgma(bruvo.dist(popsub(nancycats, 8:9), replen = rep(1, 9))) +% nan9tree$node.labels <- c(100, NA, NA, NA, NA, 64, NA, NA, NA, NA, NA, 80, NA, +% 62, NA, 52, NA, NA) +% nan9tree$tip.label <- indNames(popsub(nancycats, 8:9)) +% # ape::plot.phylo(nan9tree, show.node.label=TRUE) +% # ape::axisPhylo(3) +% poppr:::poppr.plot.phylo(nan9tree, "upgma") +% @ -\tab\tab Use this function to draw a minimum spanning network from your data set and a distance matrix derived from your data set. Since there are hundreds of distances that can be calculated for genetic data, and since I want to be able to graduate at some point in this decade, functions to automatically calculate distances and draw the minimum spanning networks will be few and far between. This function is an attempt to meet the user halfway and draw a minimum spanning network provided that the user has supplied two things: -\begin{enumerate} - \item A distance matrix over all individuals. - \item The original data set containing demographic information. -\end{enumerate} +% \end{figure} +% \newpage +% \subsubsection{Function: greycurve} +% \label{index:trees:greycurve} +% \tab\tab Use this function to display a gradient of grey values based on user- +% defined parameters. The following functions will display a minimum spanning +% network that utilize a grey scale to display the weight of the lines (referred +% to as ``edges") that connect two or more individuals. The darker the line the +% closer the distance. Since this is based off of a linear grey scale, what +% happens when you have a distance matrix comprised of values all below 0.2 or all +% above 0.8? + +% With linear grey scaling, it becomes very difficult to detect the differences in +% these ranges. The following function allows you to visualize and manipulate a +% gradient from black to white so that you can use it in \poppr{}'s msn functions +% below to maximize the visual differences in your data. + +% \begin{quote} +% Default Command: +% <>= +% funk <- "greycurve" +% print_command(funk) +% @ +% % \texttt{greycurve(glim = c(0, 0.8), gadj = 3, gweight = 1)} +% \end{quote} -That's it. For the most part, this function is functionally the same as \texttt{bruvo.msn}, except that instead of being exclusive to microsatellite markers, you can now visualize distances in any marker type provided that you have the two items listed above. -\begin{quote} -Default Command:\\ -\texttt{poppr.msn(pop, distmat, palette = topo.colors, sublist = "All",\\ - \tab blacklist = NULL, vertex.label = "MLG", gscale = TRUE, glim = c(0, 0.8),\\ - \tab gadj = 3, gweight = 1, wscale = TRUE, ...) -} -\end{quote} -\begin{itemize} - \item \texttt{pop -} a \texttt{genind} object. - \item \texttt{distmat -} a dissimilarity distance matrix derived from your data with distances between zero and one. - \item \texttt{palette -} this is a \textbf{function} definining a color palette to use. The default is \texttt{topo.colors}. There are different palettes, which you can search by typing \texttt{?rainbow}. If you want a custom color palette, an easy way is to use the function \texttt{colorRampPalette}. - \item \texttt{sublist -} The populations you wish to analyze. This defaults to ``All". - \item \texttt{blacklist -} Populations you do not want to include in the graph. - \item \texttt{vertex.label -} This is an option that is passed on to \textit{igraph}'s \texttt{plot} function. \textit{Poppr} has added two arguments specific to \textit{poppr}. If you want to label the graph with the multilocus genotypes from the whole data set, use the argument \texttt{vertex.label = "mlg"}. If you want to display the representative individual names, you can use the argument \texttt{vertex.label = "inds"}. I say representative individual names because, only one representative from each MLG will be present in the clone corrected data set used to calculate the distance. For no labels, you can choose \texttt{vertex.label = NA}. - \item \texttt{gscale -} If this is set to \texttt{TRUE}, the edge color will be converted to greyscale based on the distance. If two nodes are closely related, the edge will appear darker. The limits of the scale can be set by the argument \texttt{glim}. If this is set to \texttt{FALSE}, all edge colors will be black. - \item \texttt{glim -} This is a vector of numbers between 0 and 1. This lets you set the limits of the grey scaling based on R's internal \texttt{grey} function. For example, if you wanted a maximum of 50\% white saturation (for use if you have distantly related nodes) and a minimum of 1\%, you would use \texttt{glim = c(0.01, 0.5)}. - \item \texttt{gadj -} This is an integer greater than zero used to adjust the scaling factor for the grey curve. Since very small changes in the grey scale are not easily precieved, it's useful to be able to adjust the grey scale to be able to show you the weights of each edge. For example, a population with most weights less than 0.3, you might want to set \texttt{gadj = 10} to exaggerate the grey scale. - \item \texttt{gweight -} If \texttt{gweight = 1}, the grey scale adjustment will be weighted towards separating out smaller values of the distance. If \texttt{gweight = 2}, the grey scale ajustment will be weighted towards separating out larger values of Bruvo's distance. - \item \texttt{wscale -} If this is set to \texttt{TRUE}, edge widths will be displayed corresponding to Bruvo's distance in that thicker edges will represent a smaller distance between nodes. If this is set to \texttt{FALSE}, all edges will be set to a width of 2. - \item \texttt{... -} This is a placeholder for any other arguments that you want to supply to \textit{igraph}. Useful arguments are \texttt{vertex.label.cex} to adjust the size of the labels, \texttt{vertex.label.dist} to adjust the position of the labels, and \texttt{vertex.label.color} to adjust the color of the labels. -\end{itemize} +% This function does not return any values. It will print a visual gradient from +% black to white horizontally. On this gradient, it will plot the adjustment curve +% (in opposing grey values), yellow horizontal lines bounding the maximum and +% minimum values, and the equation used to calculate the correction in red. By +% default it will use a sequence from 1 to 0 to plot the curve, but you can use +% any sequence. Let's take 1000 random normal draws: -Since we have the ability, let's visualize the \textit{A. euteiches} data set \cite{Grunwald:2006}. -<>= -data(Aeut) -A.dist <- diss.dist(Aeut) -set.seed(9005) -A.msn <- poppr.msn(Aeut, A.dist, vertex.label=NA, palette=rainbow, gadj=15) -@ +% <>= +% set.seed(9999) +% xnorm <- rnorm(1000) +% @ -\setkeys{Gin}{width=0.8\textwidth} -\begin{figure}[ht!] - \centering - \caption{\footnotesize Minimum Spanning Network representing 4 simulated populations. Each node represents a different multi locus genotype (MLG). Node sizes and colors correspond to the number of individuals and population membership, respectively. Edge thickness and color are proportional to Bruvo's distance. Edge lengths are arbitrary.} - \label{mst_poppr} -<>= -data(Aeut) -A.dist <- diss.dist(Aeut) -set.seed(9005) -A.msn <- poppr.msn(Aeut, A.dist, vertex.label=NA, palette=rainbow, gadj=15) -@ -\end{figure} -\setkeys{Gin}{width=0.5\textwidth} -\newpage -%\subsubsection{Gory details} -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -\section{I know what you did last summary table \{diversity table\}}\label{summary} -\tab\tab Remember the summary function that you used to get all the diversity statistics in section \ref{intro:qstart}? In this section, we will flesh out all that you can do with this function. This was the very first function that was written for \textit{poppr} to make it easy for the user to manipulate and summarize the data in one function. -\subsection{Function: poppr}\label{summary:poppr} +% %\newpage +% First, we'll see what happens when we change the weight parameter. +% \begin{figure}[h!] +% \begin{minipage}[b]{0.45\linewidth} +% \centering +% \caption{\footnotesize Default for \texttt{greycurve()}, weighted for small +% values.} +% <>= +% greycurve(xnorm) +% @ +% \end{minipage} +% \hspace{0.5cm} +% \begin{minipage}[b]{0.45\linewidth} +% \centering +% \caption{\footnotesize weighting for large values.} +% <>= +% greycurve(xnorm, gweight = 2) +% @ +% \end{minipage} +% \end{figure} + +% If you change the adjustment parameter, \texttt{gadj}, the shape of the curve +% will change, and if you change the limit parameter, \texttt{glim}, the bounds of +% the curve will change. + +% This function also will plot scalebars for use with minimum spanning networks +% produced in previous versions of poppr. Just use the scalebar argument and you +% will get a scalebar using the quantiles from the data and a scalebar with the +% data smoothed from the minimum to the maximum. + +% \begin{figure}[h!] +% \begin{minipage}[b]{0.45\linewidth} +% \centering +% \caption{\footnotesize We can see that more data lies around zero (as expected +% with a random normal distribution)} +% <>= +% greycurve(xnorm, scalebar = TRUE) +% @ +% \end{minipage} +% \hspace{0.5cm} +% \begin{minipage}[b]{0.45\linewidth} +% \centering +% \caption{\footnotesize Same as the figure on the left, but weighting heavily +% toward larger values.} +% <>= +% greycurve(xnorm, gweight=2, scalebar = TRUE) +% @ +% \end{minipage} +% \end{figure} +% \newpage -\tab\tab This function is quite daunting with all its possibilities. You have the option to subset your data for specific populations, correct for missing data, and clone correct. With each of these possibilities, comes the need to provide all the arguments for their various functions. -\begin{quote} -Default Command:\\ -\texttt{poppr(pop, total = TRUE, sublist = c("ALL"), blacklist = c(NULL), sample = 0,\\ - \tab method = 1, missing = "ignore", cutoff = 0.05, quiet = FALSE, \\ - \tab clonecorrect = FALSE, hier = c(1), dfname = "population\_hierarchy", \\ - \tab hist = TRUE, minsamp = 10)} -\end{quote} -\begin{itemize} - \item \texttt{pop -} A \texttt{genind} object. - \item \texttt{total -} This is also a synonym for ``pooled". This will calculate all diversity statistics on the entire data set if set to \texttt{TRUE} or if there is no population structure. - \item \emph{popsub functions:} See section \ref{data.manip:divide} - \begin{description} - \item[sublist -] A list of populations you want to include in your analysis. - \item[blacklist -] A list of populations you want to exclude from your analysis. - \end{description} - \item \emph{shufflepop functions:} See section \ref{data.manip:shuffle} \\ Note that this only affects the calculation for $I_A$ and $\bar r_d$. - \begin{description} - \item[sample -] The number of samples you desire (eg. 999) - \item[method -] Which sampling method? 1: permute, 2: parametric bootstrap, 3: non-parametric bootstrap, 4: multilocus. - \end{description} - \item \emph{missingno functions:} See Section \ref{data.manip:missing} \\ Note that all analyses in this function ignore/impute missing data by default. - \begin{description} - \item[missing -] How to deal with missing data. This feeds into the \texttt{type} flag of \texttt{missingno}. - \item[cutoff -] Allowable percentage of missing data per genotype or locus. - \end{description} - \item \texttt{quiet -} If set to \texttt{TRUE}, nothing will be printed to the screen as the sampling progresses. If \texttt{FALSE} (default) a progress bar will be produced. - \item \emph{clonecorrect functions:} See section \ref{data.manip:cc} - \begin{description} - \item[clonecorrect -] if this is set to \texttt{TRUE}, then you will need to set the next two parameters. - \item[hier -] A list of the population hierarchy, or names of columns in the data frame noted below. - \item[dfname -] A data frame in the \texttt{@other} slot of the \texttt{genind} object containing all of the population factors in different columns. For an example, see sections \ref{data.manip:hier} and \ref{data.manip:cc}. - \item[keep -] A vector of integers as indexes for the \texttt{hier} flag indicating which levels of the hierarchy you want to analyze. See section \ref{data.manip:cc} for details. - \end{description} - \item \texttt{hist -} if \texttt{TRUE}, a histogram of distributions of $I_A$ and $\bar r_d$ will be displayed with each population if there is sampling. - \item \texttt{minsamp -} The minimum number of individuals you want to use to calculate the expected number of MLGs. The default is set to 10. -\end{itemize} +% \subsubsection{Function: bruvo.msn} +% \label{index:trees:bruvo.msn} +% \tab\tab This function will automatically draw a minimum spanning network of +% MLGs based on Bruvo's distance. It's important to note that this will +% recalculate Bruvo's distance each time it is run, but the amount of time it +% takes to run is on the order of seconds. It will return a list containing the +% network, the populations and the related colors in the network so you can export +% or redraw it with the legend if you wanted to using the package \textit{igraph} +% (type \texttt{help("plot.igraph")} for details). +% \begin{quote} +% Default Command: +% <>= +% funk <- "bruvo.msn" +% print_command(funk) +% @ +% % \texttt{bruvo.msn(pop, replen = 1, add = TRUE, loss = TRUE, palette = topo.colors,\\ +% % \tab sublist = "All", blacklist = NULL, vertex.label = "MLG", gscale = TRUE, \\ +% % \tab glim = c(0, 0.8), gadj = 3, gweight = 1, wscale = TRUE, ...) +% % } +% \end{quote} +% \begin{itemize} +% \item \texttt{pop -} a \texttt{genind} object. +% \item \texttt{replen -} see \texttt{bruvo.dist}, above. +% \item \texttt{add -} For missing data: use the genome addition model +% (see \seclink{appendix:algorithm:bruvospecial}{Special cases of Bruvo's +% distance.}) +% \item \texttt{loss -} For missing data: use the genome loss model +% (see \seclink{appendix:algorithm:bruvospecial}{Special cases of Bruvo's +% distance.}) +% \item \texttt{palette -} this is a \textbf{function} definining a color +% palette to use. The default is \texttt{topo.colors}. There are different +% palettes, which you can search by typing \texttt{?rainbow}. If you want a +% custom color palette, an easy way is to use the function +% \texttt{colorRampPalette}. +% \item \texttt{sublist -} The populations you wish to analyze. This defaults to +% ``All". See section \ref{data.manip:divide:popsub} for details. +% \item \texttt{blacklist -} Populations you do not want to include in the +% graph. See section \ref{data.manip:divide:popsub} for details. +% \item \texttt{vertex.label -} This is an option that is passed on to +% \textit{igraph}'s \texttt{plot} function. \Poppr{} has added two arguments +% specific to \poppr{}. If you want to label the graph with the multilocus +% genotypes from the whole data set, use the argument \texttt{vertex.label = +% "mlg"}. If you want to display the representative individual names, you can +% use the argument \texttt{vertex.label = "inds"}. I say representative +% individual names because, only one representative from each MLG will be +% present in the clone corrected data set used to calculate the distance. For no +% labels, you can choose \texttt{vertex.label = NA}. +% \item \texttt{gscale -} If this is set to \texttt{TRUE}, the edge color will +% be converted to greyscale based on Bruvo's distance. If two nodes are closely +% related, the edge will appear darker. The limits of the scale can be set by +% the argument \texttt{glim}. If this is set to \texttt{FALSE}, all edge colors +% will be black. +% \item \texttt{glim -} This is a vector of numbers between 0 and 1. This lets +% you set the limits of the grey scaling based on R's internal \texttt{grey} +% function. For example, if you wanted a maximum of 50\% white saturation (for +% use if you have distantly related nodes) and a minimum of 1\%, you would use +% \texttt{glim = c(0.01, 0.5)}. +% \item \texttt{gadj -} This is an integer greater than zero used to adjust the +% scaling factor for the grey curve. Since very small changes in the grey scale +% are not easily precieved, it's useful to be able to adjust the grey scale to +% be able to show you the weights of each edge. For example, a population with +% most weights less than 0.3, you might want to set \texttt{gadj = 10} to +% exaggerate the grey scale. +% \item \texttt{gweight -} If \texttt{gweight = 1}, the grey scale adjustment +% will be weighted towards separating out smaller values of Bruvo's distance. If +% \texttt{gweight = 2}, the grey scale ajustment will be weighted towards +% separating out larger values of Bruvo's distance. +% \item \texttt{wscale -} If this is set to \texttt{TRUE}, edge widths will be +% displayed corresponding to Bruvo's distance in that thicker edges will +% represent a smaller distance between nodes. If this is set to \texttt{FALSE}, +% all edges will be set to a width of 2. +% \item \texttt{... -} This is a placeholder for any other arguments that you +% want to supply to \textit{igraph}. Useful arguments are +% \texttt{vertex.label.cex} to adjust the size of the labels, +% \texttt{vertex.label.dist} to adjust the position of the labels, and +% \texttt{vertex.label.color} to adjust the color of the labels. +% \end{itemize} + +% Often, minimum spanning networks are the preferred way to visualize Bruvo's +% distance. \Poppr{} offers an easy way to plot these. For a demonstration, let's +% analyze a simulated data set of 50 individuals from populations that reproduce +% at a 99.9\% rate of clonal reproduction. + +% <>= +% data(partial_clone) +% set.seed(9005) +% pc.msn <- bruvo.msn(partial_clone, replen=rep(1, 10), vertex.label.cex=0.7, +% vertex.label.dist=-0.5, palette=colorRampPalette(c("blue", "yellow"))) +% @ +% \begin{figure}[ht!] +% \centering +% \caption{\footnotesize Minimum Spanning Network representing 4 simulated +% populations. Each node represents a different multi locus genotype (MLG). Node +% sizes and colors correspond to the number of individuals and population +% membership, respectively. Edge thickness and color are proportional to Bruvo's +% distance. Edge lengths are arbitrary.} +% \label{mst_bruvo} +% <>= +% data(partial_clone) +% set.seed(9005) +% pc.msn <- bruvo.msn(partial_clone, replen=rep(1, 10), vertex.label.cex=0.7, +% vertex.label.dist=-0.5, palette=colorRampPalette(c("blue", "yellow"))) +% @ +% \end{figure} +% \setkeys{Gin}{width=0.5\textwidth} + +% The output, as mentioned earlier, is a list containing the graph constructed via +% the \textit{igraph} package, a vector of the population names and a vector of +% colors representing the populations. +% <>= +% library(igraph) +% pc.msn +% @ -This function produces a table that contains the population name, number of individuals observed, number of MLGs observed, number of MLGs expected at the lowest common sampling size within the data set \cite{Hurlbert:1971} \cite{Heck:1975}, the Shannon-Wiener index \cite{Shannon:1948}, Stoddart and Taylor's index for expected MLGs \cite{Stoddart:1988}, Nei's 1987 genotypic diversity \cite{Nei:1978}, evenness \cite{Pielou:1975}\cite{Ludwig:1988}\cite{Grunwald:2003}, the index of association \cite{Brown:1980}\cite{Smith:1993}, the standardized index of association \cite{Agapow:2001}, and the file name. Most of these indices are calculated by converting the population into an MLG table with \texttt{mlg.table} (see section \ref{mlg:table}) and using the \textit{vegan} package's \texttt{diversity} function (To see details, type \texttt{?vegan::diversity} into the R console). +% Note that the thickness of the edges (the lines that are connecting the dots) is +% representative of relatedness between individuals, but the lengths do not +% necessarily mean anything due to the fact that with a larger data sets, +% displaying lengths proportional to relatedness would be impossible to draw on a +% 2D surface. Interpreting these data would show that MLG 9 has 5 individuals from +% all four populations and that it is most closely related to MLG 7, whereas the +% most distantly related connection exists between MLG 25 and MLG 26. + +% %\newpage +% Since a graph can be represented in many ways, you might want to play around +% with different layouts using the \texttt{layout()} function in \textit{igraph}. +% Type \texttt{help("layout", package = igraph)} for details. Below is the code +% for reconstructing the previous graph using the output: +% <>= +% set.seed(9005) +% library(igraph) +% plot(pc.msn$graph, vertex.size = V(pc.msn$graph)$size * 3, +% vertex.label.cex=0.7, vertex.label.dist=-0.5,) +% legend(-1.55, 1, bty = "n", cex = 0.75, legend = pc.msn$populations, +% title = "Populations", fill = pc.msn$colors, border = NULL) +% @ -To begin, let's revisit our example data set of \textit{Aphanomyces euteiches} \cite{Grunwald:2006}. -<>= -data(Aeut) -poppr(Aeut) -@ -OK, so we were able to get a table out of this. Now let's see what happens when we do some sampling to see if this is reproducing clonally or not. We will turn quiet on and the histogram off to save space. -<>= -poppr(Aeut, sample=999, hist=FALSE, quiet=TRUE) -@ -<>= -none1 <- poppr(Aeut, hist=FALSE, quiet=TRUE) -cbind(none1[1:10],list(p.Ia = rep(0.001, 3)), none1[11], list(p.rD = rep(0.001, 3)), none1[12]) -@ -From now on, we'll set \texttt{quiet = TRUE} to save space on our vignette. Let's clone correct at different levels to see if that affects the index of association. First, we'll clone correct at the sub population level. -<>= -poppr(Aeut, sample=999, clonecorrect=TRUE, hier=c("Pop","Subpop"), - dfname="population_hierarchy", quiet=TRUE, hist=FALSE) -@ -<>= -sub1 <- poppr(Aeut, clonecorrect=TRUE, hier=c("Pop","Subpop"), dfname="population_hierarchy", quiet=TRUE, hist=FALSE) -cbind(sub1[1:10],list(p.Ia = rep(0.001, 3)), sub1[11], list(p.rD = rep(0.001, 3)), sub1[12]) -@ -And at the population level. -<>= -poppr(Aeut, sample=999, clonecorrect=TRUE, hier="Pop", - dfname="population_hierarchy", quiet=TRUE, hist=FALSE) -@ -<>= -pop1 <- poppr(Aeut, sample=0, clonecorrect=TRUE, hier="Pop", dfname="population_hierarchy", quiet=TRUE, hist=FALSE) -cbind(pop1[1:10],list(p.Ia = rep(0.001, 3)), pop1[11], list(p.rD = rep(0.001, 3)), pop1[12]) -@ -As you can see, clone correction doesn't always have to involve creation of new data sets! +% \subsubsection{Function: poppr.msn} +% \label{index:trees:poppr.msn} + +% \tab\tab Use this function to draw a minimum spanning network from your data set +% and a distance matrix derived from your data set. Since there are hundreds of +% distances that can be calculated for genetic data, and since I want to be able +% to graduate at some point in this decade, functions to automatically calculate +% distances and draw the minimum spanning networks will be few and far between. +% This function is an attempt to meet the user halfway and draw a minimum spanning +% network provided that the user has supplied two things: +% \begin{enumerate} +% \item A distance matrix over all individuals. +% \item The original data set containing demographic information. +% \end{enumerate} + +% That's it. For the most part, this function is functionally the same as +% \texttt{bruvo.msn}, except that instead of being exclusive to microsatellite +% markers, you can now visualize distances in any marker type provided that you +% have the two items listed above. + +% \begin{quote} +% Default Command: +% <>= +% funk <- "poppr.msn" +% print_command(funk) +% @ +% % \texttt{poppr.msn(pop, distmat, palette = topo.colors, sublist = "All",\\ +% % \tab blacklist = NULL, vertex.label = "MLG", gscale = TRUE, glim = c(0, 0.8),\\ +% % \tab gadj = 3, gweight = 1, wscale = TRUE, ...) +% % } +% \end{quote} +% \begin{itemize} +% \item \texttt{pop -} a \texttt{genind} object. +% \item \texttt{distmat -} a dissimilarity distance matrix derived from your data with distances between zero and one. +% \item \texttt{palette -} this is a \textbf{function} definining a color +% palette to use. The default is \texttt{topo.colors}. There are different +% palettes, which you can search by typing \texttt{?rainbow}. If you want a +% custom color palette, an easy way is to use the function +% \texttt{colorRampPalette}. +% \item \texttt{sublist -} The populations you wish to analyze. This defaults to +% ``All". +% \item \texttt{blacklist -} Populations you do not want to include in the +% graph. +% \item \texttt{vertex.label -} This is an option that is passed on to +% \textit{igraph}'s \texttt{plot} function. \Poppr{} has added two arguments +% specific to \poppr{}. If you want to label the graph with the multilocus +% genotypes from the whole data set, use the argument \texttt{vertex.label = +% "mlg"}. If you want to display the representative individual names, you can +% use the argument \texttt{vertex.label = "inds"}. I say representative +% individual names because, only one representative from each MLG will be +% present in the clone corrected data set used to calculate the distance. For no +% labels, you can choose \texttt{vertex.label = NA}. +% \item \texttt{gscale -} If this is set to \texttt{TRUE}, the edge color will +% be converted to greyscale based on the distance. If two nodes are closely +% related, the edge will appear darker. The limits of the scale can be set by +% the argument \texttt{glim}. If this is set to \texttt{FALSE}, all edge colors +% will be black. +% \item \texttt{glim -} This is a vector of numbers between 0 and 1. This lets +% you set the limits of the grey scaling based on R's internal \texttt{grey} +% function. For example, if you wanted a maximum of 50\% white saturation (for +% use if you have distantly related nodes) and a minimum of 1\%, you would use +% \texttt{glim = c(0.01, 0.5)}. +% \item \texttt{gadj -} This is an integer greater than zero used to adjust the +% scaling factor for the grey curve. Since very small changes in the grey scale +% are not easily precieved, it's useful to be able to adjust the grey scale to +% be able to show you the weights of each edge. For example, a population with +% most weights less than 0.3, you might want to set \texttt{gadj = 10} to +% exaggerate the grey scale. +% \item \texttt{gweight -} If \texttt{gweight = 1}, the grey scale adjustment +% will be weighted towards separating out smaller values of the distance. If +% \texttt{gweight = 2}, the grey scale ajustment will be weighted towards +% separating out larger values of Bruvo's distance. +% \item \texttt{wscale -} If this is set to \texttt{TRUE}, edge widths will be +% displayed corresponding to Bruvo's distance in that thicker edges will +% represent a smaller distance between nodes. If this is set to \texttt{FALSE}, +% all edges will be set to a width of 2. +% \item \texttt{... -} This is a placeholder for any other arguments that you +% want to supply to \textit{igraph}. Useful arguments are +% \texttt{vertex.label.cex} to adjust the size of the labels, +% \texttt{vertex.label.dist} to adjust the position of the labels, and +% \texttt{vertex.label.color} to adjust the color of the labels. +% \end{itemize} + +% Since we have the ability, let's visualize the \textit{A. euteiches} data set +% \cite{Grunwald:2006}. +% <>= +% data(Aeut) +% A.dist <- diss.dist(Aeut) +% set.seed(9005) +% A.msn <- poppr.msn(Aeut, A.dist, vertex.label=NA, palette=rainbow, gadj=15) +% @ -You might notice that the P-values for both $I_A$ and $\bar r_d$ are often equal to each other. This will always be the case with the sampling method utilized in method 4 \cite{Agapow:2001}. Here, we show examples where they are not equal and why it's okay. -<>= -set.seed(2002) -poppr(nancycats, sublist=5:6, total=FALSE, sample=999, method=2, quiet=TRUE, hist=FALSE) -@ -<>= -set.seed(2001) -nan_ex <- poppr(nancycats, sublist=5:6, total=FALSE, sample=0, method=2, quiet=TRUE, hist=FALSE) -cbind(nan_ex[1:10],list(p.Ia = c(0.576, 0.070), nan_ex[11], list(p.rD = c(0.576, 0.071), nan_ex[12]))) -@ -The reason why the P-values would be different is described at the end of section \ref{index:iard:ia}. The differences in P-values are normally not very far off. It's important to note this because of what can happen in extremely clonal populations. You can end up with a large enough sample size consisting of very few MLGs. Upon shuffling using method 4, you find that there are very few values of $I_A$ and $\bar r_d$ that can be obtained. Observe with this simulated data set: -<>= -set.seed(2004) -poppr(system.file("files/simulated.dat", package="poppr"), sample=999, method=4, quiet=TRUE) -@ -<>= -set.seed(2004) -sim_ex <- poppr(system.file("files/simulated.dat", package="poppr"), sample=0, method=4, quiet=TRUE, hist=FALSE) -cbind(sim_ex[1:10],list(p.Ia = c(0.09), sim_ex[11], list(p.rD = c(0.09), sim_ex[12]))) -@ -\begin{figure}[h!] - \centering - \caption{\footnotesize Output of multilocus-style sampling. Note the multi-modal distribution.} - \label{simulated_dist_fig} - \includegraphics{simulated_dist_fig.png} -% <>= +% \begin{figure}[ht!] +% \centering +% \caption{\footnotesize Minimum Spanning Network representing 4 simulated +% populations. Each node represents a different multi locus genotype (MLG). Node +% sizes and colors correspond to the number of individuals and population +% membership, respectively. Edge thickness and color are proportional to Bruvo's +% distance. Edge lengths are arbitrary.} +% \label{mst_poppr} +% <>= +% data(Aeut) +% A.dist <- diss.dist(Aeut) +% set.seed(9005) +% A.msn <- poppr.msn(Aeut, A.dist, vertex.label=NA, palette=rainbow, gadj=15) +% @ +% \end{figure} +% \newpage +% %\subsubsection{Gory details} +% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +% %=============================================================================% +% %=============================================================================% +% % +% % +% % +% %=============================================================================% +% %=============================================================================% +% \section{Diversity table \{I know what you did last summary table\}} +% \label{summary} + +% \tab\tab Remember the summary function that you used to get all the diversity +% statistics in section \ref{intro:qstart}? In this section, we will flesh out all +% that you can do with this function. This was the very first function that was +% written for \poppr{} to make it easy for the user to manipulate and summarize +% the data in one function. +% %-----------------------------------------------------------------------------% +% % +% %-----------------------------------------------------------------------------% +% \subsection{Function: poppr} +% \label{summary:poppr} + +% \tab\tab This function is quite daunting with all its possibilities. You have +% the option to subset your data for specific populations, correct for missing +% data, and clone correct. With each of these possibilities, comes the need to +% provide all the arguments for their various functions. +% \begin{quote} +% Default Command: +% <>= +% funk <- "poppr" +% print_command(funk) +% @ +% % \texttt{poppr(pop, total = TRUE, sublist = c("ALL"), blacklist = c(NULL), sample = 0,\\ +% % \tab method = 1, missing = "ignore", cutoff = 0.05, quiet = FALSE, \\ +% % \tab clonecorrect = FALSE, hier = c(1), dfname = "population\_hierarchy", \\ +% % \tab hist = TRUE, minsamp = 10)} +% \end{quote} +% \begin{itemize} +% \item \texttt{dat -} A \texttt{genind} object, \texttt{genclone} object, or a +% path to a file on your machine that contains genetix, structure, fstat, +% genpop, or genalex formatted data. +% \item \texttt{total -} This is also a synonym for ``pooled". This will +% calculate all diversity statistics on the entire data set if set to +% \texttt{TRUE} or if there is no population structure. +% \item \emph{popsub functions:} See section \ref{data.manip:divide} +% \begin{description} +% \item[sublist -] A list of populations you want to include in your analysis. +% \item[blacklist -] A list of populations you want to exclude from your +% analysis. +% \end{description} +% \item \emph{shufflepop functions:} See section \ref{data.manip:shuffle} \\ +% Note that this only affects the calculation for $I_A$ and $\bar r_d$. +% \begin{description} +% \item[sample -] The number of samples you desire (eg. 999) +% \item[method -] Which sampling method? 1: permute, 2: parametric bootstrap, +% 3: non-parametric bootstrap, 4: multilocus. +% \end{description} +% \item \emph{missingno functions:} See Section \ref{data.manip:missing} \\ Note +% that all analyses in this function ignore/impute missing data by default. +% \begin{description} +% \item[missing -] How to deal with missing data. This feeds into the +% \texttt{type} flag of \texttt{missingno}. +% \item[cutoff -] Allowable percentage of missing data per genotype or +% locus. +% \end{description} +% \item \texttt{quiet -} If set to \texttt{TRUE}, nothing will be printed to the +% screen as the sampling progresses. If \texttt{FALSE} (default) a progress bar +% will be produced. +% \item \emph{clonecorrect functions:} See section \ref{data.manip:cc} +% \begin{description} +% \item[clonecorrect -] if this is set to \texttt{TRUE}, then you will need to +% set the next two parameters. +% \item[hier -] A list of the population hierarchy, or names of columns in the +% data frame noted below. +% \item[dfname -] A data frame in the \texttt{@other} slot of the +% \texttt{genind} object containing all of the population factors in different +% columns. For an example, see sections \ref{data.manip:hier} and +% \ref{data.manip:cc}. +% \item[keep -] A vector of integers as indexes for the \texttt{hier} flag +% indicating which levels of the hierarchy you want to analyze. See section +% \ref{data.manip:cc} for details. +% \end{description} +% \item \texttt{hist -} if \texttt{TRUE}, a histogram of distributions of $I_A$ +% and $\bar r_d$ will be displayed with each population if there is sampling. +% \item \texttt{minsamp -} The minimum number of individuals you want to use to +% calculate the expected number of MLGs. The default is set to 10. +% \end{itemize} + +% This function produces a table that contains the population name, number of +% individuals observed, number of MLGs observed, number of MLGs expected at the +% lowest common sampling size within the data set \cite{Hurlbert:1971} +% \cite{Heck:1975}, the Shannon-Wiener index \cite{Shannon:1948}, Stoddart and +% Taylor's index for expected MLGs \cite{Stoddart:1988}, Nei's 1987 genotypic +% diversity \cite{Nei:1978}, evenness +% \cite{Pielou:1975}\cite{Ludwig:1988}\cite{Grunwald:2003}, the index of +% association \cite{Brown:1980}\cite{Smith:1993}, the standardized index of +% association \cite{Agapow:2001}, and the file name. Most of these indices are +% calculated by converting the population into an MLG table with +% \texttt{mlg.table} (see section \ref{mlg:table}) and using the \textit{vegan} +% package's \texttt{diversity} function (To see details, type +% \texttt{?vegan::diversity} into the R console). + +% To begin, let's revisit our example data set of \textit{Aphanomyces euteiches} +% \cite{Grunwald:2006}. +% <>= +% data(Aeut) +% poppr(Aeut) +% @ +% OK, so we were able to get a table out of this. Now let's see what happens when +% we do some sampling to see if this is reproducing clonally or not. We will turn +% quiet on and the histogram off to save space. +% <>= +% poppr(Aeut, sample=999, hist=FALSE, quiet=TRUE) +% @ +% <>= +% none1 <- poppr(Aeut, hist=FALSE, quiet=TRUE) +% cbind(none1[1:10],list(p.Ia = rep(0.001, 3)), none1[11], list(p.rD = rep(0.001, 3)), none1[12]) +% @ +% From now on, we'll set \texttt{quiet = TRUE} to save space on our vignette. +% Let's clone correct at different levels to see if that affects the index of +% association. First, we'll clone correct at the sub population level. +% <>= +% poppr(Aeut, sample=999, clonecorrect=TRUE, hier=c("Pop","Subpop"), +% dfname="population_hierarchy", quiet=TRUE, hist=FALSE) +% @ +% <>= +% sub1 <- poppr(Aeut, clonecorrect=TRUE, hier=c("Pop","Subpop"), dfname="population_hierarchy", quiet=TRUE, hist=FALSE) +% cbind(sub1[1:10],list(p.Ia = rep(0.001, 3)), sub1[11], list(p.rD = rep(0.001, 3)), sub1[12]) +% @ +% And at the population level. +% <>= +% poppr(Aeut, sample=999, clonecorrect=TRUE, hier="Pop", +% dfname="population_hierarchy", quiet=TRUE, hist=FALSE) +% @ +% <>= +% pop1 <- poppr(Aeut, sample=0, clonecorrect=TRUE, hier="Pop", dfname="population_hierarchy", quiet=TRUE, hist=FALSE) +% cbind(pop1[1:10],list(p.Ia = rep(0.001, 3)), pop1[11], list(p.rD = rep(0.001, 3)), pop1[12]) +% @ +% As you can see, clone correction doesn't always have to involve creation of new +% data sets! + +% You might notice that the P-values for both $I_A$ and $\bar r_d$ are often equal +% to each other. This will always be the case with the sampling method utilized in +% method 4 \cite{Agapow:2001}. Here, we show examples where they are not equal and +% why it's okay. +% <>= +% set.seed(2002) +% poppr(nancycats, sublist=5:6, total=FALSE, sample=999, method=2, quiet=TRUE, hist=FALSE) +% @ +% <>= +% set.seed(2001) +% nan_ex <- poppr(nancycats, sublist=5:6, total=FALSE, sample=0, method=2, quiet=TRUE, hist=FALSE) +% cbind(nan_ex[1:10],list(p.Ia = c(0.576, 0.070), nan_ex[11], list(p.rD = c(0.576, 0.071), nan_ex[12]))) +% @ +% The reason why the P-values would be different is described at the end of +% section \ref{index:iard:ia}. The differences in P-values are normally not very +% far off. It's important to note this because of what can happen in extremely +% clonal populations. You can end up with a large enough sample size consisting of +% very few MLGs. Upon shuffling using method 4, you find that there are very few +% values of $I_A$ and $\bar r_d$ that can be obtained. Observe with this simulated +% data set: +% <>= % set.seed(2004) -% der <- poppr(system.file("files/simulated.dat", package="poppr"), sample=999, method=1, quiet=TRUE, hist=TRUE) +% poppr(system.file("files/simulated.dat", package="poppr"), sample=999, method=4, quiet=TRUE) % @ -\end{figure} -\newpage - -Take a look a these two histograms. The number of ways you can recombine the data with the default sampling method is very small. Other sampling methods could give a more theoretical distribution. Let's try the parametric bootstrap (For details, see section \ref{data.manip:shuffle}). -\begin{figure}[h!] - \centering - \caption{\footnotesize Output for parametric bootstrap sampling.} - \label{simulated_param_fig} - \includegraphics{simulated_param_fig.png} -% <>= +% <>= % set.seed(2004) -% der <- poppr(system.file("files/simulated.dat", package="poppr"), sample=999, method=3, quiet=TRUE) +% sim_ex <- poppr(system.file("files/simulated.dat", package="poppr"), sample=0, method=4, quiet=TRUE, hist=FALSE) +% cbind(sim_ex[1:10],list(p.Ia = c(0.09), sim_ex[11], list(p.rD = c(0.09), sim_ex[12]))) % @ -\end{figure} - -As you can see, the distribution is much closer to a distribution we would expect if this were a small sample of a larger population. - - -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -\section{Appendix}\label{appendix} -\subsection{Algorithmic Details}\label{appendix:algorithm} -\subsubsection{$I_A$ and $\bar r_d$}\label{appendix:algorithm:iard} -\tab \tab The index of association was originally developed by A.H.D. Brown analyzing population structure of wheat \cite{Brown:1980}. It has been widely used as a tool to detect clonal reproduction within populations \cite{Smith:1993}. Populations whose members are undergoing sexual reproduction, whether it be selfing or out-crossing, will produce gametes via meiosis, and thus have a chance to shuffle alleles in the next generation. Populations whose members are undergoing clonal reproduction, however, generally do so via mitosis. This means that the most likely mechanism for a change in genotype is via mutation. The rate of mutation varies from species to species, but it is rarely sufficiently high to approximate a random shuffling of alleles. The index of association is a calculation based on the ratio of the variance of the raw number of differences between individuals and the sum of those variances over each locus \cite{Smith:1993}. You can also think of it as the observed variance over the expected variance. If they are the same, then the index is zero after subtracting one (from Maynard-Smith, 1993 \cite{Smith:1993}): -\beq -\label{eq:I_A} -I_A = \frac{V_O}{V_E}-1 -\eeq -Since the distance is more or less a binary distance, any sort of marker can be used for this analysis. In the calculation, phase is not considered, and any difference increases the distance between two individuals. Consider the genotypes of the dummy data frame we created earlier: -<>= -df <- data.frame(list(locus1=c("101/101", "102/103", "102/102"), - locus2=c("201/201","202/203","203/204"), - locus3=c("301/302", "301/303", "304/305"))) -df -@ -Now, consider the first locus represented in the genind object: -<>= -dfg@tab[, 1:3] -@ -Remember that each column represents a different allele and that each entry in the table represents the fraction of the genotype made up by that allele at that locus. Notice also that the sum of the rows all equal one. \textit{Poppr} uses this to calculate distances by simply taking the sum of the absolute values of the differences between rows. - -The calculation for the distance between two individuals at a single locus with $a$ allelic states and a ploidy of $k$ is as follows\footnote{Individuals with Presence / Absence data will have the $k/2$ term dropped.}: -\beq -\label{eq:ia_d} -d = \displaystyle \frac{k}{2}\sum_{i=1}^{a} \mid ind_{Ai} - ind_{Bi}\mid -\eeq -<<>>= -abs(dfg@tab[1, 1:3] - dfg@tab[2, 1:3]) -abs(dfg@tab[1, 1:3] - dfg@tab[3, 1:3]) -abs(dfg@tab[2, 1:3] - dfg@tab[3, 1:3]) +% Take a look a these two histograms. The number of ways you can recombine the +% data with the default sampling method is very small. Other sampling methods +% could give a more theoretical distribution. Let's try the parametric bootstrap +% (For details, see section \ref{data.manip:shuffle}). + +% \begin{figure}[h!] +% \begin{minipage}[b]{0.5\linewidth} +% \centering +% \caption{\footnotesize Output of multilocus-style sampling. Note the multimodal distribution.} +% \label{simulated_dist_fig} +% \includegraphics{simulated_dist_fig.pdf} +% \end{minipage} +% \hspace{0.5cm} +% \begin{minipage}[b]{0.5\linewidth} +% \centering +% \caption{\footnotesize Output for parametric bootstrap sampling.} +% \label{simulated_param_fig} +% \includegraphics{simulated_param_fig.pdf} +% \end{minipage} +% \end{figure} + + + + + +% % \begin{figure}[h!] +% % \centering +% % \caption{\footnotesize Output of multilocus-style sampling. Note the multi- +% % modal distribution.} +% % \label{simulated_dist_fig} +% % \includegraphics{simulated_dist_fig.png} +% % % <>= +% % % set.seed(2004) +% % % der <- poppr(system.file("files/simulated.dat", package="poppr"), sample=999, method=1, quiet=TRUE, hist=TRUE) +% % % @ +% % \end{figure} +% % \newpage + + +% % \begin{figure}[h!] +% % \centering +% % \caption{\footnotesize Output for parametric bootstrap sampling.} +% % \label{simulated_param_fig} +% % \includegraphics{simulated_param_fig.png} +% % % <>= +% % % set.seed(2004) +% % % der <- poppr(system.file("files/simulated.dat", package="poppr"), sample=999, method=3, quiet=TRUE) +% % % @ +% % \end{figure} + +% As you can see, the distribution is much closer to a distribution we would +% expect if this were a small sample of a larger population. + + +%=============================================================================% +%=============================================================================% +% +% +% +%=============================================================================% +%=============================================================================% +\section{Appendix} +\label{appendix} + +\subsection{General hierarchy method use} +\label{appendix:hier:method} + +\tab\tab To reiterate, there are currently 5 methods that manipulate population +hierarchies in genclone objects: +% latex table generated in R 3.0.3 by xtable 1.7-3 package +% Sat Mar 22 20:24:23 2014 +\begin{table}[ht] +\centering +\begin{tabular}{llll} + \hline + Method & Function & Input & Result\\ + \hline + split & \cmdlink{data.manip:hier:define:split}{splithierarchy} & formula & defined hierarchical levels\\ + set & \cmdlink{data.manip:hier:define:set}{sethierarchy} & data frame & new hierarchy\\ + get & \cmdlink{data.manip:hier:view}{gethierarchy} & formula & data frame\\ + name & \cmdlink{data.manip:hier:manip:name}{namehierarchy} & formula & new hierarchy names\\ + add & \cmdlink{data.manip:hier:manip:add}{addhierarchy} & vector or data frame & new hierarchical level\\ + \hline +\end{tabular} +\end{table} +\noindent +NOTE: Refer to \seclink{intro:genclone:access}{Accessing hierarchies} for more +details on how to access hierarchies.\\ + +These functions all have a syntax that looks like this: +<>= +newobject <- FUNCTION(object, input) +@ +Let's take a data set of \textit{Phytophthora infestans} collected from North +America and South America and use that as an example. It has two population +hierarchies defined, Continent and Country: +<>= +data(Pinf) +Pinf +@ +\noindent +Let's say I wanted to change the hierarchy names to Spanish. I can do that using +\texttt{namehierarchy}. +<>= +elPinf <- namehierarchy(Pinf, ~continente/pais) # Don't forget the formula syntax! +elPinf +Pinf +@ +\noindent +The original data set has stayed the same and we now have a new data set with +the names we want. + +Of course, it would be silly to create a new data set every time we wanted to do +something like change names. This is why all of the above functions (with the +exception of \texttt{gethierarchy}) all have the replacement syntax of: +<>= +FUNCTION(object) <- input +@ +\noindent +\textbf{NOTE: This is the preferred syntax}.\\ +\noindent +This allows the object to be edited \textit{in place} and makes things generally +easier. Let's revisit our previous example: +<>= +Pinf +namehierarchy(Pinf) <- ~continente/pais +Pinf +@ +While we will mainly be using the replacement syntax in this vignette, the +advantage to having both systems is that with the +\texttt{function(object, input)} syntax, you can test manipulation without +affecting your object. + +%-----------------------------------------------------------------------------% +% +%-----------------------------------------------------------------------------% +% \subsection{Algorithmic Details} +% \label{appendix:algorithm} +% \subsubsection{$I_A$ and $\bar r_d$} +% \label{appendix:algorithm:iard} + +% \tab \tab The index of association was originally developed by A.H.D. Brown +% analyzing population structure of wheat \cite{Brown:1980}. It has been widely +% used as a tool to detect clonal reproduction within populations +% \cite{Smith:1993}. Populations whose members are undergoing sexual reproduction, +% whether it be selfing or out-crossing, will produce gametes via meiosis, and +% thus have a chance to shuffle alleles in the next generation. Populations whose +% members are undergoing clonal reproduction, however, generally do so via +% mitosis. This means that the most likely mechanism for a change in genotype is +% via mutation. The rate of mutation varies from species to species, but it is +% rarely sufficiently high to approximate a random shuffling of alleles. The index +% of association is a calculation based on the ratio of the variance of the raw +% number of differences between individuals and the sum of those variances over +% each locus \cite{Smith:1993}. You can also think of it as the observed variance +% over the expected variance. If they are the same, then the index is zero after +% subtracting one (from Maynard-Smith, 1993 \cite{Smith:1993}): +% \begin{equation} +% \label{eq:I_A} +% I_A = \frac{V_O}{V_E}-1 +% \end{equation} +% Since the distance is more or less a binary distance, any sort of marker can be +% used for this analysis. In the calculation, phase is not considered, and any +% difference increases the distance between two individuals. Consider the +% genotypes of the dummy data frame we created earlier: +% <>= +% df <- data.frame(list(locus1=c("101/101", "102/103", "102/102"), +% locus2=c("201/201", "202/203", "203/204"), +% locus3=c("301/302", "301/303", "304/305"))) +% df +% @ +% Now, consider the first locus represented in the genind object: +% <>= +% dfg <- df2genind(df, sep="/") +% dfg@tab[, 1:3] +% @ +% Remember that each column represents a different allele and that each entry in +% the table represents the fraction of the genotype made up by that allele at that +% locus. Notice also that the sum of the rows all equal one. \Poppr{} uses this to +% calculate distances by simply taking the sum of the absolute values of the +% differences between rows. + +% The calculation for the distance between two individuals at a single locus with +% $a$ allelic states and a ploidy of $k$ is as follows\footnote{Individuals with +% Presence / Absence data will have the $k/2$ term dropped.}: +% \begin{equation} +% \label{eq:ia_d} +% d = \displaystyle \frac{k}{2}\sum_{i=1}^{a} \mid ind_{Ai} - ind_{Bi}\mid +% \end{equation} +% <<>>= +% abs(dfg@tab[1, 1:3] - dfg@tab[2, 1:3]) +% abs(dfg@tab[1, 1:3] - dfg@tab[3, 1:3]) +% abs(dfg@tab[2, 1:3] - dfg@tab[3, 1:3]) +% @ +% As you can see, these values of $d$ at locus one add up to 2, 2, and 1, respectively. + +% To find the total number of differences between two individuals over all loci, +% you just take $d$ over $m$ loci, a value we'll call $D$: + +% \begin{equation} +% \label{eq:ia_D} +% D = \displaystyle \sum_{i=1}^{m} d_i +% \end{equation} + +% These values are calculated over all possible combinations of individuals in the +% data set, ${n \choose 2}$ after which you end up with ${n \choose 2}\cdot{} m$ +% values of $d$ and ${n \choose 2}$ values of $D$. Calculating the observed +% variances is fairly straightforward (modified from Agapow and Burt, 2001) +% \cite{Agapow:2001}: + +% \begin{equation} +% \label{eq:V_O} +% V_O = \frac{\displaystyle \sum_{i=1}^{n \choose 2} D_{i}^2 - \frac{(\displaystyle\sum_{i=1}^{n \choose 2} D_{i})^2}{{n \choose 2}}}{{n \choose 2}} +% \end{equation} + +% Calculating the expected variance is the sum of each of the variances of the +% individual loci. The calculation at a single locus, $j$ is the same as the +% previous equation, substituting values of $D$ for $d$ \cite{Agapow:2001}: + +% \begin{equation} +% \label{eq:var_j} +% var_j = \frac{\displaystyle \sum_{i=1}^{n \choose 2} d_{i}^2 - \frac{(\displaystyle\sum_{i=1}^{n \choose 2} d_i)^2}{{n \choose 2}}}{{n \choose 2}} +% \end{equation} + +% The expected variance is then the sum of all the variances over all $m$ loci +% \cite{Agapow:2001}: + +% \begin{equation} +% \label{eq:V_E} +% V_E = \displaystyle \sum_{j=1}^{m} var_j +% \end{equation} + +% Now you can plug the sums of equations (\ref{eq:V_O}) and (\ref{eq:V_E}) into +% equation (\ref{eq:I_A}) to get the index of association. Of course, Agapow and +% Burt showed that this index increases steadily with the number of loci, so they +% came up with an approximation that is widely used, $\bar r_d$ +% \cite{Agapow:2001}. For the derivation, see the manual for \textit{multilocus}. +% The equation is as follows, utilizing equations (\ref{eq:V_O}), +% (\ref{eq:var_j}), and (\ref{eq:V_E}) \cite{Agapow:2001}: + +% \begin{equation} +% \label{eq:r_d} +% \bar{r_d} = \frac{V_O - V_E} +% {2\displaystyle \sum_{j=1}^{m}\displaystyle \sum_{k \neq j}^{m}\sqrt{var_j\cdot{} var_k}} +% \end{equation} + +% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +% \subsubsection{Bruvo's distance} +% \label{appendix:algorithm:bruvo} + +% \tab \tab Bruvo's distance between two individuals calculates the minimum +% distance across all combinations of possible pairs of alleles at a single locus +% and then averaging that distance across all loci \cite{Bruvo:2004}. The distance +% between each pair of alleles is calculated as \cite{Bruvo:2004}: + +% \begin{equation} +% \label{eq:m_x} +% m_x = 2^{-\mid x \mid} +% \end{equation} + +% \begin{equation} +% \label{eq:d_a} +% d_a = 1 - m_x +% \end{equation} + +% Where $x$ is the number of steps between each allele. So, let's say we were +% comparing two haploid $(k = 1)$ individuals with alleles 228 and 244 at a locus +% that had a tetranucleotide repeat pattern (CATG$)^n$. The number of steps for +% each of these alleles would be $228/4 = 57$ and $244/4 =61$, respectively. The +% number of steps between them is then $\mid 57 - 61 \mid = 4$. Bruvo's distance +% at this locus between these two individuals is then $1-2^{-4} = 0.9375$. For +% samples with higher ploidy ($k$), there would be $k$ such distances of which we +% would need to take the sum \cite{Bruvo:2004}. + +% \begin{equation} +% \label{eq:s_i} +% s_i = \displaystyle \sum_{a=1}^{k} d_a +% \end{equation} + +% Unfortunately, it's not as simple as that since we do not assume to know phase. +% Because of this, we need to take all possible combinations of alleles into +% account. This means that we will have $k^2$ values of $d_a$, when we only want +% $k$. How do we know which $k$ distances we want? We will have to invoke +% parsimony for this and attempt to take the minimum sum of the alleles, of which +% there are $k!$ possibilities \cite{Bruvo:2004}: + +% \begin{equation} +% \label{eq:d_l} +% d_l = \frac{\left(\displaystyle \min_{i \dotsc k!} s_i\right)}{k} +% \end{equation} + +% Finally, after all of this, we can get the average distance over all loci +% \cite{Bruvo:2004}. + +% \begin{equation} +% \label{eq:D} +% D = \frac{\displaystyle \sum_{i=1}^l d_i}{l} +% \end{equation} + +% This is calculated over all possible combinations of individuals and results in +% a lower triangle distance matrix over all individuals. + +% \subsubsection{Special Cases of Bruvo's distance} +% \label{appendix:algorithm:bruvospecial} +% \tab\tab As shown in the above section, ploidy is irrelevant with respect to +% calculation of Bruvo's distance. However, since it makes a comparison between +% all alleles at a locus, it only makes sense that the two loci need to have the +% same ploidy level. Unfortunately for polyploids, it's often difficult to fully +% separate distinct alleles at each locus, so you end up with genotypes that +% appear to have a lower ploidy level than the organism \cite{Bruvo:2004}. + +% To help deal with these situatons, Bruvo has suggested three methods for dealing +% with these differences in ploidy levels \cite{Bruvo:2004}: +% \begin{itemize} +% \item{Infinite Model -} The simplest way to deal with it is to count all +% missing alleles as infinitely large so that the distance between it and +% anything else is 1. Aside from this being computationally simple, it will tend +% to inflate distances between individuals. +% \item{Genome Addition Model -} If it is suspected that the organism has gone +% through a recent genome expansion, the missing alleles will be replace with +% all possible combinations of the observed alleles in the shorter genotype. For +% example, if there is a genotype of [69, 70, 0, 0] where 0 is a missing allele, +% the possible combinations are: [69, 70, 69, 69], [69, 70, 69, 70], and [69, +% 70, 70, 70]. The resulting distances are then averaged over the number of +% comparisons. +% \item{Genome Loss Model -} This is similar to the genome addition model, +% except that it assumes that there was a recent genome reduction event and uses +% the observed values in the full genotype to fill the missing values in the +% short genotype. As with the Genome Addition Model, the resulting distances are +% averaged over the number of comparisons. +% \item{Combination Model -} Combine and average the genome addition and loss +% models. +% \end{itemize} + +% As mentioned above, the infinite model is biased, but it is not nearly as +% computationally intensive as either of the other models. The reason for this is +% that both of the addition and loss models requires replacement of alleles and +% recalculation of Bruvo's distance. The number of replacements required is equal +% to the multiset coefficient: $\left({n \choose k}\right) == {(n-k+1) \choose k}$ +% where $n$ is the number of potential replacements and $k$ is the number of +% alleles to be replaced. So, for the example given above, The genome addition +% model would require $\left({2 \choose 2}\right) = 3$ calculations of Bruvo's +% distance, whereas the genome loss model would require $\left({4 \choose +% 2}\right) = 10$ calculations. + +% To reduce the number of calcuations and assumptions otherwise, Bruvo's distance +% will be calculated using the largest observed ploidy. This means that when +% comparing [69,70,71,0] and [59,60,0,0], they will be treated as triploids. + +%-----------------------------------------------------------------------------% +% +%-----------------------------------------------------------------------------% +\subsection{Manipulating Graphics} + +\Poppr{} utilizes \textit{ggplot2} to produce many of its graphs. One advantage +it gives the user is the ability to manipulate these graphs. With base R graphs, +the only manipulation that can be performed is by adding elements to the graph. +It is a static image. The ggplot graphs are actually represented as objects in +your R environment. We can use the function \texttt{last\_plot()} from +\textit{ggplot2} to be able to grab the plot that was plotted last in our +window. Let's illustrate this using a MLG bar graph from the Athena population +of the Aeut data set. + +<>= +library(poppr) +library(ggplot2) +data(Aeut) +Athena.tab <- mlg.table(Aeut, sublist = "Athena") +p <- last_plot() @ -As you can see, these values of $d$ at locus one add up to 2, 2, and 1, respectively. - -To find the total number of differences between two individuals over all loci, you just take $d$ over $m$ loci, a value we'll call $D$: - -\beq -\label{eq:ia_D} -D = \displaystyle \sum_{i=1}^{m} d_i -\eeq - -These values are calculated over all possible combinations of individuals in the data set, ${n \choose 2}$ after which you end up with ${n \choose 2}\cdot{}m$ values of $d$ and ${n \choose 2}$ values of $D$. -Calculating the observed variances is fairly straightforward (modified from Agapow and Burt, 2001) \cite{Agapow:2001}: - -\beq -\label{eq:V_O} -V_O = \frac{\displaystyle \sum_{i=1}^{n \choose 2} D_{i}^2 - \frac{(\displaystyle\sum_{i=1}^{n \choose 2} D_{i})^2}{{n \choose 2}}}{{n \choose 2}} -\eeq - -Calculating the expected variance is the sum of each of the variances of the individual loci. The calculation at a single locus, $j$ is the same as the previous equation, substituting values of $D$ for $d$ \cite{Agapow:2001}: - -\beq -\label{eq:var_j} -var_j = \frac{\displaystyle \sum_{i=1}^{n \choose 2} d_{i}^2 - \frac{(\displaystyle\sum_{i=1}^{n \choose 2} d_i)^2}{{n \choose 2}}}{{n \choose 2}} -\eeq - -The expected variance is then the sum of all the variances over all $m$ loci \cite{Agapow:2001}: - -\beq -\label{eq:V_E} -V_E = \displaystyle \sum_{j=1}^{m} var_j -\eeq - -Now you can plug the sums of equations (\ref{eq:V_O}) and (\ref{eq:V_E}) into equation (\ref{eq:I_A}) to get the index of association. -Of course, Agapow and Burt showed that this index increases steadily with the number of loci, so they came up with an approximation that is widely used, $\bar r_d$ \cite{Agapow:2001}. For the derivation, see the manual for \textit{multilocus}. The equation is as follows, utilizing equations (\ref{eq:V_O}), (\ref{eq:var_j}), and (\ref{eq:V_E}) \cite{Agapow:2001}: -\beq -\label{eq:r_d} -\bar{r_d} = \frac{V_O - V_E} -{2\displaystyle \sum_{j=1}^{m}\displaystyle \sum_{k \neq j}^{m}\sqrt{var_j\cdot{}var_k}} -\eeq +We've captured our plot using \texttt{last\_plot()} and now we can manipulate +it. Let's say we didn't like the scale going from blue to black and we wanted +make it monotone without a color guide. (NOTE: the () around +the call allows us to show the result immediately.) -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -\subsubsection{Bruvo's distance}\label{appendix:algorithm:bruvo} - -\tab \tab Bruvo's distance between two individuals calculates the minimum distance across all combinations of possible pairs of alleles at a single locus and then averaging that distance across all loci \cite{Bruvo:2004}. The distance between each pair of alleles is calculated as \cite{Bruvo:2004}: - -\beq -\label{eq:m_x} -m_x = 2^{-\mid x \mid} -\eeq - -\beq -\label{eq:d_a} -d_a = 1 - m_x -\eeq - -Where $x$ is the number of steps between each allele. So, let's say we were comparing two haploid $(k = 1)$ individuals with alleles 228 and 244 at a locus that had a tetranucleotide repeat pattern (CATG$)^n$. The number of steps for each of these alleles would be $228/4 = 57$ and $244/4 =61$, respectively. The number of steps between them is then $\mid 57 - 61 \mid = 4$. Bruvo's distance at this locus between these two individuals is then $1-2^{-4} = 0.9375$. For samples with higher ploidy ($k$), there would be $k$ such distances of which we would need to take the sum \cite{Bruvo:2004}. - -\beq -\label{eq:s_i} -s_i = \displaystyle \sum_{a=1}^{k} d_a -\eeq - -Unfortunately, it's not as simple as that since we do not assume to know phase. Because of this, we need to take all possible combinations of alleles into account. This means that we will have $k^2$ values of $d_a$, when we only want $k$. How do we know which $k$ distances we want? We will have to invoke parsimony for this and attempt to take the minimum sum of the alleles, of which there are $k!$ possibilities \cite{Bruvo:2004}: +<>= +(pb <- p + scale_fill_continuous(low = "black", high = "black", guide = "none")) +@ -\beq -\label{eq:d_l} -d_l = \frac{\left(\displaystyle \min_{i \dotsc k!} s_i\right)}{k} -\eeq +We could also change the title -Finally, after all of this, we can get the average distance over all loci \cite{Bruvo:2004}. +<>= +(pbt <- pb + ggtitle("Distribution of multilocus genotypes for the Athena population")) + xlab("Multilocus genotype") +@ -\beq -\label{eq:D} -D = \frac{\displaystyle \sum_{i=1}^l d_i}{l} -\eeq +This allows you to produce publication quality graphs directly in R. Please see +Hadley Wickham's \textit{ggplot2} package for more details \cite{ggplot2}. Note +that if you don't like using \textit{ggplot2}, you can access the data in the +ggplot2 object and plot the data yourself: -This is calculated over all possible combinations of individuals and results in a lower triangle distance matrix over all individuals. +<>= +head(p$data) +@ -\subsection{Exporting Graphics}\label{appendix:graphics} -\tab\tab R has the ability to produce nice graphics from most any type of data, but to get these graphics into a report, presentation, or manuscript can be a bit challenging. It's no secret that the R Documentation pages are a little difficult to interpret, so I will give the reader here a short example on how to export graphics from R. Note that any code here that will produce images will also be present in other places in this vignette. The default installation of the R GUI is quite minimal, and for an easy way to manage your plots and code, I strongly encourage the user to use Rstudio \url{http://www.rstudio.com/}. -\subsubsection{Basics}\label{appendix:graphics:basics} -\tab\tab Before you export graphics, you have to ask yourself what they will be used for. If you want to use the graphic for a website, you might want to opt for a low-resolution image so that it can load quickly. With printing, you'll want to make sure that you have a scalable or at least a very high resolution image. Here, I will give some general guidelines for graphics (note that these are merely suggestions, not defined rules). +%-----------------------------------------------------------------------------% +% +%-----------------------------------------------------------------------------% +\subsection{Exporting Graphics} +\label{appendix:graphics} +\tab\tab R has the ability to produce nice graphics from most any type of data, +but to get these graphics into a report, presentation, or manuscript can be a +bit challenging. It's no secret that the R Documentation pages are a little +difficult to interpret, so I will give the reader here a short example on how to +export graphics from R. Note that any code here that will produce images will +also be present in other places in this vignette. The default installation of +the R GUI is quite minimal, and for an easy way to manage your plots and code, I +strongly encourage the user to use Rstudio \url{http://www.rstudio.com/}. + +\subsubsection{Basics} +\label{appendix:graphics:basics} +\tab\tab Before you export graphics, you have to ask yourself what they will be +used for. If you want to use the graphic for a website, you might want to opt +for a low-resolution image so that it can load quickly. With printing, you'll +want to make sure that you have a scalable or at least a very high resolution +image. Here, I will give some general guidelines for graphics (note that these +are merely suggestions, not defined rules). \begin{itemize} - \item \textbf{What you see is not always what you get} I have often seen presentations where the colors were too light or posters with painfully pixellated graphs. Think about what you are going to be using a graphic for and how it will appear to the intended audience given the media type. - \item \textbf{$\geq$ 300 dpi unless its for a web page} For any sort of printed material that requires a raster based image, 300dpi (dots per inch) is the absolute minimum resolution you should use. For simple black and white line images, 1200dpi is better. This will leave you with crisp, professional looking images. - \item \textbf{If possible, save to SVG, then rasterize} Raster images (bmp, png, jpg, etc...) are based off of the number of pixels or dots per inch it takes to render the image. This means that the raster image is more or less a very fine mosaic. Vector images (SVG) are built upon several interconnected polygons, arcs, and lines that scale relative to one another to create your graphic. With vector graphics, you can produce a plot and scale it to the size of a building if you wanted to. When you save to an SVG file first, you can also manipulate it in programs such as Adobe Illustrator or Inkscape. - \item \textbf{Before saving, make sure the units and dimensions are correct} Unless you really wanted to save a graph that's over 6 feet wide. + \item \textbf{What you see is not always what you get} I have often seen + presentations where the colors were too light or posters with painfully + pixellated graphs. Think about what you are going to be using a graphic for + and how it will appear to the intended audience given the media type. + \item \textbf{$\geq$ 300 dpi unless its for a web page} For any sort of + printed material that requires a raster based image, 300dpi (dots per inch) is + the absolute minimum resolution you should use. For simple black and white + line images, 1200dpi is better. This will leave you with crisp, professional + looking images. + \item \textbf{If possible, save to SVG, then rasterize} Raster images (bmp, + png, jpg, etc...) are based off of the number of pixels or dots per inch it + takes to render the image. This means that the raster image is more or less a + very fine mosaic. Vector images (SVG) are built upon several interconnected + polygons, arcs, and lines that scale relative to one another to create your + graphic. With vector graphics, you can produce a plot and scale it to the size + of a building if you wanted to. When you save to an SVG file first, you can + also manipulate it in programs such as Adobe Illustrator or Inkscape. + \item \textbf{Before saving, make sure the units and dimensions are correct} + Unless you really wanted to save a graph that's over 6 feet wide. \end{itemize} -\subsubsection{Image Editors}\label{appendix:graphics:editors} -\tab\tab Often times, fine details such as labels on networks need to be tweaked by hand. Luckily, there are a wide variety of programs that can help you do that. Here is a short list of image editors (both free and for a price) that you can use to edit your graphics. +\subsubsection{Image Editors} +\label{appendix:graphics:editors} +\tab\tab Often times, fine details such as labels on networks need to be tweaked +by hand. Luckily, there are a wide variety of programs that can help you do +that. Here is a short list of image editors (both free and for a price) that you +can use to edit your graphics. \begin{itemize} \item Bitmap based editors (for jpeg, bmp, png, etc...) \begin{quote} \begin{itemize} \item[ \scshape The GIMP ] Free, cross-platform. \url{http://www.gimp.org} - \item[ \scshape Paint.net ] Free, Windows only. \url{http://www.getpaint.net} - \item[ \scshape Adobe Photoshop ] Pricey, Windows and Mac. \url{http://www.adobe.com/products/photoshop.html} + \item[ \scshape Paint.net ] Free, Windows only. + \url{http://www.getpaint.net} + \item[ \scshape Adobe Photoshop ] Pricey, Windows and Mac. + \url{http://www.adobe.com/products/photoshop.html} \end{itemize} \end{quote} \item Scalable Vector Graphics based editors (for svg, pdf) \begin{quote} \begin{itemize} \item[ \scshape Inkscape ] Free, cross-platform \url{http://inkscape.org} - \item[ \scshape Adobe Illustrator ] Pricey, Windows and Mac. \url{http://www.adobe.com/products/illustrator.html} + \item[ \scshape Adobe Illustrator ] Pricey, Windows and Mac. + \url{http://www.adobe.com/products/illustrator.html} \end{itemize} \end{quote} \end{itemize} -\subsubsection{Exporting ggplot2 graphics}\label{appendix:graphics:ggplot2} -\tab\tab \textit{ggplot2} is a fantastic package that \textit{poppr} uses to produce graphs for the \texttt{mlg.table}, \texttt{poppr}, and \texttt{ia} functions. Saving a plot with \textit{ggplot2} is performed with one command after your plot has rendered: +\subsubsection{Exporting ggplot2 graphics} +\label{appendix:graphics:ggplot2} +\tab\tab \textit{ggplot2} is a fantastic package that \poppr{} uses to produce +graphs for the \texttt{mlg.table}, \texttt{poppr}, and \texttt{ia} functions. +Saving a plot with \textit{ggplot2} is performed with one command after your +plot has rendered: <>= data(nancycats) # Load the data set. poppr(nancycats, sublist=5, sample=999) # Produce a single plot. ggsave("nancy5.pdf") @ -Note that you can name the file anything, and \texttt{ggsave} will save it in that format for you. The details are in the documentation and you can access it by typing \texttt{help("ggsave")} in your R console. The important things to note are that you can set a \texttt{width}, \texttt{height}, and \texttt{unit}. The only downside to this function is that you can only save one plot at a time. If you want to be able to save multiple plots, read on to the next section. - -\subsubsection{Exporting any graphics}\label{appendix:graphics:export} -\tab\tab Some of the functions that \textit{poppr} offers will give you multiple plots, and if you want to save them all, using \texttt{ggsave} will require a lot of tedious typing and clicking. Luckily, R has Functions that will save any plot you generate in nearly any image format you want. You can save in raster images such as png, bpm, and jpeg. You can also save in vector based images such as svg, pdf, and postscript. The important thing to remember is that when you are saving in a raster format, the default units of measurement are ``pixels", but you can change that by specifying your unit of choice and a resolution. - -For raster images and svg files, you can only save your plots in multiple files, but pdf and postscript plots can be saved in one file as multiple pages. All of these functions have the same basic form. You call the function to specify the file type you want (eg. \texttt{pdf("myfile.pdf")}), create any graphs that you want to create, and then make sure to close the session with the function \texttt{dev.off()}. Let's give an example saving to pdf and png files. +Note that you can name the file anything, and \texttt{ggsave} will save it in +that format for you. The details are in the documentation and you can access it +by typing \texttt{help("ggsave")} in your R console. The important things to +note are that you can set a \texttt{width}, \texttt{height}, and \texttt{unit}. +The only downside to this function is that you can only save one plot at a time. +If you want to be able to save multiple plots, read on to the next section. + +\subsubsection{Exporting any graphics} +\label{appendix:graphics:export} +\tab\tab Some of the functions that \poppr{} offers will give you multiple +plots, and if you want to save them all, using \texttt{ggsave} will require a +lot of tedious typing and clicking. Luckily, R has Functions that will save any +plot you generate in nearly any image format you want. You can save in raster +images such as png, bpm, and jpeg. You can also save in vector based images such +as svg, pdf, and postscript. The important thing to remember is that when you +are saving in a raster format, the default units of measurement are ``pixels", +but you can change that by specifying your unit of choice and a resolution. + +For raster images and svg files, you can only save your plots in multiple files, +but pdf and postscript plots can be saved in one file as multiple pages. All of +these functions have the same basic form. You call the function to specify the +file type you want (eg. \texttt{pdf("myfile.pdf")}), create any graphs that you +want to create, and then make sure to close the session with the function +\texttt{dev.off()}. Let's give an example saving to pdf and png files. <>= data(H3N2) @@ -1877,7 +4594,14 @@ dev.off() #### @ -Since this data set is made up of 30 populations with more than 1 individual, this will save 30 files to your working directory named ``H3N2\_barchart01.png...H3N2\_barchart30.png". The way R knows how to number these files is because of the \texttt{\%02d} part of the command. That's telling R to use a number that is two digits long in place of that expression. All of these files will be 14x14" and will have a resolution of 300 dots per inch. If you wanted to do the same thing, but place them all in one file, you should use the pdf option. +Since this data set is made up of 30 populations with more than 1 individual, +this will save 30 files to your working directory named +``H3N2\_barchart01.png...H3N2\_barchart30.png". The way R knows how to number +these files is because of the \texttt{\%02d} part of the command. That's telling +R to use a number that is two digits long in place of that expression. All of +these files will be 14x14" and will have a resolution of 300 dots per inch. If +you wanted to do the same thing, but place them all in one file, you should use +the pdf option. <>= pdf("H3N2_barcharts.png", width = 14, height = 14, compress = FALSE) @@ -1885,38 +4609,104 @@ H.tab <- mlg.table(H3N2) dev.off() @ -Remember, it is important not to forget to type \texttt{dev.off()} when you are done making graphs. Note that I did not have to specify a resolution for this image since it is based off of vector graphics. +Remember, it is important not to forget to type \texttt{dev.off()} when you are +done making graphs. Note that I did not have to specify a resolution for this +image since it is based off of vector graphics. -\subsection{Function calls}\label{appendix:funk} +%-----------------------------------------------------------------------------% +% +%-----------------------------------------------------------------------------% +\subsection{Table of Functions} +\label{appendix:funk} -\tab\tab Here is a list of all the default function calls for \textit{poppr}. Details can be found in the above sections.\\ -\tt \small -\begin{itemize} -\item getfile(multi = FALSE, pattern = NULL, combine = TRUE) (Section \ref{intro:import:getfile}) -\item read.genalex(genalex, ploidy = 2, geo = FALSE, region = FALSE) (Section \ref{intro:import:read.genalex}) -\item genind2genalex(pop, filename = "genalex.csv", quiet = FALSE, geo = FALSE, geodf = "xy") (Section \ref{intro:import:genind2genalex}) -\item missingno(pop, type = "loci", cutoff = 0.05, quiet = FALSE) (Section \ref{data.manip:missing:missingno}) -\item splitcombine(pop, method = 1, dfname = "population\_hierarchy", sep = "\_", hier = c(1), setpopulation = TRUE, fixed = TRUE) (Section \ref{data.manip:hier:splitcombine}) -\item popsub(pop, sublist = "ALL", blacklist = NULL, mat = NULL) (Section \ref{data.manip:divide:popsub}) -\item clonecorrect(pop, hier = c(1), dfname = "population\_hierarchy", combine = FALSE, keep = 1) (Section \ref{data.manip:cc:clonecorrect}) -\item shufflepop(pop, method = 1) (Section \ref{data.manip:shuffle:shufflepop}) -\item informloci(pop, cutoff = 2/nInd(pop), quiet = FALSE) (Section \ref{data.manip:informloci}) -\item mlg(pop, quiet = FALSE) (Section \ref{mlg:mlg:mlg}) -\item mlg.crosspop(pop, sublist = "ALL", blacklist = NULL, mlgsub = NULL, indexreturn = FALSE, df = FALSE, quiet = FALSE) (Section \ref{mlg:cross:mlg.crosspop}) -\item mlg.table(pop, sublist = "ALL", blacklist = NULL, mlgsub = NULL, bar = TRUE, total = FALSE, quiet = FALSE) (Section \ref{mlg:table:mlg.table}) -\item mlg.vector(pop) (Section \ref{mlg:mix:mlg.vector}) -\item ia(pop, sample = 0, method = 1, quiet = FALSE, missing = "ignore", hist = TRUE) (Section \ref{index:iard:ia}) -\item diss.dist(pop) (Section \ref{index:dist:diss.dist}) -\item bruvo.dist(pop, replen = c(2)) (Section \ref{index:bruvo:bruvo.dist}) -\item bruvo.boot(pop, replen = c(2), sample = 100, tree = "upgma", showtree = TRUE, -cutoff = NULL, quiet = FALSE, ...) (Section \ref{index:trees:bruvo.boot}) -\item greycurve(glim = c(0, 0.8), gadj = 3, gweight = 1) (Section \ref{index:trees:greycurve}) -\item bruvo.msn(pop, replen = c(2), palette = topo.colors, sublist = "All", blacklist = NULL, vertex.label = "MLG", gscale = TRUE, glim = c(0, 0.8), gadj = 3, gweight = 1, wscale = TRUE, ...) (Section \ref{index:trees:bruvo.msn}) -\item poppr.msn(pop, distmat, palette = topo.colors, sublist = "All", blacklist = NULL, vertex.label = "MLG", gscale = TRUE, glim = c(0, 0.8), gadj = 3, gweight = 1, wscale = TRUE, ...) (Section \ref{index:trees:poppr.msn}) -\item poppr(pop, total = TRUE, sublist = c("ALL"), blacklist = c(NULL), sample = 0, method = 1, missing = "ignore", cutoff=0.05, quiet = FALSE, clonecorrect = FALSE, hier = c(1), keep = 1, dfname = "population\_hierarchy", hist = TRUE, minsamp = 10) (Section \ref{summary:poppr}) -\item poppr.all(filelist, ...) (Sections \ref{intro:import:getfile} and \ref{summary:poppr}) -\end{itemize} -\normalsize -\bibliographystyle{plain} +\tab\tab Below is a table of functions found in \poppr{}. These functions are +linked within the document. If a function name is blue, simply click on it to go +to its definition and description. + +\begin{center} +\begin{longtable}{ll} +\caption{Functions available in \poppr{}} \label{function_table} \\ + +\hline \\[-1.5ex] \textbf{\large Function} & \textbf{\large Description} \\ \hline \\[-1.5ex] +\endfirsthead + +\multicolumn{2}{c}% +{{\bfseries \tablename\ \thetable{} -- continued from previous page}} \\ +\hline \\[-1.5ex] \textbf{\large Function} & \textbf{\large Description} \\ \hline \\[-1.5ex] +\endhead + +\hline \multicolumn{2}{r}{\textbf{Continued on next page...}} \\ +\endfoot + +\hline \hline +\endlastfoot + +\multicolumn{2}{l}{\Large\textsc{Import/Export}}\\ +\cmdlink{intro:import:getfile}{getfile} & Provides a quick GUI to grab files for import \\ +\cmdlink{intro:import:read.genalex}{read.genalex} & Read \textit{GenAlEx} formatted csv files to a genind object \\ +\cmdlink{intro:import:genind2genalex}{genind2genalex} & Converts genind objects to \textit{GenAlEx} formatted csv files \\ +\cmdlink{intro:genclone:as.genclone}{as.genclone} & Converts genind objects to genclone objects \\ +\hline \\[-1.5ex] +\multicolumn{2}{l}{\Large\textsc{Manipulation}}\\ +\cmdlink{data.manip:hier:setpop}{setpop} & Set the population using defined hierarchies\\ +\cmdlink{data.manip:hier:define}{splithierarchy} & Split a concatenated hierarchy imported as a population\\ +\cmdlink{data.manip:hier:define}{sethierarchy} & Define a population hierarchy of a genclone object\\ +\cmdlink{data.manip:hier:view}{gethierarchy} & Extract the hierarchy data frame\\ +\cmdlink{data.manip:hier:manip}{addhierarchy} & Add a vector or data frame to an existing hierarchy \\ +\cmdlink{data.manip:hier:manip}{namehierarchy} & Rename a population hierarchy\\ +\cmdlink{data.manip:missing:missingno}{missingno} & Handles missing data \\ +\cmdlink{data.manip:cc:clonecorrect}{clonecorrect} & Clone censors at a specified population hierarchy \\ +\cmdlink{data.manip:informloci}{informloci} & Detects and removes phylogenetically uninformative loci \\ +\cmdlink{data.manip:divide:popsub}{popsub} & Subsets genind objects by population \\ +\cmdlink{data.manip:shuffle:shufflepop}{shufflepop} & Shuffles genotypes at each locus using four different shuffling algorithms \\ +\texttt{splitcombine*} & Manipulates population hierarchy *Deprecated\\ +\cmdlink{intro:import:polyploid}{recode\_polyploids} & recode polyploid data sets with missing alleles imported as ``0''\\ +\hline \\[-1.5ex] +\multicolumn{2}{l}{\Large\textsc{Distances}}\\ +\texttt{bruvo.dist} & Bruvo's distance \\ +\texttt{diss.dist} & Absolute genetic distance (see \texttt{provesti.dist}) \\ +\texttt{nei.dist} & Nei's 1978 genetic distance \\ +\texttt{rogers.dist} & Rogers' euclidean distance \\ +\texttt{reynolds.dist} & Reynolds' coancestry distance \\ +\texttt{edwards.dist} & Edwards' angular distance \\ +\texttt{provesti.dist} & Provesti's absolute genetic distance \\ +\hline \\[-1.5ex] +\multicolumn{2}{l}{\Large\textsc{Bootstrapping}}\\ +\texttt{aboot} & Creates a bootstrapped dendrogram for any distance measure \\ +\texttt{bruvo.boot} & Produces dendrograms with bootstrap support\\ + & based on Bruvo's distance \\ +\hline \\[-1.5ex] +\multicolumn{2}{l}{\Large\textsc{Analysis}}\\ +\texttt{poppr.amova} & Analysis of Molecular Variance (as implemented in ade4) \\ +\texttt{ia} & Calculates the index of association \\ +\cmdlink{mlg:mlg:mlg}{mlg} & Calculates the number of multilocus genotypes \\ +\cmdlink{mlg:cross:mlg.crosspop}{mlg.crosspop} & Finds all multilocus genotypes that cross populations \\ +\cmdlink{mlg:table:mlg.table}{mlg.table} & Returns a table of populations by multilocus genotypes \\ +\cmdlink{mlg:mix:mlg.vector}{mlg.vector} & Returns a vector of a numeric multilocus genotype assignment for \\ + & each individual \\ +\cmdlink{mlg:mix:mlg.id}{mlg.id} & Identifies individuals associated with each MLG \\ +\texttt{poppr} & Returns a diversity table by population \\ +\texttt{poppr.all} & Returns a diversity table by population for all compatible files specified \\ +\texttt{private\_alleles} & Tabulates the occurrences of alleles that only occur in one population\\ +\texttt{locus\_table} & Creates a table of summary statistics per locus\\ +\hline \\[-1.5ex] +\multicolumn{2}{l}{\Large\textsc{Visualization}}\\ +\texttt{plot\_poppr\_msn} & Plots minimum spanning networks produced in poppr with\\ + & scale bar and legend \\ +\texttt{greycurve} & Helper to determine the appropriate parameters for adjusting the\\ + & grey level for msn functions \\ +\texttt{bruvo.msn} & Produces minimum spanning networks based off Bruvo's distance\\ + & colored by population \\ +\texttt{poppr.msn} & Produces a minimum spanning network for any pairwise distance\\ + & matrix related to the data \\ +\texttt{info\_table} & Creates a heatmap representing missing data or observed ploidy\\ +\texttt{genotype\_curve} & Creates a series of boxplots demonstrating how many loci are\\ + & needed to represent the diversity of your data.\\ +\hline + +\end{longtable} +\end{center} + +\bibliographystyle{pnas.bst} \bibliography{poppr_man} \end{document} diff --git a/vignettes/poppr_manual.pdf b/vignettes/poppr_manual.pdf index 8081dba4..46379579 100644 Binary files a/vignettes/poppr_manual.pdf and b/vignettes/poppr_manual.pdf differ diff --git a/vignettes/poppr_manual.tex b/vignettes/poppr_manual.tex deleted file mode 100755 index 6e5d700b..00000000 --- a/vignettes/poppr_manual.tex +++ /dev/null @@ -1,3166 +0,0 @@ -\documentclass[letterpaper]{article} -%\VignetteIndexEntry{Poppr User Manual} -\usepackage{graphicx} -\usepackage[colorlinks=true,urlcolor=blue]{hyperref} -\usepackage{array} -\usepackage{color} -\usepackage[usenames,dvipsnames,svgnames,table]{xcolor} -\usepackage[utf8]{inputenc} % for UTF-8/single quotes from sQuote() -\usepackage{fullpage} -\usepackage{mathtools} -\usepackage{makeidx} -% \usepackage{lineno} -% Doublespacing. -% \usepackage{setspace} -% \setstretch{2} - -% for bold symbols in mathmode -\usepackage{bm} -\newcommand{\R}{\mathbb{R}} -\newcommand{\beq}{\begin{equation}} -\newcommand{\eeq}{\end{equation}} -\newcommand{\m}[1]{\mathbf{#1}} -\newcommand{\tab}{\hspace*{1em}} - -\title{Poppr 1.0.5: An R package for genetic analysis of populations with mixed (clonal/sexual) reproduction} -\author{Zhian N. Kamvar$^{1}$\ and Niklaus J. Gr\"unwald$^{1,2}$\\\scriptsize{1) Department of Botany and Plant Pathology, Oregon State University, Corvallis, OR}\\\scriptsize{2) Horticultural Crops Research Laboratory, USDA-ARS, Corvallis, OR}} - -\usepackage{Sweave} -\begin{document} -% Set the width of figures. -\setkeys{Gin}{width=0.5\textwidth} -\input{poppr_manual-concordance} -\definecolor{Sinput}{rgb}{0.75,0.19,0.19} -\definecolor{Soutput}{rgb}{0,0,0} -\definecolor{Scode}{rgb}{0.75,0.19,0.19} -\definecolor{light-gray}{gray}{0.95} -\definecolor{salmon}{HTML}{F0AAAA} -\DefineVerbatimEnvironment{Sinput}{Verbatim} -{formatcom={\color{Sinput}},fontsize=\footnotesize, baselinestretch=0.75} -\DefineVerbatimEnvironment{Soutput}{Verbatim} -{formatcom={\color{Soutput}},fontsize=\footnotesize, baselinestretch=0.75} -% The first page will have the title, abstract, and then the \textit{Poppr} logo at the bottom. -\maketitle -\begin{abstract} -\textit{Poppr} provides open-source, cross-platform tools for quick analysis of population genetic data enabling focus on data analysis and interpretation. While there are a plethora of packages for population genetic analysis, few are able to offer quick and easy analysis of populations with mixed reproductive modes. \textit{Poppr}'s main advantage is the ease of use and integration with other packages such as \textit{adegenet} and \textit{vegan}, including support for novel methods such as clone correction, multilocus genotype analysis, calculation of Bruvo's distance and the index of association. -\end{abstract} -% Inserting the \textit{Poppr} logo here -\begin{figure}[b] - \centering - \label{logo} - \includegraphics{popprlogo} -\end{figure} -\newpage -\begingroup -\hypersetup{linkcolor=black} -\tableofcontents -\endgroup - -%\linenumbers -\section{Introduction}\label{intro} -\subsection{Purpose}\label{intro:purpose} - -\tab\tab\textit{Poppr} is an R package with convenient functions for analysis of genetic data with mixed modes of reproduction including sexual and clonal reproduction. While there are many R packages in CRAN and other repositories with tools for population genetic analyses, few are appropriate for populations with mixed modes of reproduction. There are several stand alone programs that can handle these types of data sets, but they are often platform specific and often only accept specific data types. Furthermore, a typical analysis often involves switching between many programs, and converting data to each specific format. - -\textit{Poppr} is designed to make analysis of populations with mixed reproductive modes more streamlined and user friendly so that the researcher using it can focus on data analysis and interpretation. \textit{Poppr} allows analysis of haploid and diploid dominant/co-dominant marker data including microsattelites, Single Nucleotide Polymorphisms (SNP), and Amplified Fragment Length Polymorphisms (AFLP). To avoid creating yet another file format that is specific to a program, \textit{poppr} was created on the backbone of the popular R package \textit{adegenet} and can take all the file formats that \textit{adegenet} can take (Genpop, Genetix, Fstat, and Structure) and newly introduces compatibility with GenAlEx formatted files (exported to CSV). This means that anything you can analyze in \textit{adegenet} can be further analyzed with \textit{poppr}. - -The real power of \textit{poppr} is in the data manipulation and analytic tools. \textit{Poppr} has the ability to define multiple population hierarchies, clone-censor, and subset data sets. With \textit{poppr} you can also quickly calculate Bruvo's distance, the index of association, and easily determine which multilocus genotypes are shared across populations. -\subsection{Installation}\label{intro:install} - -\tab\tab This manual assumes that you have already installed R. If you have not, please refer to The CRAN home page at \url{http://cran.r-project.org/}. The author also recommends utilizing an R gui such as Rstudio (\url{http://www.rstudio.com/}) for a better R experience. - -\subsubsection{From CRAN} -\tab\tab To install \textit{poppr} from CRAN is as simple as selecting ``Package Installer" from the menu ``Packages \& Data" in the gui or by typing in your command line: -\begin{Schunk} -\begin{Sinput} -> install.packages("poppr", dependencies=TRUE) -\end{Sinput} -\end{Schunk} -If everything is working perfectly, all the dependencies (\textit{adegenet, pegas, vegan, ggplot2, phangorn, ape} and \textit{igraph}) should be installed. In the unfortunate case this does not work, consult \url{http://cran.r-project.org/doc/manuals/R-admin.html#Installing-packages}. - -\subsubsection{From Source} -\tab\tab The tarball for \textit{poppr} can be from CRAN: \url{http://cran.r-project.org/package=poppr}, the Gr\"unwald Lab website: \url{http://http://grunwaldlab.cgrb.oregonstate.edu/} under the \textsc{Resources} tab, or github at \url{https://github.com/grunwaldlab/poppr}. - -Since \textit{poppr} contains C code, it needs to be compiled, which means that you need a working C compiler. If you are on Linux, you shouldn't have to worry too much about that, but if you are on Windows or OSX, you might need to download some special tools: - -\begin{description} - \item[Windows] Download Rtools: \url{http://cran.r-project.org/bin/windows/Rtools/} - \item[OSX] Download Xcode: \url{https://developer.apple.com/xcode/} -\end{description} - -If you choose to install \textit{poppr} from a source file, you should first make sure to install all of the dependencies with the following command: -\begin{Schunk} -\begin{Sinput} -> install.packages(c("adegenet", "pegas", "vegan", "ggplot2", "phangorn", "ape", "igraph")) -\end{Sinput} -\end{Schunk} -\textbf{If you want to install from github, you may skip to the next section.} - -After installing dependencies, download the package to your computer and then you can install it with: -\begin{Schunk} -\begin{Sinput} -> install.packages("/path/to/poppr.tar.gz", type="source", repos=NULL) -\end{Sinput} -\end{Schunk} - -\subsubsection{From github} -\tab\tab Github is a repository where you can find all stable and development versions of \textit{poppr}. Installing from github requires a C compiler, so be sure to read the section above for instructions on how to obtain that if you aren't on a Linux system. - -To install from github, you do not need to actually download the tarball since there is a package called \textit{devtools} that will download and install the package for you directly from github. After you have installed all dependencies (see above section), you should download \textit{devtools}: -\begin{Schunk} -\begin{Sinput} -> install.packages("devtools") -\end{Sinput} -\end{Schunk} -Now you can execute the command \texttt{install\_github} with the user and repository name: -\begin{Schunk} -\begin{Sinput} -> library(devtools) -> install_github(repo = "grunwaldlab/poppr") -\end{Sinput} -\end{Schunk} - -If you are the adventurous type and are willing to test out unreleased versions of the package, you can also install the development version: -\begin{Schunk} -\begin{Sinput} -> library(devtools) -> install_github(repo = "grunwaldlab/poppr", ref = "devel") -\end{Sinput} -\end{Schunk} -Users who install this version do so at their own risk. Since it is a development version, documentation may be rough or nonexistant for new functions. - -\subsection{Quick start}\label{intro:qstart} - -\tab\tab The author assumes that if you have reached this point in the manual, then you have successfully installed R and \textit{poppr}. Before proceeding, you should be aware that R is case sensitive. This means that the words ``Case" and ``case" are different from R's perspective. You should also know where your R package Library is located. -\begin{center} - \fcolorbox{black}{light-gray}{ - \begin{minipage}[t]{0.8\textwidth} -{\large \textsc{What or where is my R package library?}}\\ -R is as powerful as it is through a community of people who submit extra code called ``Packages" to help it do specific things. These packages live in a certain place on your computer called an R library. You can find out where this library is by typing -.libPaths() - \end{minipage} - } -\end{center} -Importing a file into R involves you knowing the path to your file and then typing that into R's console. \texttt{getfile()} will help provide a point and click interface for selecting a file. There are two steps: -Before you do anything, you'll want to tell your computer to search R's library to find the \textit{poppr} and load the package: -\begin{Schunk} -\begin{Sinput} -> library(poppr) -\end{Sinput} -\end{Schunk} -After that, you can use \texttt{getfile()} -\begin{Schunk} -\begin{Sinput} -> x <- getfile() -\end{Sinput} -\end{Schunk} -At this point, a pop up window will appear like this\footnote{This window sometimes appears behind your current session of R, depending on the GUI and you will have to toggle to this window}: -\begin{figure}[h!] - \centering - \caption{\footnotesize \footnotesize A popup window as it appears in OSX (Mountain Lion).} - \label{getfile window} -\includegraphics{getfile.png} -\end{figure} -\begin{center} - \fcolorbox{black}{light-gray}{ - \begin{minipage}[t]{0.8\textwidth} -{\large \textsc{Hey! My window doesn't look like that!}}\\ -Now, this window will not match up to your window on your computer because you will probably not be in the right directory. Remember the first path in \texttt{.libPaths()}? Move to a folder called \textbf{poppr} in that path. In that folder, you will find another folder called \textbf{files}. Move there and your window will match the one displayed. - \end{minipage} - } -\end{center} -\newpage - -We can navigate throughout your entire computer through this little window and tell R where to go. The example I'm using goes to your R library directory. If you don't know where that is, you can find it by typing \texttt{.libPaths()} into the R command line. -Once we select a file, the file name and its path will be stored in the variable, x. We can confirm what we selected by simply typing \texttt{x} into R's command line. -\begin{Schunk} -\begin{Sinput} -> x -\end{Sinput} -\begin{Soutput} -$files -[1] "/path/to/R/poppr/files/rootrot.csv" - -$path -[1] "/path/to/R/poppr/files" -\end{Soutput} -\end{Schunk} -Here we can see that \texttt{x} is a list with two entries: \texttt{\$files} giving you the files you selected and \texttt{\$path} giving you the path to those files. -\begin{center} - \fcolorbox{black}{light-gray}{ - \begin{minipage}[t]{0.8\textwidth} -{\large \textsc{Not sure what I mean by path or working directory?}}\\ -For anyone who has never used a command line, this is a new concept. -You can think of the path as an address. So instead of \texttt{"/path/to/R"}, you could have \texttt{"/USA/Oregon/Corvallis"}. Or on your computer, it could be \texttt{"C:/users/poppr-user/R/win-library/2.15"} on Windows (where "poppr-user" is your username) or \texttt{"/Library/Frameworks/R.framework/Versions/2.15/Resources/library"} on OSX. Each slash represents a folder that you would click through when you are using the mouse. - -A working directory is simply the folder that R is working in. It is where you can access and write files. When you tell R to read a file, it will only look for that file in your working directory. Note that you will not endanger your files by reading them into R. R works by making a copy of the file into memory. This means that you can manipulate the data in any way that you want without ever losing the content.\\ - -To find out your current working directory, type \texttt{getwd()} into the R console. Usually, you will start off a session in your "home" directory, which will look like this: \texttt{"$\sim$/"}. -The command \texttt{setwd()} will change your working directory to any place of your choice on your computer as indicated by the path that you provide. -For more information, see Quick R at \url{http://www.statmethods.net}. - \end{minipage} - } -\end{center} - -% Now we can set our working directory. That is, we can tell R to go to the folder that contains all of our data with the \texttt{setwd} command. -% <>= -% setwd(x$path) -% @ -We will use \texttt{x\$files} to access the file. The \texttt{poppr()} function provides a quick and convenient first analysis of your data directly from the file on the your disk (For information on importing your data into R, see section \ref{intro:import}, \textit{Get out of my dreams and into my R}). -\begin{Schunk} -\begin{Sinput} -> popdata <- poppr(x$files) -\end{Sinput} -\end{Schunk} -\begin{Schunk} -\begin{Soutput} -| Athena_1 -| Athena_2 -| Athena_3 -| Athena_4 -| Athena_5 -| Athena_6 -| Athena_7 -| Athena_8 -| Athena_9 -| Athena_10 -| Mt. Vernon_1 -| Mt. Vernon_2 -| Mt. Vernon_3 -| Mt. Vernon_4 -| Mt. Vernon_5 -| Mt. Vernon_6 -| Mt. Vernon_7 -| Mt. Vernon_8 -| Total -\end{Soutput} -\end{Schunk} -The output of \texttt{poppr()} was assigned to the variable \texttt{popdata}, so let's look at the data. -\begin{Schunk} -\begin{Sinput} -> popdata -\end{Sinput} -\end{Schunk} -\begin{Schunk} -\begin{Soutput} - Pop N MLG eMLG SE H G Hexp E.5 Ia rbarD File -1 Athena_1 9 7 7.000 0.000 1.889 6.231 0.944 0.932 2.925 0.210 rootrot.csv -2 Athena_2 12 12 10.000 NaN 2.485 12.000 1.000 1.000 4.160 0.128 rootrot.csv -3 Athena_3 10 2 2.000 0.000 0.325 1.220 0.200 0.571 2.000 1.000 rootrot.csv -4 Athena_4 13 9 7.154 0.769 1.946 5.121 0.872 0.687 5.495 0.372 rootrot.csv -5 Athena_5 10 7 7.000 0.000 1.834 5.556 0.911 0.866 4.532 0.353 rootrot.csv -6 Athena_6 5 5 5.000 0.000 1.609 5.000 1.000 1.000 2.464 0.190 rootrot.csv -7 Athena_7 11 10 9.182 0.386 2.272 9.308 0.982 0.955 2.129 0.086 rootrot.csv -8 Athena_8 8 6 6.000 0.000 1.667 4.571 0.893 0.831 3.857 0.323 rootrot.csv -9 Athena_9 10 10 10.000 0.000 2.303 10.000 1.000 1.000 2.815 0.118 rootrot.csv -10 Athena_10 9 8 8.000 0.000 2.043 7.364 0.972 0.948 2.849 0.137 rootrot.csv -11 Mt. Vernon_1 10 9 9.000 0.000 2.164 8.333 0.978 0.952 7.132 0.276 rootrot.csv -12 Mt. Vernon_2 6 6 6.000 0.000 1.792 6.000 1.000 1.000 20.649 0.492 rootrot.csv -13 Mt. Vernon_3 8 6 6.000 0.000 1.667 4.571 0.893 0.831 2.117 0.106 rootrot.csv -14 Mt. Vernon_4 12 8 6.833 0.665 1.814 4.500 0.848 0.681 3.008 0.255 rootrot.csv -15 Mt. Vernon_5 17 7 5.541 0.828 1.758 5.070 0.853 0.848 2.677 0.340 rootrot.csv -16 Mt. Vernon_6 12 11 9.318 0.466 2.369 10.286 0.985 0.958 19.498 0.467 rootrot.csv -17 Mt. Vernon_7 12 9 7.818 0.649 2.095 7.200 0.939 0.870 1.208 0.153 rootrot.csv -18 Mt. Vernon_8 13 9 7.346 0.764 2.032 6.259 0.910 0.794 1.153 0.169 rootrot.csv -19 Total 187 119 9.612 0.612 4.558 68.972 0.991 0.720 14.371 0.271 rootrot.csv -\end{Soutput} -\end{Schunk} -One thing to note about this output is the \texttt{NaN} in the column labeled \texttt{SE}. This is produced from calculation of a standard error based on rarefaction analysis. Occasionally, this calculation will encounter a situation in which it must attempt to take a square root of a negative number. As you no doubt have learned in high school mathematics, the root of any negative number is not defined in the set of real numbers, and must have an imaginary component, $i$. Unfortunately, R is a computer program without any imagination and, thus imaginary numbers cannot be represented. To account for this, R represents the square roots of negatives as ``not a number" or \texttt{NaN}.\\ -The fields you see in the output include: -\begin{itemize} - \item \texttt{Pop -} Population name (Note that ``Total" also means ``Pooled"). - \item \texttt{N -} Number of individuals observed. - \item \texttt{MLG -} Number of multilocus genotypes (MLG) observed. - \item \texttt{eMLG -} The number of expected MLG at the smallest sample size $\geq 10$ based on rarefaction. \cite{Hurlbert:1971} - \item \texttt{SE -} Standard error based on \texttt{eMLG} \cite{Heck:1975} - \item \texttt{H -} Shannon-Wiener Index of MLG diversity. \cite{Shannon:1948} - \item \texttt{G -} Stoddart and Taylor's Index of MLG diversity. \cite{Stoddart:1988} - \item \texttt{Hexp -} Nei's 1978 genotypic diversity (corrected for sample size), or Expected Heterozygosity. \cite{Nei:1978} - \item \texttt{E.5 -} Evenness, $E_5$. \cite{Pielou:1975}\cite{Ludwig:1988}\cite{Grunwald:2003} - \item \texttt{Ia -} The index of association, $I_A$. \cite{Brown:1980} \cite{Smith:1993} \cite{Agapow:2001} - \item \texttt{rbarD -} The standardized index of association, $\bar r_d$. \cite{Agapow:2001} -\end{itemize} - -These fields are further described in section \ref{summary}, \textit{I know what you did last summary table} at the end of this vignette. - -\subsection{Get out of my dreams and into my R \{importing data into poppr\}}\label{intro:import} -There are several ways of reading data into R. -\subsubsection{Function: getfile}\label{intro:import:getfile} -\tab\tab \texttt{getfile} gives the user an easy way to point R to the directory in which your data is stored. It is only meant for R GUIs such as Rstudio. Using this on the command line has very little advantage over setting the working directory manually. -\begin{quote} -Default Command:\\ -\texttt{getfile(multi = FALSE, pattern = NULL, combine = TRUE)} -\end{quote} -\begin{itemize} - \item \texttt{multi -} This is normally set to \texttt{FALSE}, meaning that it will only grab the file you selected. If it's \texttt{TRUE}, it will grab all files within the directory, constrained only by what you type into the \texttt{pattern} field. - \item \texttt{pattern -} A pattern that you want to filter the files you get. This accepts regular expressions, so you must be careful with anything that is not an alphanumeric character. - \item \texttt{combine -} This tells \texttt{getfile} to combine the path and all the files. This is set to \texttt{TRUE} by default so that you can access your files no matter what working directory you are in. -\end{itemize} - -This method works for a single file, but let's say you had a lot of data sets you wanted to import. You would have to do all of these one by one, right? Not so. \texttt{getfile} has a nice little flag called \texttt{multi} telling the computer that you want to grab multiple files in the folder. You would use this with \texttt{poppr.all} to produce a summary table for all of your files\footnote{These files do not need to be similar in any way to do this analysis}: -\begin{Schunk} -\begin{Sinput} -> x <- getfile(multi=TRUE) -\end{Sinput} -\end{Schunk} -A window would pop up again, and you should navigate to the same directory as you had before, and select any of the files in that directory. -\begin{Schunk} -\begin{Sinput} -> x -\end{Sinput} -\end{Schunk} -\begin{Schunk} -\begin{Soutput} -$files -[1] "/path/to/R/poppr/files/rootrot.csv" "/path/to/R/poppr/files/rootrot2.csv" -[3] "/path/to/R/poppr/files/simulated.dat" - -$path -[1] "/path/to/R/poppr/files" -\end{Soutput} -\end{Schunk} -As you can see, now all of the files that existed in that directory are there! Now you can look at all those files at once! -\begin{Schunk} -\begin{Sinput} -> poppr.all(x$files) -\end{Sinput} -\end{Schunk} -\begin{Schunk} -\begin{Soutput} -| File: rootrot.csv -| Athena_1 -| Athena_2 -| Athena_3 -| Athena_4 -| Athena_5 -| Athena_6 -| Athena_7 -| Athena_8 -| Athena_9 -| Athena_10 -| Mt. Vernon_1 -| Mt. Vernon_2 -| Mt. Vernon_3 -| Mt. Vernon_4 -| Mt. Vernon_5 -| Mt. Vernon_6 -| Mt. Vernon_7 -| Mt. Vernon_8 -| Total -| File: rootrot2.csv -| 1 -| 2 -| 3 -| 4 -| 5 -| 6 -| 7 -| 8 -| 9 -| 10 -| Total -| File: simulated.dat -| Total - Pop N MLG eMLG SE H G Hexp E.5 Ia rbarD File -1 Athena_1 9 7 7.000 0.000 1.889 6.231 0.944 0.932 2.925 0.210 rootrot.csv -2 Athena_2 12 12 10.000 NaN 2.485 12.000 1.000 1.000 4.160 0.128 rootrot.csv -3 Athena_3 10 2 2.000 0.000 0.325 1.220 0.200 0.571 2.000 1.000 rootrot.csv -4 Athena_4 13 9 7.154 0.769 1.946 5.121 0.872 0.687 5.495 0.372 rootrot.csv -5 Athena_5 10 7 7.000 0.000 1.834 5.556 0.911 0.866 4.532 0.353 rootrot.csv -6 Athena_6 5 5 5.000 0.000 1.609 5.000 1.000 1.000 2.464 0.190 rootrot.csv -7 Athena_7 11 10 9.182 0.386 2.272 9.308 0.982 0.955 2.129 0.086 rootrot.csv -8 Athena_8 8 6 6.000 0.000 1.667 4.571 0.893 0.831 3.857 0.323 rootrot.csv -9 Athena_9 10 10 10.000 0.000 2.303 10.000 1.000 1.000 2.815 0.118 rootrot.csv -10 Athena_10 9 8 8.000 0.000 2.043 7.364 0.972 0.948 2.849 0.137 rootrot.csv -11 Mt. Vernon_1 10 9 9.000 0.000 2.164 8.333 0.978 0.952 7.132 0.276 rootrot.csv -12 Mt. Vernon_2 6 6 6.000 0.000 1.792 6.000 1.000 1.000 20.649 0.492 rootrot.csv -13 Mt. Vernon_3 8 6 6.000 0.000 1.667 4.571 0.893 0.831 2.117 0.106 rootrot.csv -14 Mt. Vernon_4 12 8 6.833 0.665 1.814 4.500 0.848 0.681 3.008 0.255 rootrot.csv -15 Mt. Vernon_5 17 7 5.541 0.828 1.758 5.070 0.853 0.848 2.677 0.340 rootrot.csv -16 Mt. Vernon_6 12 11 9.318 0.466 2.369 10.286 0.985 0.958 19.498 0.467 rootrot.csv -17 Mt. Vernon_7 12 9 7.818 0.649 2.095 7.200 0.939 0.870 1.208 0.153 rootrot.csv -18 Mt. Vernon_8 13 9 7.346 0.764 2.032 6.259 0.910 0.794 1.153 0.169 rootrot.csv -19 Total 187 119 9.612 0.612 4.558 68.972 0.991 0.720 14.371 0.271 rootrot.csv -20 1 19 16 9.211 0.701 2.726 14.440 0.982 0.942 14.229 0.313 rootrot2.csv -21 2 18 18 10.000 0.000 2.890 18.000 1.000 1.000 9.143 0.194 rootrot2.csv -22 3 18 8 5.265 1.009 1.609 3.375 0.745 0.594 22.843 0.573 rootrot2.csv -23 4 25 17 7.887 1.124 2.575 9.615 0.933 0.710 18.488 0.415 rootrot2.csv -24 5 27 14 7.511 1.057 2.446 9.720 0.932 0.828 23.002 0.520 rootrot2.csv -25 6 17 15 9.338 0.633 2.670 13.762 0.985 0.949 17.778 0.410 rootrot2.csv -26 7 23 19 9.178 0.764 2.872 16.030 0.980 0.902 19.162 0.405 rootrot2.csv -27 8 21 15 8.273 0.981 2.558 10.756 0.952 0.820 24.313 0.543 rootrot2.csv -28 9 10 10 10.000 0.000 2.303 10.000 1.000 1.000 2.815 0.118 rootrot2.csv -29 10 9 8 8.000 0.000 2.043 7.364 0.972 0.948 2.849 0.137 rootrot2.csv -30 Total 187 119 9.612 0.612 4.558 68.972 0.991 0.720 14.371 0.271 rootrot2.csv -31 Total 100 6 6.000 0.000 1.235 2.790 0.648 0.735 0.050 0.061 simulated.dat -\end{Soutput} -\end{Schunk} - -You've seen examples of how to use \texttt{getfile} to extract a single file and all the files in a directory, but what if you wanted many files, but only wanted ones that were of a certain type or had a certain name? This is what you would use the \texttt{pattern} argument for. -A perfect use would be the example data contained in the \textit{adegenet} package. Let's take a look at the names of these files. -\begin{center} - \fcolorbox{black}{salmon}{ - \begin{minipage}[t]{0.8\textwidth} - For the rest of this section, remember that every time you invoke \texttt{getfile()}, a window will pop up and you should select a file before hitting enter. - \end{minipage} - } -\end{center} -\begin{Schunk} -\begin{Sinput} -> getfile(multi=TRUE) -\end{Sinput} -\end{Schunk} -Navigate to the \textit{adegenet} folder in your R library. -\begin{Schunk} -\begin{Soutput} -$files - [1] "/path/to/R/adegenet/files/AFLP.txt" - [2] "/path/to/R/adegenet/files/exampleSnpDat.snp" - [3] "/path/to/R/adegenet/files/mondata1.rda" - [4] "/path/to/R/adegenet/files/mondata2.rda" - [5] "/path/to/R/adegenet/files/nancycats.dat" - [6] "/path/to/R/adegenet/files/nancycats.gen" - [7] "/path/to/R/adegenet/files/nancycats.gtx" - [8] "/path/to/R/adegenet/files/nancycats.str" - [9] "/path/to/R/adegenet/files/pdH1N1-data.csv" -[10] "/path/to/R/adegenet/files/pdH1N1-HA.fasta" -[11] "/path/to/R/adegenet/files/pdH1N1-NA.fasta" -[12] "/path/to/R/adegenet/files/usflu.fasta" - -$path -[1] "/path/to/R/adegenet/files" -\end{Soutput} -\end{Schunk} - -We can see that we have a mix of files with different formats. If we tried to run all of these files using poppr, we would have a problem because some of the file formats have no direct import into a \texttt{genind} object (*.fasta, or *.snp), or just simply are not supported (eg. *.rda files). We want to be able to filter these files out, and we will do so with the \texttt{pattern} argument. Let's say we only wanted the files that have the word "nancy" in them. -\begin{Schunk} -\begin{Sinput} -> getfile(multi=TRUE, pattern="nancy") -\end{Sinput} -\end{Schunk} -\begin{Schunk} -\begin{Soutput} -$files -[1] "/path/to/R/adegenet/files/nancycats.dat" "/path/to/R/adegenet/files/nancycats.gen" -[3] "/path/to/R/adegenet/files/nancycats.gtx" "/path/to/R/adegenet/files/nancycats.str" - -$path -[1] "/path/to/R/adegenet/files" -\end{Soutput} -\end{Schunk} -Now, let's exclude everything but genetix files (*.gtx). -\begin{Schunk} -\begin{Sinput} -> getfile(multi=TRUE, pattern="gtx") -\end{Sinput} -\end{Schunk} -\begin{Schunk} -\begin{Soutput} -$files -[1] "/path/to/R/adegenet/files/nancycats.gtx" - -$path -[1] "/path/to/R/adegenet/files" -\end{Soutput} -\end{Schunk} -Now, let's only get FSTAT files (*.dat) -\begin{Schunk} -\begin{Sinput} -> getfile(multi=TRUE, pattern="dat") -\end{Sinput} -\end{Schunk} -\begin{Schunk} -\begin{Soutput} -$files -[1] "/path/to/R/adegenet/files/mondata1.rda" -[2] "/path/to/R/adegenet/files/mondata2.rda" -[3] "/path/to/R/adegenet/files/nancycats.dat" -[4] "/path/to/R/adegenet/files/pdH1N1-data.csv" - -$path -[1] "/path/to/R/adegenet/files" -\end{Soutput} -\end{Schunk} - -Uh-oh. We've run into a problem. Three out of our four files are not FSTAT files. Why did this happen? It happened because they happen to have \texttt{"dat"} within their name. This problem can be solved, by using regular expressions. If you are unfamiliar with regular expressions, you can think of them as special characters that you can use to make your search pattern more strict or more flexible. Since the topic of regular expressions can take up several lectures, I will spare you the gory details. For this situation, the only one you need to know is ``\texttt{\$}". The dollar sign indicates the end of a word or string. If we want specific file extensions all we have to do is add this to the end of the search term like so: -\begin{Schunk} -\begin{Sinput} -> getfile(multi=TRUE, pattern="dat$") -\end{Sinput} -\end{Schunk} -\begin{Schunk} -\begin{Soutput} -$files -[1] "/path/to/R/adegenet/files/nancycats.dat" - -$path -[1] "/path/to/R/adegenet/files" -\end{Soutput} -\end{Schunk} -Now we have our FSTAT file! -\subsubsection{Function: read.genalex}\label{intro:import:read.genalex} - -\tab\tab A very popular program for population genetics is GenAlEx (\url{http://biology.anu.edu.au/GenAlEx/Welcome.html}) \cite{Peakall:2012, Peakall:2006}. GenAlEx runs within the Excel environment and can be very powerful in its analyses. \textit{Poppr} has added the ability to read *.CSV files\footnote{*.CSV files are comma separated files that are easily machine readable.} produced in the GenAlEx format. It can handle data types containing regions and geographic coordinates, but currently it cannot import allelic frequency data from GenAlEx. All the user has to do is to export a single sheet of GenAlEx data from Excel into a *.CSV file, and the \textit{poppr} function \texttt{read.genalex} will import it into \textit{adegenet}'s \texttt{genind} object (more information on that below). For ways of formatting a GenAlEx file, see the manual here: \url{http://biology.anu.edu.au/GenAlEx/Download_files/GenAlEx\%206.5\%20Guide.pdf} -\begin{quote} -Default Command:\\ -\texttt{read.genalex(genalex, ploidy = 2, geo = FALSE, region = FALSE)} -\end{quote} -\begin{itemize} - \item \texttt{genalex -} a *.CSV file exported from GenAlEx on your disk (For example: \texttt{"my\_genalex\_file.csv"}). - \item \texttt{ploidy -} a number indicating the ploidy for the data set (eg 2 for diploids, 1 for haploids). - \item \texttt{geo -} GenAlEx allows you to have geographic data within your file. To do this for \textit{poppr}, you will need to follow the first format outlined in the GenAlEx manual and place the geographic data AFTER all genetic and demographic data with one blank column separating it (See the GenAlEx Manual for details). If you have geographic information in your file, set this flag to \texttt{TRUE} and it will be included within the resulting genind object in the \texttt{@other} slot. (If you don't know what that is, don't worry. It will be explained later in section \ref{intro:genind:other}). - \item \texttt{region -} To format your GenAlEx file to include regions along with your populations, You can choose to include a separate column for regional data, or, since regional data must be in contiguous blocks, you can simply format it in the same way you would any other data (see the GenAlEx manual for details). If you have your file organized in this manner, select this option and the regional information will be stored in the resulting genind object in the \texttt{@other} slot. -\end{itemize} - -\begin{center} - \fcolorbox{black}{light-gray}{ - \begin{minipage}[t]{0.8\textwidth} -{\large \textsc{If you are unfamiliar with exporting data from Excel}}\\ -\begin{enumerate} - \item Click the Microsoft Office Button in the top left corner of Excel. (Or go to the File menu if you have an older version) - \item Click Save As... - \item In the ``Save as type" drop down box, select CSV (comma delimited). -\end{enumerate} - \end{minipage} - } -\end{center} - -Note that regional data and geographic data are not mutually exclusive. You can have both in one file, just make sure that they are on the same sheet and that the geographic data is always placed after all genetic and demographic data. - -We have a short example of genalex formatted data with no geographic or regional formatting. We will first see where the data is using the command \texttt{system.file()} -\begin{Schunk} -\begin{Sinput} -> system.file("files/rootrot.csv", package="poppr") -\end{Sinput} -\end{Schunk} -\begin{Schunk} -\begin{Soutput} -[1] "/path/to/R/library/poppr/files/rootrot.csv" -\end{Soutput} -\end{Schunk} -Now import the data into \textit{poppr} like so: -\begin{Schunk} -\begin{Sinput} -> rootrot <- read.genalex(system.file("files/rootrot.csv", package="poppr")) -\end{Sinput} -\end{Schunk} -Executing rootrot shows that this file is now in genind format (ie. the format required by \textit{poppr} and \textit{adegenet}). -\begin{Schunk} -\begin{Sinput} -> rootrot -\end{Sinput} -\begin{Soutput} - ##################### - ### Genind object ### - ##################### -- genotypes of individuals - - -S4 class: genind -@call: read.genalex(genalex = "rootrot.csv") - -@tab: 187 x 56 matrix of genotypes - -@ind.names: vector of 187 individual names -@loc.names: vector of 56 locus names -@loc.nall: NULL -@loc.fac: NULL -@all.names: NULL -@ploidy: 2 -@type: PA - -Optionnal contents: -@pop: factor giving the population of each individual -@pop.names: factor giving the population of each individual - -@other: a list containing: population_hierarchy -\end{Soutput} -\end{Schunk} - -\subsubsection{Genalex formatting shortcuts}\label{intro:import:genalex.short} - -\tab\tab The GenAlEx format is a nice way to import data because it allows you to have geographic coordinates and two hierarchical levels of sampling (Region and population). If you have multiple levels of hierarchy, you will need to code them so that you combine multiple columns of hierarchy into one using a common separator (For an example, see section \ref{data.manip:hier:splitcombine} of this manual). A problem arises when it becomes more work than it's worth to do that since, for the GenAlEx format, you must provide the sizes of each population in the header. Here, I'll show you a simple way to circumvent that. First, let's use the microbov data set from \textit{adegenet} (for details, type \texttt{help("microbov")} into your R console). It contains three demographic factors: Country, Species and Breed contained within the \texttt{@other} slot (detailed in section \ref{intro:genind:other}). We will combine these and save the file to our desktop. We will cover these functions later in this manual. For now, just know they exist. - -\begin{Schunk} -\begin{Sinput} -> library(poppr) -> data(microbov) -> microbov@other$population_hierarchy <- data.frame(list(Country = microbov@other$coun, -+ Species = microbov@other$spe, Breed = microbov@other$breed)) -> microbov <- splitcombine(microbov, method=2, hier=c("Country", "Species", "Breed")) -> genind2genalex(microbov, file="~/Desktop/microbov.csv") -\end{Sinput} -\end{Schunk} -\begin{Schunk} -\begin{Soutput} -Extracting the table ... Writing the table to ~/Desktop/microbov.csv ... Done. -\end{Soutput} -\end{Schunk} -After we do this, we can open the file in our favorite spreadsheet editor and see the following image. - -\setkeys{Gin}{width=\textwidth} - -\begin{figure}[h!] - \centering - \caption{\footnotesize \footnotesize The first 15 individuals and 4 loci of the microbov data set. The first column contains the individual names, the second column contains the population names, and each subsequent column represents microsatellite genetic data. Highlighted in red is a list of populations and their relative sizes.} - \label{microbov unmodified} -\includegraphics{unmod_dat} -\end{figure} - -\newpage -All that \textit{poppr} needs from the first header row are the first three numbers (unless you are including regional data, but it's not terribly necessary with the hierarchical support \textit{poppr} provides.), which represent the number of loci, individuals, and populations, respectively. After that, you have counts of individuals per population in each subsequent cell. For \textit{poppr}, These cells don't matter because we already have that information in column 2. - -If you have a large data set with many population levels, you can use the following shortcut by setting the number in the third cell to 1. The number in cell 4 is arbitrary (but must be there). In the following figure, it is set to the number of individuals in your data set, but can easily be replaced with any other number (perhaps your favorite number?). - -\begin{figure}[h!] - \centering - \caption{\footnotesize \footnotesize The first 15 individuals and 4 loci of the microbov data set. This is the same figure as above, however the populations and counts have been removed from the header row and the third number in the header has been replaced by 1.} -\includegraphics{mod_dat} -\end{figure} - -\setkeys{Gin}{width=0.5\textwidth} - -\subsubsection{Other ways of importing data}\label{intro:import:other} - -\tab\tab \textit{Adegenet} already supports the import of FSTAT, Structure, Genpop, and Genetix formatted files, so if you have those formats, you can import them using the function \texttt{import2genind}. For sequence data, check if you can use \texttt{read.dna} from the \textit{ape} package to import your data. If you can, then you can use the \textit{adegenet} function \texttt{DNAbin2genind}. If you don't have any of these formats handy, you can still import your data using R's \texttt{read.table} along with \texttt{df2genind} from \textit{adegenet}. For more information, see \textit{adegenet}'s ``Getting Started" vignette. - -\subsubsection{Function: genind2genalex}\label{intro:import:genind2genalex} - -\tab\tab Of course, being able to export data is just as useful as being able to import it, so we have this handy little function that will write a GenAlEx formatted file to wherever you desire.\\ -WARNING: This will overwrite any file that exists with the same name. -\begin{quote} -Default Command:\\ -\texttt{genind2genalex(pop, filename = "genalex.csv", quiet = FALSE)} -\end{quote} -\begin{itemize} - \item \texttt{pop -} a \texttt{genind} object. - \item \texttt{filename -} This is where you specify where you want the file to go. If you simply type the file name, it will deposit the file in the directory R is currently in. If you don't know what directory you are in, you can type \texttt{getwd()} to find out. - \item \texttt{quiet -} If this is set to \texttt{FALSE}, a message will be printed to the screen. - \item \texttt{geo -} This is set to \texttt{FALSE} by default. If it is set to \texttt{TRUE}, then that means you have a data frame or matrix in the \texttt{@other} slot of your genind object that contains geographic coordinates for all individuals or all populations. Setting this to \texttt{TRUE} means that you want the resulting file to have two extra columns at the end of your file with geographic coordinates. - \item \texttt{geodf -} The name of the data frame or matrix containing the geographic coordinates. The default is \texttt{geodf = "xy"}. -\end{itemize} - -First, a simple example for the rootrot data we demonstrated in section 1.4.2: -\begin{Schunk} -\begin{Sinput} -> genind2genalex(rootrot, "~/Desktop/rootrot.csv") -\end{Sinput} -\end{Schunk} -\begin{Schunk} -\begin{Soutput} -Extracting the table ... Writing the table to ~/Desktop/rootrot.csv ... Done. -\end{Soutput} -\end{Schunk} - -Now here's an example of exporting the nancycats data set into GenAlEx format with geographic information. If we look at the nancycats geographic information, we can see it's coordinates for each population, but not each individual: -\begin{Schunk} -\begin{Sinput} -> data(nancycats) -> nancycats@other$xy -\end{Sinput} -\begin{Soutput} - x y -P01 263.3498 171.10939 -P02 183.5028 122.40790 -P03 391.1050 254.70148 -P04 458.6121 41.72336 -P05 182.7769 219.08398 -P06 335.2121 344.83557 -P07 359.1662 375.36486 -P08 271.3345 67.89132 -P09 256.8169 150.02964 -P10 270.6086 17.00917 -P11 493.4544 237.25618 -P12 305.4510 85.33663 -P13 462.9674 86.79040 -P14 429.5768 291.04587 -P15 531.2003 115.13903 -P16 407.8003 99.87438 -P17 345.3745 251.79393 -\end{Soutput} -\end{Schunk} -And we can export it easily: -\begin{Schunk} -\begin{Sinput} -> genind2genalex(nancycats, "~/Desktop/nancycats_pop_xy.csv") -\end{Sinput} -\end{Schunk} -\begin{Schunk} -\begin{Soutput} -Extracting the table ... Writing the table to ~/Desktop/nancycats_pop_xy.csv ... Done. -\end{Soutput} -\end{Schunk} -If we wanted to assign a geographic coordinate to each individual, we can simply use this little repetition trick knowing that there are 17 populations in the data set: -\begin{Schunk} -\begin{Sinput} -> nan2 <- nancycats -> nan2@other$xy <- nan2@other$xy[rep(1:17, table(pop(nan2))), ] -> head(nan2@other$xy) -\end{Sinput} -\begin{Soutput} - x y -P01 263.3498 171.1094 -P01 263.3498 171.1094 -P01 263.3498 171.1094 -P01 263.3498 171.1094 -P01 263.3498 171.1094 -P01 263.3498 171.1094 -\end{Soutput} -\end{Schunk} -Now we can export it to a different file. -\begin{Schunk} -\begin{Sinput} -> genind2genalex(nan2, "~/Desktop/nancycats_inds_xy.csv") -\end{Sinput} -\end{Schunk} -\begin{Schunk} -\begin{Soutput} -Extracting the table ... Writing the table to ~/Desktop/nancycats_inds_xy.csv ... Done. -\end{Soutput} -\end{Schunk} -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -\subsection{Getting to know adegenet's genind object}\label{intro:genind} - -\tab\tab Since \textit{poppr} was built around adegenet's framework, it is important to know how \textit{adegenet} stores data in the genind object, as that is the object used by \textit{poppr}. To create a genind object, \textit{adegenet} takes a data frame of genotypes (rows) across multiple loci (columns) and converts them into a matrix of individual allelic frequencies at each locus \cite{Jombart:2008}. - -For example, if you had a data frame with 3 diploid individuals each with 3 loci that had 3, 4, and 5 allelic states respectively, the resulting \texttt{genind} object would contain a matrix that has 3 rows and 12 columns. -\noindent Here's the example data frame: -\begin{Schunk} -\begin{Soutput} - locus1 locus2 locus3 -1 101/101 201/201 301/302 -2 102/103 202/203 301/303 -3 102/102 203/204 304/305 -\end{Soutput} -\end{Schunk} -And the resulting matrix after importing to genind. -\begin{Schunk} -\begin{Soutput} - L1.1 L1.2 L1.3 L2.1 L2.2 L2.3 L2.4 L3.1 L3.2 L3.3 L3.4 L3.5 -1 1 0.0 0.0 1 0.0 0.0 0.0 0.5 0.5 0.0 0.0 0.0 -2 0 0.5 0.5 0 0.5 0.5 0.0 0.5 0.0 0.5 0.0 0.0 -3 0 1.0 0.0 0 0.0 0.5 0.5 0.0 0.0 0.0 0.5 0.5 -\end{Soutput} -\end{Schunk} -The first three columns represent the alleles of locus 1, the next four represent locus 2, and the last five represent locus 3. - -Do you see what I mean when I say individual allele frequencies at each locus? For a diploid individual, you only have three possible allele frequencies at each locus: 1, 0.5, or 0. Now, this is not the entire genind object, but it is the main feature. The object also has various elements associated with it that give you information about the population membership, the names of loci, individuals, and alleles among other things that \textit{poppr} uses to work \cite{Jombart:2008}. If you wish to know more, see the \textit{adegenet} ``Getting Started" manual. - -\subsubsection{The other slot}\label{intro:genind:other} - -\tab\tab The element that you as a \textit{poppr} user needs to be concerned with is the ``other" slot. No, I'm not trying to be cryptic. If you look at an \textit{adegenet} object, you will see that it has several ``slots" (starting with ``@"). \cite{Jombart:2008} Let's start by recreating that data frame I showed you earlier. -\begin{Schunk} -\begin{Sinput} -> df <- data.frame(list(locus1=c("101/101", "102/103", "102/102"), -+ locus2=c("201/201", "202/203", "203/204"), -+ locus3=c("301/302", "301/303", "304/305") -+ ) -+ ) -> dfg <- df2genind(df, sep="/") -\end{Sinput} -\end{Schunk} -Next we will display the contents of the \texttt{genind} object \texttt{dfg} -\begin{Schunk} -\begin{Sinput} -> dfg -\end{Sinput} -\begin{Soutput} - ##################### - ### Genind object ### - ##################### -- genotypes of individuals - - -S4 class: genind -@call: df2genind(X = df, sep = "/") - -@tab: 3 x 12 matrix of genotypes - -@ind.names: vector of 3 individual names -@loc.names: vector of 3 locus names -@loc.nall: number of alleles per locus -@loc.fac: locus factor for the 12 columns of @tab -@all.names: list of 3 components yielding allele names for each locus -@ploidy: 2 -@type: codom - -Optionnal contents: -@pop: - empty - -@pop.names: - empty - - -@other: - empty - -\end{Soutput} -\end{Schunk} - -The matrix containing our allelic frequencies is located in the \texttt{@tab} slot. All of the slots below that have very specific properties related to the matrix in \texttt{@tab}, but the \texttt{@other} slot is more or less a grab bag, where you can place anything you want, even if it doesn't make sense! - -Here, I'll give you an example of placing the genind object inside itself. Notice first, that the \texttt{@other} slot is empty and pay attention to the commands I use, noting that you can use either ``\$" or ``@" to access the slots. -\begin{Schunk} -\begin{Sinput} -> # First off, how big is the object? -> print(object.size(dfg), units="auto") -\end{Sinput} -\begin{Soutput} -8.5 Kb -\end{Soutput} -\begin{Sinput} -> dfg$other$dfg <- dfg -> dfg # we can now see that the @other slot is now filled. -\end{Sinput} -\begin{Soutput} - ##################### - ### Genind object ### - ##################### -- genotypes of individuals - - -S4 class: genind -@call: df2genind(X = df, sep = "/") - -@tab: 3 x 12 matrix of genotypes - -@ind.names: vector of 3 individual names -@loc.names: vector of 3 locus names -@loc.nall: number of alleles per locus -@loc.fac: locus factor for the 12 columns of @tab -@all.names: list of 3 components yielding allele names for each locus -@ploidy: 2 -@type: codom - -Optionnal contents: -@pop: - empty - -@pop.names: - empty - - -@other: a list containing: dfg -\end{Soutput} -\begin{Sinput} -> dfg$other$dfg -\end{Sinput} -\begin{Soutput} - ##################### - ### Genind object ### - ##################### -- genotypes of individuals - - -S4 class: genind -@call: df2genind(X = df, sep = "/") - -@tab: 3 x 12 matrix of genotypes - -@ind.names: vector of 3 individual names -@loc.names: vector of 3 locus names -@loc.nall: number of alleles per locus -@loc.fac: locus factor for the 12 columns of @tab -@all.names: list of 3 components yielding allele names for each locus -@ploidy: 2 -@type: codom - -Optionnal contents: -@pop: - empty - -@pop.names: - empty - - -@other: - empty - -\end{Soutput} -\begin{Sinput} -> print(object.size(dfg), units="auto") # How big is it now? -\end{Sinput} -\begin{Soutput} -17.2 Kb -\end{Soutput} -\end{Schunk} -\begin{center} - \fcolorbox{black}{light-gray}{ - \begin{minipage}[t]{0.8\textwidth} -{\large \textsc{What is the \# sign for?}}\\ -This is called a comment. If you type something in R with the ``\#" sign in front of it, R will not interpret it. - \end{minipage} - } -\end{center} - -And we can continue to do this until we reach the limit of our available memory. Why am I showing this silliness to you? For one thing I want to show you that you can stick anything you want into that slot and the object will not be hurt in any way. It's also important when considering how you are going to deal with the population structure of your genind object. For the \textit{poppr} functions \texttt{clonecorrect} (Section \ref{data.manip:cc:clonecorrect}) and \texttt{splitcombine} (Section \ref{data.manip:hier:splitcombine}) to work, a data frame of the population hierarchy must be present in the \texttt{@other} slot and it must have the same number of rows as individuals in the data set. There are several ways to go about this. If you know how to create a data frame or import data into R, the command is no more difficult than \texttt{obj\$other\$population\_hierarchy <- df}. If you do not know how to create a data frame or import data into R, you can visit Quick R at \url{http://www.statmethods.net/input/importingdata.html}. -\subsubsection{Setting the population factor \{adegenet's function: pop\}}\label{intro:genind:pop} - -\tab\tab A genind object can contain several populations, and, if you have differing population structures, you might want to switch among them for different analyses. The tools you as the user would need, are the slot \texttt{@pop.names} and the \textit{adegenet} function \texttt{pop()}. I'll use the H3N2 data set packaged with \textit{adegenet} as an example. -\begin{Schunk} -\begin{Sinput} -> data(H3N2) -> H3N2 -\end{Sinput} -\begin{Soutput} - ##################### - ### Genind object ### - ##################### -- genotypes of individuals - - -S4 class: genind -@call: .local(x = x, i = i, j = j, drop = drop) - -@tab: 1903 x 334 matrix of genotypes - -@ind.names: vector of 1903 individual names -@loc.names: vector of 125 locus names -@loc.nall: number of alleles per locus -@loc.fac: locus factor for the 334 columns of @tab -@all.names: list of 125 components yielding allele names for each locus -@ploidy: 1 -@type: codom - -Optionnal contents: -@pop: - empty - -@pop.names: - empty - - -@other: a list containing: x xy epid -\end{Soutput} -\begin{Sinput} -> pop(H3N2) -\end{Sinput} -\begin{Soutput} -NULL -\end{Soutput} -\begin{Sinput} -> H3N2$pop.names -\end{Sinput} -\begin{Soutput} -NULL -\end{Soutput} -\end{Schunk} - -Notice how both the pop and pop.names are empty. This means that the population information needs to be set. Notice, however that there are 1903 individuals in the data set and that the \texttt{@other} slot is not empty. Let's investigate an object in this slot. -\begin{Schunk} -\begin{Sinput} -> head(H3N2$other$x) -\end{Sinput} -\begin{Soutput} - accession length host segment subtype country year Virus name -AB434107 AB434107 1701 Human 4 (HA) H3N2 Japan 2002 Influenza Avirus -AB434108 AB434108 1701 Human 4 (HA) H3N2 Japan 2002 Influenza Avirus -AB438242 AB438242 827 Human 4 (HA) H3N2 Japan 2002 Influenza Avirus -AB438243 AB438243 827 Human 4 (HA) H3N2 Japan 2002 Influenza Avirus -AB438244 AB438244 827 Human 4 (HA) H3N2 Japan 2002 Influenza Avirus -AB438245 AB438245 827 Human 4 (HA) H3N2 Japan 2002 Influenza Avirus - Misc info Age Gender date usePreciseLoc localisation -AB434107 (A/Morioka/34/2002(H3N2)) 2002/02/25 FALSE japan -AB434108 (A/Morioka/52/2002(H3N2)) 2002/03/01 FALSE japan -AB438242 (A/Niigata/F11/2002(H3N2)) 2002/01/22 FALSE japan -AB438243 (A/Niigata/F100/2002(H3N2)) 2002/02/18 FALSE japan -AB438244 (A/Niigata/F175/2002(H3N2)) 2002/02/25 FALSE japan -AB438245 (A/Niigata/F245/2002(H3N2)) 2002/03/04 FALSE japan - lon lat month -AB434107 137.2155 35.58418 2 -AB434108 137.2155 35.58418 3 -AB438242 137.2155 35.58418 1 -AB438243 137.2155 35.58418 2 -AB438244 137.2155 35.58418 2 -AB438245 137.2155 35.58418 3 -\end{Soutput} -\begin{Sinput} -> nrow(H3N2$other$x) -\end{Sinput} -\begin{Soutput} -[1] 1903 -\end{Soutput} -\end{Schunk} -\begin{center} - \fcolorbox{black}{light-gray}{ - \begin{minipage}[t]{0.8\textwidth} -{\large \textsc{What is head()?}}\\ -\texttt{head()} is a command that will show you only the top portion of an R object. By default it will show you the first six elements (or rows of a data frame or matrix). This is so that you can quickly check the contents of an object. - \end{minipage} - } -\end{center} - -We can see that it's a data frame containing a wealth of information that we could use to subset our data. So, let's start by setting the population structure by country. How do we do that? Well, the function \texttt{pop()} will allow us to set that structure using a vector that is the same length as the number of individuals in the data set. Since the number of rows in the data frame \texttt{x} meets that criteria, we can use any item in that data frame. Let's take a look. -\begin{Schunk} -\begin{Sinput} -> pop(H3N2) <- H3N2$other$x$country -> head(pop(H3N2)) -\end{Sinput} -\begin{Soutput} -[1] Japan Japan Japan Japan Japan Japan -37 Levels: Japan USA Finland China South Korea Norway Taiwan France ... Algeria -\end{Soutput} -\begin{Sinput} -> H3N2$pop.names -\end{Sinput} -\begin{Soutput} - [1] "Japan" "USA" "Finland" "China" "South Korea" - [6] "Norway" "Taiwan" "France" "Latvia" "Netherlands" -[11] "Bulgaria" "Turkey" "United Kingdom" "Denmark" "Austria" -[16] "Canada" "Italy" "Russia" "Bangladesh" "Egypt" -[21] "Germany" "Romania" "Ukraine" "Czech Republic" "Greece" -[26] "Iceland" "Ireland" "Sweden" "Nepal" "Saudi Arabia" -[31] "Switzerland" "Iran" "Mongolia" "Spain" "Slovenia" -[36] "Croatia" "Algeria" -\end{Soutput} -\end{Schunk} -Notice how useful the \texttt{@other} slot can be. We now have population structure in the data set and you now know how to set the population factor. The other slot will become useful later on when we are talking about multilocus genotypes. -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -% -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -% -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -% -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -\section{Data Manipulation}\label{data.manip} - -\tab\tab One tedious aspect of population genetic analysis is the need for repeated data manipulation. \textit{Adegenet} has some functions for manipulating data that are limited to replacing missing data and dividing data into populations, loci, or by sample size \cite{Jombart:2008}. \textit{Poppr} includes novel functions for clone-censoring your data sets or sub-setting a genind object by specific populations. -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -\subsection{Inside the golden days of missing data \{replace or remove missing data\}}\label{data.manip:missing} - -\tab\tab A data set without missing data is always ideal, but often not achievable. Many functions in \textit{adegenet} cannot handle missing data and thus the function \texttt{na.replace} exists \cite{Jombart:2008}. It will replace missing data with with either ``0" representing a mysterious extra allele in the data set resulting in more diversity or the mean of allelic frequencies at the locus. There is no set method, however, for simply removing missing data from analyses, which is why the \textit{poppr} function \texttt{missingno} (see below) exists. If the name makes you uneasy it's because it should. Missing data can mean different things based on your data type. For microsatellites, missing data might represent any source of error that could cause a PCR product to not amplify in gel electrophoresis, which may or may not be biologically relevant. For a DNA alignment, missing data could mean something as simple as an insertion or deletion, which is biologically relevant. The choice to exclude or estimate data has very different implications for the type of data you have. -\subsubsection{Function: missingno}\label{data.manip:missing:missingno} - -\tab\tab \texttt{missingno} is a function that serves partially as a wrapper for adegenet's \texttt{na.replace} to replace missing data and as a way to exclude specific areas that contain systematic missing data. -\begin{quote} -Default Command:\\ -\texttt{missingno(pop, type = "loci", cutoff = 0.05, quiet = FALSE)} -\end{quote} -\begin{itemize} - \item \texttt{pop -} a \texttt{genind} object. - \item \texttt{type -} This could be one of four options: - \begin{description} - \item[ ``mean"] This replaces missing data with the mean allele frequencies in the entire data set. - \item[ ``zero" or ``0"] This replaces missing data with zero, signifying a new allele. - \item[ ``loci"] This is to be used for a data set that has systematic problems with certain loci that contain null alleles or simply failed to amplify. This will remove loci with a defined threshold of missing data from the data set. - \item[ ``geno"] This is to be used for genotypes (individuals) in your data set where many null alleles are present. Individuals with a defined threshold missing data will be removed. - \end{description} - \item \texttt{cutoff -} This is a numeric value from 0 to 1 indicating the percent allowable missing data for either loci or genotypes. If you have, for example, two loci containing missing 5\% and 10\% missing data, respectively and you set \texttt{cutoff = 0.05}, \texttt{missingno} will remove the second locus. Percent missing data for genotypes is considered the percent missing loci over number of total loci. - \item \texttt{quiet -} When this is set to \texttt{FALSE}, the number of missing values replaced will be printed to screen if the method is ``zero" or ``mean". It will print the number of loci or individuals removed if the method is ``loci" or ``geno". -\end{itemize} - -Of course, seeing is believing. Let's take a look at what this does by focusing in on areas with missing data. Note that I will be using some sub-setting functions here that are described in adegenet's \textit{Getting Started} vignette. First, let's take a look at what the missing data in R looks like as well as how many loci and individuals the data set nancycats contains. We need to first tell R to look in its library for the package \textit{poppr}. -\begin{Schunk} -\begin{Sinput} -> library(poppr) -\end{Sinput} -\end{Schunk} -Next, we'll initialize the \textit{adegenet} data set nancycats and load it into memory. -\begin{Schunk} -\begin{Sinput} -> data(nancycats) -\end{Sinput} -\end{Schunk} -Now, we'll take a quick look at the nancycats data set using \textit{adegenet}'s \texttt{summary()} \index{summary} function: -\begin{Schunk} -\begin{Sinput} -> summary(nancycats) -\end{Sinput} -\begin{Soutput} - # Total number of genotypes: 237 - - # Population sample sizes: - 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 -10 22 12 23 15 11 14 10 9 11 20 14 13 17 11 12 13 - - # Number of alleles per locus: -L1 L2 L3 L4 L5 L6 L7 L8 L9 -16 11 10 9 12 8 12 12 18 - - # Number of alleles per population: -01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 16 17 -36 53 50 67 48 56 42 54 43 46 70 52 44 61 42 40 35 - - # Percentage of missing data: -[1] 2.344116 - - # Observed heterozygosity: - L1 L2 L3 L4 L5 L6 L7 L8 L9 -0.6682028 0.6666667 0.6793249 0.7083333 0.6329114 0.5654008 0.6497890 0.6184211 0.4514768 - - # Expected heterozygosity: - L1 L2 L3 L4 L5 L6 L7 L8 L9 -0.8657224 0.7928751 0.7953319 0.7603095 0.8702576 0.6884669 0.8157881 0.7603493 0.6062686 -\end{Soutput} -\end{Schunk} - -We can see here a lot of summary statistics about nancycats. Here we can see that there are 17 populations, 237 individuals, and 9 loci. Nancycats also has a little over 2.3\% missing data. Let's take a look at the names of the loci and the structure of the data. In order to save space, I will only show you the first five individuals (rows) and a portion of the alleles in the first locus (columns). -\begin{Schunk} -\begin{Sinput} -> nancycats$loc.names # Names of the loci -\end{Sinput} -\begin{Soutput} - L1 L2 L3 L4 L5 L6 L7 L8 L9 - "fca8" "fca23" "fca43" "fca45" "fca77" "fca78" "fca90" "fca96" "fca37" -\end{Soutput} -\begin{Sinput} -> nancycats$tab[1:5, 8:13] -\end{Sinput} -\begin{Soutput} - L1.08 L1.09 L1.10 L1.11 L1.12 L1.13 -001 NA NA NA NA NA NA -002 NA NA NA NA NA NA -003 0.0 0.5 0 0 0 0.5 -004 0.5 0.5 0 0 0 0.0 -005 0.5 0.5 0 0 0 0.0 -\end{Soutput} -\end{Schunk} - -When looking at this data set, recall how a \texttt{genind} object is formatted. You have a matrix of 0's, 1's and 0.5's. For diploids, if you see 0.5, that means it is heterozygous at that allele, and a 1 means it's homozygous. Here we see three heterozygotes and two individuals with missing data (indicated by NA). Now, there are more places with missing data in the data set, but I'm only showing a little bit at one locus so it's easier to digest. Let's first replace it by zero and mean, respectively. -\begin{Schunk} -\begin{Sinput} -> nanzero <- missingno(nancycats, type = "zero") -\end{Sinput} -\begin{Soutput} - Replaced 617 missing values -\end{Soutput} -\begin{Sinput} -> nanmean <- missingno(nancycats, type = "mean") -\end{Sinput} -\begin{Soutput} - Replaced 617 missing values -\end{Soutput} -\begin{Sinput} -> nanzero$tab[1:5, 8:13] -\end{Sinput} -\begin{Soutput} - L1.08 L1.09 L1.10 L1.11 L1.12 L1.13 -001 0.0 0.0 0 0 0 0.0 -002 0.0 0.0 0 0 0 0.0 -003 0.0 0.5 0 0 0 0.5 -004 0.5 0.5 0 0 0 0.0 -005 0.5 0.5 0 0 0 0.0 -\end{Soutput} -\begin{Sinput} -> nanmean$tab[1:5, 8:13] -\end{Sinput} -\begin{Soutput} - L1.08 L1.09 L1.10 L1.11 L1.12 L1.13 -001 0.07603687 0.2419355 0.1912442 0.06221198 0.09447005 0.1013825 -002 0.07603687 0.2419355 0.1912442 0.06221198 0.09447005 0.1013825 -003 0.00000000 0.5000000 0.0000000 0.00000000 0.00000000 0.5000000 -004 0.50000000 0.5000000 0.0000000 0.00000000 0.00000000 0.0000000 -005 0.50000000 0.5000000 0.0000000 0.00000000 0.00000000 0.0000000 -\end{Soutput} -\end{Schunk} -You notice how the values of NA changed, yet the basic structure stayed the same. These are the replacement options from adegenet. Let's look at the same example with the exclusion options (set to the default cutoff of 5\%). -\begin{Schunk} -\begin{Sinput} -> nanloci <- missingno(nancycats, "loci") -\end{Sinput} -\begin{Soutput} - Found 617 missing values. - 2 loci contained missing values greater than 5%. - Removing 2 loci : fca8 fca45 -\end{Soutput} -\begin{Sinput} -> nangeno <- missingno(nancycats, "geno") -\end{Sinput} -\begin{Soutput} - Found 617 missing values. - 38 genotypes contained missing values greater than 5%. - Removing 38 genotypes : N215 N216 N188 N189 N190 N191 N192 N302 N304 N310 -N195 N197 N198 N199 N200 N201 N206 N182 N184 N186 N298 N299 N300 N301 N303 N282 -N283 N288 N291 N292 N293 N294 N295 N296 N297 N281 N289 N290 -\end{Soutput} -\begin{Sinput} -> nanloci$tab[1:5, 8:13] -\end{Sinput} -\begin{Soutput} - L1.08 L1.09 L1.10 L1.11 L2.01 L2.02 -001 0 0.5 0 0 0 0 -002 0 1.0 0 0 0 0 -003 0 0.5 0 0 0 0 -004 0 0.0 0 0 0 0 -005 0 0.5 0 0 0 0 -\end{Soutput} -\end{Schunk} - -Notice how we now see columns named ``L2.01" and ``L2.02". This is showing us another locus because we have removed the first. Recall from the summary table that the first locus had 16 alleles, and the second had 11. Now that we've removed loci containing missing data, all others have shifted over.\\ -Let's look at the loci names and number of individuals. -\begin{Schunk} -\begin{Sinput} -> length(nanloci$ind.names) # Individuals -\end{Sinput} -\begin{Soutput} -[1] 237 -\end{Soutput} -\begin{Sinput} -> nanloci$loc.names # Names of the loci -\end{Sinput} -\begin{Soutput} - L1 L2 L3 L4 L5 L6 L7 -"fca23" "fca43" "fca77" "fca78" "fca90" "fca96" "fca37" -\end{Soutput} -\end{Schunk} -You can see that the number of individuals stayed the same but the loci ``fca8", ``fca45", and ``fca96" were removed.\\ -Let's look at what happened when we removed individuals. -\begin{Schunk} -\begin{Sinput} -> nangeno$tab[1:5, 8:13] -\end{Sinput} -\begin{Soutput} - L1.08 L1.09 L1.10 L1.11 L1.12 L1.13 -001 0.0 0.5 0 0 0 0.5 -002 0.5 0.5 0 0 0 0.0 -003 0.5 0.5 0 0 0 0.0 -004 0.0 0.5 0 0 0 0.5 -005 0.0 1.0 0 0 0 0.0 -\end{Soutput} -\begin{Sinput} -> length(nangeno$ind.names) # Individuals -\end{Sinput} -\begin{Soutput} -[1] 199 -\end{Soutput} -\begin{Sinput} -> nangeno$loc.names # Names of the loci -\end{Sinput} -\begin{Soutput} - L1 L2 L3 L4 L5 L6 L7 L8 L9 - "fca8" "fca23" "fca43" "fca45" "fca77" "fca78" "fca90" "fca96" "fca37" -\end{Soutput} -\end{Schunk} - -We can see here that the number of individuals decreased, yet we have the same number of loci. Notice how the frequency matrix changes in both scenarios? In the scenario with ``loci", we removed several columns of the data set, and so with our sub-setting, we see alleles from the second locus. In the scenario with ``geno", we removed several rows of the data set so we see other individuals in our sub-setting. -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -\subsection{Can you take me hier(archy)? \{population hierarchy construction\}}\label{data.manip:hier} - -\tab\tab Remember all that fuss we made about the \texttt{@other} slot above in section \ref{intro:genind:other}? The way you can achieve hierarchical analysis in \textit{poppr} is through a data frame in that slot. Many of the file formats that \textit{adegenet} and \textit{poppr} can import do not allow for more than two hierarchies. If you need more levels, you have a couple of choices: -\begin{enumerate} - \item Import them into R as a data frame with each column being a separate hierarchical element. - \item Collapse them into a single population factor so that you can trick these file formats into taking multiple population hierarchies (eg. instead of ``Pop1", ``Subpop1", ``Subsubpop1", you would have ``Pop1\_Subpop1\_Subsubpop1"). -\end{enumerate} -Whichever choice you make, The \textit{poppr} function \texttt{splitcombine} can help you divide and combine those factors in any way you can think of. -\subsubsection{Function: splitcombine}\label{data.manip:hier:splitcombine} - -\tab\tab This function will allow you to combine your population hierarchies in ways meaningful to your data without needing to know R programming. It can either split a vector of combined population hierarchies or it can combine columns of a data frame containing population hierarchies (Note that it will only split the first column of the data frame if you choose \texttt{method = 1}). -\begin{quote} -Default Command:\\ -\texttt{splitcombine(pop, method = 1, dfname = "population\_hierarchy", sep = "\_", hier = c(1), setpopulation = TRUE, fixed = TRUE)} -\end{quote} -\begin{itemize} - \item \texttt{pop -} a \texttt{genind} object with a data frame in the \texttt{@other} slot. - \item \texttt{method -} An integer indicating what you want to do on your data frame: - \begin{enumerate} - \item \textbf{split} Any populations combined using a common separator in your data frame. So, a population hierarchy of ``Pop1\_Subpop1\_Subsubpop1" would be split into a data frame containing the columns ``Pop1", ``Subpop1", ``Subsubpop1". Since it will split the population factor, it needs only to be used once. - \item \textbf{combine} If you have your population hierarchy split into a data frame, you can do the exact opposite of method 1 and combine separate elements into one. - \end{enumerate} - \item \texttt{dfname -} This is the name of the data frame containing your population factor. Note that you are not limited to one data frame in your genind object. If you do not have that data frame in the \texttt{@other} slot, a warning will be returned and nothing will happen. - \item \texttt{sep -} A separation factor you want to separate your populations with. Note, that you can choose whatever you want, but be careful because some characters have special meanings (regular expressions) in R and could give you incorrect results (``\_" is the suggested default). - \item \texttt{hier -} This can be a vector of words or numbers referring to what you want to name your population hierarchies in \texttt{method = 1}, or specific column names in your data frame in \texttt{method = 2}. - \item \texttt{setpopulation -} if \texttt{TRUE} (default), this will automatically set the population factor to either the highest population factor (with \texttt{method = 1}, split) or the combined population hierarchy (with \texttt{method = 2}, combine). if this is set to \texttt{FALSE}, the population factor will not be set. - \item \texttt{fixed -} This is an option to be passed onto the \textit{base} function \texttt{strsplit}. For those not familiar with regular expressions, it will tell R whether or not the character in \texttt{split} should be treated as a special character or not. If you don't know regular expressions, don't touch it. -\end{itemize} - -Let's give an example using AFLP data of different populations of \textit{A. euteiches} collected in Washington and Oregon. \cite{Grunwald:2006} -\begin{Schunk} -\begin{Sinput} -> Aeut <- read.genalex(system.file("files/rootrot.csv", package="poppr")) -> summary(Aeut) -\end{Sinput} -\begin{Soutput} - # Total number of genotypes: 187 - - # Population sample sizes: - Athena_1 Athena_2 Athena_3 Athena_4 Athena_5 Athena_6 - 9 12 10 13 10 5 - Athena_7 Athena_8 Athena_9 Athena_10 Mt. Vernon_1 Mt. Vernon_2 - 11 8 10 9 10 6 -Mt. Vernon_3 Mt. Vernon_4 Mt. Vernon_5 Mt. Vernon_6 Mt. Vernon_7 Mt. Vernon_8 - 8 12 17 12 12 13 - - # Percentage of missing data: -[1] 0 -\end{Soutput} -\end{Schunk} -\begin{center} - \fcolorbox{black}{light-gray}{ - \begin{minipage}[t]{0.8\textwidth} -{\large \textsc{Does this summary seem a little lacking?}}\\ -The data that we have here is presence absence data. This means that many of the functions that \textit{adegenet} uses to calculate heterozygosity and number of alleles are slightly useless in this regard. - \end{minipage} - } -\end{center} -Notice that we have 18 different ``populations" here, but they are really a hierarchy. Let's say we want to analyze the diversity statistics of the two overall populations. Take a look at how the combined population factor is kept in the data frame. -\begin{Schunk} -\begin{Sinput} -> head(Aeut$other$population_hierarchy) -\end{Sinput} -\begin{Soutput} - Pop -1 Athena_1 -2 Athena_1 -3 Athena_1 -4 Athena_1 -5 Athena_1 -6 Athena_1 -\end{Soutput} -\end{Schunk} -We'll use \texttt{splitcombine} to split that into a population and sub-population and set the population factor to the population. -\begin{center} - \fcolorbox{black}{light-gray}{ - \begin{minipage}[t]{0.8\textwidth} -{\large \textsc{Important point about splitcombine}}\\ -Ideally, method split should only be used once after you read in your data. The reason for this is that when you select this method, it will look in the first column of your data frame to choose the combined population factor to split. -If you do not name your hierarchy or if you attempt to give your hierarchy too many names, it will automatically name the columns ``h1", ``h2", etc. - \end{minipage} - } -\end{center} -\begin{Schunk} -\begin{Sinput} -> Aeut.pop <- splitcombine(Aeut, method=1, dfname="population_hierarchy", hier=c("Pop", "Subpop"), setpopulation=TRUE) -> head(Aeut.pop$other$population_hierarchy) -\end{Sinput} -\begin{Soutput} - Pop_Subpop Pop Subpop -1 Athena_1 Athena 1 -2 Athena_1 Athena 1 -3 Athena_1 Athena 1 -4 Athena_1 Athena 1 -5 Athena_1 Athena 1 -6 Athena_1 Athena 1 -\end{Soutput} -\begin{Sinput} -> summary(Aeut.pop) -\end{Sinput} -\begin{Soutput} - # Total number of genotypes: 187 - - # Population sample sizes: - Athena Mt. Vernon - 97 90 - - # Percentage of missing data: -[1] 0 -\end{Soutput} -\end{Schunk} - -Now we can see that we have a data frame with all of our population factors separated, and we still have our original combined hierarchy, but it is now called ``Pop\_Subpop". This allows you to keep track of what you named your population hierarchies. We can now run the function \texttt{poppr} to get a diversity analysis. -\begin{Schunk} -\begin{Sinput} -> poppr(Aeut.pop, quiet=TRUE) -\end{Sinput} -\begin{Soutput} - Pop N MLG eMLG SE H G Hexp E.5 Ia rbarD File -1 Athena 97 70 65.981 1.246 4.063 42.193 0.986 0.721 2.906 0.072 rootrot.csv -2 Mt. Vernon 90 50 50.000 0.000 3.668 28.723 0.976 0.726 13.302 0.282 rootrot.csv -3 Total 187 119 68.453 2.989 4.558 68.972 0.991 0.720 14.371 0.271 rootrot.csv -\end{Soutput} -\end{Schunk} -%\newpage -It's as simple as that. Now, let's take a look at the same data set, except the input file is a GenAlEx file that has been formatted with Regional data (See section \ref{intro:import:read.genalex} for details). First, let's see how the data set is laid out: - -\setkeys{Gin}{width=0.9\textwidth} - -\begin{figure}[h!] - \centering - \caption{\footnotesize \footnotesize Part of the rootrot2.csv data set. Note the last two columns denoting the Regions and the number of individuals per region.} - \label{rootrot2_csv} -\includegraphics{rootrot2} -\end{figure} -\setkeys{Gin}{width=0.5\textwidth} - -% We can reverse it by simply changing the method. -\begin{center} - \fcolorbox{black}{light-gray}{ - \begin{minipage}[t]{0.8\textwidth} -{\large \textsc{The amazing disappearing options!}}\\ -Notice that I'm not writing in many of the options? This is because they have defaults. Since the data frame in my \texttt{@other} slot is called ``population hierarchy", I don't have to specify that every time I do the function call, and that saves a lot of typing! -% Also, since the hierarchy has already been split into its components, I can now index the data frame by column number. So, since I want to combine the ``Pop" and ``Subpop" components, I will tell the function to use columns 2 through 3. - \end{minipage} - } -\end{center} - -We'll import our data using \texttt{read.genalex} and take a look at the population hierarchy. -\begin{Schunk} -\begin{Sinput} -> Aeut2 <- read.genalex(system.file("files/rootrot2.csv", package="poppr"), region=TRUE) -> head(Aeut2@other$population_hierarchy) -\end{Sinput} -\begin{Soutput} - Pop Region -1 1 Athena -2 1 Athena -3 1 Athena -4 1 Athena -5 1 Athena -6 1 Athena -\end{Soutput} -\begin{Sinput} -> summary(Aeut2) -\end{Sinput} -\begin{Soutput} - # Total number of genotypes: 187 - - # Population sample sizes: - 1 2 3 4 5 6 7 8 9 10 -19 18 18 25 27 17 23 21 10 9 - - # Percentage of missing data: -[1] 0 -\end{Soutput} -\end{Schunk} -What we see is that we have both of the population factors, but the names have changed and they are not combined. Note that since we specified ``Athena" and ``Mt. Vernon" as regions, the other level of the hierarchy was set as the population factor. We'll use \texttt{splitcombine} to combine both of these in the proper order. Note that we can use the indexes of the data frame columns to index these. - -\begin{Schunk} -\begin{Sinput} -> Aeut2.combine <- splitcombine(Aeut2, method=2, hier=2:1) -> head(Aeut2.combine@other$population_hierarchy) -\end{Sinput} -\begin{Soutput} - Pop Region Region_Pop -1 1 Athena Athena_1 -2 1 Athena Athena_1 -3 1 Athena Athena_1 -4 1 Athena Athena_1 -5 1 Athena Athena_1 -6 1 Athena Athena_1 -\end{Soutput} -\begin{Sinput} -> summary(Aeut2.combine) -\end{Sinput} -\begin{Soutput} - # Total number of genotypes: 187 - - # Population sample sizes: - Athena_1 Athena_2 Athena_3 Athena_4 Athena_5 Athena_6 - 9 12 10 13 10 5 - Athena_7 Athena_8 Athena_9 Athena_10 Mt. Vernon_1 Mt. Vernon_2 - 11 8 10 9 10 6 -Mt. Vernon_3 Mt. Vernon_4 Mt. Vernon_5 Mt. Vernon_6 Mt. Vernon_7 Mt. Vernon_8 - 8 12 17 12 12 13 - - # Percentage of missing data: -[1] 0 -\end{Soutput} -\end{Schunk} -% <>= -% Aeut.combine <- splitcombine(Aeut.pop, method=2, hier=2:3) -% head(Aeut.combine$other$population_hierarchy) -% summary(Aeut.combine) -% @ -% -% Let's imagine for a second that these ``hierarchies" are not actually hierarchical, but rather they represent independent variables (For example: the ``Subpop" factor could represent months of the year). In this case, we would want to only analyze the ``Subpop" factor. We can do this by using \texttt{splitcombine} with only one hierarchical level. -% <>= -% Aeut.subpop <- splitcombine(Aeut.pop, method=2, hier="Subpop") -% summary(Aeut.subpop) -% poppr(Aeut.subpop, quiet=TRUE) -% @ -Having these hierarchies in your data set is important when it comes to clone-censoring your data set (see section \ref{data.manip:cc} \textit{Attack of the Clone Correction}). -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -\subsection{Divide (populations) and conquer (your analysis) \{extract populations\}}\label{data.manip:divide} - -\tab\tab As I've mentioned before, \textit{adegenet} has many ways of sub-setting the data, but you cannot easily subset a \texttt{genind} object by population in an efficient way. \textit{Poppr} allows sub-setting a population from a \texttt{genind} object with one command. -\subsubsection{Function: popsub}\label{data.manip:divide:popsub} - -\tab\tab The command \texttt{popsub} is powerful in that it allows you to choose exactly what populations you choose to include or exclude from your analyses. As with many R functions, you can easily use this within a function to avoid creating a new variable to keep track of. -\begin{quote} -Default Command:\\ -\texttt{popsub(pop, sublist = "ALL", blacklist = NULL, mat = NULL)} -\end{quote} -\begin{itemize} - \item \texttt{pop -} a \texttt{genind} object. - \item \texttt{sublist -} The vector of populations or integers representing the populations in your data set you wish to retain. For example: \texttt{sublist = c("pop\_z", "pop\_y")} or \texttt{sublist = 1:2}. - \item \texttt{blacklist -} The vector of populations or integers representing the populations in your data set you wish to exclude. This can take the same type of arguments as sublist, and can be used in conjunction with sublist for when you want a range of populations, but know that there is one in there that you do not want to analyze. For example: \texttt{sublist = 1:15, blacklist = "pop\_x"}. - One very useful thing about the blacklist is that it allows the user to be extremely paranoid about the data. You can set the blacklist to contain populations that are not even in your data set and it will still work! - \item \texttt{mat -} (see section \ref{mlg}, \textit{Multilocus Genotype Analysis} for more information) This is where you would put a matrix that's produced by \texttt{mlg.table} to be subsetted instead of the genind object. If you do this, the matrix will return with only the rows equal to your populations and only the multilocus genotypes (columns) pertaining to those populations. -\end{itemize} - -To demonstrate this tool, let's revisit the H3N2 data set. Let's say we wanted to analyze only the data in North America. To make sure we are all on the same page, we will reset the population factor to ``country". Remember that this is located in a data frame in the \texttt{@other} slot called ``x". -\begin{Schunk} -\begin{Sinput} -> data(H3N2) -> pop(H3N2) <- H3N2$other$x$country -> H3N2$pop.names # Only two countries from North America. -\end{Sinput} -\begin{Soutput} - [1] "Japan" "USA" "Finland" "China" "South Korea" - [6] "Norway" "Taiwan" "France" "Latvia" "Netherlands" -[11] "Bulgaria" "Turkey" "United Kingdom" "Denmark" "Austria" -[16] "Canada" "Italy" "Russia" "Bangladesh" "Egypt" -[21] "Germany" "Romania" "Ukraine" "Czech Republic" "Greece" -[26] "Iceland" "Ireland" "Sweden" "Nepal" "Saudi Arabia" -[31] "Switzerland" "Iran" "Mongolia" "Spain" "Slovenia" -[36] "Croatia" "Algeria" -\end{Soutput} -\begin{Sinput} -> H.na <- popsub(H3N2, sublist=c("USA", "Canada")) -> H.na$pop.names -\end{Sinput} -\begin{Soutput} - P1 P2 - "USA" "Canada" -\end{Soutput} -\end{Schunk} -Since this is a larger data set, running the \texttt{summary} function might take a few seconds longer than we want it to. If we want to see the population size, we can use the \textit{adegenet} function \texttt{nInd()}: -\begin{Schunk} -\begin{Sinput} -> nInd(H.na) -\end{Sinput} -\begin{Soutput} -[1] 665 -\end{Soutput} -\begin{Sinput} -> nInd(H3N2) -\end{Sinput} -\begin{Soutput} -[1] 1903 -\end{Soutput} -\end{Schunk} -You can see that the population factors are correct and that the size of the data set is considerably smaller. Let's see the data set without the North American countries. -\begin{Schunk} -\begin{Sinput} -> H.minus.na <- popsub(H3N2, blacklist=c("USA", "Canada")) -> H.minus.na$pop.names -\end{Sinput} -\begin{Soutput} - P01 P02 P03 P04 P05 - "Japan" "Finland" "China" "South Korea" "Norway" - P06 P07 P08 P09 P10 - "Taiwan" "France" "Latvia" "Netherlands" "Bulgaria" - P11 P12 P13 P14 P15 - "Turkey" "United Kingdom" "Denmark" "Austria" "Italy" - P16 P17 P18 P19 P20 - "Russia" "Bangladesh" "Egypt" "Germany" "Romania" - P21 P22 P23 P24 P25 - "Ukraine" "Czech Republic" "Greece" "Iceland" "Ireland" - P26 P27 P28 P29 P30 - "Sweden" "Nepal" "Saudi Arabia" "Switzerland" "Iran" - P31 P32 P33 P34 P35 - "Mongolia" "Spain" "Slovenia" "Croatia" "Algeria" -\end{Soutput} -\end{Schunk} -Let's make sure that the number of individuals in both data sets added up equals the number of individuals in our original data set: -\begin{Schunk} -\begin{Sinput} -> (nInd(H.minus.na) + nInd(H.na)) == nInd(H3N2) -\end{Sinput} -\begin{Soutput} -[1] TRUE -\end{Soutput} -\end{Schunk} - -Now we have data sets with and without North America. Let's try something a bit more challenging. Let's say that we want The first 10 populations in alphabetical order, but we know that we still don't want any countries in North America. We can easily do this by using the \textit{base} function \texttt{sort}. -\begin{Schunk} -\begin{Sinput} -> Hsort <- sort(H3N2$pop.names)[1:10] -> Hsort -\end{Sinput} -\begin{Soutput} - [1] "Algeria" "Austria" "Bangladesh" "Bulgaria" "Canada" - [6] "China" "Croatia" "Czech Republic" "Denmark" "Egypt" -\end{Soutput} -\begin{Sinput} -> H.alph <- popsub(H3N2, sublist=Hsort, blacklist=c("USA", "Canada")) -> H.alph$pop.names -\end{Sinput} -\begin{Soutput} - P1 P2 P3 P4 P5 - "China" "Bulgaria" "Denmark" "Austria" "Bangladesh" - P6 P7 P8 P9 - "Egypt" "Czech Republic" "Croatia" "Algeria" -\end{Soutput} -\end{Schunk} -And that, is how you subset your data with poppr! -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -\subsection{Attack of the clone correction \{clone-censor data sets\}}\label{data.manip:cc} - -\tab\tab Clone correction refers to the ability of keeping one observation per clone in a given population (or sub-population). Clone correcting can be hazardous if its done by hand (even on small data sets) and it requires a defined population hierarchy to get relevant results. \textit{Poppr} has a clone correcting function that is able to correct at the lowest level of any defined population hierarchy. Note that clone correction in \textit{poppr} is sensitive to missing data, as it treats all missing data as a single extra allele. -\subsubsection{Function: clonecorrect}\label{data.manip:cc:clonecorrect} - -\tab\tab This function will return a clone corrected data set corrected for the lowest population level. Population levels are specified with the \texttt{hier} flag. You can choose to combine the population hierarchy to analyze at the lowest population level by choosing \texttt{combine = TRUE}. -\begin{quote} -Default Command:\\ -\texttt{clonecorrect(pop, hier = c(1), dfname = "population\_hierarchy", combine = FALSE, keep = 1)} -\end{quote} -\begin{itemize} - \item \texttt{pop -} a \texttt{genind} object that has a population hierarchy data frame in the \texttt{@other} slot. Note, the \texttt{genind} object does not necessarily require a population factor to begin with. - \item \texttt{hier -} This can be a vector of words or numbers referring to specific column names in your data frame in the \texttt{@other} slot. - \item \texttt{dfname -} The name of a data frame you have in the \texttt{@other} slot with the population factors. - \item \texttt{combine -} Do you want to combine the population hierarchy? If it's set to \texttt{FALSE} (default), you will be returned a genind object with the top most hierarchical level as a population factor. - \item \texttt{keep -} This flag is to be used if you set \texttt{combine = FALSE}. This will tell clone correct to return a specific combination of your hierarchy. For example, imagine a hierarchy that needs to be clone corrected at three levels: \textit{Population} by \textit{Year} by \textit{Month}. If you wanted to only run an analysis on the \textit{Population} level, you would set \texttt{keep = 1} since \textit{Population} is the first level of the hierarchy. On the other hand, if you wanted to run analysis on \textit{Year} by \textit{Month}, you would set \texttt{keep = 2:3} since those are the second and third levels of the hierarchy. -\end{itemize} - -Let's look at ways to clone-correct our data. We'll look at our \textit{A. euteichies} data since that data set is known to include clonal populations \cite{Grunwald:2006}. Notice that I am not including the options \texttt{dfname} and \texttt{combine} because the default arguments suit my needs. -\begin{Schunk} -\begin{Sinput} -> data(Aeut) -> A.cc <- clonecorrect(Aeut, hier=c("Pop", "Subpop"), keep=1) -> poppr(A.cc, quiet=TRUE) -\end{Sinput} -\begin{Soutput} - Pop N MLG eMLG SE H G Hexp E.5 Ia rbarD File -1 Athena 76 70 60.621 1.017 4.221 65.636 0.998 0.963 2.535 0.062 rootrot.csv -2 Mt. Vernon 65 50 50.000 0.000 3.796 36.739 0.988 0.821 14.310 0.298 rootrot.csv -3 Total 141 119 59.629 1.854 4.705 96.980 0.997 0.876 13.802 0.260 rootrot.csv -\end{Soutput} -\end{Schunk} -Now let's compare the clone corrected analysis to the uncorrected data set: -\begin{Schunk} -\begin{Sinput} -> poppr(Aeut, quiet=TRUE) -\end{Sinput} -\begin{Soutput} - Pop N MLG eMLG SE H G Hexp E.5 Ia rbarD File -1 Athena 97 70 65.981 1.246 4.063 42.193 0.986 0.721 2.906 0.072 rootrot.csv -2 Mt. Vernon 90 50 50.000 0.000 3.668 28.723 0.976 0.726 13.302 0.282 rootrot.csv -3 Total 187 119 68.453 2.989 4.558 68.972 0.991 0.720 14.371 0.271 rootrot.csv -\end{Soutput} -\end{Schunk} - -As you can see from the summary tables, everything all sub-populations have been clone censored to the sub population level with respect to the population hierarchy. Notice how the observed number of individuals \texttt{(N)} decreases in the clone corrected data set. - -\subsection{Every day I'm shuffling (data sets) \{permutations and bootstrap resampling\}}\label{data.manip:shuffle} - -\tab\tab A common null hypothesis for populations with mixed reproductive modes is panmixia, or to put it simply: lots of sex. A handy way to test for that is permutation analysis to assess random linkage among loci whereupon you randomly shuffle your data. \textit{Poppr} uses randomly shuffled data sets in order to calculate P-values for the index of association ($I_A$ and $\bar r_d$) \cite{Agapow:2001}. Since there might be other tests where a permutation analysis would be pertinent, a shuffler for \texttt{genind} objects was created with four shuffling schemes: two schemes shuffling without replacement and two shuffling with replacement. Details below. -\subsubsection{Function: shufflepop}\label{data.manip:shuffle:shufflepop} -\begin{quote} -Default Command:\\ -\texttt{shufflepop(pop, method = 1)} -\end{quote} -\begin{itemize} - \item \texttt{pop -} a \texttt{genind} object. - \item \texttt{method -} a number indicating the method of sampling you wish to use. - The following methods are available for use: - \begin{enumerate} - \item \textbf{Permute Alleles (default)} This is a sampling scheme that will permute alleles within the locus. For example, a single diploid locus with four alleles (1, 2, 3, 4) with the frequencies of 0.1, 0.2, 0.3, and 0.4, respectively: -\begin{Schunk} -\begin{Soutput} - [,1] [,2] -[1,] 4 4 -[2,] 4 1 -[3,] 4 3 -[4,] 2 2 -[5,] 3 3 -\end{Soutput} -\end{Schunk} -might become: -\begin{Schunk} -\begin{Soutput} - [,1] [,2] -[1,] 3 2 -[2,] 2 4 -[3,] 1 3 -[4,] 4 3 -[5,] 4 4 -\end{Soutput} -\end{Schunk} -As you can see, The heterozygosity has changed, yet the allelic frequencies remain the same. Overall this would show you, for example, what would happen if the sample you had underwent panmixis within this sample itself. - \item \textbf{Parametric Bootstrap} The previous scheme reshuffled the observed sample, but the parametric bootstrap uses the allelic frequencies as estimates of what the true allelic frequencies are and uses those as probabilities for each allele when resampling the data with replacement. Here are two samples to show you what I mean. -\begin{Schunk} -\begin{Soutput} -First Sample -\end{Soutput} -\begin{Soutput} - [,1] [,2] -[1,] 1 3 -[2,] 3 3 -[3,] 3 2 -[4,] 4 4 -[5,] 4 2 -\end{Soutput} -\begin{Soutput} -Second Sample -\end{Soutput} -\begin{Soutput} - [,1] [,2] -[1,] 3 4 -[2,] 2 3 -[3,] 4 2 -[4,] 4 4 -[5,] 4 2 -\end{Soutput} -\end{Schunk} - -Notice how the heterozygosity has changed along with the allelic frequencies. The frequencies for alleles 3 and 4 have switched in the first data set, and we've lost allele 1 in the second data set purely by chance! This type of sampling scheme attempts to show you what the true population would look like if it were panmictic and your original sample gave you a basis for estimating expected allele frequencies. Since estimates are made from the observed allele frequencies, small samples will produce skewed results. - \item \textbf{Non-Parametric Bootstrap} The third method is sampling with replacement, but with no assumption about the distribution of the alleles. -\begin{Schunk} -\begin{Soutput} - [,1] [,2] -[1,] 1 3 -[2,] 3 3 -[3,] 3 1 -[4,] 2 2 -[5,] 3 1 -\end{Soutput} -\end{Schunk} -Again, heterozygosity and allele frequencies are not maintained, but now all of the alleles have a 1 in 4 chance of being chosen. - \item \textbf{Multilocus permutation} This is called Multilocus permutation because it does the same thing as the permutation analysis in the program \textit{multilocus} by Paul Agapow and Austin Burt \cite{Agapow:2001}. This will shuffle the genotypes at each locus. Using our example above, here it is shuffled with method 4: -\begin{Schunk} -\begin{Soutput} - [,1] [,2] -[1,] 3 3 -[2,] 4 1 -[3,] 2 2 -[4,] 4 4 -[5,] 4 3 -\end{Soutput} -\end{Schunk} - -Note that you have the same genotypes after shuffling, so at each locus, you will maintain the same allelic frequencies and heterozygosity. So, in this sample, you will only see a homozygote with allele 2. This also ensures that the P-values associated with $I_A$ and $\bar r_d$ are exactly the same (for an explanation, see the end of section \ref{index:iard:ia} of this manual). Unfortunately, if you are trying to simulate a sexual population, this does not make much biological sense as it assumes that alleles are not independently assorting within individuals. - \end{enumerate} -\end{itemize} -These shuffling schemes have been implemented for the index of association, but there may be other summary statistics you can use \texttt{shufflepop} for. All you have to do is use the function \texttt{replicate}. Let's use $I_A$ as an example: -\begin{Schunk} -\begin{Sinput} -> data(nancycats) -> nan1 <- popsub(nancycats, 1) -> ia(nan1) -\end{Sinput} -\begin{Soutput} - Ia rbarD -0.16564272 0.02105965 -\end{Soutput} -\begin{Sinput} -> replicate(10, ia(shufflepop(nan1, method = 2), quiet=TRUE)) -\end{Sinput} -\begin{Soutput} - [,1] [,2] [,3] [,4] [,5] [,6] [,7] -Ia -0.35202232 0.057744361 0.032000000 -0.37774086 -0.21588402 -0.25668449 0.25675321 -rbarD -0.04475446 0.007344014 0.004056698 -0.04803454 -0.02751686 -0.03298552 0.03244442 - [,8] [,9] [,10] -Ia -0.28757799 -0.10313192 -0.071845794 -rbarD -0.03681791 -0.01306466 -0.009271283 -\end{Soutput} -\end{Schunk} -You could use this method to replicate the resampling 999 times and then create a histogram to visualize a distribution of what would happen under different assumptions of panmixia. - -\subsection{Cut It Out! \{removing uninformative loci\}}\label{data.manip:informloci} - \tab\tab Phylogenetically uninformative loci are those that have only one sample differentiating from the rest. This can lead to biased results when using multilocus analyses such as the index of association (See \ref{index:iard} and \ref{summary}). These nuisance loci can be removed with the following function. -\subsubsection{Function: informloci}\label{data.manip:informloci:informloci} -\begin{quote} -Default Command:\\ -\texttt{informloci(pop, cutoff = 2/nInd(pop), quiet = FALSE)} -\end{quote} - \begin{itemize} - \item \texttt{pop -} a \texttt{genind} object. - \item \texttt{cutoff -} this represents the minimum fraction of individuals needed for a locus to be considered informative. The default is set to $2/n$ with $n$ being the number of individuals in the data set (represented by the \textit{adegenet} function \texttt{nInd}). Essentially, this means that any locus with fewer than 2 observations differing will be removed. The user can also specify a fraction of observations for the cutoff (eg. 0.05). - \item \texttt{quiet -} if \texttt{TRUE}, nothing will be printed to the screen, if \texttt{FALSE}, the cutoff value in percentage and number of individuals will be printed as well as the names of the uninfomrative loci found. - \end{itemize} - -Here's a quick example. -\begin{Schunk} -\begin{Sinput} -> data(H3N2) -> H.five <- informloci(H3N2, cutoff = 0.05) -\end{Sinput} -\end{Schunk} -\begin{Schunk} -\begin{Soutput} -cutoff value: 5 percent ( 95 individuals ). - 47 uninfomative loci found: 157 -177 233 243 262 267 280 303 313 327 357 382 384 399 412 418 424 425 429 433 451 -470 529 546 555 557 564 576 592 595 597 602 612 627 642 647 648 654 658 663 667 -681 717 806 824 837 882 -\end{Soutput} -\end{Schunk} -Now what happens when you have all informative loci: -\begin{Schunk} -\begin{Sinput} -> data(nancycats) -> naninform <- informloci(nancycats, cutoff = 0.05) -\end{Sinput} -\begin{Soutput} -cutoff value: 5 percent ( 12 individuals ). -No sites found with fewer than 12 different individuals. -\end{Soutput} -\end{Schunk} - -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -\section{Multilocus Genotype Analysis}\label{mlg} - -\tab\tab In populations with mixed sexual and clonal reproduction, it is not uncommon to have multiple samples from the same population have the same genotype across multiple loci (multilocus genotype, MLG). Here, we introduce tools for tracking MLGs within and across populations in \texttt{genind} objects from the \textit{adegenet} package. We will be using SNP data from isolates of the H3N2 virus from 2002 to 2006. -\subsection{Just a peek \{How many multilocus genotypes are in our data set?\}}\label{mlg:mlg} - -\tab\tab First, let's take a quick look at how many Multilocus Genotypes are present within the H3N2 data set using the \texttt{mlg} function. This will tell us if any MLG analysis is needed. -\subsubsection{Function: mlg}\label{mlg:mlg:mlg} - -\tab\tab The function \texttt{mlg} allows for the counting of the number of MLGs in a \texttt{genind} object. This is a very simple command for quick reference to determine if your data set needs further multilocus genotype analysis. -\begin{quote} -Default Command:\\ -\texttt{mlg(pop, quiet = FALSE)} -\end{quote} - \begin{itemize} - \item \texttt{pop -} a \texttt{genind} object. - \item \texttt{quiet -} if \texttt{TRUE}, the number of individuals and multilocus genotypes will be printed to the screen, if \texttt{FALSE}, nothing will be printed to the screen and the number of multilocus genotypes will be reported. - \end{itemize} -\begin{Schunk} -\begin{Sinput} -> data(H3N2) -> mlg(H3N2, quiet=FALSE) -\end{Sinput} -\begin{Soutput} -############################# -# Number of Individuals: 1903 -# Number of MLG: 752 -############################# -[1] 752 -\end{Soutput} -\end{Schunk} -We can see that since the number of individuals exceeds the number of multilocus genotypes, this data set contains clones. Let's take a look at where those clones are with respect to populations. -\subsection{Clone-ing around \{MLGs across populations\}}\label{mlg:cross} - -\tab\tab Since you have the ability to change the population structure of your data set freely, it is quite possible to see some of the same MLGs across different populations. Tracking them by hand can be a nightmare with large data sets. Luckily, \texttt{mlg.crosspop} has you covered in that regard. -\subsubsection{Function: mlg.crosspop}\label{mlg:cross:mlg.crosspop} - -\tab\tab Analyze the MLGs that cross populations within your data set. This has three output modes. The default one gives a list of MLGs, and for each MLG, it gives a named numeric vector indicating the abundance of that MLG in each population. Alternate outputs are described with \texttt{indexreturn} and \texttt{df}. -\begin{quote} -Default Command:\\ -\texttt{mlg.crosspop(pop, sublist = "ALL", blacklist = NULL, mlgsub = NULL, indexreturn = FALSE, df = FALSE, quiet = FALSE)} -\end{quote} - \begin{itemize} - \item \texttt{pop -} a \texttt{genind} object. - \item \texttt{sublist -} see \texttt{mlg.table}, Section \ref{mlg:table:mlg.table}. Analyze specified populations. - \item \texttt{blacklist -} see \texttt{mlg.table}, Section \ref{mlg:table:mlg.table}. Do not include specified populations. - \item \texttt{mlgsub -} see \texttt{mlg.table}, Section \ref{mlg:table:mlg.table}. Only analyze specified MLGs. The vector for this flag can be produced by this function as you will see later in this vignette. - \item \texttt{indexreturn -} return a vector of indices of MLGs. (You can use these in the \texttt{mlgsub} flag, or you can use them to subset the columns of an MLG table). - \item \texttt{df -} return a data frame containing the MLGs, the populations they cross, and the number of copies you find in each population. This is useful for making graphs in \textit{ggplot2}. - \item \texttt{quiet -} \texttt{TRUE} or \texttt{FALSE}. Should the populations be printed to screen as they are processed? (will print nothing if \texttt{indexreturn} is \texttt{TRUE}) - \end{itemize} - -We can see what Multilocus Genotypes cross different populations and then give a vector that shows how many populations each multi-population MLG crosses. -\begin{Schunk} -\begin{Sinput} -> pop(H3N2) <- H3N2$other$x$country -> H.dup <- mlg.crosspop(H3N2, quiet=TRUE) -\end{Sinput} -\end{Schunk} -Here is a snippet of what the output looks like when \texttt{quiet} is \texttt{FALSE}. It will print out the MLG name, the total number of individuals that make up that MLG, and the populations where that MLG can be found. -\begin{Schunk} -\begin{Soutput} -MLG.3: (12 inds) USA Denmark -MLG.9: (16 inds) Japan USA Finland Denmark -MLG.31: (9 inds) Japan Canada -MLG.75: (23 inds) Japan USA Finland Norway Denmark Austria Russia Ireland -MLG.80: (2 inds) USA Denmark -MLG.86: (7 inds) Denmark Austria -MLG.95: (2 inds) USA Bangladesh -MLG.97: (8 inds) USA Austria Bangladesh Romania -MLG.104: (3 inds) USA France -MLG.110: (16 inds) Japan USA China -\end{Soutput} -\end{Schunk} -The output of this function is a list of MLGs, each containing a vector indicating the number of copies in each population. We'll count the number of populations each MLG crosses using the function \texttt{sapply} with \texttt{length}. -\begin{Schunk} -\begin{Sinput} -> head(H.dup) -\end{Sinput} -\begin{Soutput} -$MLG.3 - USA Denmark - 4 8 - -$MLG.9 - Japan USA Finland Denmark - 1 13 1 1 - -$MLG.31 - Japan Canada - 2 7 - -$MLG.75 - Japan USA Finland Norway Denmark Austria Russia Ireland - 2 8 2 1 6 2 1 1 - -$MLG.80 - USA Denmark - 1 1 - -$MLG.86 -Denmark Austria - 3 4 -\end{Soutput} -\begin{Sinput} -> H.num <- sapply(H.dup, length) # count the number of populations each MLG crosses. -> H.num -\end{Sinput} -\begin{Soutput} - MLG.3 MLG.9 MLG.31 MLG.75 MLG.80 MLG.86 MLG.95 MLG.97 MLG.104 MLG.110 MLG.119 - 2 4 2 8 2 2 2 4 2 3 2 -MLG.149 MLG.158 MLG.163 MLG.205 MLG.206 MLG.207 MLG.210 MLG.213 MLG.221 MLG.224 MLG.227 - 2 6 2 2 3 2 2 4 2 3 3 -MLG.234 MLG.241 MLG.244 MLG.246 MLG.252 MLG.253 MLG.258 MLG.274 MLG.277 MLG.283 MLG.285 - 6 3 2 10 2 9 2 5 3 3 2 -MLG.290 MLG.291 MLG.314 MLG.315 MLG.317 MLG.321 MLG.325 MLG.326 MLG.334 MLG.344 MLG.350 - 3 2 2 3 3 2 2 2 2 2 2 -MLG.368 MLG.370 MLG.381 MLG.401 MLG.405 MLG.417 MLG.439 MLG.453 MLG.461 MLG.471 MLG.508 - 3 2 3 3 3 5 2 2 3 2 3 -MLG.529 MLG.530 MLG.540 MLG.548 MLG.552 MLG.556 MLG.570 MLG.578 MLG.580 MLG.582 MLG.589 - 5 3 3 2 2 2 2 2 2 2 2 -MLG.597 MLG.605 MLG.611 MLG.615 MLG.619 MLG.620 MLG.621 - 2 3 2 2 2 4 2 -\end{Soutput} -\end{Schunk} -%\newpage -\subsection{Bringing something to the table \{producing MLG tables and graphs\}}\label{mlg:table} - -We can also create a table of multilocus genotypes per population as well as bar graphs to give us a visual representation of the data. This is achieved through the function \texttt{mlg.table} -\subsubsection{Function: mlg.table}\label{mlg:table:mlg.table} - -\tab\tab Produce a matrix containing counts of MLGs (columns) per population (rows). If there is no population structure to your data set, a vector will be produced instead. -\begin{quote} -Default Command:\\ -\texttt{mlg.table(pop, sublist = "ALL", blacklist = NULL, mlgsub = NULL, bar = TRUE, total = FALSE, quiet = FALSE)} -\end{quote} - \begin{itemize} - \item \texttt{pop -} a \texttt{genind} object. - \item \texttt{sublist -} a vector indicating which specific populations you want to produce a table for. This can be a numeric or character vector. See section \ref{data.manip:divide:popsub} for details. - \item \texttt{blacklist -} a vector indicating which specific populations you do not want to include in your table. This can be a numeric or character vector, and does not necessarily have to be the same type as \texttt{sublist}. eg. \texttt{sublist=1:10, blacklist="USA"}. See section \ref{data.manip:divide:popsub} for details. - \item \texttt{mlgsub -} a vector containing the indices of MLGs you wish to subset your table with. - \item \texttt{bar -} \texttt{TRUE} or \texttt{FALSE}. If \texttt{TRUE}, a bar plot will be printed for each population with more than one individual. - \item \texttt{total -} \texttt{TRUE} or \texttt{FALSE}. Should the entire data set be included in the table? This is equivalent to evoking \texttt{colSums} on the table. - \item \texttt{quiet -} \texttt{TRUE} or \texttt{FALSE}. When \texttt{bar} is \texttt{TRUE}, should the populations be printed to screen as they are processed? - \end{itemize} -\begin{Schunk} -\begin{Sinput} -> H.tab <- mlg.table(H3N2, quiet=TRUE, bar=TRUE) -> H.tab[1:10, 1:10] # Showing the first 10 columns and rows of the table. -\end{Sinput} -\end{Schunk} -\begin{Schunk} -\begin{Soutput} - MLG.1 MLG.2 MLG.3 MLG.4 MLG.5 MLG.6 MLG.7 MLG.8 MLG.9 MLG.10 -Japan 0 0 0 0 0 0 1 2 1 0 -USA 0 2 4 1 1 0 0 0 13 0 -Finland 0 0 0 0 0 0 0 0 1 0 -China 0 0 0 0 0 0 0 0 0 0 -South Korea 0 0 0 0 0 1 0 0 0 0 -Norway 1 0 0 0 0 0 0 0 0 0 -Taiwan 0 0 0 0 0 0 0 0 0 0 -France 0 0 0 0 0 0 0 0 0 0 -Latvia 0 0 0 0 0 0 0 0 0 0 -Netherlands 0 0 0 0 0 0 0 0 0 0 -\end{Soutput} -\end{Schunk} -\begin{figure}[h!] - \centering - \caption{\footnotesize An example of a bar-chart produced by \texttt{mlg.table}. Note that this data set would produce several such charts.} - \label{nortable} -\includegraphics{poppr_manual-mlgbarplot} -\end{figure} -\newpage - -The MLG table is not restricted for use with just \textit{Poppr}. One of the main advantages of the function \texttt{mlg.table} is that it allows easy access to diversity functions present in the package \textit{vegan} \cite{vegan}. One very simple example is to create a rarefaction curve for each population in your data set giving the number of expected MLGs for a given sample size. For more information, type \texttt{help("diversity", package="vegan")} in your R console. - -For the sake of a simple example, instead of drawing a curve for each of the 37 countries represented in this sample, let's change the population structure to be the different years of the epidemics. -\begin{Schunk} -\begin{Sinput} -> H.year <- H3N2 -> pop(H.year) <- H.year$other$x$year -> summary(H.year) # Check the data to make sure it's correct. -\end{Sinput} -\begin{Soutput} - # Total number of genotypes: 1903 - - # Population sample sizes: -2002 2003 2004 2005 2006 - 158 415 399 469 462 - - # Number of alleles per locus: -L001 L002 L003 L004 L005 L006 L007 L008 L009 L010 L011 L012 L013 L014 L015 L016 L017 L018 - 3 3 4 2 4 2 3 2 4 3 4 2 4 3 2 2 3 3 -L019 L020 L021 L022 L023 L024 L025 L026 L027 L028 L029 L030 L031 L032 L033 L034 L035 L036 - 2 2 3 3 3 2 2 2 2 2 2 2 2 2 2 4 4 3 -L037 L038 L039 L040 L041 L042 L043 L044 L045 L046 L047 L048 L049 L050 L051 L052 L053 L054 - 3 3 4 2 2 2 4 3 2 3 4 2 3 2 3 2 2 2 -L055 L056 L057 L058 L059 L060 L061 L062 L063 L064 L065 L066 L067 L068 L069 L070 L071 L072 - 4 2 2 2 2 2 2 2 4 4 4 3 3 2 3 4 3 2 -L073 L074 L075 L076 L077 L078 L079 L080 L081 L082 L083 L084 L085 L086 L087 L088 L089 L090 - 3 3 3 3 2 3 2 4 2 3 2 2 3 3 3 3 2 2 -L091 L092 L093 L094 L095 L096 L097 L098 L099 L100 L101 L102 L103 L104 L105 L106 L107 L108 - 2 2 3 2 3 2 3 2 3 2 3 3 2 2 2 3 2 2 -L109 L110 L111 L112 L113 L114 L115 L116 L117 L118 L119 L120 L121 L122 L123 L124 L125 - 2 3 3 3 2 2 3 3 3 3 4 2 3 3 4 3 2 - - # Number of alleles per population: - 1 2 3 4 5 -203 255 232 262 240 - - # Percentage of missing data: -[1] 2.363426 - - # Observed heterozygosity: -[1] 0 - - # Expected heterozygosity: -[1] 0 -\end{Soutput} -\end{Schunk} -\begin{Schunk} -\begin{Sinput} -> library(vegan) -> H.year <- mlg.table(H.year, bar=FALSE) -> rarecurve(H.year, ylab="Multilocus genotypes expected", sample=min(rowSums(H.year))) -\end{Sinput} -\end{Schunk} -\begin{figure}[h!] - \centering - \caption{\footnotesize An example of a rarefaction curve produced using a MLG table.} - \label{rarecurve} -\includegraphics{poppr_manual-mlgrareplot} -\end{figure} -\newpage - -The minimum value from the \textit{base} function \texttt{rowSums()} of the table represents the minimum common sample size of all populations. Setting the ``sample" flag draws the horizontal and vertical lines you see on the graph. The intersections of these lines correspond to the numbers you would find if you ran the function \texttt{poppr} on this data set (under the column ``\texttt{eMLG}"). -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -\subsection{Getting into the mix \{combining MLG functions\}}\label{mlg:mix} - -\tab\tab Alone, the different functionalities are neat. Combined, we can create interesting data sets. Let's say we wanted to know which MLGs were duplicated across the regions of the United Kingdom, Germany, Netherlands, and Norway. All we have to do is use the \texttt{sublist} flag in the function: -\begin{Schunk} -\begin{Sinput} -> UGNN.list <- c("United Kingdom", "Germany", "Netherlands", "Norway") -> UGNN <- mlg.crosspop(H3N2, sublist=UGNN.list, indexreturn=TRUE) -\end{Sinput} -\end{Schunk} -OK, the output tells us that there are three MLGs that are crossing between these -populations, but we do not know how many are in each. We can easily find that out if we subset our original table, \texttt{H.tab}. - -\begin{Schunk} -\begin{Sinput} -> UGNN # Note that we have three numbers here. This will index the columns for us. -\end{Sinput} -\begin{Soutput} -MLG.315 MLG.317 MLG.620 - 315 317 620 -\end{Soutput} -\begin{Sinput} -> UGNN.list # And let's not forget that we have the population names. -\end{Sinput} -\begin{Soutput} -[1] "United Kingdom" "Germany" "Netherlands" "Norway" -\end{Soutput} -\begin{Sinput} -> H.tab[UGNN.list, UGNN] -\end{Sinput} -\begin{Soutput} - MLG.315 MLG.317 MLG.620 -United Kingdom 1 0 0 -Germany 0 1 1 -Netherlands 0 0 0 -Norway 2 3 1 -\end{Soutput} -\end{Schunk} - -Now we can see that Norway has a higher incidence of nearly all of these MLGs. -We can go even further and subset the original data set to only give us those MLGs by utilizing the function \texttt{mlg.vector}: -\subsubsection{Function: mlg.vector}\label{mlg:mix:mlg.vector} - -\tab\tab This function is the backbone for \texttt{mlg.table} and \texttt{mlg.crosspop}, and is The function that determines what your MLGs are. This is quite useful for sub-setting the data set to only contain the MLGs of interest. The numbers in the vector correspond to the number of columns in a matrix produced by \texttt{mlg.table}. It is important to remember that this is also sensitive to missing data and will treat it as a single extra allele. -\begin{quote} -Default Command:\\ -\texttt{mlg.vector(pop)} -\end{quote} - \begin{itemize} - \item \texttt{pop -} a \texttt{genind} object. - \end{itemize} -\begin{Schunk} -\begin{Sinput} -> H.vec <- mlg.vector(H3N2) -> H.sub <- H3N2[H.vec %in% UGNN, ] -> mlg.table(H.sub, bar=FALSE) -\end{Sinput} -\begin{Soutput} - MLG.1 MLG.2 MLG.3 -Austria 0 0 7 -Germany 0 1 1 -Greece 0 0 1 -Norway 2 3 1 -Japan 4 1 0 -United Kingdom 1 0 0 -\end{Soutput} -\end{Schunk} -You can also do the same thing using the mlgsub flag. -\begin{Schunk} -\begin{Sinput} -> mlg.table(H3N2, mlgsub=UGNN, bar=TRUE) -\end{Sinput} -\end{Schunk} -\begin{Schunk} -\begin{Soutput} - MLG.315 MLG.317 MLG.620 -Japan 4 1 0 -Norway 2 3 1 -United Kingdom 1 0 0 -Austria 0 0 7 -Germany 0 1 1 -Greece 0 0 1 -\end{Soutput} -\end{Schunk} -And we can see where exactly these three MLGs fall within our data set. -\begin{figure}[h!] - \centering - \caption{\footnotesize An example of the same bar-chart as \textit{Figure 1}, but focusing on three MLGs.} - \label{nortable2} -\includegraphics{poppr_manual-subnor} -\end{figure} -\\ -\newpage -Now, you might notice that the MLG vector no longer matches up with our data after we subset it. -\begin{Schunk} -\begin{Sinput} -> H.vec[1:22] -\end{Sinput} -\begin{Soutput} - [1] 605 605 672 675 674 673 670 671 670 678 678 678 678 582 615 580 581 570 615 582 582 -[22] 592 -\end{Soutput} -\begin{Sinput} -> mlg.vector(H.sub) -\end{Sinput} -\begin{Soutput} - [1] 3 3 3 3 3 3 3 3 3 3 2 1 1 2 2 1 2 1 1 1 1 2 -\end{Soutput} -\end{Schunk} -Well, this is unfortunate because it means that we can't compare any subsetted data with non-subsetted data. Luckily, there's a little trick we can do using our old friend, the \texttt{@other} slot. -If we place the MLG vector in the \texttt{@other} slot of our original data set, it will be subsetted along with the data. -\begin{Schunk} -\begin{Sinput} -> H3N2@other$MLG.vector <- H.vec -> H.sub <- H3N2[H.vec %in% UGNN, ] -> H.sub@other$MLG.vector -\end{Sinput} -\begin{Soutput} - [1] 620 620 620 620 620 620 620 620 620 620 317 315 315 317 317 315 317 315 315 315 315 -[22] 317 -\end{Soutput} -\end{Schunk} -Magic!\\ -%\newpage - -So, we've gotten this far, yet we haven't actually seen what the genotypes look like! For analyses where the genotypic signature is important, this is a crucial identification step. Lucky for us, the \texttt{genind} object retains all of the genotypic information and can be accessed using the \texttt{genind2df} function. Let's take a look at the three genotypes we specified above utilizing the vector of MLGs we created above, \texttt{H.vec}. -\begin{Schunk} -\begin{Sinput} -> H.df <- genind2df(H3N2) -> H.df[H.vec %in% UGNN, 1:15] # Showing only 15 columns becaus it is a large dataset. -\end{Sinput} -\end{Schunk} -\begin{Schunk} -\begin{Soutput} - pop 6 17 39 42 45 51 60 72 73 90 108 123 129 134 -CY026119 Austria a a g c g c g g c g a g t g -CY026120 Austria a a g c g c g g c g a g t g -CY026121 Austria a a g c g c g g c g a g t g -CY026122 Austria a a g c g c g g c g a g t g -CY026131 Austria a a g c g c g g c g a g t g -CY026132 Austria a a g c g c g g c g a g t g -CY026135 Austria a a g c g c g g c g a g t g -EU502462 Germany a a g c g c g g c g a g t g -EU502463 Greece a a g c g c g g c g a g t g -EU502464 Norway a a g c g c g g c g a g t g -EU501513 Japan a a g c t c a g a g t g t g -AB243868 Japan a a g c t c a g a g t g t g -DQ883618 Norway a a g c t c a g a g t g t g -DQ883619 Norway a a g c t c a g a g t g t g -DQ883620 Norway a a g c t c a g a g t g t g -DQ883628 Norway a a g c t c a g a g t g t g -EU501609 Germany a a g c t c a g a g t g t g -EU501642 Japan a a g c t c a g a g t g t g -EU501643 Japan a a g c t c a g a g t g t g -EU501735 United Kingdom a a g c t c a g a g t g t g -EU501742 Japan a a g c t c a g a g t g t g -EU502513 Norway a a g c t c a g a g t g t g -\end{Soutput} -\end{Schunk} -Notice that there seems to be a clear separation between the SNPs of the first 10 isolates and the rest? This is no coincidence. Take a look at the output of our sub-setting. -\begin{Schunk} -\begin{Sinput} -> UGNN -\end{Sinput} -\begin{Soutput} -MLG.315 MLG.317 MLG.620 - 315 317 620 -\end{Soutput} -\begin{Sinput} -> H.vec[H.vec %in% UGNN] -\end{Sinput} -\begin{Soutput} - [1] 620 620 620 620 620 620 620 620 620 620 317 315 315 317 317 315 317 315 315 315 315 -[22] 317 -\end{Soutput} -\end{Schunk} - -We have the MLGs 315, 317, and 620, and the result of the sub-setting shows us that 620 occurs earlier in our data set, and that MLGs 315 and 317 are mixed in together. The reason why we do not see a mixture of three different sets of SNP calls in our little window is because \texttt{mlg.vector} creates the MLGs by first concatenating and then sorting the genotypes. This way, the closer two MLG indexes are to each other, the fewer differences they will have between one another. -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -\subsection{Do you see what I see? \{alternative data visualization\}}\label{mlg:alt.vis} - -\tab\tab The graphs that are output by \textit{poppr} are simply aids for the user to make data analysis easier. We want to better visualize how these MLGs cross populations by MLG or population. We also want to see exactly what MLGs are in which populations, and how prevalent they are. As the package \textit{ggplot2} is based on data frames, we have to give ourselves a data frame to work with. We can do this using the \texttt{df = TRUE} flag. -\begin{Schunk} -\begin{Sinput} -> df <- mlg.crosspop(H3N2, df=TRUE, quiet=TRUE) -> names(df) -\end{Sinput} -\begin{Soutput} -[1] "MLG" "Population" "Count" -\end{Soutput} -\end{Schunk} -Now that we have our data frame, we can do a couple of things. We can first see where the most omnipresent MLG occurs. After that, we will plot the top ten MLGs using ggplot2. -\begin{Schunk} -\begin{Sinput} -> H.max <- names(sort(H.num, decreasing=TRUE)[1:10]) -> # Showing the data frame by the largest MLG complex. -> df[df$MLG %in% H.max[1], ] -\end{Sinput} -\begin{Soutput} - MLG Population Count -76 MLG.246 Japan 3 -77 MLG.246 USA 8 -78 MLG.246 China 4 -79 MLG.246 Norway 1 -80 MLG.246 Austria 6 -81 MLG.246 Russia 1 -82 MLG.246 Egypt 1 -83 MLG.246 Iceland 1 -84 MLG.246 Nepal 15 -85 MLG.246 Switzerland 1 -\end{Soutput} -\end{Schunk} -And now we can visualize the largest ten MLG complexes using \textit{ggplot2}'s \texttt{qplot} function. -\begin{figure}[h] - \centering - \caption{\footnotesize An example of the versatility of the MLG information.} - \label{mlgtable2} -\begin{Schunk} -\begin{Sinput} -> df2 <- df[df$MLG %in% H.max, ] -> library(ggplot2) -> qplot(y=MLG, x=Population, data=df2, color=Count, size=Count) + -+ theme(axis.text.x = element_text(size = 10, angle = -45, hjust = 0)) -\end{Sinput} -\end{Schunk} -\includegraphics{poppr_manual-ggplotchart} -\end{figure} -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -\section{Index and Distance Calculations}\label{index} -\subsection{The missing linkage disequilibrium \{calculating the index of association, $I_A$ and $\bar r_d$\}}\label{index:iard} - -\tab\tab The index of association was originally developed as a measure of multilocus linkage disequilibrium \cite{Brown:1980} and was found to be able to detect signatures of sexual reproduction and population structure \cite{Brown:1980, Smith:1993}. Unfotunately, $I_A$ was found to increase with the number of loci, and was not suitable to comparisons across studies \cite{Agapow:2001}. To remedy this, $\bar{r}_d$ was developed that corrects for this scaling and forces the index to lie between 0 (linkage equilibrium) and 1 (full disequilibrium). $I_A$ has previously been implemented in a couple of programs including \textit{multilocus} \cite{Agapow:2001} and \textit{LIAN} \cite{Haubold:2000}. While both of these programs are still available for download, \textit{multilocus} is no longer actively supported, and \textit{LIAN}, despite its speed, is only appropriate for haplotypic data. Both of these programs each require one specific file format, and, until recently\footnote{LIAN 3.6 allows the user to run multiple contiguous data sets within a single file or across multiple files. It is impossible to run MULTILOCUS in batch.}, neither of these programs had an internal ability to run in batch across multiple populations within a file or multiple files within a directory in the same way that poppr can (see footnote). - -It is important to note that for this algorithm, all missing values are treated in the same way as \textit{multilocus} in that all missing alleles are imputed to be the same as the alleles they are being compared to. Depending on the percent missing data in your data set, this might influence the statistic. If you have a lot of missing data, consider using the \texttt{missing} flag in this function. -\subsubsection{Function: ia}\label{index:iard:ia} - -\tab\tab This function is a quick look at a single data set. It can do almost everything that \texttt{poppr} can do except for sorting through populations. -\begin{quote} -Default Command:\\ -\texttt{ia(pop, sample = 0, method = 1, quiet = FALSE, missing = "ignore", - hist = TRUE)} -\end{quote} -\begin{itemize} - \item \texttt{pop -} a \texttt{genind} object. - \item \texttt{sample -} You should use this flag whenever you want to reshuffle your data set. Indicate how many times you want to reshuffle your data set to obtain a P-value. - \item \texttt{method -} a number from 1 to 4 indicating the sampling method: - \begin{enumerate} - \item permutation over alleles. - \item parametric bootstrap. - \item non-parametric bootstrap. - \item \textit{multilocus} style permutation \cite{Agapow:2001}. - \end{enumerate} - The methods are detailed in section \ref{data.manip:shuffle:shufflepop} of this manual. - \item \texttt{quiet -} If set to \texttt{TRUE}, nothing will be printed to the screen as the sampling progresses. If \texttt{FALSE} will produce a progress bar. - \item \texttt{missing -} This will preprocess your missing values. It is set to ignore missing data, so that they do not contribute to the distance measure. It can also be set to \texttt{"loci", "geno", "zero", or "mean"}. For details, see section \ref{data.manip:missing:missingno} of this manual. - \item \texttt{hist -} This will produce a pair of histograms for each population showing the distribution of $I_A$ and $\bar r_d$ across the sampled data sets, and plot the observed value as a single vertical line. -\end{itemize} -Running the analysis is as simple as this: -\begin{Schunk} -\begin{Sinput} -> ia(nancycats) -\end{Sinput} -\begin{Soutput} - Ia rbarD -0.17207262 0.02178965 -\end{Soutput} -\end{Schunk} -We can use \texttt{popsub} to subset for specific populations. Here, we'll also demonstrate the sampling flag and show you what the histogram looks like. -\begin{Schunk} -\begin{Sinput} -> set.seed(1009) -> ia(popsub(nancycats, 5), sample=999) -\end{Sinput} -\end{Schunk} -\begin{Schunk} -\begin{Soutput} -|================================================================================| 100% -\end{Soutput} -\begin{Soutput} - Ia p.Ia rbarD p.rD --0.047539953 0.572000000 -0.006004254 0.572000000 -\end{Soutput} -\end{Schunk} - -This analysis produced the histograms you see below. What these histograms represent are 999 resamplings of the data under the null hypothesis ($H_0$) of sexual reproduction. The way that $H_0$ is created is determined by the sampling method chosen. In this case, the method was to shuffle genotypes at each locus to simulate unlinked loci. Since the P = 0.572, we would fail to reject $H_0$ and we therefore might conclude that this population is sexually reproducing \cite{Brown:1980} \cite{Smith:1993} \cite{Agapow:2001}. -\begin{figure}[h!] - \centering - \caption{\footnotesize Histograms of 999 values of $I_A$ and $\bar{r}_d$ calculated from 999 resamplings of population 5 from the data set ``nancycats". The observed values of $I_A$ and $\bar r_d$ are represented as vertical blue lines overlaid on the distributions. The ticks at the bottom of each histogram represent individual observations.} - \label{ia_demo_fig} -\includegraphics{poppr_manual-109} -\end{figure} -%\newpage - -There, are, of course a couple of caveats that need to be mentioned regarding our P-values. First, while we have equivalent P-values for $I_A$ and $\bar{r}_d$, they might not always be equal due to the difference in calculation. Details about that can be found in the Appendix section \ref{appendix:algorithm:iard}. Second, the P-values are calculated by comparing how many permuted values are greater than or equal to the observed value. This includes the observed value (which is why setting the randomizations to 999 will give you a round P-value) which means that the lowest P-value you will ever have is $1/(n+1)$ where $n$ is the number of permutations you select. Take for example this population of a clonal root rot pathogen, \textit{Aphanomyces euteiches}: -\begin{Schunk} -\begin{Sinput} -> data(Aeut) -> set.seed(1001) -> ia(popsub(Aeut, 1), sample=999, method=2, quiet=TRUE, hist=FALSE) -\end{Sinput} -\end{Schunk} -\begin{Schunk} -\begin{Soutput} - Ia p.Ia rbarD p.rD -2.90602921 0.00100000 0.07237008 0.00100000 -\end{Soutput} -\end{Schunk} -If you want to be able to report $P < 0.001$ in this situation, then you can simply increase the number in sample: \texttt{sample = 1999} -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -\subsection{Going the distance \{dissimilarity distance\}}\label{index:dist} -\tab\tab Since \textit{poppr} is still in its infancy, the number of distance measures it can offer are few. Bruvo's distance is well supported and allows you to quickly visualize your data, but it only allows for microsatellites. The index of association, above, utilizes a discreet dissimilarity distance matrix. It is with this matrix that we have constructed a relative dissimilarity distance where the distance is the ratio of the number of dissimilarities to the number of dissimilarities possible. The number of dissimilarities possible is the number of loci multiplied by the ploidy, so if you have 10 loci from a diploid population, then there are 20 dissimilarities possible. For details, see equations (\ref{eq:ia_d}) and (\ref{eq:ia_D}) in section \ref{appendix:algorithm:iard}. - -\subsubsection{Function: diss.dist}\label{index:dist:diss.dist} - -\tab\tab Use this function to calculate relative dissimilarity between individuals and return a distance matrix for use in creating cladograms or minimum spanning networks. A note: missing alleles will be imputed to be the same as the challenging allele, decreasing the distance between some individuals. If you want to consider all missing data as special alleles, treat your data with \texttt{missingno(pop, type = "zero")} beforehand. - -\begin{quote} -Default Command:\\ -\texttt{diss.dist(pop)} -\end{quote} - -\begin{itemize} - \item \texttt{pop -} a \texttt{genind} object. -\end{itemize} -Since we have a data set that we know is very clonal, let's analyze the \textit{A. euteiches} data set \cite{Grunwald:2006} and create a heatmap to visualize the degree of difference between populations. -\begin{Schunk} -\begin{Sinput} -> data(Aeut) -> A.dist <- diss.dist(Aeut) -> heatmap(as.matrix(A.dist), symm=TRUE) -\end{Sinput} -\end{Schunk} -\begin{figure}[h!] - \centering - \caption{\footnotesize Heatmap representation of a dissimilarity distance for the data set ``Aeut"} - \label{diss_heat_map} -\includegraphics{poppr_manual-113} -\end{figure} -\newpage -\subsection{Step by stepwise mutation \{Bruvo's distance\}}\label{index:bruvo} - -\tab\tab Bruvo's distance is a genetic distance measure for microsatellite markers utilizing a stepwise mutation model that allows for differing ploidy levels \cite{Bruvo:2004}. As adegenet's genind object has an all or none approach to missing data, any genotypes not exhibiting full ploidy will be treated as missing. This means that only non-special cases will be considered for the calculation and missing data will be ignored \cite{Bruvo:2004}. It is important to note that this is a distance between individuals, not populations, unlike Nei's 1978 distance \cite{Nei:1978}. For distances between populations, see the \textit{adegenet} function \texttt{dist.genpop} - -\subsubsection{Function: bruvo.dist}\label{index:bruvo:bruvo.dist} - -\tab\tab Bruvo's distance requires knowledge of the repeat lengths of each locus, so take care to read the description below. -\begin{quote} -Default Command:\\ -\texttt{bruvo.dist(pop, replen = c(2))} -\end{quote} -\begin{itemize} - \item \texttt{pop -} a \texttt{genind} object. - \item \texttt{replen -} This is a vector of numbers indicating the repeat length for each locus in your sample. If you have two dinucleotide repeats and five tetranucleotide repeats, you would put \texttt{c(2,2,4,4,4,4,4)} in this field. If you have imported data where that represents the raw number of steps, all you would have to type is \texttt{rep(1, n)}, replacing $n$ with the number of loci in your sample. It is important that you place something in this field because this function will attempt to estimate the repeat length based on the minimum difference of the alleles represented; with variability of position calls, relying on this estimation is NOT recommended. -\end{itemize} - -To illustrate why it is important to specify the repeat lengths, let's imagine a locus that contains 5 alleles and the true repeat length is 4. Note that Bruvo's distance between alleles is calculated as $1 - 2^{-[x]}$, where $x$ is the difference in repeat lengths: -\begin{Schunk} -\begin{Sinput} -> locus1 <- c(244, 248, 256, 240, 236) -> locus1/4 -\end{Sinput} -\begin{Soutput} -[1] 61 62 64 60 59 -\end{Soutput} -\begin{Sinput} -> 1 - 2^-dist(locus1/4) -\end{Sinput} -\begin{Soutput} - 1 2 3 4 -2 0.50000 -3 0.87500 0.75000 -4 0.50000 0.75000 0.93750 -5 0.75000 0.87500 0.96875 0.50000 -\end{Soutput} -\end{Schunk} -We can see that the distance between them ranges from 1 to 5. Let's say, that we accidentally wrote 2 or 8 instead of 4: -\begin{Schunk} -\begin{Sinput} -> locus1/2 -\end{Sinput} -\begin{Soutput} -[1] 122 124 128 120 118 -\end{Soutput} -\begin{Sinput} -> 1 - 2^-dist(locus1/2) # Distance increase -\end{Sinput} -\begin{Soutput} - 1 2 3 4 -2 0.7500000 -3 0.9843750 0.9375000 -4 0.7500000 0.9375000 0.9960938 -5 0.9375000 0.9843750 0.9990234 0.7500000 -\end{Soutput} -\begin{Sinput} -> locus1/8 -\end{Sinput} -\begin{Soutput} -[1] 30.5 31.0 32.0 30.0 29.5 -\end{Soutput} -\begin{Sinput} -> 1 - 2^-dist(locus1/8) # Distance decrease -\end{Sinput} -\begin{Soutput} - 1 2 3 4 -2 0.2928932 -3 0.6464466 0.5000000 -4 0.2928932 0.5000000 0.7500000 -5 0.5000000 0.6464466 0.8232233 0.2928932 -\end{Soutput} -\end{Schunk} -While we will still get results from this analysis with the incorrect repeat length, they will be inherently wrong as they do not represent the true distance. That being said, it's important to note that the repeat lengths we represent for the rest of the manual are not known by the authors, but are used as a simple example. - -This function will return a distance matrix (displaying the smallest population in the data set ``nancycats"): -\begin{Schunk} -\begin{Sinput} -> dist9 <- bruvo.dist(popsub(nancycats, 9), replen=rep(1,9)) -> dist9 -\end{Sinput} -\begin{Soutput} - 1 2 3 4 5 6 7 8 -2 0.5778800 -3 0.4008213 0.4563124 -4 0.2202691 0.5093036 0.1805522 -5 0.3270365 0.5533719 0.2352431 0.2178786 -6 0.4016376 0.2760247 0.3192139 0.3330612 0.4294671 -7 0.6150004 0.8707648 0.5533278 0.6167331 0.5219014 0.7330560 -8 0.5492079 0.4086363 0.4391785 0.3289388 0.3886142 0.4528936 0.8037448 -9 0.5925835 0.6227587 0.6203071 0.6585558 0.6186252 0.6099514 0.5060450 0.7026198 -\end{Soutput} -\end{Schunk} -You can visualize this better with a simple heatmap: -%\newpage - -\begin{figure}[h!] - \centering - \caption{\footnotesize Heatmap representation of Bruvo's distance for population 9 of the data set ``nancycats"} - \label{bruvo_heat_map} -\begin{Schunk} -\begin{Sinput} -> heatmap(as.matrix(dist9), symm=TRUE) -\end{Sinput} -\end{Schunk} -\includegraphics{poppr_manual-118} -\end{figure} -%\newpage - -Let's take a closer look at the two individuals, N113 and N111. They seem to have large distances between everyone else and themselves. The names and columns of the matrix contain the names of individuals, but not the population information. We can make a comparison of Bruvo's distance across populations easier by editing the ``Labels" attribute of the distance object. Let's take a look at the labels attribute using the \texttt{attr()} command. -\begin{Schunk} -\begin{Sinput} -> attr(dist9, "Labels") -\end{Sinput} -\begin{Soutput} -NULL -\end{Soutput} -\end{Schunk} -Remember that they all came from population 9, so let's append that to each label using the \texttt{paste()} command. -\begin{Schunk} -\begin{Sinput} -> dist9.attr <- attr(dist9, "Labels") -> attr(dist9, "Labels") <- paste(rep("P09", 9), dist9.attr) -> dist9 -\end{Sinput} -\begin{Soutput} - P09 P09 P09 P09 P09 P09 P09 P09 -P09 0.5778800 -P09 0.4008213 0.4563124 -P09 0.2202691 0.5093036 0.1805522 -P09 0.3270365 0.5533719 0.2352431 0.2178786 -P09 0.4016376 0.2760247 0.3192139 0.3330612 0.4294671 -P09 0.6150004 0.8707648 0.5533278 0.6167331 0.5219014 0.7330560 -P09 0.5492079 0.4086363 0.4391785 0.3289388 0.3886142 0.4528936 0.8037448 -P09 0.5925835 0.6227587 0.6203071 0.6585558 0.6186252 0.6099514 0.5060450 0.7026198 -\end{Soutput} -\end{Schunk} -Now we can see that all of the labels are corresponding to population 9. Let's calculate Bruvo's distance between populations 8 and 9. -\begin{Schunk} -\begin{Sinput} -> dist9to8 <- bruvo.dist(popsub(nancycats, 8:9), replen=rep(1,9)) -> dist9to8.attr <- attr(dist9to8, "Labels") -> nan9to8pop <- nancycats@pop[nancycats@pop %in% c("P08", "P09")] -> attr(dist9to8, "Labels") <- paste(nan9to8pop, dist9to8.attr) -> heatmap(as.matrix(dist9to8), symm=TRUE) -\end{Sinput} -\end{Schunk} -\begin{figure}[h!] - \centering - \caption{\footnotesize Heatmap representation of Bruvo's distance for populations 8 and 9 of the data set ``nancycats"} - \label{bruvo_heat_map_8to9} -\includegraphics{poppr_manual-popcompare_bruvo2} -\end{figure} - -Remember N113 and N111? Take a look at where they fall on the heatmap. They don't cluster together with population 9 anymore, but somewhere in population 8. -% \newpage - -\subsection{See the forest for the trees \{visualizing distances with dendrograms and networks\}}\label{index:trees} - -\tab\tab Staring at a raw distance matrix might be able to tell you something about your data, but it also might be able to ruin your eyesight. In this section, we present functions to display this data in trees and networks. - -\subsubsection{Function: bruvo.boot}\label{index:trees:bruvo.boot} - -\tab\tab This function provides the ability to draw a dendrogram based on Bruvo's distance including bootstrap support. -\begin{quote} -Default Command:\\ -\texttt{bruvo.boot(pop, replen = c(2), B = 100, tree = "upgma", showtree = TRUE, cutoff = NULL, quiet = FALSE)} -\end{quote} -\begin{itemize} - \item \texttt{pop -} a \texttt{genind} object. - \item \texttt{replen -} see \texttt{bruvo.dist}, above. - \item \texttt{sample -} How many bootstraps do you want to perform? - \item \texttt{tree -} Two trees are available, Neighbor-Joining \texttt{"nj"} or UPGMA \texttt{"upgma"}. - \item \texttt{showtree -} if \texttt{TRUE}, a tree will be plotted automatically. - \item \texttt{cutoff -} This is a number between 0 and 100 indicating the cutoff value for the bootstrap nodelables. If you only wanted to see the the boostrap values for nodes that were present more than 75\% of the time, you would use \texttt{cutoff = 75}. If you don't put anything for this parameter, all values will be shown. - \item \texttt{quiet -} if \texttt{quiet = TRUE}, no standard messages will be printed to screen. If \texttt{quiet = FALSE} (default), then a progress bar and standard message will be printed to the screen. -\end{itemize} - -For this example, let's set the cutoff to 50\%. -\begin{Schunk} -\begin{Sinput} -> set.seed(1001) -> nan9tree <- bruvo.boot(popsub(nancycats, 8:9), replen=rep(1,9), sample=1000, cutoff=50) -\end{Sinput} -\end{Schunk} -\begin{Schunk} -\begin{Soutput} -Bootstrapping... -(note: calculation of node labels can take a while even after the progress bar is full) -\end{Soutput} -\begin{Soutput} -|================================================================================| 100% -\end{Soutput} -\end{Schunk} -\begin{figure}[h!] - \centering - \caption{\footnotesize UPGMA Tree of Bruvo's distance for population 9 of the data set ``nancycats" with 1000 Bootstrap Replicates. Node labels represent percentage of bootstrap replicates that contained that node.} - \label{bruvo_upgma} -% \includegraphics{bruvo_upgma.png} - -\includegraphics{poppr_manual-bruvo_tree} - -\end{figure} - -\subsubsection{Function: greycurve}\label{index:trees:greycurve} -\tab\tab Use this function to display a gradient of grey values based on user-defined parameters. The following functions will display a minimum spanning network that utilize a grey scale to display the weight of the lines (referred to as ``edges") that connect two or more individuals. The darker the line the closer the distance. Since this is based off of a linear grey scale, what happens when you have a distance matrix comprised of values all below 0.2 or all above 0.8? - -With linear grey scaling, it becomes very difficult to detect the differences in these ranges. The following function allows you to visualize and manipulate a gradient from black to white so that you can use it in \textit{poppr}'s msn functions below to maximize the visual differences in your data. - -\begin{quote} -Default Command:\\ -\texttt{greycurve(glim = c(0, 0.8), gadj = 3, gweight = 1)} -\end{quote} - - -This function does not return any values. It will print a visual gradient from black to white horizontally. On this gradient, it will plot the adjustment curve (in opposing grey values), yellow horizontal lines bounding the maximum and minimum values, and the equation used to calculate the correction in red. Keep in mind that this is plotting values from zero to one. - -%\newpage -First, we'll see what happens when we change the weight parameter. -\setkeys{Gin}{width=\textwidth} -\begin{figure}[h!] -\begin{minipage}[b]{0.45\linewidth} -\centering -\caption{\footnotesize Default for \texttt{greycurve()}, weighted for small values.} -\begin{Schunk} -\begin{Sinput} -> greycurve() -\end{Sinput} -\end{Schunk} -\includegraphics{poppr_manual-greycurve_normal} -\end{minipage} -\hspace{0.5cm} -\begin{minipage}[b]{0.45\linewidth} -\centering -\caption{\footnotesize weighting for large values.} -\begin{Schunk} -\begin{Sinput} -> greycurve(gweight = 2) -\end{Sinput} -\end{Schunk} -\includegraphics{poppr_manual-greywidth_inverse} -\end{minipage} -\end{figure} - -Now, we'll see what happens when we change the adjustment parameter (affects the shape of the curve) and the upper and lower limits of the grey scale. - -\begin{figure}[h!] -\begin{minipage}[b]{0.45\linewidth} -\centering -\caption{\footnotesize Setting the lower and upper limits and weighting the curve heavily toward smaller values.} -\begin{Schunk} -\begin{Sinput} -> greycurve(glim = c(0.2, 0.9), gadj=15) -\end{Sinput} -\end{Schunk} -\includegraphics{poppr_manual-greycurve_small_heavy} -\end{minipage} -\hspace{0.5cm} -\begin{minipage}[b]{0.45\linewidth} -\centering -\caption{\footnotesize Same as the figure on the left, but weighting heavily toward larger values.} -\begin{Schunk} -\begin{Sinput} -> greycurve(glim = c(0.2, 0.9), gadj=15, gweight=2) -\end{Sinput} -\end{Schunk} -\includegraphics{poppr_manual-greywidth_large_heavy} -\end{minipage} -\end{figure} -\newpage - -\subsubsection{Function: bruvo.msn}\label{index:trees:bruvo.msn} -\tab\tab This function will automatically draw a minimum spanning network of MLGs based on Bruvo's distance. It's important to note that this will recalculate Bruvo's distance each time it is run, but the amount of time it takes to run is on the order of seconds. It will return a list containing the network, the populations and the related colors in the network so you can export or redraw it with the legend if you wanted to using the package \textit{igraph} (type \texttt{help("plot.igraph")} for details). -\begin{quote} -Default Command:\\ -\texttt{bruvo.msn(pop, replen = c(2), palette = topo.colors, sublist = "All",\\ - \tab blacklist = NULL, vertex.label = "MLG", gscale = TRUE, glim = c(0, 0.8),\\ - \tab gadj = 3, gweight = 1, wscale = TRUE, ...) -} -\end{quote} -\begin{itemize} - \item \texttt{pop -} a \texttt{genind} object. - \item \texttt{replen -} see \texttt{bruvo.dist}, above. - \item \texttt{palette -} this is a \textbf{function} definining a color palette to use. The default is \texttt{topo.colors}. There are different palettes, which you can search by typing \texttt{?rainbow}. If you want a custom color palette, an easy way is to use the function \texttt{colorRampPalette}. - \item \texttt{sublist -} The populations you wish to analyze. This defaults to ``All". See section \ref{data.manip:divide:popsub} for details. - \item \texttt{blacklist -} Populations you do not want to include in the graph. See section \ref{data.manip:divide:popsub} for details. - \item \texttt{vertex.label -} This is an option that is passed on to \textit{igraph}'s \texttt{plot} function. \textit{Poppr} has added two arguments specific to \textit{poppr}. If you want to label the graph with the multilocus genotypes from the whole data set, use the argument \texttt{vertex.label = "mlg"}. If you want to display the representative individual names, you can use the argument \texttt{vertex.label = "inds"}. I say representative individual names because, only one representative from each MLG will be present in the clone corrected data set used to calculate the distance. For no labels, you can choose \texttt{vertex.label = NA}. - \item \texttt{gscale -} If this is set to \texttt{TRUE}, the edge color will be converted to greyscale based on Bruvo's distance. If two nodes are closely related, the edge will appear darker. The limits of the scale can be set by the argument \texttt{glim}. If this is set to \texttt{FALSE}, all edge colors will be black. - \item \texttt{glim -} This is a vector of numbers between 0 and 1. This lets you set the limits of the grey scaling based on R's internal \texttt{grey} function. For example, if you wanted a maximum of 50\% white saturation (for use if you have distantly related nodes) and a minimum of 1\%, you would use \texttt{glim = c(0.01, 0.5)}. - \item \texttt{gadj -} This is an integer greater than zero used to adjust the scaling factor for the grey curve. Since very small changes in the grey scale are not easily precieved, it's useful to be able to adjust the grey scale to be able to show you the weights of each edge. For example, a population with most weights less than 0.3, you might want to set \texttt{gadj = 10} to exaggerate the grey scale. - \item \texttt{gweight -} If \texttt{gweight = 1}, the grey scale adjustment will be weighted towards separating out smaller values of Bruvo's distance. If \texttt{gweight = 2}, the grey scale ajustment will be weighted towards separating out larger values of Bruvo's distance. - \item \texttt{wscale -} If this is set to \texttt{TRUE}, edge widths will be displayed corresponding to Bruvo's distance in that thicker edges will represent a smaller distance between nodes. If this is set to \texttt{FALSE}, all edges will be set to a width of 2. - \item \texttt{... -} This is a placeholder for any other arguments that you want to supply to \textit{igraph}. Useful arguments are \texttt{vertex.label.cex} to adjust the size of the labels, \texttt{vertex.label.dist} to adjust the position of the labels, and \texttt{vertex.label.color} to adjust the color of the labels. -\end{itemize} - -Often, minimum spanning networks are the preferred way to visualize Bruvo's distance. \textit{Poppr} offers an easy way to plot these. For a demonstration, let's analyze a simulated data set of 50 individuals from populations that reproduce at a 99.9\% rate of clonal reproduction. - -\begin{Schunk} -\begin{Sinput} -> data(partial_clone) -> set.seed(9005) -> pc.msn <- bruvo.msn(partial_clone, replen=rep(1, 10), vertex.label.cex=0.7, -+ vertex.label.dist=-0.5, palette=colorRampPalette(c("blue", "yellow"))) -\end{Sinput} -\end{Schunk} -\setkeys{Gin}{width=0.8\textwidth} -\begin{figure}[ht!] - \centering - \caption{\footnotesize Minimum Spanning Network representing 4 simulated populations. Each node represents a different multi locus genotype (MLG). Node sizes and colors correspond to the number of individuals and population membership, respectively. Edge thickness and color are proportional to Bruvo's distance. Edge lengths are arbitrary.} - \label{mst_bruvo} -\includegraphics{poppr_manual-bruvo_msn} -\end{figure} -\setkeys{Gin}{width=0.5\textwidth} - -The output, as mentioned earlier, is a list containing the graph constructed via the \textit{igraph} package, a vector of the population names and a vector of colors representing the populations. -\begin{Schunk} -\begin{Sinput} -> library(igraph) -> pc.msn -\end{Sinput} -\begin{Soutput} -$graph -IGRAPH UNW- 26 25 -- -+ attr: name (v/c), size (v/n), shape (v/c), pie (v/x), pie.color (v/x), label - (v/c), weight (e/n), color (e/c), width (e/n) - -$populations -[1] "1" "2" "3" "4" - -$colors -[1] "#0000FF" "#5555AA" "#AAAA55" "#FFFF00" -\end{Soutput} -\end{Schunk} - -Note that the thickness of the edges (the lines that are connecting the dots) is representative of relatedness between individuals, but the lengths do not necessarily mean anything due to the fact that with a larger data sets, displaying lengths proportional to relatedness would be impossible to draw on a 2D surface. Interpreting these data would show that MLG 9 has 5 individuals from all four populations and that it is most closely related to MLG 7, whereas the most distantly related connection exists between MLG 25 and MLG 26. -\newpage -Since a graph can be represented in many ways, you might want to play around with different layouts using the \texttt{layout()} function in \textit{igraph}. Type \texttt{help("layout", package = igraph)} for details. Below is the code for reconstructing the previous graph using the output: -\begin{Schunk} -\begin{Sinput} -> set.seed(9005) -> library(igraph) -> plot(pc.msn$graph, vertex.size = V(pc.msn$graph)$size * 3, vertex.label.cex=0.7, -+ vertex.label.dist=-0.5,) -> legend(-1.55, 1, bty = "n", cex = 0.75, legend = pc.msn$populations, -+ title = "Populations", fill = pc.msn$colors, border = NULL) -\end{Sinput} -\end{Schunk} - -\subsubsection{Function: poppr.msn}\label{index:trees:poppr.msn} - -\tab\tab Use this function to draw a minimum spanning network from your data set and a distance matrix derived from your data set. Since there are hundreds of distances that can be calculated for genetic data, and since I want to be able to graduate at some point in this decade, functions to automatically calculate distances and draw the minimum spanning networks will be few and far between. This function is an attempt to meet the user halfway and draw a minimum spanning network provided that the user has supplied two things: -\begin{enumerate} - \item A distance matrix over all individuals. - \item The original data set containing demographic information. -\end{enumerate} - -That's it. For the most part, this function is functionally the same as \texttt{bruvo.msn}, except that instead of being exclusive to microsatellite markers, you can now visualize distances in any marker type provided that you have the two items listed above. - -\begin{quote} -Default Command:\\ -\texttt{poppr.msn(pop, distmat, palette = topo.colors, sublist = "All",\\ - \tab blacklist = NULL, vertex.label = "MLG", gscale = TRUE, glim = c(0, 0.8),\\ - \tab gadj = 3, gweight = 1, wscale = TRUE, ...) -} -\end{quote} -\begin{itemize} - \item \texttt{pop -} a \texttt{genind} object. - \item \texttt{distmat -} a dissimilarity distance matrix derived from your data with distances between zero and one. - \item \texttt{palette -} this is a \textbf{function} definining a color palette to use. The default is \texttt{topo.colors}. There are different palettes, which you can search by typing \texttt{?rainbow}. If you want a custom color palette, an easy way is to use the function \texttt{colorRampPalette}. - \item \texttt{sublist -} The populations you wish to analyze. This defaults to ``All". - \item \texttt{blacklist -} Populations you do not want to include in the graph. - \item \texttt{vertex.label -} This is an option that is passed on to \textit{igraph}'s \texttt{plot} function. \textit{Poppr} has added two arguments specific to \textit{poppr}. If you want to label the graph with the multilocus genotypes from the whole data set, use the argument \texttt{vertex.label = "mlg"}. If you want to display the representative individual names, you can use the argument \texttt{vertex.label = "inds"}. I say representative individual names because, only one representative from each MLG will be present in the clone corrected data set used to calculate the distance. For no labels, you can choose \texttt{vertex.label = NA}. - \item \texttt{gscale -} If this is set to \texttt{TRUE}, the edge color will be converted to greyscale based on the distance. If two nodes are closely related, the edge will appear darker. The limits of the scale can be set by the argument \texttt{glim}. If this is set to \texttt{FALSE}, all edge colors will be black. - \item \texttt{glim -} This is a vector of numbers between 0 and 1. This lets you set the limits of the grey scaling based on R's internal \texttt{grey} function. For example, if you wanted a maximum of 50\% white saturation (for use if you have distantly related nodes) and a minimum of 1\%, you would use \texttt{glim = c(0.01, 0.5)}. - \item \texttt{gadj -} This is an integer greater than zero used to adjust the scaling factor for the grey curve. Since very small changes in the grey scale are not easily precieved, it's useful to be able to adjust the grey scale to be able to show you the weights of each edge. For example, a population with most weights less than 0.3, you might want to set \texttt{gadj = 10} to exaggerate the grey scale. - \item \texttt{gweight -} If \texttt{gweight = 1}, the grey scale adjustment will be weighted towards separating out smaller values of the distance. If \texttt{gweight = 2}, the grey scale ajustment will be weighted towards separating out larger values of Bruvo's distance. - \item \texttt{wscale -} If this is set to \texttt{TRUE}, edge widths will be displayed corresponding to Bruvo's distance in that thicker edges will represent a smaller distance between nodes. If this is set to \texttt{FALSE}, all edges will be set to a width of 2. - \item \texttt{... -} This is a placeholder for any other arguments that you want to supply to \textit{igraph}. Useful arguments are \texttt{vertex.label.cex} to adjust the size of the labels, \texttt{vertex.label.dist} to adjust the position of the labels, and \texttt{vertex.label.color} to adjust the color of the labels. -\end{itemize} - -Since we have the ability, let's visualize the \textit{A. euteiches} data set \cite{Grunwald:2006}. -\begin{Schunk} -\begin{Sinput} -> data(Aeut) -> A.dist <- diss.dist(Aeut) -> set.seed(9005) -> A.msn <- poppr.msn(Aeut, A.dist, vertex.label=NA, palette=rainbow, gadj=15) -\end{Sinput} -\end{Schunk} - -\setkeys{Gin}{width=0.8\textwidth} -\begin{figure}[ht!] - \centering - \caption{\footnotesize Minimum Spanning Network representing 4 simulated populations. Each node represents a different multi locus genotype (MLG). Node sizes and colors correspond to the number of individuals and population membership, respectively. Edge thickness and color are proportional to Bruvo's distance. Edge lengths are arbitrary.} - \label{mst_poppr} -\includegraphics{poppr_manual-poppr_msn_fig} -\end{figure} -\setkeys{Gin}{width=0.5\textwidth} -\newpage -%\subsubsection{Gory details} -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -\section{I know what you did last summary table \{diversity table\}}\label{summary} - -\tab\tab Remember the summary function that you used to get all the diversity statistics in section \ref{intro:qstart}? In this section, we will flesh out all that you can do with this function. This was the very first function that was written for \textit{poppr} to make it easy for the user to manipulate and summarize the data in one function. -\subsection{Function: poppr}\label{summary:poppr} - -\tab\tab This function is quite daunting with all its possibilities. You have the option to subset your data for specific populations, correct for missing data, and clone correct. With each of these possibilities, comes the need to provide all the arguments for their various functions. -\begin{quote} -Default Command:\\ -\texttt{poppr(pop, total = TRUE, sublist = c("ALL"), blacklist = c(NULL), sample = 0,\\ - \tab method = 1, missing = "ignore", cutoff = 0.05, quiet = FALSE, \\ - \tab clonecorrect = FALSE, hier = c(1), dfname = "population\_hierarchy", \\ - \tab hist = TRUE, minsamp = 10)} -\end{quote} -\begin{itemize} - \item \texttt{pop -} A \texttt{genind} object. - \item \texttt{total -} This is also a synonym for ``pooled". This will calculate all diversity statistics on the entire data set if set to \texttt{TRUE} or if there is no population structure. - \item \emph{popsub functions:} See section \ref{data.manip:divide} - \begin{description} - \item[sublist -] A list of populations you want to include in your analysis. - \item[blacklist -] A list of populations you want to exclude from your analysis. - \end{description} - \item \emph{shufflepop functions:} See section \ref{data.manip:shuffle} \\ Note that this only affects the calculation for $I_A$ and $\bar r_d$. - \begin{description} - \item[sample -] The number of samples you desire (eg. 999) - \item[method -] Which sampling method? 1: permute, 2: parametric bootstrap, 3: non-parametric bootstrap, 4: multilocus. - \end{description} - \item \emph{missingno functions:} See Section \ref{data.manip:missing} \\ Note that all analyses in this function ignore/impute missing data by default. - \begin{description} - \item[missing -] How to deal with missing data. This feeds into the \texttt{type} flag of \texttt{missingno}. - \item[cutoff -] Allowable percentage of missing data per genotype or locus. - \end{description} - \item \texttt{quiet -} If set to \texttt{TRUE}, nothing will be printed to the screen as the sampling progresses. If \texttt{FALSE} (default) a progress bar will be produced. - \item \emph{clonecorrect functions:} See section \ref{data.manip:cc} - \begin{description} - \item[clonecorrect -] if this is set to \texttt{TRUE}, then you will need to set the next two parameters. - \item[hier -] A list of the population hierarchy, or names of columns in the data frame noted below. - \item[dfname -] A data frame in the \texttt{@other} slot of the \texttt{genind} object containing all of the population factors in different columns. For an example, see sections \ref{data.manip:hier} and \ref{data.manip:cc}. - \item[keep -] A vector of integers as indexes for the \texttt{hier} flag indicating which levels of the hierarchy you want to analyze. See section \ref{data.manip:cc} for details. - \end{description} - \item \texttt{hist -} if \texttt{TRUE}, a histogram of distributions of $I_A$ and $\bar r_d$ will be displayed with each population if there is sampling. - \item \texttt{minsamp -} The minimum number of individuals you want to use to calculate the expected number of MLGs. The default is set to 10. -\end{itemize} - -This function produces a table that contains the population name, number of individuals observed, number of MLGs observed, number of MLGs expected at the lowest common sampling size within the data set \cite{Hurlbert:1971} \cite{Heck:1975}, the Shannon-Wiener index \cite{Shannon:1948}, Stoddart and Taylor's index for expected MLGs \cite{Stoddart:1988}, Nei's 1987 genotypic diversity \cite{Nei:1978}, evenness \cite{Pielou:1975}\cite{Ludwig:1988}\cite{Grunwald:2003}, the index of association \cite{Brown:1980}\cite{Smith:1993}, the standardized index of association \cite{Agapow:2001}, and the file name. Most of these indices are calculated by converting the population into an MLG table with \texttt{mlg.table} (see section \ref{mlg:table}) and using the \textit{vegan} package's \texttt{diversity} function (To see details, type \texttt{?vegan::diversity} into the R console). - -To begin, let's revisit our example data set of \textit{Aphanomyces euteiches} \cite{Grunwald:2006}. -\begin{Schunk} -\begin{Sinput} -> data(Aeut) -> poppr(Aeut) -\end{Sinput} -\begin{Soutput} -| Athena -| Mt. Vernon -| Total - Pop N MLG eMLG SE H G Hexp E.5 Ia rbarD File -1 Athena 97 70 65.981 1.246 4.063 42.193 0.986 0.721 2.906 0.072 rootrot.csv -2 Mt. Vernon 90 50 50.000 0.000 3.668 28.723 0.976 0.726 13.302 0.282 rootrot.csv -3 Total 187 119 68.453 2.989 4.558 68.972 0.991 0.720 14.371 0.271 rootrot.csv -\end{Soutput} -\end{Schunk} -OK, so we were able to get a table out of this. Now let's see what happens when we do some sampling to see if this is reproducing clonally or not. We will turn quiet on and the histogram off to save space. -\begin{Schunk} -\begin{Sinput} -> poppr(Aeut, sample=999, hist=FALSE, quiet=TRUE) -\end{Sinput} -\end{Schunk} -\begin{Schunk} -\begin{Soutput} - Pop N MLG eMLG SE H G Hexp E.5 Ia p.Ia rbarD p.rD -1 Athena 97 70 65.981 1.246 4.063 42.193 0.986 0.721 2.906 0.001 0.072 0.001 -2 Mt. Vernon 90 50 50.000 0.000 3.668 28.723 0.976 0.726 13.302 0.001 0.282 0.001 -3 Total 187 119 68.453 2.989 4.558 68.972 0.991 0.720 14.371 0.001 0.271 0.001 - File -1 rootrot.csv -2 rootrot.csv -3 rootrot.csv -\end{Soutput} -\end{Schunk} -From now on, we'll set \texttt{quiet = TRUE} to save space on our vignette. Let's clone correct at different levels to see if that affects the index of association. First, we'll clone correct at the sub population level. -\begin{Schunk} -\begin{Sinput} -> poppr(Aeut, sample=999, clonecorrect=TRUE, hier=c("Pop","Subpop"), -+ dfname="population_hierarchy", quiet=TRUE, hist=FALSE) -\end{Sinput} -\end{Schunk} -\begin{Schunk} -\begin{Soutput} - Pop N MLG eMLG SE H G Hexp E.5 Ia p.Ia rbarD p.rD -1 Athena 76 70 60.621 1.017 4.221 65.636 0.998 0.963 2.535 0.001 0.062 0.001 -2 Mt. Vernon 65 50 50.000 0.000 3.796 36.739 0.988 0.821 14.310 0.001 0.298 0.001 -3 Total 141 119 59.629 1.854 4.705 96.980 0.997 0.876 13.802 0.001 0.260 0.001 - File -1 rootrot.csv -2 rootrot.csv -3 rootrot.csv -\end{Soutput} -\end{Schunk} -And at the population level. -\begin{Schunk} -\begin{Sinput} -> poppr(Aeut, sample=999, clonecorrect=TRUE, hier="Pop", -+ dfname="population_hierarchy", quiet=TRUE, hist=FALSE) -\end{Sinput} -\end{Schunk} -\begin{Schunk} -\begin{Soutput} - Pop N MLG eMLG SE H G Hexp E.5 Ia p.Ia rbarD p.rD -1 Athena 70 70 50.000 0.000 4.248 70.000 1 1.000 2.438 0.001 0.060 0.001 -2 Mt. Vernon 50 50 50.000 0.000 3.912 50.000 1 1.000 13.856 0.001 0.285 0.001 -3 Total 120 119 49.828 0.377 4.776 118.033 1 0.995 12.497 0.001 0.234 0.001 - File -1 rootrot.csv -2 rootrot.csv -3 rootrot.csv -\end{Soutput} -\end{Schunk} -As you can see, clone correction doesn't always have to involve creation of new data sets! - -You might notice that the P-values for both $I_A$ and $\bar r_d$ are often equal to each other. This will always be the case with the sampling method utilized in method 4 \cite{Agapow:2001}. Here, we show examples where they are not equal and why it's okay. -\begin{Schunk} -\begin{Sinput} -> set.seed(2002) -> poppr(nancycats, sublist=5:6, total=FALSE, sample=999, method=2, quiet=TRUE, hist=FALSE) -\end{Sinput} -\end{Schunk} -\begin{Schunk} -\begin{Soutput} - Pop N MLG eMLG SE H G Hexp E.5 Ia p.Ia rbarD p.rD File -1 5 15 15 11 0 2.708 15 1 1 -0.048 0.576 -0.006 0.576 truenames(nancycats)$tab -2 6 11 11 11 0 2.398 11 1 1 0.334 0.070 0.043 0.071 truenames(nancycats)$tab -\end{Soutput} -\end{Schunk} -The reason why the P-values would be different is described at the end of section \ref{index:iard:ia}. The differences in P-values are normally not very far off. It's important to note this because of what can happen in extremely clonal populations. You can end up with a large enough sample size consisting of very few MLGs. Upon shuffling using method 4, you find that there are very few values of $I_A$ and $\bar r_d$ that can be obtained. Observe with this simulated data set: -\begin{Schunk} -\begin{Sinput} -> set.seed(2004) -> poppr(system.file("files/simulated.dat", package="poppr"), sample=999, method=4, quiet=TRUE) -\end{Sinput} -\end{Schunk} -\begin{Schunk} -\begin{Soutput} - Pop N MLG eMLG SE H G Hexp E.5 Ia p.Ia rbarD p.rD File -1 Total 100 6 6 0 1.235 2.79 0.648 0.735 0.05 0.09 0.061 0.09 simulated.dat -\end{Soutput} -\end{Schunk} -\begin{figure}[h!] - \centering - \caption{\footnotesize Output of multilocus-style sampling. Note the multi-modal distribution.} - \label{simulated_dist_fig} - \includegraphics{simulated_dist_fig.png} -% <>= -% set.seed(2004) -% der <- poppr(system.file("files/simulated.dat", package="poppr"), sample=999, method=1, quiet=TRUE, hist=TRUE) -% @ -\end{figure} -\newpage - -Take a look a these two histograms. The number of ways you can recombine the data with the default sampling method is very small. Other sampling methods could give a more theoretical distribution. Let's try the parametric bootstrap (For details, see section \ref{data.manip:shuffle}). -\begin{figure}[h!] - \centering - \caption{\footnotesize Output for parametric bootstrap sampling.} - \label{simulated_param_fig} - \includegraphics{simulated_param_fig.png} -% <>= -% set.seed(2004) -% der <- poppr(system.file("files/simulated.dat", package="poppr"), sample=999, method=3, quiet=TRUE) -% @ -\end{figure} - -As you can see, the distribution is much closer to a distribution we would expect if this were a small sample of a larger population. - - -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -\section{Appendix}\label{appendix} -\subsection{Algorithmic Details}\label{appendix:algorithm} -\subsubsection{$I_A$ and $\bar r_d$}\label{appendix:algorithm:iard} - -\tab \tab The index of association was originally developed by A.H.D. Brown analyzing population structure of wheat \cite{Brown:1980}. It has been widely used as a tool to detect clonal reproduction within populations \cite{Smith:1993}. Populations whose members are undergoing sexual reproduction, whether it be selfing or out-crossing, will produce gametes via meiosis, and thus have a chance to shuffle alleles in the next generation. Populations whose members are undergoing clonal reproduction, however, generally do so via mitosis. This means that the most likely mechanism for a change in genotype is via mutation. The rate of mutation varies from species to species, but it is rarely sufficiently high to approximate a random shuffling of alleles. The index of association is a calculation based on the ratio of the variance of the raw number of differences between individuals and the sum of those variances over each locus \cite{Smith:1993}. You can also think of it as the observed variance over the expected variance. If they are the same, then the index is zero after subtracting one (from Maynard-Smith, 1993 \cite{Smith:1993}): -\beq -\label{eq:I_A} -I_A = \frac{V_O}{V_E}-1 -\eeq -Since the distance is more or less a binary distance, any sort of marker can be used for this analysis. In the calculation, phase is not considered, and any difference increases the distance between two individuals. Consider the genotypes of the dummy data frame we created earlier: -\begin{Schunk} -\begin{Soutput} - locus1 locus2 locus3 -1 101/101 201/201 301/302 -2 102/103 202/203 301/303 -3 102/102 203/204 304/305 -\end{Soutput} -\end{Schunk} -Now, consider the first locus represented in the genind object: -\begin{Schunk} -\begin{Soutput} - L1.1 L1.2 L1.3 -1 1 0.0 0.0 -2 0 0.5 0.5 -3 0 1.0 0.0 -\end{Soutput} -\end{Schunk} -Remember that each column represents a different allele and that each entry in the table represents the fraction of the genotype made up by that allele at that locus. Notice also that the sum of the rows all equal one. \textit{Poppr} uses this to calculate distances by simply taking the sum of the absolute values of the differences between rows. - -The calculation for the distance between two individuals at a single locus with $a$ allelic states and a ploidy of $k$ is as follows\footnote{Individuals with Presence / Absence data will have the $k/2$ term dropped.}: -\beq -\label{eq:ia_d} -d = \displaystyle \frac{k}{2}\sum_{i=1}^{a} \mid ind_{Ai} - ind_{Bi}\mid -\eeq -\begin{Schunk} -\begin{Sinput} -> abs(dfg@tab[1, 1:3] - dfg@tab[2, 1:3]) -\end{Sinput} -\begin{Soutput} -L1.1 L1.2 L1.3 - 1.0 0.5 0.5 -\end{Soutput} -\begin{Sinput} -> abs(dfg@tab[1, 1:3] - dfg@tab[3, 1:3]) -\end{Sinput} -\begin{Soutput} -L1.1 L1.2 L1.3 - 1 1 0 -\end{Soutput} -\begin{Sinput} -> abs(dfg@tab[2, 1:3] - dfg@tab[3, 1:3]) -\end{Sinput} -\begin{Soutput} -L1.1 L1.2 L1.3 - 0.0 0.5 0.5 -\end{Soutput} -\end{Schunk} -As you can see, these values of $d$ at locus one add up to 2, 2, and 1, respectively. - -To find the total number of differences between two individuals over all loci, you just take $d$ over $m$ loci, a value we'll call $D$: - -\beq -\label{eq:ia_D} -D = \displaystyle \sum_{i=1}^{m} d_i -\eeq - -These values are calculated over all possible combinations of individuals in the data set, ${n \choose 2}$ after which you end up with ${n \choose 2}\cdot{}m$ values of $d$ and ${n \choose 2}$ values of $D$. -Calculating the observed variances is fairly straightforward (modified from Agapow and Burt, 2001) \cite{Agapow:2001}: - -\beq -\label{eq:V_O} -V_O = \frac{\displaystyle \sum_{i=1}^{n \choose 2} D_{i}^2 - \frac{(\displaystyle\sum_{i=1}^{n \choose 2} D_{i})^2}{{n \choose 2}}}{{n \choose 2}} -\eeq - -Calculating the expected variance is the sum of each of the variances of the individual loci. The calculation at a single locus, $j$ is the same as the previous equation, substituting values of $D$ for $d$ \cite{Agapow:2001}: - -\beq -\label{eq:var_j} -var_j = \frac{\displaystyle \sum_{i=1}^{n \choose 2} d_{i}^2 - \frac{(\displaystyle\sum_{i=1}^{n \choose 2} d_i)^2}{{n \choose 2}}}{{n \choose 2}} -\eeq - -The expected variance is then the sum of all the variances over all $m$ loci \cite{Agapow:2001}: - -\beq -\label{eq:V_E} -V_E = \displaystyle \sum_{j=1}^{m} var_j -\eeq - -Now you can plug the sums of equations (\ref{eq:V_O}) and (\ref{eq:V_E}) into equation (\ref{eq:I_A}) to get the index of association. -Of course, Agapow and Burt showed that this index increases steadily with the number of loci, so they came up with an approximation that is widely used, $\bar r_d$ \cite{Agapow:2001}. For the derivation, see the manual for \textit{multilocus}. The equation is as follows, utilizing equations (\ref{eq:V_O}), (\ref{eq:var_j}), and (\ref{eq:V_E}) \cite{Agapow:2001}: - -\beq -\label{eq:r_d} -\bar{r_d} = \frac{V_O - V_E} -{2\displaystyle \sum_{j=1}^{m}\displaystyle \sum_{k \neq j}^{m}\sqrt{var_j\cdot{}var_k}} -\eeq - -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -\subsubsection{Bruvo's distance}\label{appendix:algorithm:bruvo} - -\tab \tab Bruvo's distance between two individuals calculates the minimum distance across all combinations of possible pairs of alleles at a single locus and then averaging that distance across all loci \cite{Bruvo:2004}. The distance between each pair of alleles is calculated as \cite{Bruvo:2004}: - -\beq -\label{eq:m_x} -m_x = 2^{-\mid x \mid} -\eeq - -\beq -\label{eq:d_a} -d_a = 1 - m_x -\eeq - -Where $x$ is the number of steps between each allele. So, let's say we were comparing two haploid $(k = 1)$ individuals with alleles 228 and 244 at a locus that had a tetranucleotide repeat pattern (CATG$)^n$. The number of steps for each of these alleles would be $228/4 = 57$ and $244/4 =61$, respectively. The number of steps between them is then $\mid 57 - 61 \mid = 4$. Bruvo's distance at this locus between these two individuals is then $1-2^{-4} = 0.9375$. For samples with higher ploidy ($k$), there would be $k$ such distances of which we would need to take the sum \cite{Bruvo:2004}. - -\beq -\label{eq:s_i} -s_i = \displaystyle \sum_{a=1}^{k} d_a -\eeq - -Unfortunately, it's not as simple as that since we do not assume to know phase. Because of this, we need to take all possible combinations of alleles into account. This means that we will have $k^2$ values of $d_a$, when we only want $k$. How do we know which $k$ distances we want? We will have to invoke parsimony for this and attempt to take the minimum sum of the alleles, of which there are $k!$ possibilities \cite{Bruvo:2004}: - -\beq -\label{eq:d_l} -d_l = \frac{\left(\displaystyle \min_{i \dotsc k!} s_i\right)}{k} -\eeq - -Finally, after all of this, we can get the average distance over all loci \cite{Bruvo:2004}. - -\beq -\label{eq:D} -D = \frac{\displaystyle \sum_{i=1}^l d_i}{l} -\eeq - -This is calculated over all possible combinations of individuals and results in a lower triangle distance matrix over all individuals. - -\subsection{Exporting Graphics}\label{appendix:graphics} -\tab\tab R has the ability to produce nice graphics from most any type of data, but to get these graphics into a report, presentation, or manuscript can be a bit challenging. It's no secret that the R Documentation pages are a little difficult to interpret, so I will give the reader here a short example on how to export graphics from R. Note that any code here that will produce images will also be present in other places in this vignette. The default installation of the R GUI is quite minimal, and for an easy way to manage your plots and code, I strongly encourage the user to use Rstudio \url{http://www.rstudio.com/}. - -\subsubsection{Basics}\label{appendix:graphics:basics} -\tab\tab Before you export graphics, you have to ask yourself what they will be used for. If you want to use the graphic for a website, you might want to opt for a low-resolution image so that it can load quickly. With printing, you'll want to make sure that you have a scalable or at least a very high resolution image. Here, I will give some general guidelines for graphics (note that these are merely suggestions, not defined rules). -\begin{itemize} - \item \textbf{What you see is not always what you get} I have often seen presentations where the colors were too light or posters with painfully pixellated graphs. Think about what you are going to be using a graphic for and how it will appear to the intended audience given the media type. - \item \textbf{$\geq$ 300 dpi unless its for a web page} For any sort of printed material that requires a raster based image, 300dpi (dots per inch) is the absolute minimum resolution you should use. For simple black and white line images, 1200dpi is better. This will leave you with crisp, professional looking images. - \item \textbf{If possible, save to SVG, then rasterize} Raster images (bmp, png, jpg, etc...) are based off of the number of pixels or dots per inch it takes to render the image. This means that the raster image is more or less a very fine mosaic. Vector images (SVG) are built upon several interconnected polygons, arcs, and lines that scale relative to one another to create your graphic. With vector graphics, you can produce a plot and scale it to the size of a building if you wanted to. When you save to an SVG file first, you can also manipulate it in programs such as Adobe Illustrator or Inkscape. - \item \textbf{Before saving, make sure the units and dimensions are correct} Unless you really wanted to save a graph that's over 6 feet wide. -\end{itemize} - -\subsubsection{Image Editors}\label{appendix:graphics:editors} -\tab\tab Often times, fine details such as labels on networks need to be tweaked by hand. Luckily, there are a wide variety of programs that can help you do that. Here is a short list of image editors (both free and for a price) that you can use to edit your graphics. -\begin{itemize} - \item Bitmap based editors (for jpeg, bmp, png, etc...) - \begin{quote} - \begin{itemize} - \item[ \scshape The GIMP ] Free, cross-platform. \url{http://www.gimp.org} - \item[ \scshape Paint.net ] Free, Windows only. \url{http://www.getpaint.net} - \item[ \scshape Adobe Photoshop ] Pricey, Windows and Mac. \url{http://www.adobe.com/products/photoshop.html} - \end{itemize} - \end{quote} - \item Scalable Vector Graphics based editors (for svg, pdf) - \begin{quote} - \begin{itemize} - \item[ \scshape Inkscape ] Free, cross-platform \url{http://inkscape.org} - \item[ \scshape Adobe Illustrator ] Pricey, Windows and Mac. \url{http://www.adobe.com/products/illustrator.html} - \end{itemize} - \end{quote} -\end{itemize} - -\subsubsection{Exporting ggplot2 graphics}\label{appendix:graphics:ggplot2} -\tab\tab \textit{ggplot2} is a fantastic package that \textit{poppr} uses to produce graphs for the \texttt{mlg.table}, \texttt{poppr}, and \texttt{ia} functions. Saving a plot with \textit{ggplot2} is performed with one command after your plot has rendered: -\begin{Schunk} -\begin{Sinput} -> data(nancycats) # Load the data set. -> poppr(nancycats, sublist=5, sample=999) # Produce a single plot. -> ggsave("nancy5.pdf") -\end{Sinput} -\end{Schunk} -Note that you can name the file anything, and \texttt{ggsave} will save it in that format for you. The details are in the documentation and you can access it by typing \texttt{help("ggsave")} in your R console. The important things to note are that you can set a \texttt{width}, \texttt{height}, and \texttt{unit}. The only downside to this function is that you can only save one plot at a time. If you want to be able to save multiple plots, read on to the next section. - -\subsubsection{Exporting any graphics}\label{appendix:graphics:export} -\tab\tab Some of the functions that \textit{poppr} offers will give you multiple plots, and if you want to save them all, using \texttt{ggsave} will require a lot of tedious typing and clicking. Luckily, R has Functions that will save any plot you generate in nearly any image format you want. You can save in raster images such as png, bpm, and jpeg. You can also save in vector based images such as svg, pdf, and postscript. The important thing to remember is that when you are saving in a raster format, the default units of measurement are ``pixels", but you can change that by specifying your unit of choice and a resolution. - -For raster images and svg files, you can only save your plots in multiple files, but pdf and postscript plots can be saved in one file as multiple pages. All of these functions have the same basic form. You call the function to specify the file type you want (eg. \texttt{pdf("myfile.pdf")}), create any graphs that you want to create, and then make sure to close the session with the function \texttt{dev.off()}. Let's give an example saving to pdf and png files. - -\begin{Schunk} -\begin{Sinput} -> data(H3N2) -> pop(H3N2) <- H3N2$other$x$country -> #### -> png("H3N2_barchart%02d.png", width = 14, height = 14, units = "in", res = 300) -> H.tab <- mlg.table(H3N2) -> dev.off() -> #### -\end{Sinput} -\end{Schunk} - -Since this data set is made up of 30 populations with more than 1 individual, this will save 30 files to your working directory named ``H3N2\_barchart01.png...H3N2\_barchart30.png". The way R knows how to number these files is because of the \texttt{\%02d} part of the command. That's telling R to use a number that is two digits long in place of that expression. All of these files will be 14x14" and will have a resolution of 300 dots per inch. If you wanted to do the same thing, but place them all in one file, you should use the pdf option. - -\begin{Schunk} -\begin{Sinput} -> pdf("H3N2_barcharts.png", width = 14, height = 14, compress = FALSE) -> H.tab <- mlg.table(H3N2) -> dev.off() -\end{Sinput} -\end{Schunk} - -Remember, it is important not to forget to type \texttt{dev.off()} when you are done making graphs. Note that I did not have to specify a resolution for this image since it is based off of vector graphics. - -\subsection{Function calls}\label{appendix:funk} - -\tab\tab Here is a list of all the default function calls for \textit{poppr}. Details can be found in the above sections.\\ -\tt \small -\begin{itemize} -\item getfile(multi = FALSE, pattern = NULL, combine = TRUE) (Section \ref{intro:import:getfile}) -\item read.genalex(genalex, ploidy = 2, geo = FALSE, region = FALSE) (Section \ref{intro:import:read.genalex}) -\item genind2genalex(pop, filename = "genalex.csv", quiet = FALSE, geo = FALSE, geodf = "xy") (Section \ref{intro:import:genind2genalex}) -\item missingno(pop, type = "loci", cutoff = 0.05, quiet = FALSE) (Section \ref{data.manip:missing:missingno}) -\item splitcombine(pop, method = 1, dfname = "population\_hierarchy", sep = "\_", hier = c(1), setpopulation = TRUE, fixed = TRUE) (Section \ref{data.manip:hier:splitcombine}) -\item popsub(pop, sublist = "ALL", blacklist = NULL, mat = NULL) (Section \ref{data.manip:divide:popsub}) -\item clonecorrect(pop, hier = c(1), dfname = "population\_hierarchy", combine = FALSE, keep = 1) (Section \ref{data.manip:cc:clonecorrect}) -\item shufflepop(pop, method = 1) (Section \ref{data.manip:shuffle:shufflepop}) -\item informloci(pop, cutoff = 2/nInd(pop), quiet = FALSE) (Section \ref{data.manip:informloci}) -\item mlg(pop, quiet = FALSE) (Section \ref{mlg:mlg:mlg}) -\item mlg.crosspop(pop, sublist = "ALL", blacklist = NULL, mlgsub = NULL, indexreturn = FALSE, df = FALSE, quiet = FALSE) (Section \ref{mlg:cross:mlg.crosspop}) -\item mlg.table(pop, sublist = "ALL", blacklist = NULL, mlgsub = NULL, bar = TRUE, total = FALSE, quiet = FALSE) (Section \ref{mlg:table:mlg.table}) -\item mlg.vector(pop) (Section \ref{mlg:mix:mlg.vector}) -\item ia(pop, sample = 0, method = 1, quiet = FALSE, missing = "ignore", hist = TRUE) (Section \ref{index:iard:ia}) -\item diss.dist(pop) (Section \ref{index:dist:diss.dist}) -\item bruvo.dist(pop, replen = c(2)) (Section \ref{index:bruvo:bruvo.dist}) -\item bruvo.boot(pop, replen = c(2), sample = 100, tree = "upgma", showtree = TRUE, -cutoff = NULL, quiet = FALSE, ...) (Section \ref{index:trees:bruvo.boot}) -\item greycurve(glim = c(0, 0.8), gadj = 3, gweight = 1) (Section \ref{index:trees:greycurve}) -\item bruvo.msn(pop, replen = c(2), palette = topo.colors, sublist = "All", blacklist = NULL, vertex.label = "MLG", gscale = TRUE, glim = c(0, 0.8), gadj = 3, gweight = 1, wscale = TRUE, ...) (Section \ref{index:trees:bruvo.msn}) -\item poppr.msn(pop, distmat, palette = topo.colors, sublist = "All", blacklist = NULL, vertex.label = "MLG", gscale = TRUE, glim = c(0, 0.8), gadj = 3, gweight = 1, wscale = TRUE, ...) (Section \ref{index:trees:poppr.msn}) -\item poppr(pop, total = TRUE, sublist = c("ALL"), blacklist = c(NULL), sample = 0, method = 1, missing = "ignore", cutoff=0.05, quiet = FALSE, clonecorrect = FALSE, hier = c(1), keep = 1, dfname = "population\_hierarchy", hist = TRUE, minsamp = 10) (Section \ref{summary:poppr}) -\item poppr.all(filelist, ...) (Sections \ref{intro:import:getfile} and \ref{summary:poppr}) -\end{itemize} -\normalsize -\bibliographystyle{plain} -\bibliography{poppr_man} -\end{document} diff --git a/vignettes/simulated_dist_fig.pdf b/vignettes/simulated_dist_fig.pdf new file mode 100644 index 00000000..0e0cadcc Binary files /dev/null and b/vignettes/simulated_dist_fig.pdf differ diff --git a/vignettes/simulated_dist_fig.png b/vignettes/simulated_dist_fig.png deleted file mode 100644 index e6bc6007..00000000 Binary files a/vignettes/simulated_dist_fig.png and /dev/null differ diff --git a/vignettes/simulated_param_fig.pdf b/vignettes/simulated_param_fig.pdf new file mode 100644 index 00000000..b3c01b4b Binary files /dev/null and b/vignettes/simulated_param_fig.pdf differ diff --git a/vignettes/simulated_param_fig.png b/vignettes/simulated_param_fig.png deleted file mode 100644 index 6f65d04e..00000000 Binary files a/vignettes/simulated_param_fig.png and /dev/null differ