diff --git a/DESCRIPTION b/DESCRIPTION index 764aaaff..64bc0886 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -2,8 +2,8 @@ Package: poppr Type: Package Title: an R package for genetic analysis of populations with mixed reproduction -Version: 1.1.0 -Date: 2014-07-23 +Version: 1.1.1 +Date: 2014-07-26 Author: Zhian N. Kamvar , Javier F. Tabima , Niklaus J. Grunwald diff --git a/NEWS b/NEWS index 9a672bab..4b210af5 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,10 @@ +poppr 1.1.1 +=========== +BUG FIX +* Fixed bug where the loss and add options for Bruvo's distance were switched. +* Fixed illegal memory access error by UBSAN. Made memory management of internal C functions more sane. (Addresses issue #2). +* Fixed directional quotes and em-dashes produced by Mavericks (Addresses issue #3). + poppr 1.1.0 =========== NEW FEATURES diff --git a/R/amova.r b/R/amova.r index b950dfe0..ba1ebd30 100644 --- a/R/amova.r +++ b/R/amova.r @@ -172,7 +172,7 @@ #' @references Excoffier, L., Smouse, P.E. and Quattro, J.M. (1992) Analysis of #' molecular variance inferred from metric distances among DNA haplotypes: #' application to human mitochondrial DNA restriction data. \emph{Genetics}, -#' \strong{131}, 479–491. +#' \strong{131}, 479-491. #' #' @seealso \code{\link[ade4]{amova}} \code{\link{clonecorrect}} #' \code{\link{diss.dist}} \code{\link{missingno}} diff --git a/R/distances.r b/R/distances.r index a5f9edc5..52bb67f1 100644 --- a/R/distances.r +++ b/R/distances.r @@ -155,49 +155,49 @@ diss.dist <- function(x, percent=FALSE, mat=FALSE){ #' @keywords nei rogers rodgers reynolds coancestry edwards angular provesti #' @references #' Nei, M. (1972) Genetic distances between populations. American Naturalist, -#' 106, 283–292. +#' 106, 283-292. #' #' Nei M. (1978) Estimation of average heterozygosity and genetic -#' distance from a small number of individuals. Genetics, 23, 341–369. +#' distance from a small number of individuals. Genetics, 23, 341-369. #' #' Avise, J. C. (1994) Molecular markers, natural history and evolution. Chapman & Hall, #' London. #' #' Edwards, A.W.F. (1971) Distance between populations on the basis of gene -#' frequencies. Biometrics, 27, 873–881. +#' frequencies. Biometrics, 27, 873-881. #' #' Cavalli-Sforza L.L. and Edwards A.W.F. #' (1967) Phylogenetic analysis: models and estimation procedures. Evolution, -#' 32, 550–570. +#' 32, 550-570. #' #' Hartl, D.L. and Clark, A.G. (1989) Principles of population #' genetics. Sinauer Associates, Sunderland, Massachussetts (p. 303). #' #' Reynolds, J. B., B. S. Weir, and C. C. Cockerham. (1983) Estimation of the #' coancestry coefficient: basis for a short-term genetic distance. Genetics, -#' 105, 767–779. +#' 105, 767-779. #' #' Rogers, J.S. (1972) Measures of genetic similarity and genetic distances. -#' Studies in Genetics, Univ. Texas Publ., 7213, 145–153. +#' Studies in Genetics, Univ. Texas Publ., 7213, 145-153. #' #' Avise, J. C. (1994) #' Molecular markers, natural history and evolution. Chapman & Hall, London. #' #' Prevosti A. (1974) La distancia genetica entre poblaciones. Miscellanea -#' Alcobe, 68, 109–118. +#' Alcobe, 68, 109-118. #' #' Prevosti A., Oca\~na J. and Alonso G. (1975) Distances #' between populations of Drosophila subobscura, based on chromosome -#' arrangements frequencies. Theoretical and Applied Genetics, 45, 231–241. +#' arrangements frequencies. Theoretical and Applied Genetics, 45, 231-241. #' #' For more information on dissimilarity indexes: #' #' Gower J. and Legendre P. (1986) #' Metric and Euclidean properties of dissimilarity coefficients. Journal of -#' Classification, 3, 5–48 +#' Classification, 3, 5-48 #' #' Legendre P. and Legendre L. (1998) Numerical Ecology, -#' Elsevier Science B.V. 20, pp274–288. +#' Elsevier Science B.V. 20, pp274-288. #' @export #' @examples #' diff --git a/R/internal.r b/R/internal.r index 46aaa2c9..719a24a5 100644 --- a/R/internal.r +++ b/R/internal.r @@ -93,7 +93,7 @@ NULL #' Restrepo, William E. Fry, Gregory A. Forbes, Valerie J. Fieland, Martha #' Cardenas, and Niklaus J. Grünwald. "The Irish potato famine pathogen #' \emph{Phytophthora infestans} originated in central Mexico rather than the Andes." -#' Proceedings of the National Academy of Sciences 111:8791–8796. +#' Proceedings of the National Academy of Sciences 111:8791-8796. #==============================================================================# NULL #==============================================================================# @@ -108,8 +108,8 @@ NULL #' were sampled across 3 years (2009, 2010, and 2011) in a total of four #' trees, where one tree was sampled in all three years, for a total of 6 #' within-tree populations. Within each year, samples in the spring were taken -#' from affected blossoms (termed “BB” for blossom blight) and in late summer -#' from affected fruits (termed “FR” for fruit rot). There are a total of 694 +#' from affected blossoms (termed "BB" for blossom blight) and in late summer +#' from affected fruits (termed "FR" for fruit rot). There are a total of 694 #' isolates with 65 to 173 isolates within each canopy population that were #' characterized using a set of 13 microsatellite markers. #' @format a \code{\linkS4class{genclone}} object with 3 hierarchical levels diff --git a/R/poppr.R b/R/poppr.R index ec60bdbe..4ac5ac87 100644 --- a/R/poppr.R +++ b/R/poppr.R @@ -62,18 +62,18 @@ #' } #' @section Genetic distances: #' \itemize{ -#' \item \code{\link{bruvo.dist}} - Bruvo’s distance +#' \item \code{\link{bruvo.dist}} - Bruvo's distance #' \item \code{\link{diss.dist}} - Absolute genetic distance (see provesti.dist) -#' \item \code{\link{nei.dist}} - Nei’s 1978 genetic distance -#' \item \code{\link{rogers.dist}} - Rogers’ euclidean distance -#' \item \code{\link{reynolds.dist}} - Reynolds’ coancestry distance -#' \item \code{\link{edwards.dist}} - Edwards’ angular distance -#' \item \code{\link{provesti.dist}} - Provesti’s absolute genetic distance +#' \item \code{\link{nei.dist}} - Nei's 1978 genetic distance +#' \item \code{\link{rogers.dist}} - Rogers' euclidean distance +#' \item \code{\link{reynolds.dist}} - Reynolds' coancestry distance +#' \item \code{\link{edwards.dist}} - Edwards' angular distance +#' \item \code{\link{provesti.dist}} - Provesti's absolute genetic distance #' } #' @section Bootstrapping: #' \itemize{ #' \item \code{\link{aboot}} - Creates a bootstrapped dendrogram for any distance measure -#' \item \code{\link{bruvo.boot}} - Produces dendrograms with bootstrap support based on Bruvo’s distance +#' \item \code{\link{bruvo.boot}} - Produces dendrograms with bootstrap support based on Bruvo's distance #' } #' @section Analysis: #' \itemize{ @@ -93,7 +93,7 @@ #' \itemize{ #' \item \code{\link{plot_poppr_msn}} - Plots minimum spanning networks produced in poppr with scale bar and legend #' \item \code{\link{greycurve}} - Helper to determine the appropriate parameters for adjusting the grey level for msn functions -#' \item \code{\link{bruvo.msn}} - Produces minimum spanning networks based off Bruvo’s distance colored by population +#' \item \code{\link{bruvo.msn}} - Produces minimum spanning networks based off Bruvo's distance colored by population #' \item \code{\link{poppr.msn}} - Produces a minimum spanning network for any pairwise distance matrix related to the data #' \item \code{\link{info_table}} - Creates a heatmap representing missing data or observed ploidy #' \item \code{\link{genotype_curve}} - Creates a series of boxplots to demonstrate how many markers are needed to represent the diversity of your data. diff --git a/man/Pinf.Rd b/man/Pinf.Rd index a9d955f2..a220883f 100644 --- a/man/Pinf.Rd +++ b/man/Pinf.Rd @@ -19,6 +19,6 @@ Goss, Erica M., Javier F. Tabima, David EL Cooke, Silvia Restrepo, William E. Fry, Gregory A. Forbes, Valerie J. Fieland, Martha Cardenas, and Niklaus J. Grünwald. "The Irish potato famine pathogen \emph{Phytophthora infestans} originated in central Mexico rather than the Andes." - Proceedings of the National Academy of Sciences 111:8791–8796. + Proceedings of the National Academy of Sciences 111:8791-8796. } diff --git a/man/genetic_distance.Rd b/man/genetic_distance.Rd index 1ffe849b..6dd7df1f 100644 --- a/man/genetic_distance.Rd +++ b/man/genetic_distance.Rd @@ -64,49 +64,49 @@ Daniel Chessel (ade4) } \references{ Nei, M. (1972) Genetic distances between populations. American Naturalist, -106, 283–292. +106, 283-292. Nei M. (1978) Estimation of average heterozygosity and genetic -distance from a small number of individuals. Genetics, 23, 341–369. +distance from a small number of individuals. Genetics, 23, 341-369. Avise, J. C. (1994) Molecular markers, natural history and evolution. Chapman & Hall, London. Edwards, A.W.F. (1971) Distance between populations on the basis of gene -frequencies. Biometrics, 27, 873–881. +frequencies. Biometrics, 27, 873-881. Cavalli-Sforza L.L. and Edwards A.W.F. (1967) Phylogenetic analysis: models and estimation procedures. Evolution, -32, 550–570. +32, 550-570. Hartl, D.L. and Clark, A.G. (1989) Principles of population genetics. Sinauer Associates, Sunderland, Massachussetts (p. 303). Reynolds, J. B., B. S. Weir, and C. C. Cockerham. (1983) Estimation of the coancestry coefficient: basis for a short-term genetic distance. Genetics, -105, 767–779. +105, 767-779. Rogers, J.S. (1972) Measures of genetic similarity and genetic distances. -Studies in Genetics, Univ. Texas Publ., 7213, 145–153. +Studies in Genetics, Univ. Texas Publ., 7213, 145-153. Avise, J. C. (1994) Molecular markers, natural history and evolution. Chapman & Hall, London. Prevosti A. (1974) La distancia genetica entre poblaciones. Miscellanea -Alcobe, 68, 109–118. +Alcobe, 68, 109-118. Prevosti A., Oca\~na J. and Alonso G. (1975) Distances between populations of Drosophila subobscura, based on chromosome -arrangements frequencies. Theoretical and Applied Genetics, 45, 231–241. +arrangements frequencies. Theoretical and Applied Genetics, 45, 231-241. For more information on dissimilarity indexes: Gower J. and Legendre P. (1986) Metric and Euclidean properties of dissimilarity coefficients. Journal of -Classification, 3, 5–48 +Classification, 3, 5-48 Legendre P. and Legendre L. (1998) Numerical Ecology, -Elsevier Science B.V. 20, pp274–288. +Elsevier Science B.V. 20, pp274-288. } \seealso{ \code{\link{aboot}} \code{\link{diss.dist}} \code{\link{poppr.amova}} diff --git a/man/monpop.Rd b/man/monpop.Rd index b7d5f633..7b6eda15 100644 --- a/man/monpop.Rd +++ b/man/monpop.Rd @@ -16,8 +16,8 @@ This is microsatellite data for a population of the haploid were sampled across 3 years (2009, 2010, and 2011) in a total of four trees, where one tree was sampled in all three years, for a total of 6 within-tree populations. Within each year, samples in the spring were taken - from affected blossoms (termed “BB” for blossom blight) and in late summer - from affected fruits (termed “FR” for fruit rot). There are a total of 694 + from affected blossoms (termed "BB" for blossom blight) and in late summer + from affected fruits (termed "FR" for fruit rot). There are a total of 694 isolates with 65 to 173 isolates within each canopy population that were characterized using a set of 13 microsatellite markers. } diff --git a/man/poppr-package.Rd b/man/poppr-package.Rd index b395cd41..d1328b53 100644 --- a/man/poppr-package.Rd +++ b/man/poppr-package.Rd @@ -73,13 +73,13 @@ This package relies on the \pkg{\link[adegenet]{adegenet}} package. \section{Genetic distances}{ \itemize{ -\item \code{\link{bruvo.dist}} - Bruvo’s distance +\item \code{\link{bruvo.dist}} - Bruvo's distance \item \code{\link{diss.dist}} - Absolute genetic distance (see provesti.dist) -\item \code{\link{nei.dist}} - Nei’s 1978 genetic distance -\item \code{\link{rogers.dist}} - Rogers’ euclidean distance -\item \code{\link{reynolds.dist}} - Reynolds’ coancestry distance -\item \code{\link{edwards.dist}} - Edwards’ angular distance -\item \code{\link{provesti.dist}} - Provesti’s absolute genetic distance +\item \code{\link{nei.dist}} - Nei's 1978 genetic distance +\item \code{\link{rogers.dist}} - Rogers' euclidean distance +\item \code{\link{reynolds.dist}} - Reynolds' coancestry distance +\item \code{\link{edwards.dist}} - Edwards' angular distance +\item \code{\link{provesti.dist}} - Provesti's absolute genetic distance } } @@ -87,7 +87,7 @@ This package relies on the \pkg{\link[adegenet]{adegenet}} package. \itemize{ \item \code{\link{aboot}} - Creates a bootstrapped dendrogram for any distance measure -\item \code{\link{bruvo.boot}} - Produces dendrograms with bootstrap support based on Bruvo’s distance +\item \code{\link{bruvo.boot}} - Produces dendrograms with bootstrap support based on Bruvo's distance } } @@ -113,7 +113,7 @@ This package relies on the \pkg{\link[adegenet]{adegenet}} package. \itemize{ \item \code{\link{plot_poppr_msn}} - Plots minimum spanning networks produced in poppr with scale bar and legend \item \code{\link{greycurve}} - Helper to determine the appropriate parameters for adjusting the grey level for msn functions -\item \code{\link{bruvo.msn}} - Produces minimum spanning networks based off Bruvo’s distance colored by population +\item \code{\link{bruvo.msn}} - Produces minimum spanning networks based off Bruvo's distance colored by population \item \code{\link{poppr.msn}} - Produces a minimum spanning network for any pairwise distance matrix related to the data \item \code{\link{info_table}} - Creates a heatmap representing missing data or observed ploidy \item \code{\link{genotype_curve}} - Creates a series of boxplots to demonstrate how many markers are needed to represent the diversity of your data. diff --git a/man/poppr.amova.Rd b/man/poppr.amova.Rd index d10e0db4..a1367b81 100644 --- a/man/poppr.amova.Rd +++ b/man/poppr.amova.Rd @@ -124,7 +124,7 @@ amova.cc.test Excoffier, L., Smouse, P.E. and Quattro, J.M. (1992) Analysis of molecular variance inferred from metric distances among DNA haplotypes: application to human mitochondrial DNA restriction data. \emph{Genetics}, -\strong{131}, 479–491. +\strong{131}, 479-491. } \seealso{ \code{\link[ade4]{amova}} \code{\link{clonecorrect}} diff --git a/src/poppr_distance.c b/src/poppr_distance.c index 687435be..a6379d4f 100755 --- a/src/poppr_distance.c +++ b/src/poppr_distance.c @@ -46,8 +46,8 @@ double bruvo_dist(int *in, int *nall, int *perm, int *woo); double test_bruvo_dist(int *in, int *nall, int *perm, int *woo, int *loss, int *add); void permute(int *a, int i, int n, int *c); int fact(int x); -double mindist(int perms, int alleles, int *perm, double *dist); -void genome_add_calc(int perms, int alleles, int *perm, double *dist, +double mindist(int perms, int alleles, int *perm, double **dist); +void genome_add_calc(int perms, int alleles, int *perm, double **dist, int zeroes, int *zero_ind, int curr_zero, int miss_ind, int *replacement, int inds, int curr_ind, double *genome_add_sum, int *tracker); void genome_loss_calc(int *genos, int nalleles, int *perm_array, int woo, @@ -58,73 +58,10 @@ void fill_short_geno(int *genos, int nalleles, int *perm_array, int *woo, int *loss, int *add, int zeroes, int *zero_ind, int curr_zero, int miss_ind, int *replacement, int inds, int curr_ind, double *res, int *tracker); +void print_distmat(double** dist, int* genos, int p); - -SEXP raw_pairdiffs(SEXP mat, SEXP ploidy) -{ - char binary_diffs, homozygote; - int count, row, col, bitcount, hz, a1, a2, ht;//, derp, i, j, k; - //SEXP Rout; - SEXP Rdim; - //SEXP dvector; - //SEXP Dvector; - Rdim = getAttrib(mat, R_DimSymbol); - row = INTEGER(Rdim)[0]; - col = INTEGER(Rdim)[1]; - /* - ploidy = coerceVector(ploidy, INTSXP); - PROTECT(dvector = allocVector(dvector, col)); - PROTECT(Dvector = allocVector(Dvector, ((row*(row-1)/2)))); - //mat = coerceVector(mat, RAWSXP); - for(i = 0; i < row - 1; i+ploidy) - { - for(j = i+ploidy; j < row; j+ploidy) - { - count = 0; - for(k = 0; k < col; k++) - { - binary_diffs = RAW(mat)[i+col*ploidy+k] ^ RAW(mat)[j+col*ploidy+k]; - for(bitcount = 7; bitcount >= 0; bitcount--) - { - dvector[count] += (binary_diffs >> bitcount) & 0x01; - } - count++; - } - } - } -*/ - for(count = 0; count < row*col; count++) - { - if(count < (row*col)-1 && count % 2 == 0) - { - binary_diffs = RAW(mat)[count] ^ RAW(mat)[count+1]; - homozygote = RAW(mat)[count] & RAW(mat)[count+1]; - } - else - { - goto out; - } - for(bitcount = 7; bitcount >= 0; bitcount--) - { - //printf("Genotype:\t\t%d\n", (RAW(mat)[count] >> bitcount) & 0x01); - if(count < (row*col)-1) - { - hz = (homozygote >> bitcount) & 0x01; - a1 = (RAW(mat)[count] >> bitcount) & 0x01; - a2 = (RAW(mat)[count + 1] >> bitcount) & 0x01; - ht = (binary_diffs >> bitcount) & 0x01; - Rprintf("%d AND %d:\t%d\t\t", a1, a2, hz); - Rprintf("%d XOR %d:\t%d\t\t", a1, a2, ht); - Rprintf("RESULT:\t%d\n", hz+ht); - } - } - out: - Rprintf("\n"); - } - return R_NilValue; -} /*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Calculates the root product of pairwise comparisons of each of the variances of each locus. @@ -134,7 +71,10 @@ Output: A vector of length n*(n-1)/2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/ SEXP pairwise_covar(SEXP pair_vec) { - int I, i, j, count; + int I; + int i; + int j; + int count; SEXP Rout; I = length(pair_vec); pair_vec = coerceVector(pair_vec, REALSXP); @@ -164,7 +104,12 @@ Output: A vector of length n*(n-1)/2 SEXP pairdiffs(SEXP freq_mat) { - int I, J, i, j, z, count; + int I; + int J; + int i; + int j; + int z; + int count; double P; SEXP Rout; SEXP Rdim; @@ -204,7 +149,9 @@ permuto will return a vector of all permutations needed for bruvo's distance. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/ SEXP permuto(SEXP perm) { - int permutations, i, per; + int permutations; + int i; + int per; SEXP Rval; /* IMPORTANT: INITIALIZE THE COUNTER. THE POINTER IS NOT RELEASED FROM @@ -213,7 +160,8 @@ SEXP permuto(SEXP perm) perm_count = 0; perm = coerceVector(perm, INTSXP); per = INTEGER(perm)[0]; - int allele_array[per]; + int *allele_array; + allele_array = R_Calloc(per, int); permutations = fact(per)*per; for(i = 0; i < per; i++) { @@ -222,6 +170,7 @@ SEXP permuto(SEXP perm) PROTECT(Rval = allocVector(INTSXP, permutations)); permute(allele_array, 0, per-1, INTEGER(Rval)); UNPROTECT(1); + R_Free(allele_array); return Rval; } /*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -247,7 +196,7 @@ with missing data. In the wrapping R function, 100s will be converted to NAs and then the average over all loci will be taken. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/ -SEXP single_bruvo(SEXP b_mat, SEXP permutations, SEXP alleles, SEXP loss, SEXP add) +SEXP single_bruvo(SEXP b_mat, SEXP permutations, SEXP alleles, SEXP add, SEXP loss) { int A, P, *pA, *pP; SEXP Rval; @@ -269,7 +218,7 @@ SEXP single_bruvo(SEXP b_mat, SEXP permutations, SEXP alleles, SEXP loss, SEXP a } -SEXP bruvo_distance(SEXP bruvo_mat, SEXP permutations, SEXP alleles, SEXP m_loss, SEXP m_add) +SEXP bruvo_distance(SEXP bruvo_mat, SEXP permutations, SEXP alleles, SEXP m_add, SEXP m_loss) { /*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ I = number of rows in bruvo_mat @@ -408,11 +357,8 @@ int fact(int x) /*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - DEPRECATED - - This will calculate bruvo's distance between two individuals. - All that needs to be done from here is to have it do the pairwise - calculations. + Bruvo's Distance implementing Addition, Loss, and Infinite models for + imputation of missing data. NOTE: The input needs to be divided by the repeat length beforehand for this to work. @@ -420,68 +366,13 @@ int fact(int x) in: a matrix of two individuals out: a double value that will be the output from bruvo's distance. n: number of individuals(2) - nall / p: number of alleles + nall: number of alleles perm: a vector from the permn function in R woo: p * p! - minn: is a rolling counter of the minimum between allele compairsons. - -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/ -double bruvo_dist(int *in, int *nall, int *perm, int *woo) -{ - int i, j, counter=0, n = 2, p = *nall, w = *woo, genos[2][p]; - double dist[p][p], da, res = 0, minn=100; - // reconstruct the genotype table. - for(i=0; i < n; i++) - { - for(j = 0; j < p; j++) - { - // Missing data will return with distance of 100 - if(in[counter] == 0) - { - return minn; - } - else - { - genos[i][j] = in[counter++]; - } - } - } - - // Construct distance matrix of 1 - 2^{-|x|} - for(j = 0; j < p; j++) - { - for(i=0; i < p; i++) - { - da = 1- pow(2 , -abs(genos[0][i] - genos[1][j])); - dist[i][j] = da; - } - } - - // Calculate the smallest s, which is the minimum distance among alleles. - for(i = 0; i < w; i += p) - { - for(j = 0; j < p; j++) - { - if (j == 0) - { - res = dist[*perm++][j]; - } - else - { - res += dist[*perm++][j]; - } - } - /* Checking if the new calculated distance is smaller than the smallest - distance seen. */ - if ( res < minn ) - { - minn = res; - } - } - return minn/p; -} + loss: TRUE/FALSE: impute under genome loss model. + add: TRUE/FALSE: impute under genome addition model. -/* Test code comparing current status to polysat's Bruvo2.distance: + Test code comparing current status to polysat's Bruvo2.distance: ================================================================================ poppr_bruvo <- function(){ return(c(.Call("single_bruvo", c(20,23,24,0,20,24,26,43), .Call("permuto", 4), 4, 0, 0), @@ -493,9 +384,9 @@ poppr_bruvo <- function(){ polysat_bruvo <- function(){ return(c(Bruvo2.distance(c(20,23,24), c(20,24,26,43), usatnt=1, loss=FALSE, add=FALSE), -Bruvo2.distance(c(20,23,24), c(20,24,26,43), usatnt=1, loss=T, add=FALSE), -Bruvo2.distance(c(20,23,24), c(20,24,26,43), usatnt=1, loss=F, add=T), -Bruvo2.distance(c(20,23,24), c(20,24,26,43), usatnt=1, loss=T, add=T) +Bruvo2.distance(c(20,23,24), c(20,24,26,43), usatnt=1, loss=FALSE, add=TRUE), +Bruvo2.distance(c(20,23,24), c(20,24,26,43), usatnt=1, loss=TRUE, add=FALSE), +Bruvo2.distance(c(20,23,24), c(20,24,26,43), usatnt=1, loss=TRUE, add=TRUE) )) } @@ -506,10 +397,40 @@ polysat_bruvo() == poppr_bruvo() ==============================================================================*/ double test_bruvo_dist(int *in, int *nall, int *perm, int *woo, int *loss, int *add) { - int i, j, counter = 0, n = 2, p = *nall, w = *woo, loss_indicator = *loss, - add_indicator = *add, genos[2][p], zerocatch[2], zero_ind[2][p], - zerodiff; - double dist[p][p], da, minn = 100, *distp; + int i; + int j; + int counter = 0; // counter used for building arrays + int n = 2; // number of individuals + int p = *nall; // number of alleles + int w = *woo; // number of permutations = p * p! + int loss_indicator = *loss; // 1 if the genome loss model should be used + int add_indicator = *add; // 1 if the genome addition model should be used + + int *genos; // array to store the genotypes + genos = R_Calloc(2*p, int); + + int zerocatch[2]; // 2 element array to store the number of missing + // alleles in each genotype. + + int** zero_ind; // array to store the indices of the missing data for + // each genotype + zero_ind = R_Calloc(2, int*); + zero_ind[0] = R_Calloc(p, int); + zero_ind[1] = R_Calloc(p, int); + + int zerodiff; // used to check the amount of missing data different + // between the two genotypes + + + double** dist; // array to store the distance + dist = R_Calloc(p, double*); + for (i = 0; i < p; i++) + { + dist[i] = R_Calloc(p, double); + } + + double minn = 100; // The minimum distance + // reconstruct the genotype table. zerocatch[0] = 0; zerocatch[1] = 0; @@ -518,22 +439,22 @@ double test_bruvo_dist(int *in, int *nall, int *perm, int *woo, int *loss, int * for(j = 0; j < p; j++) { // Catch missing data here. - if(in[counter] == 0) + if (in[counter] == 0) { if (zerocatch[i] == p - 1) { return minn; } zerocatch[i] += 1; - //printf("#"); + // Rprintf("[%d]", zerocatch[i] - 1); zero_ind[i][zerocatch[i] - 1] = j; } - genos[i][j] = in[counter++]; - //printf("%d\t", genos[i][j]); + genos[i*p + j] = in[counter++]; + // Rprintf("%d\t", genos[i*p + j]); } - //printf("\n"); + // Rprintf("\n"); } - //printf("\n"); + // Rprintf("\n"); zerodiff = abs(zerocatch[0] - zerocatch[1]); /*========================================================================== * Removing superfluous zeroes from the data. This is in the case that both @@ -543,8 +464,15 @@ double test_bruvo_dist(int *in, int *nall, int *perm, int *woo, int *loss, int * ==========================================================================*/ if (zerocatch[0] > 0 && zerocatch[1] > 0) { - int smaller = 0, larger = 1, reduction = 0, i, j, - zero_counter, *perm_array, *new_genop; + int smaller = 0; + int larger = 1; + int reduction = 0; + int i; + int j; + int zero_counter; + int *perm_array; + int *new_geno; + int *new_alleles; if (zerodiff == 0) { @@ -552,114 +480,87 @@ double test_bruvo_dist(int *in, int *nall, int *perm, int *woo, int *loss, int * } else { - if(zerocatch[0] < zerocatch[1]) + if (zerocatch[0] < zerocatch[1]) { smaller = 1; larger = 0; } reduction = p - (zerocatch[smaller] - zerodiff); } - int del = 1; - if (del > 0) - { - int new_alleles[reduction]; - for (i = 0; i < reduction; i++) - { - new_alleles[i] = i; - } - w = fact(reduction) * reduction; - perm_array = (int *) malloc(w * sizeof(int)); - perm_count = 0; - permute(new_alleles, 0, reduction - 1, perm_array); - // rebuild the array and make a pointer. - int new_geno[reduction*n]; - counter = 0; - for (i=0; i < n; i++) - { - zero_counter = zerocatch[larger]; - for (j = 0; j < p; j++) - { - if (genos[i][j] == 0 && zero_counter > 0) - { - zero_counter--; - } - else - { - new_geno[counter++] = genos[i][j]; - } - } - } - new_genop = (int *) &new_geno; - - minn = test_bruvo_dist(new_genop, &reduction, perm_array, &w, - &loss_indicator, &add_indicator); - free(perm_array); + + new_alleles = R_Calloc(reduction, int); + for (i = 0; i < reduction; i++) + { + new_alleles[i] = i; } - - /* Questions of the proper way of permuting this arise: - * 1. If both of the genotypes are of equal length, but the user wants - * a non genome - addition model, how do we undertake that? - * should we simply run all combinations of both genotypes - * separately? - * 2. If one genotype is longer than the other, should we fill the - * larger or smaller genotype, or possibly, should we fill both and - * do a similar procedure as I described above? - - - else + w = fact(reduction) * reduction; + perm_array = R_Calloc(w, int); + perm_count = 0; + permute(new_alleles, 0, reduction - 1, perm_array); + new_geno = R_Calloc(reduction*n, int); + counter = 0; + for (i=0; i < n; i++) { - int fill_tracker = 0, *pzero_ind, large_inds[p - zerocatch[larger]], - large_counter = 0, *plarge_inds; - double res = 0; - pzero_ind = (int *) &zero_ind[larger]; - plarge_inds = (int *) &large_inds; - for (i = 0; i < p; i++) + zero_counter = zerocatch[larger]; + for (j = 0; j < p; j++) { - if (genos[larger][i] > 0) + if (genos[i*p + j] == 0 && zero_counter > 0) { - large_inds[large_counter++] = i; + zero_counter--; + } + else + { + new_geno[counter++] = genos[i*p + j]; } } - for (i = 0; i < reduction; i++) - { - fill_short_geno(in, p, perm, woo, loss, add, zerocatch[larger], - pzero_ind, 0, larger, plarge_inds, reduction, i, &res, - &fill_tracker); - } - minn = res/fill_tracker; } - */ + + minn = test_bruvo_dist(new_geno, &reduction, perm_array, &w, + &loss_indicator, &add_indicator); + R_Free(perm_array); + R_Free(new_geno); + R_Free(new_alleles); return minn; } // Construct distance matrix of 1 - 2^{-|x|}. - // This is constructed column by column. Genotype 1 in the rows. Genotype 2 - // in the columns. - for(j = 0; j < p; j++) + // This is constructed column by column. + // Genotype 1: COLUMNS + // Genotype 2: ROWS + + for(i = 0; i < p; i++) { - for(i = 0; i < p; i++) + for(j = 0; j < p; j++) { - da = 1 - pow(2, -abs(genos[0][i] - genos[1][j])); - dist[i][j] = da; + dist[i][j] = 1 - pow(2, -abs(genos[0*p + j] - genos[1*p + i])); } } - // This avoids warning: assignment from incompatible pointer type - distp = (double *) &dist; - if(zerocatch[0] > 0 || zerocatch[1] > 0) + + // Rprintf("DISTANCE:"); + // print_distmat(dist, genos, p); + + if (zerocatch[0] > 0 || zerocatch[1] > 0) { - int *genop, ind, miss_ind = 1, z, tracker = 0, loss_tracker = 0;// full_ind = 0; - double genome_add_sum = 0, genome_loss_sum = 0; - genop = (int *) &genos; - if (zerocatch[0] > 0) // The rows contain the zero value + int ind; + int miss_ind = 1; + int z; + int tracker = 0; + int loss_tracker = 0; + int comparison_factor = 1; + int* short_inds; + double genome_add_sum = 0; + double genome_loss_sum = 0; + + if (zerocatch[0] > 0) // The columns contain the zero value { miss_ind = 0; - //full_ind = 1; } ind = zero_ind[miss_ind][0]; - int short_inds[zerocatch[miss_ind]], short_counter = 0; + short_inds = R_Calloc(zerocatch[miss_ind], int); + int short_counter = 0; for (i = 0; i < p; i++) { - if (genos[miss_ind][i] > 0) + if (genos[miss_ind*p + i] > 0) { short_inds[short_counter++] = i; } @@ -671,10 +572,11 @@ double test_bruvo_dist(int *in, int *nall, int *perm, int *woo, int *loss, int * ======================================================================*/ if(loss_indicator != 1 && add_indicator != 1) { + // Rprintf("TO INFINITY!\n"); for (z = 0; z < zerocatch[miss_ind]; z++) { ind = zero_ind[miss_ind][z]; - if (zerocatch[0] > 0) + if (zerocatch[0] < 0) { for (j = 0; j < p; j++) { @@ -689,7 +591,7 @@ double test_bruvo_dist(int *in, int *nall, int *perm, int *woo, int *loss, int * } } } - return mindist(w, p, perm, distp)/p; + goto finalcalc; } /*====================================================================== * GENOME ADDITION MODEL @@ -700,15 +602,18 @@ double test_bruvo_dist(int *in, int *nall, int *perm, int *woo, int *loss, int * ======================================================================*/ if (add_indicator == 1) { - int *pzero_ind, *pshort_inds; - pzero_ind = (int *) &zero_ind[miss_ind]; - pshort_inds = (int *) &short_inds; - for (i = 0; i < p - zerocatch[miss_ind]; i++) + // Rprintf("ADD!\n"); + int replacements = 0; + replacements = p - zerocatch[miss_ind]; + // print_distmat(dist, genos, p); + for (i = 0; i < replacements; i++) { - genome_add_calc(w, p, perm, distp, zerocatch[miss_ind], - pzero_ind, 0, miss_ind, pshort_inds, p-zerocatch[miss_ind], + genome_add_calc(w, p, perm, dist, zerocatch[miss_ind], + zero_ind[miss_ind], 0, miss_ind, short_inds, replacements, i, &genome_add_sum, &tracker); + // Rprintf("current add sum = %.6f\n", genome_add_sum); } + R_Free(short_inds); } /*====================================================================== * GENOME LOSS MODEL @@ -718,12 +623,11 @@ double test_bruvo_dist(int *in, int *nall, int *perm, int *woo, int *loss, int * ======================================================================*/ if (loss_indicator == 1) { - int *pzero_ind; - pzero_ind = (int *) &zero_ind[miss_ind]; + // Rprintf("LOSS!\n"); for (i = 0; i < p; i++) { - genome_loss_calc(genop, p, perm, w, &loss_indicator, - &add_indicator, pzero_ind, 0, zerocatch[miss_ind], + genome_loss_calc(genos, p, perm, w, &loss_indicator, + &add_indicator, zero_ind[miss_ind], 0, zerocatch[miss_ind], miss_ind, i, &genome_loss_sum, &loss_tracker); } } @@ -736,11 +640,26 @@ double test_bruvo_dist(int *in, int *nall, int *perm, int *woo, int *loss, int * loss_tracker = 1; } genome_loss_sum = genome_loss_sum/loss_tracker; + // Rprintf("LOSS SUM: %.6f\n", genome_loss_sum/p); genome_add_sum = genome_add_sum/tracker; - int comparison_factor = loss_indicator + add_indicator; - return (genome_add_sum + genome_loss_sum)/(p*comparison_factor); + // Rprintf("ADD SUM: %.6f\n", genome_add_sum/p); + comparison_factor = loss_indicator + add_indicator; + minn = (genome_add_sum + genome_loss_sum)/(p*comparison_factor); + } + else + { + finalcalc: minn = mindist(w, p, perm, dist)/p; } - return mindist(w, p, perm, distp)/p; + R_Free(genos); + for (i = 0; i < p; i++) + { + R_Free(dist[i]); + } + R_Free(dist); + R_Free(zero_ind[0]); + R_Free(zero_ind[1]); + R_Free(zero_ind); + return minn; } @@ -755,7 +674,7 @@ double test_bruvo_dist(int *in, int *nall, int *perm, int *woo, int *loss, int * * perms - number of permutations (passed to mindist) * alleles - number of maximum alleles (also passed to mindist) * *perm - permutation array (passed to mindist) -* *dist - distance matrix to be manipulated (also passed to mindist) +* **dist - distance matrix to be manipulated (also passed to mindist) * * zeroes - number of zeroes present in the shorter genotype. * *zero_ind - array containing indices of the zero values of the short geno @@ -769,31 +688,30 @@ double test_bruvo_dist(int *in, int *nall, int *perm, int *woo, int *loss, int * * *genome_add_sum - pointer to the total number of the genome addition model. * *tracker - pointer to counter for the number of calculations for addition. ==============================================================================*/ -void genome_add_calc(int perms, int alleles, int *perm, double *dist, +void genome_add_calc(int perms, int alleles, int *perm, double **dist, int zeroes, int *zero_ind, int curr_zero, int miss_ind, int *replacement, int inds, int curr_ind, double *genome_add_sum, int *tracker) { - int i,j; + int i; + int j; //========================================================================== // Part 1: fill one row/column of the matrix. // Note that we don't have the format of the 2D array here, so we are // cheating a little bit. Instead of indexing by dist[i][j] over p columns, - // we use dist[i + p*j]. It works. + // we use dist[i][j]. It works. //========================================================================== if(miss_ind > 0) { for (j = 0; j < alleles; j++) { - dist[zero_ind[curr_zero] + alleles*j] = - dist[replacement[curr_ind] + alleles*j]; + dist[zero_ind[curr_zero]][j] = dist[replacement[curr_ind]][j]; } } else { for (j = 0; j < alleles; j++) { - dist[j + alleles*zero_ind[curr_zero]] = - dist[j + alleles*replacement[curr_ind]]; + dist[j][zero_ind[curr_zero]] = dist[j][replacement[curr_ind]]; } } //========================================================================== @@ -864,7 +782,8 @@ void genome_loss_calc(int *genos, int nalleles, int *perm_array, int woo, int miss_ind, int curr_allele, double *genome_loss_sum, int *loss_tracker) { - int i, full_ind; + int i; + int full_ind; full_ind = 1 + (0 - miss_ind); genos[miss_ind*nalleles + zero_ind[curr_zero]] = genos[full_ind*nalleles + curr_allele]; @@ -951,17 +870,23 @@ void fill_short_geno(int *genos, int nalleles, int *perm_array, int *woo, // Multiset coefficient: fact(n+k-1)/(fact(k)*fact(n-1)) */ -double mindist(int perms, int alleles, int *perm, double *dist) +double mindist(int perms, int alleles, int *perm, double **dist) { - int i, j, w = perms, p = alleles, counter = 0; - double res = 0, minn = 100; + int i, j; + int w = perms; + int p = alleles; + int counter = 0; + double res = 0; + double minn = 100; + for(i = 0; i < w; i += p) { for(j = 0; j < p; j++) { if (j == 0) { - res = dist[*(perm + counter++) + p*j]; + res = dist[*(perm + counter)][j]; + counter++; if(res > minn) { j = p; @@ -971,7 +896,8 @@ double mindist(int perms, int alleles, int *perm, double *dist) } else { - res += dist[*(perm + counter++) + p*j]; + res += dist[*(perm + counter)][j]; + counter++; if(j < p-1 && res > minn) { counter += (p-j-1); @@ -988,3 +914,27 @@ double mindist(int perms, int alleles, int *perm, double *dist) } return minn; } + +void print_distmat(double** dist, int* genos, int p) +{ + int i; + int j; + + Rprintf("\n \t"); + for (i = 0; i < p; i++) + { + Rprintf("%d\t", genos[i]); + } + Rprintf("\n"); + for (i = 0; i < p; i++) + { + Rprintf("%d\t", genos[i+p]); + for (j = 0; j < p; j++) + { + Rprintf("%.4f\t", dist[i][j]); + } + Rprintf("\n"); + } + return; +} + diff --git a/tests/testthat/test-values.R b/tests/testthat/test-values.R index 69caf0e3..1f77e00e 100644 --- a/tests/testthat/test-values.R +++ b/tests/testthat/test-values.R @@ -8,8 +8,8 @@ test_that("Bruvo's distance works as expected.", { addLOSS <- as.vector(bruvo.dist(testgid, add = FALSE, loss = TRUE)) ADDLOSS <- as.vector(bruvo.dist(testgid, add = TRUE, loss = TRUE)) expect_that(addloss, equals(0.46875000000000)) - expect_that(ADDloss, equals(0.34374987334013)) - expect_that(addLOSS, equals(0.458333164453506)) + expect_that(addLOSS, equals(0.34374987334013)) + expect_that(ADDloss, equals(0.458333164453506)) expect_that(ADDLOSS, equals(0.401041518896818)) }) diff --git a/vignettes/algo.Rnw b/vignettes/algo.Rnw index 609a7d57..fd3a045b 100644 --- a/vignettes/algo.Rnw +++ b/vignettes/algo.Rnw @@ -51,7 +51,7 @@ \scalebox{-1}[1]{\jala{}} } -\title{Algorithms and equations utilized in poppr version 1.1.0} +\title{Algorithms and equations utilized in poppr version 1.1.1} \author{Zhian N. Kamvar$^{1}$\ and Niklaus J. Gr\"unwald$^{1,2}$\\\scriptsize{1) Department of Botany and Plant Pathology, Oregon State University, Corvallis, OR}\\\scriptsize{2) Horticultural Crops Research Laboratory, USDA-ARS, @@ -550,6 +550,6 @@ H_{exp} = \frac{N}{N-1} \sum_{i=1}^{g}{p^{2}_{i}} Where $p_i$ is the frequency of the $i$th genotype, $g$ is the number of genotypes observed and $N$ is the sample size. -\bibliographystyle{pnas.bst} +\bibliographystyle{plain} \bibliography{poppr_man} \end{document} \ No newline at end of file diff --git a/vignettes/algo.pdf b/vignettes/algo.pdf index bfc9939e..1e08dad0 100644 Binary files a/vignettes/algo.pdf and b/vignettes/algo.pdf differ diff --git a/vignettes/algo.tex b/vignettes/algo.tex index ca6c5a7a..afa4cdc1 100644 --- a/vignettes/algo.tex +++ b/vignettes/algo.tex @@ -100,7 +100,7 @@ \scalebox{-1}[1]{\jala{}} } -\title{Algorithms and equations utilized in poppr version 1.1.0} +\title{Algorithms and equations utilized in poppr version 1.1.1} \author{Zhian N. Kamvar$^{1}$\ and Niklaus J. Gr\"unwald$^{1,2}$\\\scriptsize{1) Department of Botany and Plant Pathology, Oregon State University, Corvallis, OR}\\\scriptsize{2) Horticultural Crops Research Laboratory, USDA-ARS, @@ -604,6 +604,6 @@ \subsection{Hexp} Where $p_i$ is the frequency of the $i$th genotype, $g$ is the number of genotypes observed and $N$ is the sample size. -\bibliographystyle{pnas.bst} +\bibliographystyle{plain} \bibliography{poppr_man} \end{document} diff --git a/vignettes/poppr_manual.Rnw b/vignettes/poppr_manual.Rnw index 091a06ae..e55da70f 100644 --- a/vignettes/poppr_manual.Rnw +++ b/vignettes/poppr_manual.Rnw @@ -51,7 +51,7 @@ \scalebox{-1}[1]{\jala{}} } -\title{Data import and manipulation in Poppr version 1.1.0} +\title{Data import and manipulation in Poppr version 1.1.1} \author{Zhian N. Kamvar$^{1}$\ and Niklaus J. Gr\"unwald$^{1,2}$\\\scriptsize{1) Department of Botany and Plant Pathology, Oregon State University, Corvallis, OR}\\\scriptsize{2) Horticultural Crops Research Laboratory, USDA-ARS, @@ -197,9 +197,9 @@ The following data sets are included in \poppr{}: \texttt{Pinf} \cite{goss2014ir \subsection{Citation} -To cite \poppr{}, please type: +The formal publication for \poppr{} was published in the journal PeerJ: \url{http://peerj.com/articles/281/}. To cite \poppr{}, please type in your R console: -<>= +<>= citation(package = "poppr") @ @@ -4706,7 +4706,6 @@ to its definition and description. \end{longtable} \end{center} - -\bibliographystyle{pnas.bst} +\bibliographystyle{plain} \bibliography{poppr_man} \end{document} diff --git a/vignettes/poppr_manual.pdf b/vignettes/poppr_manual.pdf index 46379579..f9574eb0 100644 Binary files a/vignettes/poppr_manual.pdf and b/vignettes/poppr_manual.pdf differ