diff --git a/NAMESPACE b/NAMESPACE index af22964..03ece9e 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -8,9 +8,4 @@ export(kmeans_estimation) export(kmeans_inference_1f) export(norm_phi_canonical_kmeans_1f) export(rect_hier_clusters) -export(test_clusters_approx) -export(test_clusters_approx_1f) -export(test_complete_hier_clusters_approx) -export(test_complete_hier_clusters_approx_1f) -export(test_hier_clusters_exact) export(test_hier_clusters_exact_1f) diff --git a/man/compute_S_average.Rd b/man/compute_S_average.Rd deleted file mode 100644 index 397fcad..0000000 --- a/man/compute_S_average.Rd +++ /dev/null @@ -1,26 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/trunc_sets.R -\name{compute_S_average} -\alias{compute_S_average} -\title{Computes the conditioning set for average linkage hierarchical clustering} -\usage{ -compute_S_average(X, hcl, K, k1, k2) -} -\arguments{ -\item{X}{the n x q data set} - -\item{hcl}{hclust object obtained by clustering X} - -\item{K}{number of clusters} - -\item{k1}{the index of first cluster involved in the test} - -\item{k2}{the index of second cluster involved in the test} -} -\value{ -Returns an "Intervals" object containing the conditioning set. -} -\description{ -Computes the conditioning set for average linkage hierarchical clustering -} -\keyword{internal} diff --git a/man/compute_S_average_gencov.Rd b/man/compute_S_average_gencov.Rd deleted file mode 100644 index 4ef6cc8..0000000 --- a/man/compute_S_average_gencov.Rd +++ /dev/null @@ -1,30 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/trunc_sets.R -\name{compute_S_average_gencov} -\alias{compute_S_average_gencov} -\title{Computes the conditioning set S for average linkage hierarchical clustering, -w/o assuming isotropic covariance matrix} -\usage{ -compute_S_average_gencov(X, hcl, K, k1, k2, stat) -} -\arguments{ -\item{X}{the n x q data set} - -\item{hcl}{hclust object obtained by clustering X} - -\item{K}{number of clusters} - -\item{k1}{the index of first cluster involved in the test} - -\item{k2}{the index of second cluster involved in the test} - -\item{stat}{the test statistic, \eqn{||\Sigma^{-1/2} x^T \nu||_2}} -} -\value{ -Returns an "Intervals" object containing the conditioning set. -} -\description{ -Computes the conditioning set S for average linkage hierarchical clustering, -w/o assuming isotropic covariance matrix -} -\keyword{internal} diff --git a/man/compute_S_centroid.Rd b/man/compute_S_centroid.Rd deleted file mode 100644 index 0be3429..0000000 --- a/man/compute_S_centroid.Rd +++ /dev/null @@ -1,26 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/trunc_sets.R -\name{compute_S_centroid} -\alias{compute_S_centroid} -\title{Computes the conditioning set for centroid linkage hierarchical clustering} -\usage{ -compute_S_centroid(X, hcl, K, k1, k2) -} -\arguments{ -\item{X}{the n x q data set} - -\item{hcl}{hclust object obtained by clustering X} - -\item{K}{number of clusters} - -\item{k1}{the index of first cluster involved in the test} - -\item{k2}{the index of second cluster involved in the test} -} -\value{ -Returns an "Intervals" object containing the conditioning set. -} -\description{ -Computes the conditioning set for centroid linkage hierarchical clustering -} -\keyword{internal} diff --git a/man/compute_S_centroid_gencov.Rd b/man/compute_S_centroid_gencov.Rd deleted file mode 100644 index 35b9aeb..0000000 --- a/man/compute_S_centroid_gencov.Rd +++ /dev/null @@ -1,30 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/trunc_sets.R -\name{compute_S_centroid_gencov} -\alias{compute_S_centroid_gencov} -\title{Computes the conditioning set S for centroid linkage hierarchical clustering, -w/o assuming isotropic covariance matrix} -\usage{ -compute_S_centroid_gencov(X, hcl, K, k1, k2, stat) -} -\arguments{ -\item{X}{the n x q data set} - -\item{hcl}{hclust object obtained by clustering X} - -\item{K}{number of clusters} - -\item{k1}{the index of first cluster involved in the test} - -\item{k2}{the index of second cluster involved in the test} - -\item{stat}{the test statistic, \eqn{||\Sigma^{-1/2} x^T \nu||_2}} -} -\value{ -Returns an "Intervals" object containing the conditioning set. -} -\description{ -Computes the conditioning set S for centroid linkage hierarchical clustering, -w/o assuming isotropic covariance matrix -} -\keyword{internal} diff --git a/man/compute_S_mcquitty.Rd b/man/compute_S_mcquitty.Rd deleted file mode 100644 index dc51c92..0000000 --- a/man/compute_S_mcquitty.Rd +++ /dev/null @@ -1,26 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/trunc_sets.R -\name{compute_S_mcquitty} -\alias{compute_S_mcquitty} -\title{Computes the conditioning set for McQuitty linkage hierarchical clustering (WPGMA)} -\usage{ -compute_S_mcquitty(X, hcl, K, k1, k2) -} -\arguments{ -\item{X}{the n x q data set} - -\item{hcl}{hclust object obtained by clustering X} - -\item{K}{number of clusters} - -\item{k1}{the index of first cluster involved in the test} - -\item{k2}{the index of second cluster involved in the test} -} -\value{ -Returns an "Intervals" object containing the conditioning set. -} -\description{ -Computes the conditioning set for McQuitty linkage hierarchical clustering (WPGMA) -} -\keyword{internal} diff --git a/man/compute_S_mcquitty_gencov.Rd b/man/compute_S_mcquitty_gencov.Rd deleted file mode 100644 index 8239aed..0000000 --- a/man/compute_S_mcquitty_gencov.Rd +++ /dev/null @@ -1,30 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/trunc_sets.R -\name{compute_S_mcquitty_gencov} -\alias{compute_S_mcquitty_gencov} -\title{Computes the conditioning set S for McQuitty linkage hierarchical clustering (WPGMA), -w/o assuming isotropic covariance matrix} -\usage{ -compute_S_mcquitty_gencov(X, hcl, K, k1, k2, stat) -} -\arguments{ -\item{X}{the n x q data set} - -\item{hcl}{hclust object obtained by clustering X} - -\item{K}{number of clusters} - -\item{k1}{the index of first cluster involved in the test} - -\item{k2}{the index of second cluster involved in the test} - -\item{stat}{the test statistic, \eqn{||\Sigma^{-1/2} x^T \nu||_2}} -} -\value{ -Returns an "Intervals" object containing the conditioning set. -} -\description{ -Computes the conditioning set S for McQuitty linkage hierarchical clustering (WPGMA), -w/o assuming isotropic covariance matrix -} -\keyword{internal} diff --git a/man/compute_S_median.Rd b/man/compute_S_median.Rd deleted file mode 100644 index ee5b204..0000000 --- a/man/compute_S_median.Rd +++ /dev/null @@ -1,26 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/trunc_sets.R -\name{compute_S_median} -\alias{compute_S_median} -\title{Computes the conditioning set for median linkage hierarchical clustering (WPGMC)} -\usage{ -compute_S_median(X, hcl, K, k1, k2) -} -\arguments{ -\item{X}{the n x q data set} - -\item{hcl}{hclust object obtained by clustering X} - -\item{K}{number of clusters} - -\item{k1}{the index of first cluster involved in the test} - -\item{k2}{the index of second cluster involved in the test} -} -\value{ -Returns an "Intervals" object containing the conditioning set. -} -\description{ -Computes the conditioning set for median linkage hierarchical clustering (WPGMC) -} -\keyword{internal} diff --git a/man/compute_S_median_gencov.Rd b/man/compute_S_median_gencov.Rd deleted file mode 100644 index a4fcdcb..0000000 --- a/man/compute_S_median_gencov.Rd +++ /dev/null @@ -1,30 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/trunc_sets.R -\name{compute_S_median_gencov} -\alias{compute_S_median_gencov} -\title{Computes the conditioning set S for median linkage hierarchical clustering (WPGMC), -w/o assuming isotropic covariance matrix} -\usage{ -compute_S_median_gencov(X, hcl, K, k1, k2, stat) -} -\arguments{ -\item{X}{the n x q data set} - -\item{hcl}{hclust object obtained by clustering X} - -\item{K}{number of clusters} - -\item{k1}{the index of first cluster involved in the test} - -\item{k2}{the index of second cluster involved in the test} - -\item{stat}{the test statistic, \eqn{||\Sigma^{-1/2} x^T \nu||_2}} -} -\value{ -Returns an "Intervals" object containing the conditioning set. -} -\description{ -Computes the conditioning set S for median linkage hierarchical clustering (WPGMC), -w/o assuming isotropic covariance matrix -} -\keyword{internal} diff --git a/man/compute_S_single_gencov.Rd b/man/compute_S_single_gencov.Rd deleted file mode 100644 index 8115d7d..0000000 --- a/man/compute_S_single_gencov.Rd +++ /dev/null @@ -1,30 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/trunc_sets.R -\name{compute_S_single_gencov} -\alias{compute_S_single_gencov} -\title{Computes the conditioning set S for single linkage hierarchical clustering, -w/o assuming isotropic covariance matrix} -\usage{ -compute_S_single_gencov(X, hcl, K, k1, k2, stat) -} -\arguments{ -\item{X}{the n x q data set} - -\item{hcl}{hclust object obtained by clustering X} - -\item{K}{number of clusters} - -\item{k1}{the index of first cluster involved in the test} - -\item{k2}{the index of second cluster involved in the test} - -\item{stat}{the test statistic, \eqn{||\Sigma^{-1/2} x^T \nu||_2}} -} -\value{ -Returns an "Intervals" object containing the conditioning set. -} -\description{ -Computes the conditioning set S for single linkage hierarchical clustering, -w/o assuming isotropic covariance matrix -} -\keyword{internal} diff --git a/man/compute_S_ward.Rd b/man/compute_S_ward.Rd deleted file mode 100644 index dc145f2..0000000 --- a/man/compute_S_ward.Rd +++ /dev/null @@ -1,26 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/trunc_sets.R -\name{compute_S_ward} -\alias{compute_S_ward} -\title{Computes the conditioning set for Ward linkage hierarchical clustering} -\usage{ -compute_S_ward(X, hcl, K, k1, k2) -} -\arguments{ -\item{X}{the n x q data set} - -\item{hcl}{hclust object obtained by clustering X} - -\item{K}{number of clusters} - -\item{k1}{the index of first cluster involved in the test} - -\item{k2}{the index of second cluster involved in the test} -} -\value{ -Returns an "Intervals" object containing the conditioning set. -} -\description{ -Computes the conditioning set for Ward linkage hierarchical clustering -} -\keyword{internal} diff --git a/man/compute_S_ward_gencov.Rd b/man/compute_S_ward_gencov.Rd deleted file mode 100644 index eca38d6..0000000 --- a/man/compute_S_ward_gencov.Rd +++ /dev/null @@ -1,30 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/trunc_sets.R -\name{compute_S_ward_gencov} -\alias{compute_S_ward_gencov} -\title{Computes the conditioning set S for ward linkage hierarchical clustering, -w/o assuming isotropic covariance matrix} -\usage{ -compute_S_ward_gencov(X, hcl, K, k1, k2, stat) -} -\arguments{ -\item{X}{the n x q data set} - -\item{hcl}{hclust object obtained by clustering X} - -\item{K}{number of clusters} - -\item{k1}{the index of first cluster involved in the test} - -\item{k2}{the index of second cluster involved in the test} - -\item{stat}{the test statistic, \eqn{||\Sigma^{-1/2} x^T \nu||_2}} -} -\value{ -Returns an "Intervals" object containing the conditioning set. -} -\description{ -Computes the conditioning set S for ward linkage hierarchical clustering, -w/o assuming isotropic covariance matrix -} -\keyword{internal} diff --git a/man/preserve_cl.Rd b/man/preserve_cl.Rd deleted file mode 100644 index cded3a2..0000000 --- a/man/preserve_cl.Rd +++ /dev/null @@ -1,22 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/util.R -\name{preserve_cl} -\alias{preserve_cl} -\title{Checks if Ck, Ck' in C(x'(phi))} -\usage{ -preserve_cl(cl, cl_phi, k1, k2) -} -\arguments{ -\item{cl}{clustering of x} - -\item{cl_phi}{clustering of x'(phi)} - -\item{k1, k2}{index of clusters involved in the test} -} -\value{ -Returns TRUE if Ck, Ck' in C(x'(phi)), and FALSE otherwise -} -\description{ -Checks if Ck, Ck' in C(x'(phi)) -} -\keyword{internal} diff --git a/man/same_cl.Rd b/man/same_cl.Rd deleted file mode 100644 index cbc05c7..0000000 --- a/man/same_cl.Rd +++ /dev/null @@ -1,22 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/util.R -\name{same_cl} -\alias{same_cl} -\title{Checks if two clusterings are the same up to permutation} -\usage{ -same_cl(cl1, cl2, K) -} -\arguments{ -\item{cl1}{the first clustering} - -\item{cl2}{the second clustering} - -\item{K}{the number of clusters} -} -\value{ -Returns TRUE if they are the same, and FALSE otherwise -} -\description{ -Checks if two clusterings are the same up to permutation -} -\keyword{internal} diff --git a/man/test_clusters_approx.Rd b/man/test_clusters_approx.Rd deleted file mode 100644 index 0cde1b4..0000000 --- a/man/test_clusters_approx.Rd +++ /dev/null @@ -1,85 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/trunc_inf.R -\name{test_clusters_approx} -\alias{test_clusters_approx} -\title{Monte Carlo significance test for any clustering method} -\usage{ -test_clusters_approx( - X, - k1, - k2, - iso = TRUE, - sig = NULL, - SigInv = NULL, - ndraws = 2000, - cl_fun, - cl = NULL -) -} -\arguments{ -\item{X}{\eqn{n} by \eqn{p} matrix containing numeric data.} - -\item{k1, k2}{Integers selecting the clusters to test.} - -\item{iso}{Boolean. If \code{TRUE}, isotropic covariance matrix model, otherwise not.} - -\item{sig}{Optional scalar specifying \eqn{\sigma}, relevant if \code{iso} is \code{TRUE}.} - -\item{SigInv}{Optional matrix specifying \eqn{\Sigma^{-1}}, relevant if \code{iso} is \code{FALSE}.} - -\item{ndraws}{Integer selecting the number of importance samples, default of 2000.} - -\item{cl_fun}{Function returning assignments to clusters 1 through \code{K}.} - -\item{cl}{Optionally pass in the results of calling \code{cl_fun} on your data. This is for -efficiency and reproducibility (when the clustering function is non-deterministic).} -} -\value{ -\item{stat}{the test statistic: the Euclidean distance between the mean of cluster \code{k1} and the mean of cluster \code{k2} } -\item{pval}{the approximate p-value} -\item{stderr}{standard error of the p-value estimate} -\item{clusters}{the estimated cluster assignments} -} -\description{ -This function performs a user-specified clustering method \code{cl_fun} on the rows of a -data matrix to obtain \code{K} clusters, and tests the null hypothesis of no difference in means -between clusters \code{k1} and \code{k2}. -} -\details{ -In order to account for the fact that the clusters have been estimated from the data, -the p-values are computed conditional on the fact that those clusters were estimated. -This function approximates p-values via importance sampling. - -This function assumes that \code{cl_fun} takes a \eqn{n \times p} numeric data matrix as input -and outputs integer assignments to clusters 1 through \code{K}. -} -\examples{ -# Simulates a 100 x 2 data set with three clusters -set.seed(123) -dat <- rbind(c(-1, 0), c(0, sqrt(3)), c(1, 0))[rep(1:3, length=100), ] + -matrix(0.2*rnorm(200), 100, 2) - -# Function to run k-means clustering w/ k = 3 and 50 random starts -km_cluster <- function(X) { - km <- kmeans(X, 3, nstart=50) - return(km$cluster) -} - -# Cluster data using k-means -clusters <- km_cluster(dat) -table(rep(1:3, length=100), clusters) - -# tests for a difference in means between clusters 1 and 2 -# We pass in earlier k-means clustering results from earlier -results <- test_clusters_approx(dat, k1=1, k2=2, cl_fun=km_cluster, ndraws=500, cl=clusters) -results$stat -results$pval -results$stderr - -} -\references{ -Lucy L. Gao et al. "Selective inference for hierarchical clustering". arXiv preprint (2020). -} -\seealso{ -\code{\link{test_clusters_approx_1f}} for approximate p-values for a difference in the mean of one feature. -} diff --git a/man/test_clusters_approx_1f.Rd b/man/test_clusters_approx_1f.Rd deleted file mode 100644 index d1d8364..0000000 --- a/man/test_clusters_approx_1f.Rd +++ /dev/null @@ -1,81 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/trunc_inf.R -\name{test_clusters_approx_1f} -\alias{test_clusters_approx_1f} -\title{Monte Carlo significance test (with respect to a single feature) for any clustering function} -\usage{ -test_clusters_approx_1f( - X, - k1, - k2, - feat, - sig = NULL, - ndraws = 2000, - cl_fun, - cl = NULL -) -} -\arguments{ -\item{X}{\eqn{n} by \eqn{p} matrix containing numeric data.} - -\item{k1, k2}{Integers selecting the clusters to test.} - -\item{feat}{Integer selecting the feature to test.} - -\item{sig}{Optional scalar specifying \eqn{\sigma}.} - -\item{ndraws}{Integer selecting the number of importance samples, default of 2000.} - -\item{cl_fun}{Function returning assignments to clusters 1 through \code{K}.} - -\item{cl}{Optionally pass in the results of calling \code{cl_fun} on your data. This is for -efficiency and reproducibility (when the clustering function is non-deterministic).} -} -\value{ -\item{stat}{the test statistic: the Euclidean distance between the mean of cluster \code{k1} and the mean of cluster \code{k2} } -\item{pval}{the p-value} -\item{clusters}{the estimated cluster assignments} -} -\description{ -This function performs a user-specified clustering method \code{cl_fun} on the rows of a -data matrix to obtain \code{K} clusters, and tests the null hypothesis of no difference -between the mean of feature \code{feat} in clusters \code{k1} and \code{k2}. -} -\details{ -In order to account for the fact that the clusters have been estimated from the data, -the p-values are computed conditional on the fact that those clusters were estimated. -This function approximates p-values via importance sampling. - -This function assumes that \code{cl_fun} takes a \eqn{n \times p} numeric data matrix as input -and outputs integer assignments to clusters 1 through \code{K}. -} -\examples{ -# Simulates a 100 x 2 data set with three clusters -set.seed(123) -dat <- rbind(c(-1, 0), c(0, sqrt(3)), c(1, 0))[rep(1:3, length=100), ] + -matrix(0.2*rnorm(200), 100, 2) - -# Function to run k-means clustering w/ k = 3 and 50 random starts -km_cluster <- function(X) { - km <- kmeans(X, 3, nstart=50) - return(km$cluster) -} - -# Cluster data using k-means -clusters <- km_cluster(dat) -table(rep(1:3, length=100), clusters) - -# tests for a difference in means between clusters 1 and 2 -results <- test_clusters_approx_1f(dat, k1=1, k2=2, feat = 1, cl_fun=km_cluster, -ndraws=500, cl=clusters) -results$stat -results$pval -results$stderr - -} -\references{ -Lucy L. Gao et al. "Selective inference for hierarchical clustering". arXiv preprint (2020). -} -\seealso{ -\code{\link{test_clusters_approx}} for approximate p-values for a difference in the mean of any feature. -} diff --git a/man/test_complete_hier_clusters_approx.Rd b/man/test_complete_hier_clusters_approx.Rd deleted file mode 100644 index 1c153f9..0000000 --- a/man/test_complete_hier_clusters_approx.Rd +++ /dev/null @@ -1,98 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/trunc_inf.R -\name{test_complete_hier_clusters_approx} -\alias{test_complete_hier_clusters_approx} -\title{Monte Carlo significance test for complete linkage hierarchical clustering} -\usage{ -test_complete_hier_clusters_approx( - X, - hcl, - K, - k1, - k2, - iso = TRUE, - sig = NULL, - SigInv = NULL, - ndraws = 2000 -) -} -\arguments{ -\item{X}{\eqn{n} by \eqn{p} matrix containing numeric data.} - -\item{hcl}{An object of the type \code{hclust} containing the hierarchical clustering of X.} - -\item{K}{Integer selecting the total number of clusters.} - -\item{k1, k2}{Integers selecting the clusters to test.} - -\item{iso}{Boolean. If \code{TRUE}, isotropic covariance matrix model, otherwise not.} - -\item{sig}{Optional scalar specifying \eqn{\sigma}, relevant if \code{iso} is \code{TRUE}.} - -\item{SigInv}{Optional matrix specifying \eqn{\Sigma^{-1}}, relevant if \code{iso} is \code{FALSE}.} - -\item{ndraws}{Integer selecting the number of importance samples, default of 2000.} -} -\value{ -\item{stat}{the test statistic: the Euclidean distance between the mean of cluster \code{k1} and the mean of cluster \code{k2} } -\item{pval}{the approximate p-value} -\item{stderr}{estimated standard error of the p-value estimate} -} -\description{ -This tests the null hypothesis of no difference in means between -clusters \code{k1} and \code{k2} at level \code{K} in a complete -linkage hierarchical clustering. (The \code{K} clusters are numbered as per -the results of the \code{cutree} function in the \code{stats} package.) -} -\details{ -Important note: Before calling \code{hclust} and this function, make sure to -load the package \code{fastcluster}. This is because the p-value approximation -procedure requires running hierarchical clustering on a large number of simulated -data sets, and the version of \code{hclust} in the \code{fastcluster} package -is much faster than the version of \code{hclust} in \code{stats}. - -In order to account for the fact that the clusters have been estimated from the data, -the p-values are computed conditional on the fact that those clusters were estimated. -This function approximates p-values via importance sampling. - -Currently, this function supports squared Euclidean distance as a measure of dissimilarity -between observations. (Note that complete linkage is invariant under monotone transformations -of the measure of dissimilarity between observations, so unsquared Euclidean distance -would produce the same hierarchical clustering.) - -By default, this function assumes that the covariance matrix of the features is isotropic -i.e. \eqn{Cov(X_i) = \sigma^2 I_p}. Setting \code{iso} to false instead assumes that -\eqn{Cov(X_i) = \Sigma}. If known, \eqn{\sigma} can be passed in using the \code{sigma} argument -or \eqn{\Sigma^{-1}} can be passed in the \code{SigInv} argument; otherwise, an -estimate of \eqn{\sigma} or \eqn{\Sigma} will be used. -} -\examples{ -# Simulates a 100 x 2 data set with no clusters -set.seed(1) -dat <- matrix(rnorm(200), 100, 2) - -# Complete linkage hierarchical clustering -library(fastcluster) -hcl <- hclust(dist(dat, method="euclidean")^2, method="complete") - -# plot dendrograms with the 1st and 2nd clusters (cut at the third level) -# displayed in blue and orange -plot(hcl) -rect_hier_clusters(hcl, k=3, which=1:2, border=c("blue", "orange")) - -# Monte Carlo test for a difference in means between the blue and orange clusters -test_complete_hier_clusters_approx(X=dat, hcl=hcl, K=3, k1=1, k2=2, ndraws=1000) - -} -\references{ -Lucy L. Gao et al. "Selective inference for hierarchical clustering". arXiv preprint (2020). -} -\seealso{ -\code{\link{rect_hier_clusters}} for visualizing clusters \code{k1} and \code{k2} in the dendrogram; - -\code{\link{test_hier_clusters_exact}} for exact p-values for hierarchical clustering with other linkages; - -\code{\link{test_clusters_approx}} for approximate p-values for a user-specified clustering function; - -\code{\link{test_complete_hier_clusters_approx_1f}} for approximate p-values for a difference in the mean of one feature. -} diff --git a/man/test_complete_hier_clusters_approx_1f.Rd b/man/test_complete_hier_clusters_approx_1f.Rd deleted file mode 100644 index 3b94637..0000000 --- a/man/test_complete_hier_clusters_approx_1f.Rd +++ /dev/null @@ -1,96 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/trunc_inf.R -\name{test_complete_hier_clusters_approx_1f} -\alias{test_complete_hier_clusters_approx_1f} -\title{Monte Carlo significance test (with respect to a single feature) for complete linkage hierarchical clustering} -\usage{ -test_complete_hier_clusters_approx_1f( - X, - hcl, - K, - k1, - k2, - feat, - sig = NULL, - ndraws = 2000 -) -} -\arguments{ -\item{X}{\eqn{n} by \eqn{p} matrix containing numeric data.} - -\item{hcl}{Object of the type \code{hclust} containing the hierarchical clustering of X.} - -\item{K}{Integer selecting the total number of clusters.} - -\item{k1, k2}{Integers selecting the clusters to test.} - -\item{feat}{Integer selecting the feature to test.} - -\item{sig}{Optional scalar specifying \eqn{\sigma}.} - -\item{ndraws}{Integer selecting the number of importance samples, default of 2000.} -} -\value{ -\item{stat}{the test statistic: the absolute difference between the mean of feature \code{feat} in cluster \code{k1} and the mean of feature \code{feat} in cluster \code{k2}} -\item{pval}{the approximate p-value} -\item{stderr}{standard error of the p-value estimate} -} -\description{ -This tests the null hypothesis of no difference in means in the mean of -feature \code{feat} between clusters \code{k1} and \code{k2} at -level \code{K} in a complete linkage hierarchical clustering. (The \code{K} -clusters are numbered as per the results of the \code{cutree} function in the -\code{stats} package.) -} -\details{ -Important note: Before calling \code{hclust} and this function, make sure to -load the package \code{fastcluster}. This is because the p-value approximation -procedure requires running hierarchical clustering on a large number of simulated -data sets, and the version of \code{hclust} in the \code{fastcluster} package -is much faster than the version of \code{hclust} in \code{stats}. - -In order to account for the fact that the clusters have been estimated from the data, -the p-values are computed conditional on the fact that those clusters were estimated. -This function approximates p-values via importance sampling. - -Currently, this function supports squared Euclidean distance as a measure of dissimilarity -between observations. (Note that complete linkage is invariant under monotone transformations -of the measure of dissimilarity between observations, so unsquared Euclidean distance -would produce the same hierarchical clustering.) - -This function assumes that the covariance matrix of the features is isotropic -i.e. \eqn{Cov(X_i) = \sigma^2 I_p}. If known, \eqn{\sigma} can be passed in using the \code{sigma} -argument; otherwise, an estimate of \eqn{\sigma} will be used. -} -\examples{ -# Simulates a 100 x 2 data set with no clusters -set.seed(1) -dat <- matrix(rnorm(200), 100, 2) - -# Complete linkage hierarchical clustering -library(fastcluster) -hcl <- hclust(dist(dat, method="euclidean")^2, method="complete") - -# plot dendrograms with the 1st and 2nd clusters (cut at the third split) -# displayed in blue and orange -plot(hcl) -rect_hier_clusters(hcl, k=3, which=1:2, border=c("blue", "orange")) - -# Monte Carlo test for a difference in means between the blue and orange clusters -# wrt the 2nd feature -test_complete_hier_clusters_approx_1f(X=dat, hcl=hcl, -K=3, k1=1, k2=2, feat=2, ndraws=1000) - -} -\references{ -Lucy L. Gao et al. "Selective inference for hierarchical clustering". arXiv preprint (2020). -} -\seealso{ -\code{\link{rect_hier_clusters}} for visualizing clusters \code{k1} and \code{k2} in the dendrogram; - -\code{\link{test_hier_clusters_exact_1f}} for exact p-values for other linkages; - -\code{\link{test_clusters_approx_1f}} for approximate p-values for a user-specified clustering function; - -\code{\link{test_complete_hier_clusters_approx}} for approximate p-values for a difference in the mean of any feature. -} diff --git a/man/test_hier_clusters_exact.Rd b/man/test_hier_clusters_exact.Rd deleted file mode 100644 index c89c0f9..0000000 --- a/man/test_hier_clusters_exact.Rd +++ /dev/null @@ -1,91 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/trunc_inf.R -\name{test_hier_clusters_exact} -\alias{test_hier_clusters_exact} -\title{Exact significance test for hierarchical clustering} -\usage{ -test_hier_clusters_exact( - X, - link, - hcl, - K, - k1, - k2, - iso = TRUE, - sig = NULL, - SigInv = NULL -) -} -\arguments{ -\item{X}{\eqn{n} by \eqn{p} matrix containing numeric data.} - -\item{link}{String selecting the linkage. Supported options are \code{"single", "average", "centroid", "ward.D", "median"}, and \code{"mcquitty"}.} - -\item{hcl}{Object of the type \code{hclust} containing the hierarchical clustering of X.} - -\item{K}{Integer selecting the total number of clusters.} - -\item{k1, k2}{Integers selecting the clusters to test, as indexed by the results of \code{cutree(hcl, K)}.} - -\item{iso}{Boolean. If \code{TRUE}, isotropic covariance matrix model, otherwise not.} - -\item{sig}{Optional scalar specifying \eqn{\sigma}, relevant if \code{iso} is \code{TRUE}.} - -\item{SigInv}{Optional matrix specifying \eqn{\Sigma^{-1}}, relevant if \code{iso} is \code{FALSE}.} -} -\value{ -\item{stat}{the test statistic: the Euclidean distance between the mean of cluster \code{k1} and the mean of cluster \code{k2} } -\item{pval}{the p-value} -\item{trunc}{object of the type \code{Intervals} containing the conditioning set} -} -\description{ -This tests the null hypothesis of no difference in means between -clusters \code{k1} and \code{k2} at level \code{K} in a hierarchical clustering. -(The \code{K} clusters are numbered as per the results of the \code{cutree} -function in the \code{stats} package.) -} -\details{ -In order to account for the fact that the clusters have been estimated from the data, -the p-values are computed conditional on the fact that those clusters were estimated. -This function computes p-values exactly via an analytic characterization of the conditioning set. - -Currently, this function supports squared Euclidean distance as a measure of dissimilarity -between observations, and the following six linkages: single, average, centroid, Ward, -McQuitty (also known as WPGMA), and median (also kown as WPGMC). - -By default, this function assumes that the covariance matrix of the features is isotropic -i.e. \eqn{Cov(X_i) = \sigma^2 I_p}. Setting \code{iso} to \code{FALSE} instead assumes that -\eqn{Cov(X_i) = \Sigma}. If known, \eqn{\sigma} can be passed in using the \code{sigma} argument -or \eqn{\Sigma^{-1}} can be passed in the \code{SigInv} argument; otherwise, an -estimate of \eqn{\sigma} or \eqn{\Sigma} will be used. -} -\examples{ -# Simulates a 100 x 2 data set with three clusters -set.seed(123) -dat <- rbind(c(-1, 0), c(0, sqrt(3)), c(1, 0))[rep(1:3, length=100), ] + -matrix(0.2*rnorm(200), 100, 2) - -# Average linkage hierarchical clustering -hcl <- hclust(dist(dat, method="euclidean")^2, method="average") - -# plot dendrograms with the 1st and 2nd clusters (cut at the third split) -# displayed in blue and orange -plot(hcl) -rect_hier_clusters(hcl, k=3, which=1:2, border=c("blue", "orange")) - -# tests for a difference in means between the blue and orange clusters -test_hier_clusters_exact(X=dat, link="average", hcl=hcl, K=3, k1=1, k2=2) - -} -\references{ -Lucy L. Gao et al. "Selective inference for hierarchical clustering". arXiv preprint (2020). -} -\seealso{ -\code{\link{rect_hier_clusters}} for visualizing clusters \code{k1} and \code{k2} in the dendrogram; - -\code{\link{test_complete_hier_clusters_approx}} for approximate p-values for complete linkage hierarchical clustering; - -\code{\link{test_clusters_approx}} for approximate p-values for a user-specified clustering function; - -\code{\link{test_hier_clusters_exact_1f}} for exact p-values for a difference in the mean of one feature. -}