diff --git a/DESCRIPTION b/DESCRIPTION index 7016526..5f1bd90 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,8 +1,8 @@ Package: fastLink Type: Package Title: Fast Probabilistic Record Linkage with Missing Data -Version: 0.3.2 -Date: 2018-02-25 +Version: 0.4.0 +Date: 2018-05-15 Authors@R: c( person("Ted", "Enamorado", email = "fastlinkr@gmail.com", role = c("aut", "cre")), person("Ben", "Fifield", email = "fastlinkr@gmail.com", role = c("aut")), diff --git a/NAMESPACE b/NAMESPACE index 01c9d8a..6cc37ed 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -3,15 +3,15 @@ S3method(plot,fastLink) S3method(print,inspectEM) S3method(summary,fastLink) +export(aggconfusion) export(aggregateEM) export(calcMoversPriors) export(clusterMatch) export(confusion) -export(aggconfusion) export(dedupeMatches) export(emlinkMARmov) -export(emlinklog) export(emlinkRS) +export(emlinklog) export(fastLink) export(gammaCK2par) export(gammaCKpar) @@ -49,15 +49,15 @@ importFrom(parallel,makeCluster) importFrom(parallel,mclapply) importFrom(parallel,stopCluster) importFrom(plotrix,staxlab) +importFrom(stats,glm) importFrom(stats,kmeans) +importFrom(stats,model.matrix) importFrom(stats,na.omit) importFrom(stats,prcomp) importFrom(stats,predict) importFrom(stats,quantile) importFrom(stats,runif) importFrom(stats,var) -importFrom(stats,glm) -importFrom(stats,model.matrix) importFrom(stringdist,phonetic) importFrom(stringdist,stringdist) importFrom(stringdist,stringdistmatrix) diff --git a/R/aggconfusion.R b/R/aggconfusion.R index edaa93c..01986a4 100644 --- a/R/aggconfusion.R +++ b/R/aggconfusion.R @@ -1,13 +1,12 @@ -#' Get confusion table for fastLink objects after blocking via clusterMatch() +#' aggconfusion #' -#' Calculate confusion table after running fastLink(). +#' Aggregate confusion tables from separate runs of fastLink() (UNDER DEVELOPMENT) #' -#' @usage confusion(object, threshold) +#' @usage aggconfusion(object) #' -#' @param object A 'fastLink' object. Can only be run if 'return.all = TRUE' in 'fastLink().' -#' @param threshold The matching threshold above which a pair is a true match. Default is .85 +#' @param object A list of confusion tables. #' -#' @return 'confusion()' returns two tables - one calculating the confusion table, and another +#' @return 'aggconfusion()' returns two tables - one calculating the confusion table, and another #' calculating a series of additional summary statistics. #' #' @author Ted Enamorado and Ben Fifield diff --git a/R/emlinklog.R b/R/emlinklog.R index 64a8b38..acb8527 100644 --- a/R/emlinklog.R +++ b/R/emlinklog.R @@ -3,8 +3,8 @@ #' Expectation-Maximization algorithm for Record Linkage #' allowing for dependencies across linkage fields #' -#' @usage emlinklog(patterns, nobs.a, nobs.b, p.m, iter.max, -#' tol, varnames) +#' @usage emlinklog(patterns, nobs.a, nobs.b, p.m, p.gamma.j.m, p.gamma.j.u, +#' iter.max, tol, varnames) #' #' @param patterns table that holds the counts for each unique agreement #' pattern. This object is produced by the function: tableCounts. @@ -48,53 +48,54 @@ #' #' @export #' @importFrom gtools rdirichlet +#' @importFrom stats glm model.matrix emlinklog <- function(patterns, nobs.a, nobs.b, - p.m = 0.1, iter.max = 5000, tol = 1e-5, p.gamma.j.m = NULL, p.gamma.j.u = NULL, varnames = NULL) { + p.m = 0.1, p.gamma.j.m = NULL, p.gamma.j.u = NULL, iter.max = 5000, tol = 1e-5, varnames = NULL) { -## OPTIONS -## patterns <- tc; nobs.a <- nrow(dfA); nobs.a <- nrow(dfB); p.m <- 0.1; iter.max = 5000; -## tol = 1e-5; p.gamma.k.m = NULL; p.gamma.k.u = NULL + ## OPTIONS + ## patterns <- tc; nobs.a <- nrow(dfA); nobs.a <- nrow(dfB); p.m <- 0.1; iter.max = 5000; + ## tol = 1e-5; p.gamma.k.m = NULL; p.gamma.k.u = NULL - options(digits=16) - - ## EM Algorithm for a Fellegi-Sunter model that accounts for missing data (under MAR) - ## - ## Args: - ## patterns: - ## p.m: - ## p.gamma.k.m: - ## p.gamma.k.u: - ## tol: - ## - ## Returns: - ## The p.m, p.gamma.k.m, p.gamma.k.u, p.gamma.k.m, p.gamma.k.m, p.gamma.k.m, that - ## maximize the observed data log-likelihood of the agreement patterns - - - ## Number of fields - nfeatures <- ncol(patterns) - 1 - - ## Patterns: - gamma.j.k <- as.matrix(patterns[, 1:nfeatures]) + options(digits=16) + + ## EM Algorithm for a Fellegi-Sunter model that accounts for missing data (under MAR) + ## + ## Args: + ## patterns: + ## p.m: + ## p.gamma.k.m: + ## p.gamma.k.u: + ## tol: + ## + ## Returns: + ## The p.m, p.gamma.k.m, p.gamma.k.u, p.gamma.k.m, p.gamma.k.m, p.gamma.k.m, that + ## maximize the observed data log-likelihood of the agreement patterns + + + ## Number of fields + nfeatures <- ncol(patterns) - 1 + + ## Patterns: + gamma.j.k <- as.matrix(patterns[, 1:nfeatures]) - ## Patterns counts: - n.j <- as.matrix(patterns[, (nfeatures + 1)]) # Counts - - ## Number of unique patterns: - N <- nrow(gamma.j.k) - - p.gamma.k.m <- p.gamma.k.u <- NULL - - ## Overall Prob of finding a Match - p.u <- 1 - p.m - - ## Field specific probability of observing gamma.k conditional on M + ## Patterns counts: + n.j <- as.matrix(patterns[, (nfeatures + 1)]) # Counts + + ## Number of unique patterns: + N <- nrow(gamma.j.k) + + p.gamma.k.m <- p.gamma.k.u <- NULL + + ## Overall Prob of finding a Match + p.u <- 1 - p.m + + ## Field specific probability of observing gamma.k conditional on M if (is.null(p.gamma.k.m)) { p.gamma.k.m <- list() for (i in 1:nfeatures) { - l.m <- length(unique(na.omit(gamma.j.k[, i]))) - c.m <- seq(from = 1, to = 50 * l.m, by = 50) - p.gamma.k.m[[i]] <- sort(rdirichlet(1, c.m), decreasing = FALSE) + l.m <- length(unique(na.omit(gamma.j.k[, i]))) + c.m <- seq(from = 1, to = 50 * l.m, by = 50) + p.gamma.k.m[[i]] <- sort(rdirichlet(1, c.m), decreasing = FALSE) } } @@ -102,138 +103,145 @@ emlinklog <- function(patterns, nobs.a, nobs.b, if (is.null(p.gamma.k.u)) { p.gamma.k.u <- list() for (i in 1:nfeatures) { - l.u <- length(unique(na.omit(gamma.j.k[, i]))) - c.u <- seq(from = 1, to = 50 * l.u, by = 50) - p.gamma.k.u[[i]] <- sort(rdirichlet(1, c.u), decreasing = TRUE) + l.u <- length(unique(na.omit(gamma.j.k[, i]))) + c.u <- seq(from = 1, to = 50 * l.u, by = 50) + p.gamma.k.u[[i]] <- sort(rdirichlet(1, c.u), decreasing = TRUE) } } - - p.gamma.k.j.m <- matrix(rep(NA, N * nfeatures), nrow = nfeatures, ncol = N) - p.gamma.k.j.u <- matrix(rep(NA, N * nfeatures), nrow = nfeatures, ncol = N) - - p.gamma.j.m <- matrix(rep(NA, N), nrow = N, ncol = 1) - p.gamma.j.u <- matrix(rep(NA, N), nrow = N, ncol = 1) - - for (i in 1:nfeatures) { - temp.01 <- temp.02 <- gamma.j.k[, i] - temp.1 <- unique(na.omit(temp.01)) - temp.2 <- p.gamma.k.m[[i]] - temp.3 <- p.gamma.k.u[[i]] - for (j in 1:length(temp.1)) { - temp.01[temp.01 == temp.1[j]] <- temp.2[j] - temp.02[temp.02 == temp.1[j]] <- temp.3[j] - } - p.gamma.k.j.m[i, ] <- temp.01 - p.gamma.k.j.u[i, ] <- temp.02 - } - - sumlog <- function(x) { sum(log(x), na.rm = T) } - - p.gamma.j.m <- as.matrix((apply(p.gamma.k.j.m, 2, sumlog))) - p.gamma.j.m <- exp(p.gamma.j.m) - - p.gamma.j.u <- as.matrix((apply(p.gamma.k.j.u, 2, sumlog))) - p.gamma.j.u <- exp(p.gamma.j.u) - - delta <- 1 - count <- 1 - warn.once <- 1 - - ## The EM Algorithm presented in the paper starts here: - while (abs(delta) >= tol) { - - if((count %% 100) == 0) { - cat("Iteration number", count, "\n") - cat("Maximum difference in log-likelihood =", round(delta, 4), "\n") - } - ## Old Paramters - p.old <- c(p.m, p.u, unlist(p.gamma.j.m), unlist(p.gamma.j.u)) - - ## ------ - ## E-Step: - ## ------ + p.gamma.k.j.m <- matrix(rep(NA, N * nfeatures), nrow = nfeatures, ncol = N) + p.gamma.k.j.u <- matrix(rep(NA, N * nfeatures), nrow = nfeatures, ncol = N) - log.prod <- log(p.gamma.j.m) + log(p.m) - max.log.prod <- max(log.prod) + p.gamma.j.m <- matrix(rep(NA, N), nrow = N, ncol = 1) + p.gamma.j.u <- matrix(rep(NA, N), nrow = N, ncol = 1) - logxpy <- function(lx,ly) { - temp <- cbind(lx, ly) - apply(temp, 1, max) + log1p(exp(-abs(lx-ly))) + for (i in 1:nfeatures) { + temp.01 <- temp.02 <- gamma.j.k[, i] + temp.1 <- unique(na.omit(temp.01)) + temp.2 <- p.gamma.k.m[[i]] + temp.3 <- p.gamma.k.u[[i]] + for (j in 1:length(temp.1)) { + temp.01[temp.01 == temp.1[j]] <- temp.2[j] + temp.02[temp.02 == temp.1[j]] <- temp.3[j] + } + p.gamma.k.j.m[i, ] <- temp.01 + p.gamma.k.j.u[i, ] <- temp.02 } - log.sum <- logxpy(log(p.gamma.j.m) + log(p.m), log(p.gamma.j.u) + log(p.u)) - zeta.j <- exp(log.prod - max.log.prod)/(exp(log.sum - max.log.prod)) + sumlog <- function(x) { sum(log(x), na.rm = T) } - ## -------- - ## M-step : - ## -------- - num.prod <- n.j * zeta.j - p.m <- sum(num.prod)/sum(n.j) - p.u <- 1 - p.m + p.gamma.j.m <- as.matrix((apply(p.gamma.k.j.m, 2, sumlog))) + p.gamma.j.m <- exp(p.gamma.j.m) + p.gamma.j.u <- as.matrix((apply(p.gamma.k.j.u, 2, sumlog))) + p.gamma.j.u <- exp(p.gamma.j.u) - pat <- data.frame(gamma.j.k) - pat[is.na(pat)] <- -1 - pat <- replace(pat, TRUE, lapply(pat, factor)) - factors <- model.matrix(~ ., pat) - - ## get theta.m and theta.u - c <- 1e-06 - matches <- glm(count ~ ., data = data.frame(count = ((zeta.j * n.j) + c), factors), - family = "quasipoisson") + delta <- 1 + count <- 1 + warn.once <- 1 + + ## The EM Algorithm presented in the paper starts here: + while (abs(delta) >= tol) { + + if((count %% 100) == 0) { + cat("Iteration number", count, "\n") + cat("Maximum difference in log-likelihood =", round(delta, 4), "\n") + } + + ## Old Paramters + p.old <- c(p.m, p.u, unlist(p.gamma.j.m), unlist(p.gamma.j.u)) - non.matches <- glm(count ~ .*., data = data.frame(count = ((1 - zeta.j) * n.j + c), factors), - family = "quasipoisson") - - ## Predict & renormalization fn as in Murray 2017 - g <- function(fit) { - logwt = predict( fit ) - logwt = logwt - max(logwt) - wt = exp(logwt) - wt/sum(wt) - } + ## ------ + ## E-Step: + ## ------ + + log.prod <- log(p.gamma.j.m) + log(p.m) + max.log.prod <- max(log.prod) + + logxpy <- function(lx,ly) { + temp <- cbind(lx, ly) + apply(temp, 1, max) + log1p(exp(-abs(lx-ly))) + } + + log.sum <- logxpy(log(p.gamma.j.m) + log(p.m), log(p.gamma.j.u) + log(p.u)) + zeta.j <- exp(log.prod - max.log.prod)/(exp(log.sum - max.log.prod)) + + ## -------- + ## M-step : + ## -------- + num.prod <- n.j * zeta.j + p.m <- sum(num.prod)/sum(n.j) + p.u <- 1 - p.m + - p.gamma.j.m = as.matrix(g(matches)) - p.gamma.j.u = as.matrix(g(non.matches)) + pat <- data.frame(gamma.j.k) + pat[is.na(pat)] <- -1 + pat <- replace(pat, TRUE, lapply(pat, factor)) + factors <- model.matrix(~ ., pat) + + ## get theta.m and theta.u + c <- 1e-06 + matches <- glm(count ~ ., data = data.frame(count = ((zeta.j * n.j) + c), factors), + family = "quasipoisson") - ## Updated parameters: - p.new <- c(p.m, p.u, unlist(p.gamma.j.m), unlist(p.gamma.j.u)) - - if(p.m < 1e-13 & warn.once == 1) { - warning("The overall probability of finding a match is too small. Increasing the amount of overlap between the datasets might help, see e.g., clusterMatch()") - warn.once <- 0 + non.matches <- glm(count ~ .*., data = data.frame(count = ((1 - zeta.j) * n.j + c), factors), + family = "quasipoisson") + + ## Predict & renormalization fn as in Murray 2017 + g <- function(fit) { + logwt = predict( fit ) + logwt = logwt - max(logwt) + wt = exp(logwt) + wt/sum(wt) + } + + p.gamma.j.m = as.matrix(g(matches)) + p.gamma.j.u = as.matrix(g(non.matches)) + + ## Updated parameters: + p.new <- c(p.m, p.u, unlist(p.gamma.j.m), unlist(p.gamma.j.u)) + + if(p.m < 1e-13 & warn.once == 1) { + warning("The overall probability of finding a match is too small. Increasing the amount of overlap between the datasets might help, see e.g., clusterMatch()") + warn.once <- 0 + } + + ## Max difference between the updated and old parameters: + delta <- max(abs(p.new - p.old)) + count <- count + 1 + + if(count > iter.max) { + warning("The EM algorithm has run for the specified number of iterations but has not converged yet.") + break + } } - ## Max difference between the updated and old parameters: - delta <- max(abs(p.new - p.old)) - count <- count + 1 + weights <- log(p.gamma.j.m) - log(p.gamma.j.u) + + data.w <- cbind(patterns, weights, p.gamma.j.m, p.gamma.j.u) + nc <- ncol(data.w) + colnames(data.w)[nc-2] <- "weights" + colnames(data.w)[nc-1] <- "p.gamma.j.m" + colnames(data.w)[nc] <- "p.gamma.j.u" - if(count > iter.max) { - warning("The EM algorithm has run for the specified number of iterations but has not converged yet.") - break + inf <- which(data.w == Inf, arr.ind = T) + ninf <- which(data.w == -Inf, arr.ind = T) + + data.w[inf[, 1], unique(inf[, 2])] <- 150 + data.w[ninf[, 1], unique(ninf[, 2])] <- -150 + + if(!is.null(varnames)){ + output <- list("zeta.j"= zeta.j,"p.m"= p.m, "p.u" = p.u, + "p.gamma.j.m" = p.gamma.j.m, "p.gamma.j.u" = p.gamma.j.u, "patterns.w" = data.w, "iter.converge" = count, + "nobs.a" = nobs.a, "nobs.b" = nobs.b, "varnames" = varnames) + }else{ + output <- list("zeta.j"= zeta.j,"p.m"= p.m, "p.u" = p.u, + "p.gamma.j.m" = p.gamma.j.m, "p.gamma.j.u" = p.gamma.j.u, "patterns.w" = data.w, "iter.converge" = count, + "nobs.a" = nobs.a, "nobs.b" = nobs.b, "varnames" = paste0("gamma.", 1:nfeatures)) } - } - - weights <- log(p.gamma.j.m) - log(p.gamma.j.u) - - data.w <- cbind(patterns, weights, p.gamma.j.m, p.gamma.j.u) - nc <- ncol(data.w) - colnames(data.w)[nc-2] <- "weights" - colnames(data.w)[nc-1] <- "p.gamma.j.m" - colnames(data.w)[nc] <- "p.gamma.j.u" - - inf <- which(data.w == Inf, arr.ind = T) - ninf <- which(data.w == -Inf, arr.ind = T) - - data.w[inf[, 1], unique(inf[, 2])] <- 150 - data.w[ninf[, 1], unique(ninf[, 2])] <- -150 - - output <- list("zeta.j"= zeta.j,"p.m"= p.m, "p.u" = p.u, - "p.gamma.j.m" = p.gamma.j.m, "p.gamma.j.u" = p.gamma.j.u, "patterns.w" = data.w, "iter.converge" = count, - "nobs.a" = nobs.a, "nobs.b" = nobs.b, "varnames" = paste0("gamma.", 1:nfeatures)) - - class(output) <- c("fastLink", "fastLink.EM") - - return(output) + + + class(output) <- c("fastLink", "fastLink.EM") + + return(output) } diff --git a/R/fastLink.R b/R/fastLink.R index 811d3b9..d9b17d0 100644 --- a/R/fastLink.R +++ b/R/fastLink.R @@ -10,7 +10,7 @@ #' priors.obj, w.lambda, w.pi, #' address.field, gender.field, estimate.only, em.obj, #' dedupe.matches, linprog.dedupe, -#' reweight.names, firstname.field, +#' reweight.names, firstname.field, cond.indep, #' n.cores, tol.em, threshold.match, return.all, return.df, verbose) #' #' @param dfA Dataset A - to be matched to Dataset B @@ -51,6 +51,8 @@ #' @param linprog.dedupe If deduping matches, whether to use Winkler's linear programming solution to dedupe. Default is FALSE. #' @param reweight.names Whether to reweight the posterior match probabilities by the frequency of individual first names. Default is FALSE. #' @param firstname.field The name of the field indicating first name. Must be provided if reweight.names = TRUE. +#' @param cond.indep Estimates for the parameters of interest are obtained from the Fellegi-Sunter model under conditional independence. Default is TRUE. +#' If set to FALSE parameters estimates are obtained from a model that allows for dependencies across linkage fields. #' @param n.cores Number of cores to parallelize over. Default is NULL. #' @param tol.em Convergence tolerance for the EM Algorithm. Default is 1e-04. #' @param threshold.match A number between 0 and 1 indicating either the lower bound (if only one number provided) or the range of certainty that the @@ -59,8 +61,6 @@ #' @param return.all Whether to return the most likely match for each observation in dfA and dfB. Overrides user setting of \code{threshold.match} by setting #' \code{threshold.match} to 0.0001, and automatically dedupes all matches. Default is FALSE. #' @param return.df Whether to return the entire dataframe of dfA and dfB instead of just the indices. Default is FALSE. -#' @param cond.indep estimates for the parameters of interest are obtained from the Fellegi-Sunter model under conditional independence. Default is TRUE. -#' If set to FALSE parameters estimates are obtained from a model that allows for dependencies across linkage fields. #' @param verbose Whether to print elapsed time for each step. Default is FALSE. #' #' @return \code{fastLink} returns a list of class 'fastLink' containing the following components if calculating matches: @@ -97,9 +97,9 @@ fastLink <- function(dfA, dfB, varnames, w.lambda = NULL, w.pi = NULL, address.field = NULL, gender.field = NULL, estimate.only = FALSE, em.obj = NULL, dedupe.matches = TRUE, linprog.dedupe = FALSE, - reweight.names = FALSE, firstname.field = NULL, + reweight.names = FALSE, firstname.field = NULL, cond.indep = TRUE, n.cores = NULL, tol.em = 1e-04, threshold.match = 0.85, - return.all = FALSE, return.df = FALSE, cond.indep = TRUE, verbose = FALSE){ + return.all = FALSE, return.df = FALSE, verbose = FALSE){ cat("\n") cat(c(paste(rep("=", 20), sep = "", collapse = ""), "\n")) @@ -193,6 +193,14 @@ fastLink <- function(dfA, dfB, varnames, }else{ cat("If you set return.all to FALSE, you will not be able to calculate a confusion table as a summary statistic.\n") } + if(!is.null(priors.obj) & cond.indep == FALSE){ + cat("The current implementation of fastLink can only incorporate prior information under the conditionally independent model. Ignoring prior information in estimation.") + priors.obj <- NULL + w.lambda <- NULL + w.pi <- NULL + address.field <- NULL + gender.field <- NULL + } ## Create boolean indicators sm.bool <- which(varnames %in% stringdist.match) @@ -294,12 +302,12 @@ fastLink <- function(dfA, dfB, varnames, ## ------------------------------ ## Get counts for zeta parameters ## ------------------------------ - cat("Getting counts for zeta parameters.\n") + cat("Getting counts for parameter estimation.\n") start <- Sys.time() counts <- tableCounts(gammalist, nobs.a = nr_a, nobs.b = nr_b, n.cores = n.cores) end <- Sys.time() if(verbose){ - cat("Getting counts for zeta parameters took", round(difftime(end, start, units = "mins"), 2), "minutes.\n\n") + cat("Getting counts for parameter estimation took", round(difftime(end, start, units = "mins"), 2), "minutes.\n\n") } ## ------------------------------ @@ -325,15 +333,17 @@ fastLink <- function(dfA, dfB, varnames, pi.prior <- NULL } } - if(cond.indep == FALSE) { - resultsEM <- emlinklog(patterns = counts, nobs.a = nr_a, nobs.b = nr_b, tol = tol.em) - } else { resultsEM <- emlinkMARmov(patterns = counts, nobs.a = nr_a, nobs.b = nr_b, - tol = tol.em, - prior.lambda = lambda.prior, w.lambda = w.lambda, - prior.pi = pi.prior, w.pi = w.pi, - address.field = address.field, - gender.field = gender.field, - varnames = varnames) + if(cond.indep == FALSE){ + resultsEM <- emlinklog(patterns = counts, nobs.a = nr_a, nobs.b = nr_b, + tol = tol.em, varnames = varnames) + }else{ + resultsEM <- emlinkMARmov(patterns = counts, nobs.a = nr_a, nobs.b = nr_b, + tol = tol.em, + prior.lambda = lambda.prior, w.lambda = w.lambda, + prior.pi = pi.prior, w.pi = w.pi, + address.field = address.field, + gender.field = gender.field, + varnames = varnames) } end <- Sys.time() if(verbose){ diff --git a/man/aggconfusion.Rd b/man/aggconfusion.Rd index 0d9a8eb..4e85cc7 100644 --- a/man/aggconfusion.Rd +++ b/man/aggconfusion.Rd @@ -1,26 +1,20 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/confusion.R +% Please edit documentation in R/aggconfusion.R \name{aggconfusion} \alias{aggconfusion} -\title{Aggregates confusion tables obtained from fastLink} +\title{aggconfusion} \usage{ aggconfusion(object) } \arguments{ -\item{object}{A list that contains at least one confusion table obtained via 'confusion()' } +\item{object}{A list of confusion tables.} } \value{ 'aggconfusion()' returns two tables - one calculating the confusion table, and another calculating a series of additional summary statistics. } \description{ -Calculate confusion table after running fastLink(). -} -\examples{ -\dontrun{ - ct <- aggconfusion() -} - +Aggregate confusion tables from separate runs of fastLink() } \author{ Ted Enamorado and Ben Fifield diff --git a/man/emlinkMARmov.Rd b/man/emlinkMARmov.Rd index a9357ae..82f7eb7 100644 --- a/man/emlinkMARmov.Rd +++ b/man/emlinkMARmov.Rd @@ -47,12 +47,12 @@ clean visualization of EM results in summary functions.} \value{ \code{emlinkMARmov} returns a list with the following components: \item{zeta.j}{The posterior match probabilities for each unique pattern.} -\item{p.m}{The probability of finding a match.} -\item{p.u}{The probability of finding a non-match.} -\item{p.gamma.k.m}{The probabilities of observing the values in field k conditional of being in the set of matches.} -\item{p.gamma.k.u}{The probabilities of observing the values in field k conditional of being in the set of non-matches.} -\item{p.gamma.j.m}{The probability of observing a particular agreement pattern conditional on being in the set of matches.} -\item{p.gamma.j.u}{The probability of observing a particular agreement pattern conditional on being in the set of non-matches.} +\item{p.m}{The probability of a pair matching.} +\item{p.u}{The probability of a pair not matching.} +\item{p.gamma.k.m}{The matching probability for a specific matching field.} +\item{p.gamma.k.u}{The non-matching probability for a specific matching field.} +\item{p.gamma.j.m}{The probability that a pair is in the matched set given a particular agreement pattern.} +\item{p.gamma.j.u}{The probability that a pair is in the unmatched set given a particular agreement pattern.} \item{patterns.w}{Counts of the agreement patterns observed, along with the Felligi-Sunter Weights.} \item{iter.converge}{The number of iterations it took the EM algorithm to converge.} \item{nobs.a}{The number of observations in dataset A.} diff --git a/man/emlinkRS.Rd b/man/emlinkRS.Rd index 73e95b4..0e5808f 100644 --- a/man/emlinkRS.Rd +++ b/man/emlinkRS.Rd @@ -21,12 +21,12 @@ on a smaller random sample to apply to counts from a larger sample} \value{ \code{emlinkMARmov} returns a list with the following components: \item{zeta.j}{The posterior match probabilities for each unique pattern.} -\item{p.m}{The probability of finding a match.} -\item{p.u}{The probability of finding a non-match.} -\item{p.gamma.k.m}{The probabilities of observing the values in field k conditional of being in the set of matches.} -\item{p.gamma.k.u}{The probabilities of observing the values in field k conditional of being in the set of non-matches.} -\item{p.gamma.j.m}{The probability of observing a particular agreement pattern conditional on being in the set of matches.} -\item{p.gamma.j.u}{The probability of observing a particular agreement pattern conditional on being in the set of non-matches.} +\item{p.m}{The posterior probability of a pair matching.} +\item{p.u}{The posterior probability of a pair not matching.} +\item{p.gamma.k.m}{The posterior of the matching probability for a specific matching field.} +\item{p.gamma.k.u}{The posterior of the non-matching probability for a specific matching field.} +\item{p.gamma.j.m}{The posterior probability that a pair is in the matched set given a particular agreement pattern.} +\item{p.gamma.j.u}{The posterior probability that a pair is in the unmatched set given a particular agreement pattern.} \item{patterns.w}{Counts of the agreement patterns observed, along with the Felligi-Sunter Weights.} \item{iter.converge}{The number of iterations it took the EM algorithm to converge.} \item{nobs.a}{The number of observations in dataset A.} diff --git a/man/emlinklog.Rd b/man/emlinklog.Rd index 4b628f3..7e4f1c7 100644 --- a/man/emlinklog.Rd +++ b/man/emlinklog.Rd @@ -4,7 +4,8 @@ \alias{emlinklog} \title{emlinklog} \usage{ -emlinklog(patterns, nobs.a, nobs.b, p.m, iter.max, tol, p.gamma.j.m, p.gamma.j.u, varnames) +emlinklog(patterns, nobs.a, nobs.b, p.m, p.gamma.j.m, p.gamma.j.u, +iter.max, tol, varnames) } \arguments{ \item{patterns}{table that holds the counts for each unique agreement @@ -16,9 +17,9 @@ pattern. This object is produced by the function: tableCounts.} \item{p.m}{probability of finding a match. Default is 0.1} -\item{p.gamma.j.m}{The probability of observing a particular agreement pattern conditional on being in the set of matches.} +\item{p.gamma.j.m}{probability that conditional of being in the matched set we observed a specific agreement pattern.} -\item{p.gamma.j.u}{The probability of observing a particular agreement pattern conditional on being in the set of non-matches.} +\item{p.gamma.j.u}{probability that conditional of being in the non-matched set we observed a specific agreement pattern.} \item{iter.max}{Max number of iterations. Default is 5000} @@ -28,7 +29,7 @@ pattern. This object is produced by the function: tableCounts.} clean visualization of EM results in summary functions.} } \value{ -\code{emlinkMARmov} returns a list with the following components: +\code{emlinklog} returns a list with the following components: \item{zeta.j}{The posterior match probabilities for each unique pattern.} \item{p.m}{The probability of finding a match.} \item{p.u}{The probability of finding a non-match.} @@ -40,8 +41,8 @@ clean visualization of EM results in summary functions.} \item{nobs.b}{The number of observations in dataset B.} } \description{ -Expectation-Maximization algorithm for Record Linkage -allowing for dependecies across linakge fields. +Expectation-Maximization algorithm for Record Linkage +allowing for dependencies across linkage fields } \examples{ \dontrun{ @@ -60,5 +61,5 @@ em.log <- emlinklog(tc, nobs.a = nrow(dfA), nobs.b = nrow(dfB)) } \author{ -Ted Enamorado and Kosuke Imai +Ted Enamorado and Benjamin Fifield } diff --git a/man/fastLink.Rd b/man/fastLink.Rd index e48b0ef..059b7c6 100644 --- a/man/fastLink.Rd +++ b/man/fastLink.Rd @@ -11,8 +11,8 @@ cut.a.num, cut.p.num, priors.obj, w.lambda, w.pi, address.field, gender.field, estimate.only, em.obj, dedupe.matches, linprog.dedupe, -reweight.names, firstname.field, -n.cores, tol.em, threshold.match, return.all, return.df, cond.indep, verbose) +reweight.names, firstname.field, cond.indep, +n.cores, tol.em, threshold.match, return.all, return.df, verbose) } \arguments{ \item{dfA}{Dataset A - to be matched to Dataset B} @@ -76,6 +76,9 @@ estimated on a smaller sample, and the user wants to apply them to the full data \item{firstname.field}{The name of the field indicating first name. Must be provided if reweight.names = TRUE.} +\item{cond.indep}{Estimates for the parameters of interest are obtained from the Fellegi-Sunter model under conditional independence. Default is TRUE. +If set to FALSE parameters estimates are obtained from a model that allows for dependencies across linkage fields.} + \item{n.cores}{Number of cores to parallelize over. Default is NULL.} \item{tol.em}{Convergence tolerance for the EM Algorithm. Default is 1e-04.} @@ -89,9 +92,6 @@ while threshold.match = c(.85, .95) will return all pairs with posterior probabi \item{return.df}{Whether to return the entire dataframe of dfA and dfB instead of just the indices. Default is FALSE.} -\item{cond.indep}{estimates for the parameters of interest are obtained from the Fellegi-Sunter model under conditional independence. Default is TRUE. -If set to FALSE parameters estimates are obtained from a model that allows for dependencies across linkage fields.} - \item{verbose}{Whether to print elapsed time for each step. Default is FALSE.} } \value{