From dc9ebb510890c39cb615324b3039dc18db99a17f Mon Sep 17 00:00:00 2001
From: Ben Fifield <benfifield@gmail.com>
Date: Tue, 15 May 2018 15:39:55 -0400
Subject: [PATCH] prep for submission to cran - v0.4.0

---
 DESCRIPTION         |   4 +-
 NAMESPACE           |   8 +-
 R/aggconfusion.R    |  11 +-
 R/emlinklog.R       | 326 +++++++++++++++++++++++---------------------
 R/fastLink.R        |  42 +++---
 man/aggconfusion.Rd |  14 +-
 man/emlinkMARmov.Rd |  12 +-
 man/emlinkRS.Rd     |  12 +-
 man/emlinklog.Rd    |  15 +-
 man/fastLink.Rd     |  10 +-
 10 files changed, 233 insertions(+), 221 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 7016526..5f1bd90 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,8 +1,8 @@
 Package: fastLink
 Type: Package
 Title: Fast Probabilistic Record Linkage with Missing Data
-Version: 0.3.2
-Date: 2018-02-25
+Version: 0.4.0
+Date: 2018-05-15
 Authors@R: c(
   person("Ted", "Enamorado", email = "fastlinkr@gmail.com", role = c("aut", "cre")),
   person("Ben", "Fifield", email = "fastlinkr@gmail.com", role = c("aut")),
diff --git a/NAMESPACE b/NAMESPACE
index 01c9d8a..6cc37ed 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -3,15 +3,15 @@
 S3method(plot,fastLink)
 S3method(print,inspectEM)
 S3method(summary,fastLink)
+export(aggconfusion)
 export(aggregateEM)
 export(calcMoversPriors)
 export(clusterMatch)
 export(confusion)
-export(aggconfusion)
 export(dedupeMatches)
 export(emlinkMARmov)
-export(emlinklog)
 export(emlinkRS)
+export(emlinklog)
 export(fastLink)
 export(gammaCK2par)
 export(gammaCKpar)
@@ -49,15 +49,15 @@ importFrom(parallel,makeCluster)
 importFrom(parallel,mclapply)
 importFrom(parallel,stopCluster)
 importFrom(plotrix,staxlab)
+importFrom(stats,glm)
 importFrom(stats,kmeans)
+importFrom(stats,model.matrix)
 importFrom(stats,na.omit)
 importFrom(stats,prcomp)
 importFrom(stats,predict)
 importFrom(stats,quantile)
 importFrom(stats,runif)
 importFrom(stats,var)
-importFrom(stats,glm)
-importFrom(stats,model.matrix)
 importFrom(stringdist,phonetic)
 importFrom(stringdist,stringdist)
 importFrom(stringdist,stringdistmatrix)
diff --git a/R/aggconfusion.R b/R/aggconfusion.R
index edaa93c..01986a4 100644
--- a/R/aggconfusion.R
+++ b/R/aggconfusion.R
@@ -1,13 +1,12 @@
-#' Get confusion table for fastLink objects after blocking via clusterMatch()
+#' aggconfusion
 #'
-#' Calculate confusion table after running fastLink().
+#' Aggregate confusion tables from separate runs of fastLink() (UNDER DEVELOPMENT)
 #'
-#' @usage confusion(object, threshold)
+#' @usage aggconfusion(object)
 #'
-#' @param object A 'fastLink' object. Can only be run if 'return.all = TRUE' in 'fastLink().'
-#' @param threshold The matching threshold above which a pair is a true match. Default is .85
+#' @param object A list of confusion tables. 
 #'
-#' @return 'confusion()' returns two tables - one calculating the confusion table, and another
+#' @return 'aggconfusion()' returns two tables - one calculating the confusion table, and another
 #' calculating a series of additional summary statistics.
 #'
 #' @author Ted Enamorado <ted.enamorado@gmail.com> and Ben Fifield <benfifield@gmail.com>
diff --git a/R/emlinklog.R b/R/emlinklog.R
index 64a8b38..acb8527 100644
--- a/R/emlinklog.R
+++ b/R/emlinklog.R
@@ -3,8 +3,8 @@
 #' Expectation-Maximization algorithm for Record Linkage 
 #' allowing for dependencies across linkage fields
 #'
-#' @usage emlinklog(patterns, nobs.a, nobs.b, p.m, iter.max,
-#' tol, varnames)
+#' @usage emlinklog(patterns, nobs.a, nobs.b, p.m, p.gamma.j.m, p.gamma.j.u,
+#' iter.max, tol, varnames)
 #'
 #' @param patterns table that holds the counts for each unique agreement
 #' pattern. This object is produced by the function: tableCounts.
@@ -48,53 +48,54 @@
 #'
 #' @export
 #' @importFrom gtools rdirichlet
+#' @importFrom stats glm model.matrix
 emlinklog <- function(patterns, nobs.a, nobs.b,
-                        p.m = 0.1, iter.max = 5000, tol = 1e-5, p.gamma.j.m = NULL, p.gamma.j.u = NULL, varnames = NULL) {
+                      p.m = 0.1, p.gamma.j.m = NULL, p.gamma.j.u = NULL, iter.max = 5000, tol = 1e-5, varnames = NULL) {
 
-## OPTIONS  
-## patterns <- tc; nobs.a <- nrow(dfA); nobs.a <- nrow(dfB); p.m <- 0.1; iter.max = 5000; 
-## tol = 1e-5; p.gamma.k.m = NULL; p.gamma.k.u = NULL
+    ## OPTIONS  
+    ## patterns <- tc; nobs.a <- nrow(dfA); nobs.a <- nrow(dfB); p.m <- 0.1; iter.max = 5000; 
+    ## tol = 1e-5; p.gamma.k.m = NULL; p.gamma.k.u = NULL
 
-  options(digits=16)
-  
-  ## EM Algorithm for a Fellegi-Sunter model that accounts for missing data (under MAR)
-  ##
-  ## Args:
-  ##   patterns:
-  ##   p.m:
-  ##   p.gamma.k.m:
-  ##   p.gamma.k.u:
-  ##   tol:
-  ##
-  ## Returns:
-  ##   The p.m, p.gamma.k.m, p.gamma.k.u, p.gamma.k.m, p.gamma.k.m, p.gamma.k.m, that
-  ##   maximize the observed data log-likelihood of the agreement patterns
-    
-  
-  ## Number of fields
-  nfeatures <- ncol(patterns) - 1
-  
-  ## Patterns:
-  gamma.j.k <- as.matrix(patterns[, 1:nfeatures])
+    options(digits=16)
+    
+    ## EM Algorithm for a Fellegi-Sunter model that accounts for missing data (under MAR)
+    ##
+    ## Args:
+    ##   patterns:
+    ##   p.m:
+    ##   p.gamma.k.m:
+    ##   p.gamma.k.u:
+    ##   tol:
+    ##
+    ## Returns:
+    ##   The p.m, p.gamma.k.m, p.gamma.k.u, p.gamma.k.m, p.gamma.k.m, p.gamma.k.m, that
+    ##   maximize the observed data log-likelihood of the agreement patterns
+    
+    
+    ## Number of fields
+    nfeatures <- ncol(patterns) - 1
+    
+    ## Patterns:
+    gamma.j.k <- as.matrix(patterns[, 1:nfeatures])
 
-  ## Patterns counts:
-  n.j <- as.matrix(patterns[, (nfeatures + 1)])  # Counts
-  
-  ## Number of unique patterns:
-  N <- nrow(gamma.j.k)
-  
-  p.gamma.k.m <- p.gamma.k.u <- NULL
-  
-  ## Overall Prob of finding a Match
-  p.u <- 1 - p.m
-  
-  ## Field specific probability of observing gamma.k conditional on M
+    ## Patterns counts:
+    n.j <- as.matrix(patterns[, (nfeatures + 1)])  # Counts
+    
+    ## Number of unique patterns:
+    N <- nrow(gamma.j.k)
+    
+    p.gamma.k.m <- p.gamma.k.u <- NULL
+    
+    ## Overall Prob of finding a Match
+    p.u <- 1 - p.m
+    
+    ## Field specific probability of observing gamma.k conditional on M
     if (is.null(p.gamma.k.m)) {
         p.gamma.k.m <- list()
         for (i in 1:nfeatures) {
-        	l.m <- length(unique(na.omit(gamma.j.k[, i])))
-        	c.m <- seq(from = 1, to = 50 * l.m, by = 50)
-          p.gamma.k.m[[i]] <- sort(rdirichlet(1, c.m), decreasing = FALSE)
+            l.m <- length(unique(na.omit(gamma.j.k[, i])))
+            c.m <- seq(from = 1, to = 50 * l.m, by = 50)
+            p.gamma.k.m[[i]] <- sort(rdirichlet(1, c.m), decreasing = FALSE)
         }
     }
 
@@ -102,138 +103,145 @@ emlinklog <- function(patterns, nobs.a, nobs.b,
     if (is.null(p.gamma.k.u)) {
         p.gamma.k.u <- list()
         for (i in 1:nfeatures) {
-        	l.u <- length(unique(na.omit(gamma.j.k[, i])))
-        	c.u <- seq(from = 1, to = 50 * l.u, by = 50)
-	        p.gamma.k.u[[i]] <- sort(rdirichlet(1, c.u), decreasing = TRUE)
+            l.u <- length(unique(na.omit(gamma.j.k[, i])))
+            c.u <- seq(from = 1, to = 50 * l.u, by = 50)
+            p.gamma.k.u[[i]] <- sort(rdirichlet(1, c.u), decreasing = TRUE)
         }
     }
-  
-  p.gamma.k.j.m <- matrix(rep(NA, N * nfeatures), nrow = nfeatures, ncol = N)
-  p.gamma.k.j.u <- matrix(rep(NA, N * nfeatures), nrow = nfeatures, ncol = N)
-  
-  p.gamma.j.m <- matrix(rep(NA, N), nrow = N, ncol = 1)
-  p.gamma.j.u <- matrix(rep(NA, N), nrow = N, ncol = 1)
-  
-  for (i in 1:nfeatures) {
-    temp.01 <- temp.02 <- gamma.j.k[, i]
-    temp.1 <- unique(na.omit(temp.01))
-    temp.2 <- p.gamma.k.m[[i]]
-    temp.3 <- p.gamma.k.u[[i]]
-    for (j in 1:length(temp.1)) {
-      temp.01[temp.01 == temp.1[j]] <- temp.2[j]
-      temp.02[temp.02 == temp.1[j]] <- temp.3[j]
-    }
-    p.gamma.k.j.m[i, ] <- temp.01
-    p.gamma.k.j.u[i, ] <- temp.02
-  }
-  
-  sumlog <- function(x) { sum(log(x), na.rm = T) }
-  
-  p.gamma.j.m <- as.matrix((apply(p.gamma.k.j.m, 2, sumlog)))
-  p.gamma.j.m <- exp(p.gamma.j.m)
-  
-  p.gamma.j.u <- as.matrix((apply(p.gamma.k.j.u, 2, sumlog)))
-  p.gamma.j.u <- exp(p.gamma.j.u)
-
-  delta <- 1
-  count <- 1
-  warn.once <- 1
-  
-  ## The EM Algorithm presented in the paper starts here:
-  while (abs(delta) >= tol) {
-    
-    if((count %% 100) == 0) {
-      cat("Iteration number", count, "\n")
-      cat("Maximum difference in log-likelihood =", round(delta, 4), "\n")
-    }
     
-    ## Old Paramters
-    p.old <- c(p.m, p.u, unlist(p.gamma.j.m), unlist(p.gamma.j.u))
-
-    ## ------
-    ## E-Step:
-    ## ------
+    p.gamma.k.j.m <- matrix(rep(NA, N * nfeatures), nrow = nfeatures, ncol = N)
+    p.gamma.k.j.u <- matrix(rep(NA, N * nfeatures), nrow = nfeatures, ncol = N)
     
-    log.prod <- log(p.gamma.j.m) + log(p.m)
-    max.log.prod <- max(log.prod)
+    p.gamma.j.m <- matrix(rep(NA, N), nrow = N, ncol = 1)
+    p.gamma.j.u <- matrix(rep(NA, N), nrow = N, ncol = 1)
     
-    logxpy <- function(lx,ly) {
-      temp <- cbind(lx, ly)
-      apply(temp, 1, max) + log1p(exp(-abs(lx-ly)))
+    for (i in 1:nfeatures) {
+        temp.01 <- temp.02 <- gamma.j.k[, i]
+        temp.1 <- unique(na.omit(temp.01))
+        temp.2 <- p.gamma.k.m[[i]]
+        temp.3 <- p.gamma.k.u[[i]]
+        for (j in 1:length(temp.1)) {
+            temp.01[temp.01 == temp.1[j]] <- temp.2[j]
+            temp.02[temp.02 == temp.1[j]] <- temp.3[j]
+        }
+        p.gamma.k.j.m[i, ] <- temp.01
+        p.gamma.k.j.u[i, ] <- temp.02
     }
     
-    log.sum <- logxpy(log(p.gamma.j.m) + log(p.m), log(p.gamma.j.u) + log(p.u))
-    zeta.j <- exp(log.prod - max.log.prod)/(exp(log.sum - max.log.prod))
+    sumlog <- function(x) { sum(log(x), na.rm = T) }
     
-    ## --------
-    ## M-step :
-    ## --------
-    num.prod <- n.j * zeta.j
-    p.m <- sum(num.prod)/sum(n.j)
-    p.u <- 1 - p.m
+    p.gamma.j.m <- as.matrix((apply(p.gamma.k.j.m, 2, sumlog)))
+    p.gamma.j.m <- exp(p.gamma.j.m)
     
+    p.gamma.j.u <- as.matrix((apply(p.gamma.k.j.u, 2, sumlog)))
+    p.gamma.j.u <- exp(p.gamma.j.u)
 
-    pat <- data.frame(gamma.j.k)
-    pat[is.na(pat)] <- -1
-    pat <- replace(pat, TRUE, lapply(pat, factor))
-    factors <- model.matrix(~ ., pat)
-    
-    ## get theta.m and theta.u
-    c <- 1e-06
-    matches <- glm(count ~ ., data = data.frame(count = ((zeta.j * n.j) + c), factors),
-                        family = "quasipoisson")
+    delta <- 1
+    count <- 1
+    warn.once <- 1
+    
+    ## The EM Algorithm presented in the paper starts here:
+    while (abs(delta) >= tol) {
+        
+        if((count %% 100) == 0) {
+            cat("Iteration number", count, "\n")
+            cat("Maximum difference in log-likelihood =", round(delta, 4), "\n")
+        }
+        
+        ## Old Paramters
+        p.old <- c(p.m, p.u, unlist(p.gamma.j.m), unlist(p.gamma.j.u))
 
-    non.matches <- glm(count ~ .*., data = data.frame(count = ((1 - zeta.j) * n.j + c), factors),
-                   family = "quasipoisson")
-    
-    ## Predict & renormalization fn as in Murray 2017
-    g <- function(fit) {
-      logwt = predict( fit )
-      logwt = logwt - max(logwt)
-      wt = exp(logwt)
-      wt/sum(wt)
-    }
+        ## ------
+        ## E-Step:
+        ## ------
+        
+        log.prod <- log(p.gamma.j.m) + log(p.m)
+        max.log.prod <- max(log.prod)
+        
+        logxpy <- function(lx,ly) {
+            temp <- cbind(lx, ly)
+            apply(temp, 1, max) + log1p(exp(-abs(lx-ly)))
+        }
+        
+        log.sum <- logxpy(log(p.gamma.j.m) + log(p.m), log(p.gamma.j.u) + log(p.u))
+        zeta.j <- exp(log.prod - max.log.prod)/(exp(log.sum - max.log.prod))
+        
+        ## --------
+        ## M-step :
+        ## --------
+        num.prod <- n.j * zeta.j
+        p.m <- sum(num.prod)/sum(n.j)
+        p.u <- 1 - p.m
+        
 
-    p.gamma.j.m = as.matrix(g(matches))
-    p.gamma.j.u = as.matrix(g(non.matches))
+        pat <- data.frame(gamma.j.k)
+        pat[is.na(pat)] <- -1
+        pat <- replace(pat, TRUE, lapply(pat, factor))
+        factors <- model.matrix(~ ., pat)
+        
+        ## get theta.m and theta.u
+        c <- 1e-06
+        matches <- glm(count ~ ., data = data.frame(count = ((zeta.j * n.j) + c), factors),
+                       family = "quasipoisson")
 
-    ## Updated parameters:
-    p.new <- c(p.m, p.u, unlist(p.gamma.j.m), unlist(p.gamma.j.u))
-    
-    if(p.m < 1e-13 & warn.once == 1) {
-      warning("The overall probability of finding a match is too small. Increasing the amount of overlap between the datasets might help, see e.g., clusterMatch()")
-      warn.once <- 0
+        non.matches <- glm(count ~ .*., data = data.frame(count = ((1 - zeta.j) * n.j + c), factors),
+                           family = "quasipoisson")
+        
+        ## Predict & renormalization fn as in Murray 2017
+        g <- function(fit) {
+            logwt = predict( fit )
+            logwt = logwt - max(logwt)
+            wt = exp(logwt)
+            wt/sum(wt)
+        }
+
+        p.gamma.j.m = as.matrix(g(matches))
+        p.gamma.j.u = as.matrix(g(non.matches))
+
+        ## Updated parameters:
+        p.new <- c(p.m, p.u, unlist(p.gamma.j.m), unlist(p.gamma.j.u))
+        
+        if(p.m < 1e-13 & warn.once == 1) {
+            warning("The overall probability of finding a match is too small. Increasing the amount of overlap between the datasets might help, see e.g., clusterMatch()")
+            warn.once <- 0
+        }
+        
+        ## Max difference between the updated and old parameters:
+        delta <- max(abs(p.new - p.old))
+        count <- count + 1
+        
+        if(count > iter.max) {
+            warning("The EM algorithm has run for the specified number of iterations but has not converged yet.")
+            break
+        }
     }
     
-    ## Max difference between the updated and old parameters:
-    delta <- max(abs(p.new - p.old))
-    count <- count + 1
+    weights <- log(p.gamma.j.m) - log(p.gamma.j.u)
+    
+    data.w <- cbind(patterns, weights, p.gamma.j.m, p.gamma.j.u)
+    nc <- ncol(data.w)
+    colnames(data.w)[nc-2] <- "weights"
+    colnames(data.w)[nc-1] <- "p.gamma.j.m"
+    colnames(data.w)[nc] <- "p.gamma.j.u"
     
-    if(count > iter.max) {
-      warning("The EM algorithm has run for the specified number of iterations but has not converged yet.")
-      break
+    inf <- which(data.w == Inf, arr.ind = T)
+    ninf <- which(data.w == -Inf, arr.ind = T)
+    
+    data.w[inf[, 1], unique(inf[, 2])] <- 150
+    data.w[ninf[, 1], unique(ninf[, 2])] <- -150
+
+    if(!is.null(varnames)){
+        output <- list("zeta.j"= zeta.j,"p.m"= p.m, "p.u" = p.u, 
+                       "p.gamma.j.m" = p.gamma.j.m, "p.gamma.j.u" = p.gamma.j.u, "patterns.w" = data.w, "iter.converge" = count,
+                       "nobs.a" = nobs.a, "nobs.b" = nobs.b, "varnames" = varnames)
+    }else{
+        output <- list("zeta.j"= zeta.j,"p.m"= p.m, "p.u" = p.u, 
+                       "p.gamma.j.m" = p.gamma.j.m, "p.gamma.j.u" = p.gamma.j.u, "patterns.w" = data.w, "iter.converge" = count,
+                       "nobs.a" = nobs.a, "nobs.b" = nobs.b, "varnames" = paste0("gamma.", 1:nfeatures))
     }
-  }
-  
-  weights <- log(p.gamma.j.m) - log(p.gamma.j.u)
-  
-  data.w <- cbind(patterns, weights, p.gamma.j.m, p.gamma.j.u)
-  nc <- ncol(data.w)
-  colnames(data.w)[nc-2] <- "weights"
-  colnames(data.w)[nc-1] <- "p.gamma.j.m"
-  colnames(data.w)[nc] <- "p.gamma.j.u"
-  
-  inf <- which(data.w == Inf, arr.ind = T)
-  ninf <- which(data.w == -Inf, arr.ind = T)
-  
-  data.w[inf[, 1], unique(inf[, 2])] <- 150
-  data.w[ninf[, 1], unique(ninf[, 2])] <- -150
-  
-  output <- list("zeta.j"= zeta.j,"p.m"= p.m, "p.u" = p.u, 
-                 "p.gamma.j.m" = p.gamma.j.m, "p.gamma.j.u" = p.gamma.j.u, "patterns.w" = data.w, "iter.converge" = count,
-                 "nobs.a" = nobs.a, "nobs.b" = nobs.b, "varnames" = paste0("gamma.", 1:nfeatures))
-  
-  class(output) <- c("fastLink", "fastLink.EM")
-  
-  return(output)
+
+    
+    class(output) <- c("fastLink", "fastLink.EM")
+    
+    return(output)
 }
diff --git a/R/fastLink.R b/R/fastLink.R
index 811d3b9..d9b17d0 100644
--- a/R/fastLink.R
+++ b/R/fastLink.R
@@ -10,7 +10,7 @@
 #' priors.obj, w.lambda, w.pi,
 #' address.field, gender.field, estimate.only, em.obj,
 #' dedupe.matches, linprog.dedupe,
-#' reweight.names, firstname.field,
+#' reweight.names, firstname.field, cond.indep,
 #' n.cores, tol.em, threshold.match, return.all, return.df, verbose)
 #'
 #' @param dfA Dataset A - to be matched to Dataset B
@@ -51,6 +51,8 @@
 #' @param linprog.dedupe If deduping matches, whether to use Winkler's linear programming solution to dedupe. Default is FALSE.
 #' @param reweight.names Whether to reweight the posterior match probabilities by the frequency of individual first names. Default is FALSE.
 #' @param firstname.field The name of the field indicating first name. Must be provided if reweight.names = TRUE.
+#' @param cond.indep Estimates for the parameters of interest are obtained from the Fellegi-Sunter model under conditional independence. Default is TRUE. 
+#' If set to FALSE parameters estimates are obtained from a model that allows for dependencies across linkage fields.
 #' @param n.cores Number of cores to parallelize over. Default is NULL.
 #' @param tol.em Convergence tolerance for the EM Algorithm. Default is 1e-04.
 #' @param threshold.match A number between 0 and 1 indicating either the lower bound (if only one number provided) or the range of certainty that the
@@ -59,8 +61,6 @@
 #' @param return.all Whether to return the most likely match for each observation in dfA and dfB. Overrides user setting of \code{threshold.match} by setting
 #' \code{threshold.match} to 0.0001, and automatically dedupes all matches. Default is FALSE.
 #' @param return.df Whether to return the entire dataframe of dfA and dfB instead of just the indices. Default is FALSE.
-#' @param cond.indep estimates for the parameters of interest are obtained from the Fellegi-Sunter model under conditional independence. Default is TRUE. 
-#' If set to FALSE parameters estimates are obtained from a model that allows for dependencies across linkage fields.
 #' @param verbose Whether to print elapsed time for each step. Default is FALSE.
 #'
 #' @return \code{fastLink} returns a list of class 'fastLink' containing the following components if calculating matches:
@@ -97,9 +97,9 @@ fastLink <- function(dfA, dfB, varnames,
                      w.lambda = NULL, w.pi = NULL, address.field = NULL,
                      gender.field = NULL, estimate.only = FALSE, em.obj = NULL,
                      dedupe.matches = TRUE, linprog.dedupe = FALSE,
-                     reweight.names = FALSE, firstname.field = NULL,
+                     reweight.names = FALSE, firstname.field = NULL, cond.indep = TRUE,
                      n.cores = NULL, tol.em = 1e-04, threshold.match = 0.85,
-                     return.all = FALSE, return.df = FALSE, cond.indep = TRUE, verbose = FALSE){
+                     return.all = FALSE, return.df = FALSE, verbose = FALSE){
 
     cat("\n")
     cat(c(paste(rep("=", 20), sep = "", collapse = ""), "\n"))
@@ -193,6 +193,14 @@ fastLink <- function(dfA, dfB, varnames,
     }else{
         cat("If you set return.all to FALSE, you will not be able to calculate a confusion table as a summary statistic.\n")
     }
+    if(!is.null(priors.obj) & cond.indep == FALSE){
+        cat("The current implementation of fastLink can only incorporate prior information under the conditionally independent model. Ignoring prior information in estimation.")
+        priors.obj <- NULL
+        w.lambda <- NULL
+        w.pi <- NULL
+        address.field <- NULL
+        gender.field <- NULL
+    }
 
     ## Create boolean indicators
     sm.bool <- which(varnames %in% stringdist.match)
@@ -294,12 +302,12 @@ fastLink <- function(dfA, dfB, varnames,
     ## ------------------------------
     ## Get counts for zeta parameters
     ## ------------------------------
-    cat("Getting counts for zeta parameters.\n")
+    cat("Getting counts for parameter estimation.\n")
     start <- Sys.time()
     counts <- tableCounts(gammalist, nobs.a = nr_a, nobs.b = nr_b, n.cores = n.cores)
     end <- Sys.time()
     if(verbose){
-        cat("Getting counts for zeta parameters took", round(difftime(end, start, units = "mins"), 2), "minutes.\n\n")
+        cat("Getting counts for parameter estimation took", round(difftime(end, start, units = "mins"), 2), "minutes.\n\n")
     }
 
     ## ------------------------------
@@ -325,15 +333,17 @@ fastLink <- function(dfA, dfB, varnames,
                 pi.prior <- NULL
             }
         }
-        if(cond.indep == FALSE) {
-                     resultsEM <- emlinklog(patterns = counts, nobs.a = nr_a, nobs.b = nr_b, tol = tol.em)  
-        } else {     resultsEM <- emlinkMARmov(patterns = counts, nobs.a = nr_a, nobs.b = nr_b,
-                                  tol = tol.em,
-                                  prior.lambda = lambda.prior, w.lambda = w.lambda,
-                                  prior.pi = pi.prior, w.pi = w.pi,
-                                  address.field = address.field, 
-                                  gender.field = gender.field,
-                                  varnames = varnames)
+        if(cond.indep == FALSE){
+            resultsEM <- emlinklog(patterns = counts, nobs.a = nr_a, nobs.b = nr_b,
+                                   tol = tol.em, varnames = varnames)  
+        }else{
+            resultsEM <- emlinkMARmov(patterns = counts, nobs.a = nr_a, nobs.b = nr_b,
+                                      tol = tol.em,
+                                      prior.lambda = lambda.prior, w.lambda = w.lambda,
+                                      prior.pi = pi.prior, w.pi = w.pi,
+                                      address.field = address.field, 
+                                      gender.field = gender.field,
+                                      varnames = varnames)
         }
         end <- Sys.time()
         if(verbose){
diff --git a/man/aggconfusion.Rd b/man/aggconfusion.Rd
index 0d9a8eb..4e85cc7 100644
--- a/man/aggconfusion.Rd
+++ b/man/aggconfusion.Rd
@@ -1,26 +1,20 @@
 % Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/confusion.R
+% Please edit documentation in R/aggconfusion.R
 \name{aggconfusion}
 \alias{aggconfusion}
-\title{Aggregates confusion tables obtained from fastLink}
+\title{aggconfusion}
 \usage{
 aggconfusion(object)
 }
 \arguments{
-\item{object}{A list that contains at least one confusion table obtained via 'confusion()' }
+\item{object}{A list of confusion tables.}
 }
 \value{
 'aggconfusion()' returns two tables - one calculating the confusion table, and another
 calculating a series of additional summary statistics.
 }
 \description{
-Calculate confusion table after running fastLink().
-}
-\examples{
-\dontrun{
- ct <- aggconfusion()
-}
-
+Aggregate confusion tables from separate runs of fastLink()
 }
 \author{
 Ted Enamorado <ted.enamorado@gmail.com> and Ben Fifield <benfifield@gmail.com>
diff --git a/man/emlinkMARmov.Rd b/man/emlinkMARmov.Rd
index a9357ae..82f7eb7 100644
--- a/man/emlinkMARmov.Rd
+++ b/man/emlinkMARmov.Rd
@@ -47,12 +47,12 @@ clean visualization of EM results in summary functions.}
 \value{
 \code{emlinkMARmov} returns a list with the following components:
 \item{zeta.j}{The posterior match probabilities for each unique pattern.}
-\item{p.m}{The probability of finding a match.}
-\item{p.u}{The probability of finding a non-match.}
-\item{p.gamma.k.m}{The probabilities of observing the values in field k conditional of being in the set of matches.}
-\item{p.gamma.k.u}{The probabilities of observing the values in field k conditional of being in the set of non-matches.}
-\item{p.gamma.j.m}{The probability of observing a particular agreement pattern conditional on being in the set of matches.}
-\item{p.gamma.j.u}{The probability of observing a particular agreement pattern conditional on being in the set of non-matches.}
+\item{p.m}{The probability of a pair matching.}
+\item{p.u}{The probability of a pair not matching.}
+\item{p.gamma.k.m}{The matching probability for a specific matching field.}
+\item{p.gamma.k.u}{The non-matching probability for a specific matching field.}
+\item{p.gamma.j.m}{The probability that a pair is in the matched set given a particular agreement pattern.}
+\item{p.gamma.j.u}{The probability that a pair is in the unmatched set given a particular agreement pattern.}
 \item{patterns.w}{Counts of the agreement patterns observed, along with the Felligi-Sunter Weights.}
 \item{iter.converge}{The number of iterations it took the EM algorithm to converge.}
 \item{nobs.a}{The number of observations in dataset A.}
diff --git a/man/emlinkRS.Rd b/man/emlinkRS.Rd
index 73e95b4..0e5808f 100644
--- a/man/emlinkRS.Rd
+++ b/man/emlinkRS.Rd
@@ -21,12 +21,12 @@ on a smaller random sample to apply to counts from a larger sample}
 \value{
 \code{emlinkMARmov} returns a list with the following components:
 \item{zeta.j}{The posterior match probabilities for each unique pattern.}
-\item{p.m}{The probability of finding a match.}
-\item{p.u}{The probability of finding a non-match.}
-\item{p.gamma.k.m}{The probabilities of observing the values in field k conditional of being in the set of matches.}
-\item{p.gamma.k.u}{The probabilities of observing the values in field k conditional of being in the set of non-matches.}
-\item{p.gamma.j.m}{The probability of observing a particular agreement pattern conditional on being in the set of matches.}
-\item{p.gamma.j.u}{The probability of observing a particular agreement pattern conditional on being in the set of non-matches.}
+\item{p.m}{The posterior probability of a pair matching.}
+\item{p.u}{The posterior probability of a pair not matching.}
+\item{p.gamma.k.m}{The posterior of the matching probability for a specific matching field.}
+\item{p.gamma.k.u}{The posterior of the non-matching probability for a specific matching field.}
+\item{p.gamma.j.m}{The posterior probability that a pair is in the matched set given a particular agreement pattern.}
+\item{p.gamma.j.u}{The posterior probability that a pair is in the unmatched set given a particular agreement pattern.}
 \item{patterns.w}{Counts of the agreement patterns observed, along with the Felligi-Sunter Weights.}
 \item{iter.converge}{The number of iterations it took the EM algorithm to converge.}
 \item{nobs.a}{The number of observations in dataset A.}
diff --git a/man/emlinklog.Rd b/man/emlinklog.Rd
index 4b628f3..7e4f1c7 100644
--- a/man/emlinklog.Rd
+++ b/man/emlinklog.Rd
@@ -4,7 +4,8 @@
 \alias{emlinklog}
 \title{emlinklog}
 \usage{
-emlinklog(patterns, nobs.a, nobs.b, p.m, iter.max, tol, p.gamma.j.m, p.gamma.j.u, varnames)
+emlinklog(patterns, nobs.a, nobs.b, p.m, p.gamma.j.m, p.gamma.j.u,
+iter.max, tol, varnames)
 }
 \arguments{
 \item{patterns}{table that holds the counts for each unique agreement
@@ -16,9 +17,9 @@ pattern. This object is produced by the function: tableCounts.}
 
 \item{p.m}{probability of finding a match. Default is 0.1}
 
-\item{p.gamma.j.m}{The probability of observing a particular agreement pattern conditional on being in the set of matches.}
+\item{p.gamma.j.m}{probability that conditional of being in the matched set we observed a specific agreement pattern.}
 
-\item{p.gamma.j.u}{The probability of observing a particular agreement pattern conditional on being in the set of non-matches.}
+\item{p.gamma.j.u}{probability that conditional of being in the non-matched set we observed a specific agreement pattern.}
 
 \item{iter.max}{Max number of iterations. Default is 5000}
 
@@ -28,7 +29,7 @@ pattern. This object is produced by the function: tableCounts.}
 clean visualization of EM results in summary functions.}
 }
 \value{
-\code{emlinkMARmov} returns a list with the following components:
+\code{emlinklog} returns a list with the following components:
 \item{zeta.j}{The posterior match probabilities for each unique pattern.}
 \item{p.m}{The probability of finding a match.}
 \item{p.u}{The probability of finding a non-match.}
@@ -40,8 +41,8 @@ clean visualization of EM results in summary functions.}
 \item{nobs.b}{The number of observations in dataset B.}
 }
 \description{
-Expectation-Maximization algorithm for Record Linkage
-allowing for dependecies across linakge fields.
+Expectation-Maximization algorithm for Record Linkage 
+allowing for dependencies across linkage fields
 }
 \examples{
 \dontrun{
@@ -60,5 +61,5 @@ em.log <- emlinklog(tc, nobs.a = nrow(dfA), nobs.b = nrow(dfB))
 
 }
 \author{
-Ted Enamorado <ted.enamorado@gmail.com> and Kosuke Imai
+Ted Enamorado <ted.enamorado@gmail.com> and Benjamin Fifield
 }
diff --git a/man/fastLink.Rd b/man/fastLink.Rd
index e48b0ef..059b7c6 100644
--- a/man/fastLink.Rd
+++ b/man/fastLink.Rd
@@ -11,8 +11,8 @@ cut.a.num, cut.p.num,
 priors.obj, w.lambda, w.pi,
 address.field, gender.field, estimate.only, em.obj,
 dedupe.matches, linprog.dedupe,
-reweight.names, firstname.field,
-n.cores, tol.em, threshold.match, return.all, return.df, cond.indep, verbose)
+reweight.names, firstname.field, cond.indep,
+n.cores, tol.em, threshold.match, return.all, return.df, verbose)
 }
 \arguments{
 \item{dfA}{Dataset A - to be matched to Dataset B}
@@ -76,6 +76,9 @@ estimated on a smaller sample, and the user wants to apply them to the full data
 
 \item{firstname.field}{The name of the field indicating first name. Must be provided if reweight.names = TRUE.}
 
+\item{cond.indep}{Estimates for the parameters of interest are obtained from the Fellegi-Sunter model under conditional independence. Default is TRUE. 
+If set to FALSE parameters estimates are obtained from a model that allows for dependencies across linkage fields.}
+
 \item{n.cores}{Number of cores to parallelize over. Default is NULL.}
 
 \item{tol.em}{Convergence tolerance for the EM Algorithm. Default is 1e-04.}
@@ -89,9 +92,6 @@ while threshold.match = c(.85, .95) will return all pairs with posterior probabi
 
 \item{return.df}{Whether to return the entire dataframe of dfA and dfB instead of just the indices. Default is FALSE.}
 
-\item{cond.indep}{estimates for the parameters of interest are obtained from the Fellegi-Sunter model under conditional independence. Default is TRUE. 
-If set to FALSE parameters estimates are obtained from a model that allows for dependencies across linkage fields.}
-
 \item{verbose}{Whether to print elapsed time for each step. Default is FALSE.}
 }
 \value{