Merge pull request #16 from kosukeimai/development

Merge development into master for new CRAN submission
kosukeimai · Sep 1, 2017 · d7806e9 · d7806e9
2 parents d885027 + 6b07bcb
commit d7806e9
Show file tree

Hide file tree

Showing 23 changed files with 642 additions and 106 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,8 +1,8 @@
 Package: fastLink
 Type: Package
 Title: Fast Probabilistic Record Linkage with Missing Data
-Version: 0.1.1
-Date: 2017-07-10
+Version: 0.2.0
+Date: 2017-08-31
 Authors@R: c(
   person("Ted", "Enamorado", email = "[email protected]", role = c("aut", "cre")),
   person("Ben", "Fifield", email = "[email protected]", role = c("aut")),
@@ -24,10 +24,14 @@ Imports:
     data.table,
     stringdist,
     stringr,
-    Rcpp (>= 0.12.9),
+    stringi,
+    Rcpp (>= 0.12.7),
     FactoClass,
     adagio,
-    dplyr
+    dplyr,
+    plotrix,
+    grDevices,
+    graphics
 Depends:
     R (>= 2.14.0)
 LinkingTo: RcppArmadillo, Rcpp, RcppEigen

diff --git a/NAMESPACE b/NAMESPACE
@@ -1,9 +1,10 @@
 # Generated by roxygen2: do not edit by hand
 
+S3method(plot,fastLink)
+S3method(print,inspectEM)
 S3method(summary,fastLink)
 export(aggregateEM)
 export(calcMoversPriors)
-export(cleanAddressUSPS)
 export(clusterMatch)
 export(dedupeMatches)
 export(emlinkMARmov)
@@ -13,8 +14,11 @@ export(gammaCK2par)
 export(gammaCKpar)
 export(gammaKpar)
 export(getMatches)
+export(getPosterior)
+export(inspectEM)
 export(matchesLink)
 export(nameReweight)
+export(preprocText)
 export(tableCounts)
 import(Matrix)
 import(data.table)
@@ -28,20 +32,29 @@ importFrom(dplyr,n)
 importFrom(dplyr,summarise)
 importFrom(foreach,"%dopar%")
 importFrom(foreach,foreach)
+importFrom(grDevices,colorRampPalette)
+importFrom(graphics,axis)
+importFrom(graphics,legend)
+importFrom(graphics,plot)
+importFrom(graphics,polygon)
 importFrom(gtools,rdirichlet)
 importFrom(parallel,detectCores)
 importFrom(parallel,makeCluster)
 importFrom(parallel,mclapply)
 importFrom(parallel,stopCluster)
+importFrom(plotrix,staxlab)
 importFrom(stats,kmeans)
 importFrom(stats,na.omit)
 importFrom(stats,prcomp)
 importFrom(stats,predict)
 importFrom(stats,quantile)
 importFrom(stats,runif)
 importFrom(stats,var)
+importFrom(stringdist,phonetic)
 importFrom(stringdist,stringdist)
 importFrom(stringdist,stringdistmatrix)
+importFrom(stringi,stri_trans_general)
+importFrom(stringi,stri_trans_list)
 importFrom(stringr,str_count)
 importFrom(utils,data)
 useDynLib(fastLink, .registration = TRUE)
diff --git a/R/cleanAddressUSPS.R b/R/cleanAddressUSPS.R
diff --git a/R/emlinkMARmov.R b/R/emlinkMARmov.R
@@ -5,7 +5,7 @@
 #'
 #' @usage emlinkMARmov(patterns, nobs.a, nobs.b, p.m, iter.max,
 #' tol, p.gamma.k.m, p.gamma.k.u, prior.lambda, w.lambda,
-#' prior.pi, w.pi, address.field, gender.field)
+#' prior.pi, w.pi, address.field, gender.field, varnames)
 #'
 #' @param patterns table that holds the counts for each unique agreement
 #' pattern. This object is produced by the function: tableCounts.
@@ -25,6 +25,8 @@
 #' Address fields should be set to TRUE while non-address fields are set to FALSE if provided.
 #' @param gender.field Boolean indicators for whether a given field is for gender. If so, exact match is conducted on gender.
 #' Default is NULL (FALSE for all fields). The one gender field should be set to TRUE while all other fields are set to FALSE if provided.
+#' @param varnames The vector of variable names used for matching. Automatically provided if using \code{fastLink()} wrapper. Used for
+#' clean visualization of EM results in summary functions.
 #'
 #' @return \code{emlinkMARmov} returns a list with the following components:
 #' \item{zeta.j}{The posterior match probabilities for each unique pattern.}
@@ -53,7 +55,7 @@
 #' tc <- tableCounts(list(g1, g2, g3, g4), nobs.a = nrow(dfA), nobs.b = nrow(dfB))
 #'
 #' ## Run EM
-#' em <- emlinkMAR(tc, nobs.a = nrow(dfA), nobs.b = nrow(dfB))
+#' em <- emlinkMARmov(tc, nobs.a = nrow(dfA), nobs.b = nrow(dfB))
 #' }
 #'
 #' @export
@@ -62,7 +64,7 @@ emlinkMARmov <- function(patterns, nobs.a, nobs.b,
                          p.m = 0.1, iter.max = 5000, tol = 1e-5, p.gamma.k.m = NULL, p.gamma.k.u = NULL,
                          prior.lambda = NULL, w.lambda = NULL, 
                          prior.pi = NULL, w.pi = NULL, address.field = NULL,
-                         gender.field = NULL) {
+                         gender.field = NULL, varnames = NULL) {
 
     options(digits=16)
 
@@ -326,15 +328,22 @@ emlinkMARmov <- function(patterns, nobs.a, nobs.b,
     colnames(data.w)[nc-1] <- "p.gamma.j.m"
     colnames(data.w)[nc] <- "p.gamma.j.u"
 
-	inf <- which(data.w == Inf, arr.ind = T)
-	ninf <- which(data.w == -Inf, arr.ind = T)
-
-	data.w[inf[, 1], unique(inf[, 2])] <- 150
-	data.w[ninf[, 1], unique(ninf[, 2])] <- -150
+    inf <- which(data.w == Inf, arr.ind = T)
+    ninf <- which(data.w == -Inf, arr.ind = T)
+
+    data.w[inf[, 1], unique(inf[, 2])] <- 150
+    data.w[ninf[, 1], unique(ninf[, 2])] <- -150
+
+    if(!is.null(varnames)){
+        output <- list("zeta.j"= zeta.j,"p.m"= p.m, "p.u" = p.u, "p.gamma.k.m" = p.gamma.k.m, "p.gamma.k.u" = p.gamma.k.u,
+                       "p.gamma.j.m" = p.gamma.j.m, "p.gamma.j.u" = p.gamma.j.u, "patterns.w" = data.w, "iter.converge" = count,
+                       "nobs.a" = nobs.a, "nobs.b" = nobs.b, "varnames" = varnames)
+    }else{
+        output <- list("zeta.j"= zeta.j,"p.m"= p.m, "p.u" = p.u, "p.gamma.k.m" = p.gamma.k.m, "p.gamma.k.u" = p.gamma.k.u,
+                       "p.gamma.j.m" = p.gamma.j.m, "p.gamma.j.u" = p.gamma.j.u, "patterns.w" = data.w, "iter.converge" = count,
+                       "nobs.a" = nobs.a, "nobs.b" = nobs.b, "varnames" = paste0("gamma.", 1:nfeatures))
+    }
 
-    output <- list("zeta.j"= zeta.j,"p.m"= p.m, "p.u" = p.u, "p.gamma.k.m" = p.gamma.k.m, "p.gamma.k.u" = p.gamma.k.u,
-                   "p.gamma.j.m" = p.gamma.j.m, "p.gamma.j.u" = p.gamma.j.u, "patterns.w" = data.w, "iter.converge" = count,
-                   "nobs.a" = nobs.a, "nobs.b" = nobs.b)
     class(output) <- c("fastLink", "fastLink.EM")
 
     return(output)
@@ -481,15 +490,15 @@ emlinkRS <- function(patterns.out, em.out, nobs.a, nobs.b){
     colnames(data.w)[nc - 1] <- "p.gamma.j.m"  
     colnames(data.w)[nc] <- "p.gamma.j.u"
 
-	inf <- which(data.w == Inf, arr.ind = T)
-	ninf <- which(data.w == -Inf, arr.ind = T)
-		
-	data.w[inf[, 1], unique(inf[, 2])] <- 150
-	data.w[ninf[, 1], unique(ninf[, 2])] <- -150
+    inf <- which(data.w == Inf, arr.ind = T)
+    ninf <- which(data.w == -Inf, arr.ind = T)
+    
+    data.w[inf[, 1], unique(inf[, 2])] <- 150
+    data.w[ninf[, 1], unique(ninf[, 2])] <- -150
 
     output <- list("zeta.j" = zeta.j, "p.m" = em.out$p.m, "p.u" = em.out$p.u, "p.gamma.k.m" = em.out$p.gamma.k.m, "p.gamma.k.u" = em.out$p.gamma.k.u,
                    "p.gamma.j.m" = p.gamma.j.m, "p.gamma.j.u" = p.gamma.j.u, "patterns.w" = data.w, "iter.converge" = em.out$iter.converge,
-                   "nobs.a" = nobs.a, "nobs.b" = nobs.b)
+                   "nobs.a" = nobs.a, "nobs.b" = nobs.b, "varnames" = em.out$varnames)
     class(output) <- c("fastLink", "fastLink.EM")
 
     return(output)

diff --git a/R/fastLink-package.R b/R/fastLink-package.R
@@ -8,8 +8,8 @@
 #' the Fellegi-Sunter model, using the Expectation-Maximization Algorithm. In addition,
 #' tools for conducting and summarizing data merges are included. 
 #' 
-#' \tabular{ll}{ Package: \tab fastLink\cr Type: \tab Package\cr Version: \tab 0.1.1-\cr
-#' Date: \tab 2017-07-10\cr License: \tab GPL (>= 3)\cr }
+#' \tabular{ll}{ Package: \tab fastLink\cr Type: \tab Package\cr Version: \tab 0.2.0-\cr
+#' Date: \tab 2017-07-30\cr License: \tab GPL (>= 3)\cr }
 #'
 #' @name fastLink-package
 #' @useDynLib fastLink, .registration = TRUE

diff --git a/R/fastLink.R b/R/fastLink.R
@@ -9,7 +9,7 @@
 #' address.field, gender.field, estimate.only, em.obj,
 #' dedupe.matches, linprog.dedupe,
 #' reweight.names, firstname.field,
-#' return.df, n.cores, tol.em, threshold.match, verbose)
+#' n.cores, tol.em, threshold.match, return.all, return.df, verbose)
 #'
 #' @param dfA Dataset A - to be matched to Dataset B
 #' @param dfB Dataset B - to be matched to Dataset A
@@ -45,12 +45,14 @@
 #' @param linprog.dedupe If deduping matches, whether to use Winkler's linear programming solution to dedupe. Default is FALSE.
 #' @param reweight.names Whether to reweight the posterior match probabilities by the frequency of individual first names. Default is FALSE.
 #' @param firstname.field The name of the field indicating first name. Must be provided if reweight.names = TRUE.
-#' @param return.df Whether to return the entire dataframe of dfA and dfB instead of just the indices. Default is FALSE.
 #' @param n.cores Number of cores to parallelize over. Default is NULL.
 #' @param tol.em Convergence tolerance for the EM Algorithm. Default is 1e-04.
 #' @param threshold.match A number between 0 and 1 indicating either the lower bound (if only one number provided) or the range of certainty that the
 #' user wants to declare a match. For instance, threshold.match = .85 will return all pairs with posterior probability greater than .85 as matches,
 #' while threshold.match = c(.85, .95) will return all pairs with posterior probability between .85 and .95 as matches.
+#' @param return.all Whether to return the most likely match for each observation in dfA and dfB. Overrides user setting of \code{threshold.match} by setting
+#' \code{threshold.match} to 0.0001, and automatically dedupes all matches. Default is FALSE.
+#' @param return.df Whether to return the entire dataframe of dfA and dfB instead of just the indices. Default is FALSE.
 #' @param verbose Whether to print elapsed time for each step. Default is FALSE.
 #'
 #' @return \code{fastLink} returns a list of class 'fastLink' containing the following components if calculating matches:
@@ -83,8 +85,8 @@ fastLink <- function(dfA, dfB, varnames,
                      gender.field = NULL, estimate.only = FALSE, em.obj = NULL,
                      dedupe.matches = TRUE, linprog.dedupe = FALSE,
                      reweight.names = FALSE, firstname.field = NULL,
-                     return.df = FALSE, n.cores = NULL, tol.em = 1e-04,
-                     threshold.match = 0.85, verbose = FALSE){
+                     n.cores = NULL, tol.em = 1e-04, threshold.match = 0.85,
+                     return.all = FALSE, return.df = FALSE, verbose = FALSE){
 
     cat("\n")
     cat(c(paste(rep("=", 20), sep = "", collapse = ""), "\n"))
@@ -156,6 +158,13 @@ fastLink <- function(dfA, dfB, varnames,
             stop("Invalid value provided for jw.weight. Remember, jw.weight in [0, 0.25].")
         }
     }
+    if(return.all){
+        threshold.match <- 0.0001
+        if(!dedupe.matches){
+            cat("You have specified that all matches be returned but have not deduped the matches. Setting 'dedupe.matches' to TRUE.\n")
+            dedupe.matches <- TRUE
+        }
+    }
 
     ## Create boolean indicators
     sm.bool <- which(varnames %in% stringdist.match)
@@ -195,6 +204,10 @@ fastLink <- function(dfA, dfB, varnames,
     start <- Sys.time()
     gammalist <- vector(mode = "list", length = length(varnames))
     for(i in 1:length(gammalist)){
+        if(verbose){
+            sdb <- ifelse(stringdist.match[i], "string-distance", "exact")
+            cat("    Matching variable", varnames[i], "using", sdb, "matching.\n")
+        }
         ## Convert to character
         if(is.factor(dfA[,varnames[i]]) | is.factor(dfB[,varnames[i]])){
             dfA[,varnames[i]] <- as.character(dfA[,varnames[i]])
@@ -209,6 +222,9 @@ fastLink <- function(dfA, dfB, varnames,
                 stop(paste("You have no variation in dataset B for", varnames[i], "or all observations are missing."))
             }
         }
+        if(sum(dfA[,varnames[i]] %in% dfB[,varnames[i]]) == 0){
+            stop(paste0("You have no exact matches for ", varnames[i], ". Please drop this variable from your analysis."))
+        }
         ## Get patterns
         if(stringdist.match[i]){
             if(partial.match[i]){
@@ -270,7 +286,8 @@ fastLink <- function(dfA, dfB, varnames,
                                   prior.lambda = lambda.prior, w.lambda = w.lambda,
                                   prior.pi = pi.prior, w.pi = w.pi,
                                   address.field = address.field, 
-                                  gender.field = gender.field)
+                                  gender.field = gender.field,
+                                  varnames = varnames)
         end <- Sys.time()
         if(verbose){
             cat("Running the EM algorithm took", round(difftime(end, start, units = "secs"), 2), "seconds.\n\n")
@@ -310,9 +327,19 @@ fastLink <- function(dfA, dfB, varnames,
             if(verbose){
                 cat("Deduping the estimated matches took", round(difftime(end, start, units = "mins"), 2), "minutes.\n\n")
             }
+        }else{
+            cat("Calculating the posterior for each pair of matched observations.\n")
+            start <- Sys.time()
+            zeta <- getPosterior(dfA[matches$inds.a,], dfB[matches$inds.b,], EM = resultsEM,
+                                 varnames = varnames, stringdist.match = stringdist.match, partial.match = partial.match,
+                                 stringdist.method = stringdist.method, cut.a = cut.a, cut.p = cut.p, jw.weight = jw.weight)
+            end <- Sys.time()
+            if(verbose){
+                cat("Calculating the posterior for each matched pair took", round(difftime(end, start, units = "mins"), 2), "minutes.\n\n")
+            }
         }
 
-        ## Reweight first names
+        ## Reweight first names or get zeta
         if(reweight.names){
             cat("Reweighting match probabilities by frequency of occurrence.\n")
             start <- Sys.time()
@@ -335,14 +362,16 @@ fastLink <- function(dfA, dfB, varnames,
         }
         out[["matches"]] <- matches
         out[["EM"]] <- resultsEM
-        out[["nobs.a"]] <- nr_a
-        out[["nobs.b"]] <- nr_b
         if(dedupe.matches){
-            out[["max.zeta"]] <- ddm.out$max.zeta
+            out[["posterior"]] <- ddm.out$max.zeta
+        }else{
+            out[["posterior"]] <- zeta
         }
         if(reweight.names){
             out[["zeta.name"]] <- rwn.out
         }
+        out[["nobs.a"]] <- nr_a
+        out[["nobs.b"]] <- nr_b
         class(out) <- "fastLink"
     }else{
         out <- resultsEM