Merge pull request #11 from kosukeimai/stringdist_measures

Stringdist measures
kosukeimai · Jun 2, 2017 · c8173c8 · c8173c8
2 parents ad87f18 + 8477ea2
commit c8173c8
Show file tree

Hide file tree

Showing 10 changed files with 374 additions and 221 deletions.
diff --git a/R/dedupeMatches.R b/R/dedupeMatches.R
diff --git a/R/fastLink.R b/R/fastLink.R
@@ -4,7 +4,8 @@
 #' two datasets.
 #'
 #' @usage fastLink(dfA, dfB, varnames, stringdist.match,
-#' partial.match, cut.a, cut.p, priors.obj, w.lambda, w.pi,
+#' partial.match, stringdist.method, cut.a, cut.p, jw.weight,
+#' priors.obj, w.lambda, w.pi,
 #' address.field, gender.field, estimate.only, em.obj,
 #' dedupe.matches, linprog.dedupe,
 #' reweight.names, firstname.field,
@@ -20,8 +21,10 @@
 #' @param partial.match A vector of variable names indicating whether to include
 #' a partial matching category for the string distances. Must be a subset of 'varnames'
 #' and 'stringdist.match'.
+#' @param stringdist.method String distance method for calculating similarity, options are: "jw" Jaro-Winkler (Default), "jaro" Jaro, and "lv" Edit
 #' @param cut.a Lower bound for full string-distance match, ranging between 0 and 1. Default is 0.92
 #' @param cut.p Lower bound for partial string-distance match, ranging between 0 and 1. Default is 0.88
+#' @param jw.weight Parameter that describes the importance of the first characters of a string (only needed if stringdist.method = "jw"). Default is .10
 #' @param priors.obj A list containing priors for auxiliary movers information,
 #' as output from calcMoversPriors(). Default is NULL
 #' @param w.lambda How much weight to give the prior on lambda versus the data. Must range between 0 (no weight on prior) and 1 (weight fully on prior).
@@ -71,7 +74,8 @@
 #' @export
 fastLink <- function(dfA, dfB, varnames,
                      stringdist.match = NULL, partial.match = NULL,
-                     cut.a = 0.92, cut.p = 0.88,
+                     stringdist.method = "jw",
+                     cut.a = 0.92, cut.p = 0.88, jw.weight = .10,
                      priors.obj = NULL,
                      w.lambda = NULL, w.pi = NULL, address.field = NULL,
                      gender.field = NULL, estimate.only = FALSE, em.obj = NULL,
@@ -139,6 +143,14 @@ fastLink <- function(dfA, dfB, varnames,
         estimate.only <- FALSE
         cat("You have provided an EM object but have set 'estimate.only' to TRUE. Setting 'estimate.only' to FALSE so that matched indices are returned.\n")
     }
+    if(!(stringdist.method %in% c("jw", "jaro", "lv"))){
+        stop("Invalid string distance method. Method should be one of 'jw', 'jaro', or 'lv'.")
+    }
+    if(stringdist.method == "jw" & !is.null(jw.weight)){
+        if(jw.weight < 0 | jw.weight > 0.25){
+            stop("Invalid value provided for jw.weight. Remember, jw.weight in [0, 0.25].")
+        }
+    }
 
     ## Create boolean indicators
     sm.bool <- which(varnames %in% stringdist.match)
@@ -189,9 +201,11 @@ fastLink <- function(dfA, dfB, varnames,
         ## Get patterns
         if(stringdist.match[i]){
             if(partial.match[i]){
-                gammalist[[i]] <- gammaCKpar(dfA[,varnames[i]], dfB[,varnames[i]], cut.a = cut.a, cut.p = cut.p, n.cores = n.cores)
+                gammalist[[i]] <- gammaCKpar(
+                    dfA[,varnames[i]], dfB[,varnames[i]], cut.a = cut.a, cut.p = cut.p, method = stringdist.method, w = jw.weight, n.cores = n.cores
+                )
             }else{
-                gammalist[[i]] <- gammaCK2par(dfA[,varnames[i]], dfB[,varnames[i]], cut.a = cut.a, n.cores = n.cores)
+                gammalist[[i]] <- gammaCK2par(dfA[,varnames[i]], dfB[,varnames[i]], cut.a = cut.a, method = stringdist.method, w = jw.weight, n.cores = n.cores)
             }
         }else{
             gammalist[[i]] <- gammaKpar(dfA[,varnames[i]], dfB[,varnames[i]], gender = gender.field[i], n.cores = n.cores)
@@ -277,7 +291,8 @@ fastLink <- function(dfA, dfB, varnames,
             ddm.out <- dedupeMatches(matchesA = dfA[matches$inds.a,], matchesB = dfB[matches$inds.b,],
                                      EM = resultsEM, matchesLink = matches, varnames = varnames,
                                      stringdist.match = stringdist.match, partial.match = partial.match,
-                                     linprog = linprog.dedupe, cut.a = cut.a, cut.p = cut.p)
+                                     linprog = linprog.dedupe, stringdist.method = stringdist.method,
+                                     cut.a = cut.a, cut.p = cut.p, jw.weight = jw.weight)
             matches <- ddm.out$matchesLink
             resultsEM <- ddm.out$EM
             end <- Sys.time()
@@ -293,7 +308,8 @@ fastLink <- function(dfA, dfB, varnames,
             rwn.out <- nameReweight(dfA, dfB, EM = resultsEM, gammalist = gammalist, matchesLink = matches,
                                     varnames = varnames, stringdist.match = stringdist.match, partial.match = partial.match,
                                     firstname.field = firstname.field, threshold.match = threshold.match,
-                                    cut.a = cut.a, cut.p = cut.p, n.cores = n.cores)
+                                    stringdist.method = stringdist.method, cut.a = cut.a, cut.p = cut.p, jw.weight = jw.weight,
+                                    n.cores = n.cores)
             end <- Sys.time()
             if(verbose){
                 cat("Reweighting by first name took", round(difftime(end, start, units = "mins"), 2), "minutes.\n\n")

diff --git a/R/gammaCK2par.R b/R/gammaCK2par.R
@@ -4,12 +4,14 @@
 #' 0 total disagreement, 2 agreement.
 #' The distance between strings is calculated using a Jaro-Winkler distance.
 #'
-#' @usage gammaCK2par(matAp, matBp, n.cores, cut.a)
+#' @usage gammaCK2par(matAp, matBp, n.cores, cut.a, method, w)
 #'
 #' @param matAp vector storing the comparison field in data set 1
 #' @param matBp vector storing the comparison field in data set 2
 #' @param n.cores Number of cores to parallelize over. Default is NULL.
 #' @param cut.a Lower bound for full match, ranging between 0 and 1. Default is 0.92
+#' @param method String distance method, options are: "jw" Jaro-Winkler (Default), "jaro" Jaro, and "lv" Edit
+#' @param w Parameter that describes the importance of the first characters of a string (only needed if method = "jw"). Default is .10
 #'
 #' @return \code{gammaCK2par} returns a list with the indices corresponding to each
 #' matching pattern, which can be fed directly into \code{tableCounts} and \code{matchesLink}.
@@ -28,7 +30,7 @@
 ## in parallel
 ## ------------------------
 
-gammaCK2par <- function(matAp, matBp, n.cores = NULL, cut.a = 0.92) {
+gammaCK2par <- function(matAp, matBp, n.cores = NULL, cut.a = 0.92, method = "jw", w = .10) {
 
     if(any(class(matAp) %in% c("tbl_df", "data.table"))){
         matAp <- as.data.frame(matAp)[,1]
@@ -46,6 +48,16 @@ gammaCK2par <- function(matAp, matBp, n.cores = NULL, cut.a = 0.92) {
     if(sum(is.na(matBp)) == length(matBp) | length(unique(matBp)) == 1){
         stop("You have no variation in this variable, or all observations are missing in dataset B.")
     }
+
+    if(!(method %in% c("jw", "jaro", "lv"))){
+        stop("Invalid string distance method. Method should be one of 'jw', 'jaro', or 'lv'.")
+    }
+
+    if(method == "jw" & !is.null(w)){
+        if(w < 0 | w > 0.25){
+            stop("Invalid value provided for w. Remember, w in [0, 0.25].")
+        }
+    }
 
     if(is.null(n.cores)) {
         n.cores <- detectCores() - 1
@@ -78,13 +90,33 @@ gammaCK2par <- function(matAp, matBp, n.cores = NULL, cut.a = 0.92) {
         temp.2[[i]] <- list(u.values.1[(limit.2[i]+1):limit.2[i+1]], limit.2[i])
     }
 
-    stringvec <- function(m, y, cut) {
+    stringvec <- function(m, y, cut, strdist = method, p1 = w) {
         x <- as.matrix(m[[1]])
-        e <- as.matrix(y[[1]])
-        t <- 1 - stringdistmatrix(e, x, method = "jw")
-        t[ t < cut ] <- 0
-        t <- Matrix(t, sparse = T)
-        t@x[t@x >= cut] <- 2; gc()
+        e <- as.matrix(y[[1]])        
+
+        if(strdist == "jw") {
+            t <- 1 - stringdistmatrix(e, x, method = "jw", p = p1, nthread = 1)
+            t[ t < cut ] <- 0
+            t <- Matrix(t, sparse = T)
+        }
+
+        if(strdist == "jaro") {
+            t <- 1 - stringdistmatrix(e, x, method = "jw", nthread = 1)
+            t[ t < cut ] <- 0
+            t <- Matrix(t, sparse = T)
+        }
+
+        if(strdist == "lv") {
+            t <- stringdistmatrix(e, x, method = method, nthread = 1)
+            t.1 <- nchar(as.matrix(e))
+            t.2 <- nchar(as.matrix(x))
+            o <- t(apply(t.1, 1, function(w){ ifelse(w >= t.2, w, t.2)}))
+            t <- 1 - t * (1/o)
+            t[ t < cut ] <- 0
+            t <- Matrix(t, sparse = T)
+        }
+
+        t@x[t@x >= cut] <- 2; gc()       	
         slice.1 <- m[[2]]
         slice.2 <- y[[2]]
         indexes.2 <- which(t == 2, arr.ind = T)

diff --git a/R/gammaCKpar.R b/R/gammaCKpar.R
@@ -4,13 +4,15 @@
 #' 0 total disagreement, 1 partial agreement, 2 agreement.
 #' The distance between strings is calculated using a Jaro-Winkler distance.
 #'
-#' @usage gammaCKpar(matAp, matBp, n.cores, cut.a, cut.p)
+#' @usage gammaCKpar(matAp, matBp, n.cores, cut.a, cut.p, method, w)
 #'
 #' @param matAp vector storing the comparison field in data set 1
 #' @param matBp vector storing the comparison field in data set 2
 #' @param n.cores Number of cores to parallelize over. Default is NULL.
 #' @param cut.a Lower bound for full match, ranging between 0 and 1. Default is 0.92
 #' @param cut.p Lower bound for partial match, ranging between 0 and 1. Default is 0.88
+#' @param method String distance method, options are: "jw" Jaro-Winkler (Default), "jaro" Jaro, and "lv" Edit
+#' @param w Parameter that describes the importance of the first characters of a string (only needed if method = "jw"). Default is .10
 #'
 #' @return \code{gammaCKpar} returns a list with the indices corresponding to each
 #' matching pattern, which can be fed directly into \code{tableCounts} and \code{matchesLink}.
@@ -30,7 +32,7 @@
 ## in parallel
 ## ------------------------
 
-gammaCKpar <- function(matAp, matBp, n.cores = NULL, cut.a = 0.92, cut.p = 0.88) {
+gammaCKpar <- function(matAp, matBp, n.cores = NULL, cut.a = 0.92, cut.p = 0.88, method = "jw", w = .10) {
 
     if(any(class(matAp) %in% c("tbl_df", "data.table"))){
         matAp <- as.data.frame(matAp)[,1]
@@ -48,6 +50,16 @@ gammaCKpar <- function(matAp, matBp, n.cores = NULL, cut.a = 0.92, cut.p = 0.88)
     if(sum(is.na(matBp)) == length(matBp) | length(unique(matBp)) == 1){
         stop("You have no variation in this variable, or all observations are missing in dataset B.")
     }
+
+    if(!(method %in% c("jw", "jaro", "lv"))){
+        stop("Invalid string distance method. Method should be one of 'jw', 'jaro', or 'lv'.")
+    }
+
+    if(method == "jw" & !is.null(w)){
+        if(w < 0 | w > 0.25){
+        	stop("Invalid value provided for w. Remember, w in [0, 0.25].")
+        }
+    }
 
     if(is.null(n.cores)) {
         n.cores <- detectCores() - 1
@@ -80,12 +92,32 @@ gammaCKpar <- function(matAp, matBp, n.cores = NULL, cut.a = 0.92, cut.p = 0.88)
         temp.2[[i]] <- list(u.values.1[(limit.2[i]+1):limit.2[i+1]], limit.2[i])
     }
 
-    stringvec <- function(m, y, cut) {
+    stringvec <- function(m, y, cut, strdist = method, p1 = w) {
         x <- as.matrix(m[[1]])
         e <- as.matrix(y[[1]])
-        t <- 1 - stringdistmatrix(e, x, method = "jw", nthread = 1)
-        t[ t < cut[2] ] <- 0
-        t <- Matrix(t, sparse = T)
+
+        if(strdist == "jw") {
+        		t <- 1 - stringdistmatrix(e, x, method = "jw", p = p1, nthread = 1)
+        		t[ t < cut[[2]] ] <- 0
+        		t <- Matrix(t, sparse = T)
+        	}
+
+        if(strdist == "jaro") {
+        		t <- 1 - stringdistmatrix(e, x, method = "jw", nthread = 1)
+        		t[ t < cut[[2]] ] <- 0
+        		t <- Matrix(t, sparse = T)
+        	}
+
+        if(strdist == "lv") {
+            t <- stringdistmatrix(e, x, method = method, nthread = 1)
+            t.1 <- nchar(as.matrix(e))
+            t.2 <- nchar(as.matrix(x))
+            o <- t(apply(t.1, 1, function(w){ ifelse(w >= t.2, w, t.2)}))
+            t <- 1 - t * (1/o)
+        		t[ t < cut[[2]] ] <- 0
+        		t <- Matrix(t, sparse = T)
+        	}
+
         t@x[t@x >= cut[1]] <- 2
         t@x[t@x >= cut[2] & t@x < cut[1]] <- 1; gc()
         slice.1 <- m[[2]]
@@ -105,7 +137,7 @@ gammaCKpar <- function(matAp, matBp, n.cores = NULL, cut.a = 0.92, cut.p = 0.88)
     cl <- makeCluster(nc)
     registerDoParallel(cl)
 
-    temp.f <- foreach(i = 1:nrow(do)) %dopar% { 
+    temp.f <- foreach(i = 1:nrow(do), .packages = c("stringdist", "Matrix")) %dopar% { 
         r1 <- do[i, 1]
         r2 <- do[i, 2]
         stringvec(temp.1[[r1]], temp.2[[r2]], c(cut.a, cut.p))

diff --git a/R/nameReweight.R b/R/nameReweight.R
@@ -6,7 +6,8 @@
 #'
 #' @usage nameReweight(dfA, dfB, EM, gammalist, matchesLink,
 #' varnames, stringdist.match, partial.match,
-#' firstname.field, threshold.match, cut.a, cut.p, n.cores)
+#' firstname.field, threshold.match, stringdist.method, cut.a, cut.p,
+#' jw.weight, n.cores)
 #' @param dfA The full version of dataset A that is being matched.
 #' @param dfB The full version of dataset B that is being matched.
 #' @param EM The EM object from \code{emlinkMARmov()}
@@ -26,8 +27,10 @@
 #' @param threshold.match A number between 0 and 1 indicating either the lower bound (if only one number provided) or the range of certainty that the
 #' user wants to declare a match. For instance, threshold.match = .85 will return all pairs with posterior probability greater than .85 as matches,
 #' while threshold.match = c(.85, .95) will return all pairs with posterior probability between .85 and .95 as matches.
+#' @param stringdist.method String distance method for calculating similarity, options are: "jw" Jaro-Winkler (Default), "jaro" Jaro, and "lv" Edit
 #' @param cut.a Lower bound for full string-distance match, ranging between 0 and 1. Default is 0.92
 #' @param cut.p Lower bound for partial string-distance match, ranging between 0 and 1. Default is 0.88
+#' @param jw.weight Parameter that describes the importance of the first characters of a string (only needed if stringdist.method = "jw"). Default is .10
 #' @param n.cores Number of cores to parallelize over. Default is NULL.
 #'
 #' @return \code{nameReweight()} returns a list containing the following elements:
@@ -38,8 +41,18 @@
 nameReweight <- function(dfA, dfB, EM, gammalist, matchesLink,
                          varnames, stringdist.match, partial.match, 
                          firstname.field, threshold.match,
-                         cut.a = .92, cut.p = .88, n.cores = NULL){
+                         stringdist.method = "jw", cut.a = .92, cut.p = .88,
+                         jw.weight = .10, n.cores = NULL){
 
+    if(!(stringdist.method %in% c("jw", "jaro", "lv"))){
+        stop("Invalid string distance method. Method should be one of 'jw', 'jaro', or 'lv'.")
+    }
+    if(stringdist.method == "jw" & !is.null(jw.weight)){
+        if(jw.weight < 0 | jw.weight > 0.25){
+            stop("Invalid value provided for jw.weight. Remember, jw.weight in [0, 0.25].")
+        }
+    }
+
     ## Get cores
     if(is.null(n.cores)) {
         n.cores <- detectCores() - 1
@@ -85,7 +98,20 @@ nameReweight <- function(dfA, dfB, EM, gammalist, matchesLink,
         }
         ## Get matches
         if(stringdist.match[i]){
-            tmp <- 1 - stringdist(matchesA[,varnames[i]], matchesB[,varnames[i]], "jw")
+            if(stringdist.method %in% c("jw", "jaro")){
+                if(stringdist.method == "jw"){
+                    p1 <- jw.weight
+                }else{
+                    p1 <- NULL
+                }
+                tmp <- 1 - stringdist(matchesA[,varnames[i]], matchesB[,varnames[i]], "jw", p = p1)
+            }else{
+                t <- stringdist(matchesA[,varnames[i]], matchesB[,varnames[i]], method = stringdist.method)
+                t.1 <- nchar(matchesA[,varnames[i]])
+                t.2 <- nchar(matchesB[,varnames[i]])
+                o <- ifelse(t.1 > t.2, t.1, t.2)
+                tmp <- 1 - t * (1/o)
+            }
             if(partial.match[i]){
                 gammalist[[i]] <- ifelse(
                     tmp >= cut.a, 2, ifelse(tmp >= cut.p, 1, 0)

diff --git a/man/dedupeMatches.Rd b/man/dedupeMatches.Rd
diff --git a/man/fastLink.Rd b/man/fastLink.Rd
diff --git a/man/gammaCK2par.Rd b/man/gammaCK2par.Rd