Skip to content

Commit

Permalink
Merge pull request #11 from kosukeimai/stringdist_measures
Browse files Browse the repository at this point in the history
Stringdist measures
  • Loading branch information
tedenamorado authored Jun 2, 2017
2 parents ad87f18 + 8477ea2 commit c8173c8
Show file tree
Hide file tree
Showing 10 changed files with 374 additions and 221 deletions.
408 changes: 216 additions & 192 deletions R/dedupeMatches.R

Large diffs are not rendered by default.

28 changes: 22 additions & 6 deletions R/fastLink.R
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@
#' two datasets.
#'
#' @usage fastLink(dfA, dfB, varnames, stringdist.match,
#' partial.match, cut.a, cut.p, priors.obj, w.lambda, w.pi,
#' partial.match, stringdist.method, cut.a, cut.p, jw.weight,
#' priors.obj, w.lambda, w.pi,
#' address.field, gender.field, estimate.only, em.obj,
#' dedupe.matches, linprog.dedupe,
#' reweight.names, firstname.field,
Expand All @@ -20,8 +21,10 @@
#' @param partial.match A vector of variable names indicating whether to include
#' a partial matching category for the string distances. Must be a subset of 'varnames'
#' and 'stringdist.match'.
#' @param stringdist.method String distance method for calculating similarity, options are: "jw" Jaro-Winkler (Default), "jaro" Jaro, and "lv" Edit
#' @param cut.a Lower bound for full string-distance match, ranging between 0 and 1. Default is 0.92
#' @param cut.p Lower bound for partial string-distance match, ranging between 0 and 1. Default is 0.88
#' @param jw.weight Parameter that describes the importance of the first characters of a string (only needed if stringdist.method = "jw"). Default is .10
#' @param priors.obj A list containing priors for auxiliary movers information,
#' as output from calcMoversPriors(). Default is NULL
#' @param w.lambda How much weight to give the prior on lambda versus the data. Must range between 0 (no weight on prior) and 1 (weight fully on prior).
Expand Down Expand Up @@ -71,7 +74,8 @@
#' @export
fastLink <- function(dfA, dfB, varnames,
stringdist.match = NULL, partial.match = NULL,
cut.a = 0.92, cut.p = 0.88,
stringdist.method = "jw",
cut.a = 0.92, cut.p = 0.88, jw.weight = .10,
priors.obj = NULL,
w.lambda = NULL, w.pi = NULL, address.field = NULL,
gender.field = NULL, estimate.only = FALSE, em.obj = NULL,
Expand Down Expand Up @@ -139,6 +143,14 @@ fastLink <- function(dfA, dfB, varnames,
estimate.only <- FALSE
cat("You have provided an EM object but have set 'estimate.only' to TRUE. Setting 'estimate.only' to FALSE so that matched indices are returned.\n")
}
if(!(stringdist.method %in% c("jw", "jaro", "lv"))){
stop("Invalid string distance method. Method should be one of 'jw', 'jaro', or 'lv'.")
}
if(stringdist.method == "jw" & !is.null(jw.weight)){
if(jw.weight < 0 | jw.weight > 0.25){
stop("Invalid value provided for jw.weight. Remember, jw.weight in [0, 0.25].")
}
}

## Create boolean indicators
sm.bool <- which(varnames %in% stringdist.match)
Expand Down Expand Up @@ -189,9 +201,11 @@ fastLink <- function(dfA, dfB, varnames,
## Get patterns
if(stringdist.match[i]){
if(partial.match[i]){
gammalist[[i]] <- gammaCKpar(dfA[,varnames[i]], dfB[,varnames[i]], cut.a = cut.a, cut.p = cut.p, n.cores = n.cores)
gammalist[[i]] <- gammaCKpar(
dfA[,varnames[i]], dfB[,varnames[i]], cut.a = cut.a, cut.p = cut.p, method = stringdist.method, w = jw.weight, n.cores = n.cores
)
}else{
gammalist[[i]] <- gammaCK2par(dfA[,varnames[i]], dfB[,varnames[i]], cut.a = cut.a, n.cores = n.cores)
gammalist[[i]] <- gammaCK2par(dfA[,varnames[i]], dfB[,varnames[i]], cut.a = cut.a, method = stringdist.method, w = jw.weight, n.cores = n.cores)
}
}else{
gammalist[[i]] <- gammaKpar(dfA[,varnames[i]], dfB[,varnames[i]], gender = gender.field[i], n.cores = n.cores)
Expand Down Expand Up @@ -277,7 +291,8 @@ fastLink <- function(dfA, dfB, varnames,
ddm.out <- dedupeMatches(matchesA = dfA[matches$inds.a,], matchesB = dfB[matches$inds.b,],
EM = resultsEM, matchesLink = matches, varnames = varnames,
stringdist.match = stringdist.match, partial.match = partial.match,
linprog = linprog.dedupe, cut.a = cut.a, cut.p = cut.p)
linprog = linprog.dedupe, stringdist.method = stringdist.method,
cut.a = cut.a, cut.p = cut.p, jw.weight = jw.weight)
matches <- ddm.out$matchesLink
resultsEM <- ddm.out$EM
end <- Sys.time()
Expand All @@ -293,7 +308,8 @@ fastLink <- function(dfA, dfB, varnames,
rwn.out <- nameReweight(dfA, dfB, EM = resultsEM, gammalist = gammalist, matchesLink = matches,
varnames = varnames, stringdist.match = stringdist.match, partial.match = partial.match,
firstname.field = firstname.field, threshold.match = threshold.match,
cut.a = cut.a, cut.p = cut.p, n.cores = n.cores)
stringdist.method = stringdist.method, cut.a = cut.a, cut.p = cut.p, jw.weight = jw.weight,
n.cores = n.cores)
end <- Sys.time()
if(verbose){
cat("Reweighting by first name took", round(difftime(end, start, units = "mins"), 2), "minutes.\n\n")
Expand Down
48 changes: 40 additions & 8 deletions R/gammaCK2par.R
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,14 @@
#' 0 total disagreement, 2 agreement.
#' The distance between strings is calculated using a Jaro-Winkler distance.
#'
#' @usage gammaCK2par(matAp, matBp, n.cores, cut.a)
#' @usage gammaCK2par(matAp, matBp, n.cores, cut.a, method, w)
#'
#' @param matAp vector storing the comparison field in data set 1
#' @param matBp vector storing the comparison field in data set 2
#' @param n.cores Number of cores to parallelize over. Default is NULL.
#' @param cut.a Lower bound for full match, ranging between 0 and 1. Default is 0.92
#' @param method String distance method, options are: "jw" Jaro-Winkler (Default), "jaro" Jaro, and "lv" Edit
#' @param w Parameter that describes the importance of the first characters of a string (only needed if method = "jw"). Default is .10
#'
#' @return \code{gammaCK2par} returns a list with the indices corresponding to each
#' matching pattern, which can be fed directly into \code{tableCounts} and \code{matchesLink}.
Expand All @@ -28,7 +30,7 @@
## in parallel
## ------------------------

gammaCK2par <- function(matAp, matBp, n.cores = NULL, cut.a = 0.92) {
gammaCK2par <- function(matAp, matBp, n.cores = NULL, cut.a = 0.92, method = "jw", w = .10) {

if(any(class(matAp) %in% c("tbl_df", "data.table"))){
matAp <- as.data.frame(matAp)[,1]
Expand All @@ -46,6 +48,16 @@ gammaCK2par <- function(matAp, matBp, n.cores = NULL, cut.a = 0.92) {
if(sum(is.na(matBp)) == length(matBp) | length(unique(matBp)) == 1){
stop("You have no variation in this variable, or all observations are missing in dataset B.")
}

if(!(method %in% c("jw", "jaro", "lv"))){
stop("Invalid string distance method. Method should be one of 'jw', 'jaro', or 'lv'.")
}

if(method == "jw" & !is.null(w)){
if(w < 0 | w > 0.25){
stop("Invalid value provided for w. Remember, w in [0, 0.25].")
}
}

if(is.null(n.cores)) {
n.cores <- detectCores() - 1
Expand Down Expand Up @@ -78,13 +90,33 @@ gammaCK2par <- function(matAp, matBp, n.cores = NULL, cut.a = 0.92) {
temp.2[[i]] <- list(u.values.1[(limit.2[i]+1):limit.2[i+1]], limit.2[i])
}

stringvec <- function(m, y, cut) {
stringvec <- function(m, y, cut, strdist = method, p1 = w) {
x <- as.matrix(m[[1]])
e <- as.matrix(y[[1]])
t <- 1 - stringdistmatrix(e, x, method = "jw")
t[ t < cut ] <- 0
t <- Matrix(t, sparse = T)
t@x[t@x >= cut] <- 2; gc()
e <- as.matrix(y[[1]])

if(strdist == "jw") {
t <- 1 - stringdistmatrix(e, x, method = "jw", p = p1, nthread = 1)
t[ t < cut ] <- 0
t <- Matrix(t, sparse = T)
}

if(strdist == "jaro") {
t <- 1 - stringdistmatrix(e, x, method = "jw", nthread = 1)
t[ t < cut ] <- 0
t <- Matrix(t, sparse = T)
}

if(strdist == "lv") {
t <- stringdistmatrix(e, x, method = method, nthread = 1)
t.1 <- nchar(as.matrix(e))
t.2 <- nchar(as.matrix(x))
o <- t(apply(t.1, 1, function(w){ ifelse(w >= t.2, w, t.2)}))
t <- 1 - t * (1/o)
t[ t < cut ] <- 0
t <- Matrix(t, sparse = T)
}

t@x[t@x >= cut] <- 2; gc()
slice.1 <- m[[2]]
slice.2 <- y[[2]]
indexes.2 <- which(t == 2, arr.ind = T)
Expand Down
46 changes: 39 additions & 7 deletions R/gammaCKpar.R
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,15 @@
#' 0 total disagreement, 1 partial agreement, 2 agreement.
#' The distance between strings is calculated using a Jaro-Winkler distance.
#'
#' @usage gammaCKpar(matAp, matBp, n.cores, cut.a, cut.p)
#' @usage gammaCKpar(matAp, matBp, n.cores, cut.a, cut.p, method, w)
#'
#' @param matAp vector storing the comparison field in data set 1
#' @param matBp vector storing the comparison field in data set 2
#' @param n.cores Number of cores to parallelize over. Default is NULL.
#' @param cut.a Lower bound for full match, ranging between 0 and 1. Default is 0.92
#' @param cut.p Lower bound for partial match, ranging between 0 and 1. Default is 0.88
#' @param method String distance method, options are: "jw" Jaro-Winkler (Default), "jaro" Jaro, and "lv" Edit
#' @param w Parameter that describes the importance of the first characters of a string (only needed if method = "jw"). Default is .10
#'
#' @return \code{gammaCKpar} returns a list with the indices corresponding to each
#' matching pattern, which can be fed directly into \code{tableCounts} and \code{matchesLink}.
Expand All @@ -30,7 +32,7 @@
## in parallel
## ------------------------

gammaCKpar <- function(matAp, matBp, n.cores = NULL, cut.a = 0.92, cut.p = 0.88) {
gammaCKpar <- function(matAp, matBp, n.cores = NULL, cut.a = 0.92, cut.p = 0.88, method = "jw", w = .10) {

if(any(class(matAp) %in% c("tbl_df", "data.table"))){
matAp <- as.data.frame(matAp)[,1]
Expand All @@ -48,6 +50,16 @@ gammaCKpar <- function(matAp, matBp, n.cores = NULL, cut.a = 0.92, cut.p = 0.88)
if(sum(is.na(matBp)) == length(matBp) | length(unique(matBp)) == 1){
stop("You have no variation in this variable, or all observations are missing in dataset B.")
}

if(!(method %in% c("jw", "jaro", "lv"))){
stop("Invalid string distance method. Method should be one of 'jw', 'jaro', or 'lv'.")
}

if(method == "jw" & !is.null(w)){
if(w < 0 | w > 0.25){
stop("Invalid value provided for w. Remember, w in [0, 0.25].")
}
}

if(is.null(n.cores)) {
n.cores <- detectCores() - 1
Expand Down Expand Up @@ -80,12 +92,32 @@ gammaCKpar <- function(matAp, matBp, n.cores = NULL, cut.a = 0.92, cut.p = 0.88)
temp.2[[i]] <- list(u.values.1[(limit.2[i]+1):limit.2[i+1]], limit.2[i])
}

stringvec <- function(m, y, cut) {
stringvec <- function(m, y, cut, strdist = method, p1 = w) {
x <- as.matrix(m[[1]])
e <- as.matrix(y[[1]])
t <- 1 - stringdistmatrix(e, x, method = "jw", nthread = 1)
t[ t < cut[2] ] <- 0
t <- Matrix(t, sparse = T)

if(strdist == "jw") {
t <- 1 - stringdistmatrix(e, x, method = "jw", p = p1, nthread = 1)
t[ t < cut[[2]] ] <- 0
t <- Matrix(t, sparse = T)
}

if(strdist == "jaro") {
t <- 1 - stringdistmatrix(e, x, method = "jw", nthread = 1)
t[ t < cut[[2]] ] <- 0
t <- Matrix(t, sparse = T)
}

if(strdist == "lv") {
t <- stringdistmatrix(e, x, method = method, nthread = 1)
t.1 <- nchar(as.matrix(e))
t.2 <- nchar(as.matrix(x))
o <- t(apply(t.1, 1, function(w){ ifelse(w >= t.2, w, t.2)}))
t <- 1 - t * (1/o)
t[ t < cut[[2]] ] <- 0
t <- Matrix(t, sparse = T)
}

t@x[t@x >= cut[1]] <- 2
t@x[t@x >= cut[2] & t@x < cut[1]] <- 1; gc()
slice.1 <- m[[2]]
Expand All @@ -105,7 +137,7 @@ gammaCKpar <- function(matAp, matBp, n.cores = NULL, cut.a = 0.92, cut.p = 0.88)
cl <- makeCluster(nc)
registerDoParallel(cl)

temp.f <- foreach(i = 1:nrow(do)) %dopar% {
temp.f <- foreach(i = 1:nrow(do), .packages = c("stringdist", "Matrix")) %dopar% {
r1 <- do[i, 1]
r2 <- do[i, 2]
stringvec(temp.1[[r1]], temp.2[[r2]], c(cut.a, cut.p))
Expand Down
32 changes: 29 additions & 3 deletions R/nameReweight.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@
#'
#' @usage nameReweight(dfA, dfB, EM, gammalist, matchesLink,
#' varnames, stringdist.match, partial.match,
#' firstname.field, threshold.match, cut.a, cut.p, n.cores)
#' firstname.field, threshold.match, stringdist.method, cut.a, cut.p,
#' jw.weight, n.cores)
#' @param dfA The full version of dataset A that is being matched.
#' @param dfB The full version of dataset B that is being matched.
#' @param EM The EM object from \code{emlinkMARmov()}
Expand All @@ -26,8 +27,10 @@
#' @param threshold.match A number between 0 and 1 indicating either the lower bound (if only one number provided) or the range of certainty that the
#' user wants to declare a match. For instance, threshold.match = .85 will return all pairs with posterior probability greater than .85 as matches,
#' while threshold.match = c(.85, .95) will return all pairs with posterior probability between .85 and .95 as matches.
#' @param stringdist.method String distance method for calculating similarity, options are: "jw" Jaro-Winkler (Default), "jaro" Jaro, and "lv" Edit
#' @param cut.a Lower bound for full string-distance match, ranging between 0 and 1. Default is 0.92
#' @param cut.p Lower bound for partial string-distance match, ranging between 0 and 1. Default is 0.88
#' @param jw.weight Parameter that describes the importance of the first characters of a string (only needed if stringdist.method = "jw"). Default is .10
#' @param n.cores Number of cores to parallelize over. Default is NULL.
#'
#' @return \code{nameReweight()} returns a list containing the following elements:
Expand All @@ -38,8 +41,18 @@
nameReweight <- function(dfA, dfB, EM, gammalist, matchesLink,
varnames, stringdist.match, partial.match,
firstname.field, threshold.match,
cut.a = .92, cut.p = .88, n.cores = NULL){
stringdist.method = "jw", cut.a = .92, cut.p = .88,
jw.weight = .10, n.cores = NULL){

if(!(stringdist.method %in% c("jw", "jaro", "lv"))){
stop("Invalid string distance method. Method should be one of 'jw', 'jaro', or 'lv'.")
}
if(stringdist.method == "jw" & !is.null(jw.weight)){
if(jw.weight < 0 | jw.weight > 0.25){
stop("Invalid value provided for jw.weight. Remember, jw.weight in [0, 0.25].")
}
}

## Get cores
if(is.null(n.cores)) {
n.cores <- detectCores() - 1
Expand Down Expand Up @@ -85,7 +98,20 @@ nameReweight <- function(dfA, dfB, EM, gammalist, matchesLink,
}
## Get matches
if(stringdist.match[i]){
tmp <- 1 - stringdist(matchesA[,varnames[i]], matchesB[,varnames[i]], "jw")
if(stringdist.method %in% c("jw", "jaro")){
if(stringdist.method == "jw"){
p1 <- jw.weight
}else{
p1 <- NULL
}
tmp <- 1 - stringdist(matchesA[,varnames[i]], matchesB[,varnames[i]], "jw", p = p1)
}else{
t <- stringdist(matchesA[,varnames[i]], matchesB[,varnames[i]], method = stringdist.method)
t.1 <- nchar(matchesA[,varnames[i]])
t.2 <- nchar(matchesB[,varnames[i]])
o <- ifelse(t.1 > t.2, t.1, t.2)
tmp <- 1 - t * (1/o)
}
if(partial.match[i]){
gammalist[[i]] <- ifelse(
tmp >= cut.a, 2, ifelse(tmp >= cut.p, 1, 0)
Expand Down
7 changes: 6 additions & 1 deletion man/dedupeMatches.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

7 changes: 6 additions & 1 deletion man/fastLink.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 5 additions & 1 deletion man/gammaCK2par.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit c8173c8

Please sign in to comment.