diff --git a/R/dedupeMatches.R b/R/dedupeMatches.R index 949bd9b..1ac840c 100644 --- a/R/dedupeMatches.R +++ b/R/dedupeMatches.R @@ -4,7 +4,8 @@ #' #' @usage dedupeMatches(matchesA, matchesB, EM, #' matchesLink, varnames, stringdist.match, partial.match, -#' linprog, cut.a = 0.92, cut.p = 0.88) +#' linprog, stringdist.method, cut.a = 0.92, cut.p = 0.88, +#' jw.weight) #' @param matchesA A dataframe of the matched observations in #' dataset A, with all variables used to inform the match. #' @param matchesB A dataframe of the matched observations in @@ -21,8 +22,10 @@ #' as varnames. Default is FALSE for all variables. #' @param linprog Whether to implement Winkler's linear programming solution to the deduplication #' problem. Default is false. +#' @param stringdist.method String distance method for calculating similarity, options are: "jw" Jaro-Winkler (Default), "jaro" Jaro, and "lv" Edit #' @param cut.a Lower bound for full string-distance match, ranging between 0 and 1. Default is 0.92 #' @param cut.p Lower bound for partial string-distance match, ranging between 0 and 1. Default is 0.88 +#' @param jw.weight Parameter that describes the importance of the first characters of a string (only needed if stringdist.method = "jw"). Default is .10 #' #' @return \code{dedupeMatches()} returns a list containing the following elements: #' \item{matchesA}{A deduped version of matchesA} @@ -37,8 +40,8 @@ #' @importFrom stats runif dedupeMatches <- function(matchesA, matchesB, EM, matchesLink, varnames, stringdist.match, partial.match, - linprog = FALSE, - cut.a = 0.92, cut.p = 0.88){ + linprog = FALSE, stringdist.method = "jw", + cut.a = 0.92, cut.p = 0.88, jw.weight = .10){ ## -------------- ## Start function @@ -50,6 +53,14 @@ dedupeMatches <- function(matchesA, matchesB, EM, matchesLink, if(any(class(matchesB) %in% c("tbl_df", "data.table"))){ matchesB <- as.data.frame(matchesB) } + if(!(stringdist.method %in% c("jw", "jaro", "lv"))){ + stop("Invalid string distance method. Method should be one of 'jw', 'jaro', or 'lv'.") + } + if(stringdist.method == "jw" & !is.null(jw.weight)){ + if(jw.weight < 0 | jw.weight > 0.25){ + stop("Invalid value provided for jw.weight. Remember, jw.weight in [0, 0.25].") + } + } ## Get original column names colnames.df.a <- colnames(matchesA) @@ -68,7 +79,20 @@ dedupeMatches <- function(matchesA, matchesB, EM, matchesLink, } ## Get matches if(stringdist.match[i]){ - tmp <- 1 - stringdist(matchesA[,varnames[i]], matchesB[,varnames[i]], "jw") + if(stringdist.method %in% c("jw", "jaro")){ + if(stringdist.method == "jw"){ + p1 <- jw.weight + }else{ + p1 <- NULL + } + tmp <- 1 - stringdist(matchesA[,varnames[i]], matchesB[,varnames[i]], "jw", p = p1) + }else{ + t <- stringdist(matchesA[,varnames[i]], matchesB[,varnames[i]], method = stringdist.method) + t.1 <- nchar(matchesA[,varnames[i]]) + t.2 <- nchar(matchesB[,varnames[i]]) + o <- ifelse(t.1 > t.2, t.1, t.2) + tmp <- 1 - t * (1/o) + } if(partial.match[i]){ gammalist[[i]] <- ifelse( tmp >= cut.a, 2, ifelse(tmp >= cut.p, 1, 0) @@ -76,201 +100,201 @@ dedupeMatches <- function(matchesA, matchesB, EM, matchesLink, }else{ gammalist[[i]] <- ifelse(tmp >= cut.a, 2, 0) } - }else{ - tmp <- matchesA[,varnames[i]] == matchesB[,varnames[i]] - gammalist[[i]] <- ifelse(tmp == TRUE, 2, 0) - } + }else{ + tmp <- matchesA[,varnames[i]] == matchesB[,varnames[i]] + gammalist[[i]] <- ifelse(tmp == TRUE, 2, 0) + } - namevec[i] <- paste0("gamma.", i) - - } - gammalist <- data.frame(do.call(cbind, gammalist)) - names(gammalist) <- namevec - - ## ------------------------------- - ## Convert EM object to data frame - ## ------------------------------- - emdf <- as.data.frame(EM$patterns.w) - emdf$zeta.j <- c(EM$zeta.j) - - ## --------------------- - ## Merge EM to gammalist - ## --------------------- - matchesA <- cbind(matchesA, gammalist) - matchesB <- cbind(matchesB, gammalist) - matchesA$roworder <- 1:nrow(matchesA) - matchesB$roworder <- 1:nrow(matchesB) - matchesA <- merge(matchesA, emdf, by = namevec, all.x = TRUE) - matchesB <- merge(matchesB, emdf, by = namevec, all.x = TRUE) - matchesA <- matchesA[order(matchesA$roworder),] - matchesB <- matchesB[order(matchesB$roworder),] - - ## ------------ - ## Start dedupe - ## ------------ - ## Ids - matchesA$idA <- matchesLink$inds.a - matchesB$idB <- matchesLink$inds.b - matchesB$idA <- matchesA$idA - matchesA$idB <- matchesB$idB - - ## Remove observations with NA for zeta.j - matchesA <- matchesA[!is.na(matchesA$zeta.j),] - matchesB <- matchesB[!is.na(matchesB$zeta.j),] - - if(!linprog){ - - ## Step 1: Find max zeta for each observation in dataset A: - ## Merge in maximum zeta for each observation in dataset A - temp <- as.matrix(tapply(matchesA$zeta.j, matchesA$idA, max, na.rm = T)) - temp <- data.frame(cbind(as.numeric(rownames(temp)), as.numeric(temp))) - names(temp) <- c("idA", "zeta.max") - matchesA <- merge(matchesA, temp, by = "idA") - - ## Calculate difference - matchesA <- matchesA[order(matchesA$roworder), ] - matchesB <- matchesB[order(matchesB$roworder), ] - matchesA$rm <- abs(matchesA$zeta.j - matchesA$zeta.max) - rm <- which(matchesA$rm == 0) - - ## Subset down to max zetas - matchesA <- matchesA[rm, ] - matchesB <- matchesB[rm, ] - - ## Step 2: Find max zeta for each observation in dataset B, if in first subset: - ## Merge in maximum zeta for each observation in dataset B - temp <- as.matrix(tapply(matchesB$zeta.j, matchesB$idB, max, na.rm = T)) - temp <- data.frame(cbind(as.numeric(rownames(temp)), as.numeric(temp))) - names(temp) <- c("idB", "zeta.max") - matchesB <- merge(matchesB, temp, by = "idB") - - ## Calculate difference - matchesA <- matchesA[order(matchesA$roworder), ] - matchesB <- matchesB[order(matchesB$roworder), ] - matchesB$rm <- abs(matchesB$zeta.j - matchesB$zeta.max) - rm <- which(matchesB$rm == 0) - - ## Subset down to max zetas - matchesA <- matchesA[rm, ] - matchesB <- matchesB[rm, ] - - ## Step 3: Break remaining ties - ## Find remaining duplicates in A - d1 <- duplicated(matchesA$idA) - d2 <- duplicated(matchesA$idA, fromLast = T) - matchesA$dA <- ifelse((d1 + d2) > 0, 1, 0) - matchesB$dA <- ifelse((d1 + d2) > 0, 1, 0) - - ## Draw uniform to break tie, and merge in - matchesA$uni <- runif(nrow(matchesA)) - temp <- as.matrix(tapply(matchesA$uni, matchesA$idA, max, na.rm = T)) - temp <- data.frame(cbind(as.numeric(rownames(temp)), as.numeric(temp))) - names(temp) <- c("idA", "uni.max") - matchesA <- merge(matchesA, temp, by = "idA") - matchesA <- matchesA[order(matchesA$roworder), ] - matchesB <- matchesB[order(matchesB$roworder), ] - matchesA$rm <- abs(matchesA$uni - matchesA$uni.max) - rm <- which(matchesA$rm == 0) - - ## Subset down to broken tie - matchesA <- matchesA[rm, ] - matchesB <- matchesB[rm, ] - - }else{ - - ## Find duplicates - dupA <- duplicated(matchesA$idA) - dupB <- duplicated(matchesA$idA, fromLast = T) - matchesA$dupA <- ifelse(dupA == 1 | dupB == 1, 1, 0) - - dupA <- duplicated(matchesB$idB) - dupB <- duplicated(matchesB$idB, fromLast = T) - matchesA$dupB <- ifelse(dupA == 1 | dupB == 1, 1, 0) - - ## Split into dupes, not dups - dups <- subset(matchesA, dupA == 1 | dupB == 1) - nodups <- subset(matchesA, dupA == 0 & dupB == 0) - - dups$idA.t <- as.numeric(as.factor(dups$idA)) - dups$idB.t <- as.numeric(as.factor(dups$idB)) - - nr <- max(dups$idA.t) - nc <- max(dups$idB.t) - dim <- max(nr, nc) - - ## Create adjacency matrix to id duplicates - mat.adj <- sparseMatrix(i = dups$idA.t, j = dups$idB.t, x = dups$zeta.j, - dims = c(dim, dim)) - mat.adj <- as.matrix(mat.adj) - - ## Solve linear sum assignment problem - T1 <- suppressWarnings(assignment(-mat.adj)) - temp.0 <- cbind(1:dim, T1$perm) - n1 <- which(rowSums(mat.adj) == 0) - n2 <- which(colSums(mat.adj) == 0) - - if(length(n1) > 0) { - temp.0 <- temp.0[-n1, ] + namevec[i] <- paste0("gamma.", i) + } + gammalist <- data.frame(do.call(cbind, gammalist)) + names(gammalist) <- namevec + + ## ------------------------------- + ## Convert EM object to data frame + ## ------------------------------- + emdf <- as.data.frame(EM$patterns.w) + emdf$zeta.j <- c(EM$zeta.j) + + ## --------------------- + ## Merge EM to gammalist + ## --------------------- + matchesA <- cbind(matchesA, gammalist) + matchesB <- cbind(matchesB, gammalist) + matchesA$roworder <- 1:nrow(matchesA) + matchesB$roworder <- 1:nrow(matchesB) + matchesA <- merge(matchesA, emdf, by = namevec, all.x = TRUE) + matchesB <- merge(matchesB, emdf, by = namevec, all.x = TRUE) + matchesA <- matchesA[order(matchesA$roworder),] + matchesB <- matchesB[order(matchesB$roworder),] + + ## ------------ + ## Start dedupe + ## ------------ + ## Ids + matchesA$idA <- matchesLink$inds.a + matchesB$idB <- matchesLink$inds.b + matchesB$idA <- matchesA$idA + matchesA$idB <- matchesB$idB + + ## Remove observations with NA for zeta.j + matchesA <- matchesA[!is.na(matchesA$zeta.j),] + matchesB <- matchesB[!is.na(matchesB$zeta.j),] + + if(!linprog){ + + ## Step 1: Find max zeta for each observation in dataset A: + ## Merge in maximum zeta for each observation in dataset A + temp <- as.matrix(tapply(matchesA$zeta.j, matchesA$idA, max, na.rm = T)) + temp <- data.frame(cbind(as.numeric(rownames(temp)), as.numeric(temp))) + names(temp) <- c("idA", "zeta.max") + matchesA <- merge(matchesA, temp, by = "idA") + + ## Calculate difference + matchesA <- matchesA[order(matchesA$roworder), ] + matchesB <- matchesB[order(matchesB$roworder), ] + matchesA$rm <- abs(matchesA$zeta.j - matchesA$zeta.max) + rm <- which(matchesA$rm == 0) + + ## Subset down to max zetas + matchesA <- matchesA[rm, ] + matchesB <- matchesB[rm, ] + + ## Step 2: Find max zeta for each observation in dataset B, if in first subset: + ## Merge in maximum zeta for each observation in dataset B + temp <- as.matrix(tapply(matchesB$zeta.j, matchesB$idB, max, na.rm = T)) + temp <- data.frame(cbind(as.numeric(rownames(temp)), as.numeric(temp))) + names(temp) <- c("idB", "zeta.max") + matchesB <- merge(matchesB, temp, by = "idB") + + ## Calculate difference + matchesA <- matchesA[order(matchesA$roworder), ] + matchesB <- matchesB[order(matchesB$roworder), ] + matchesB$rm <- abs(matchesB$zeta.j - matchesB$zeta.max) + rm <- which(matchesB$rm == 0) + + ## Subset down to max zetas + matchesA <- matchesA[rm, ] + matchesB <- matchesB[rm, ] + + ## Step 3: Break remaining ties + ## Find remaining duplicates in A + d1 <- duplicated(matchesA$idA) + d2 <- duplicated(matchesA$idA, fromLast = T) + matchesA$dA <- ifelse((d1 + d2) > 0, 1, 0) + matchesB$dA <- ifelse((d1 + d2) > 0, 1, 0) + + ## Draw uniform to break tie, and merge in + matchesA$uni <- runif(nrow(matchesA)) + temp <- as.matrix(tapply(matchesA$uni, matchesA$idA, max, na.rm = T)) + temp <- data.frame(cbind(as.numeric(rownames(temp)), as.numeric(temp))) + names(temp) <- c("idA", "uni.max") + matchesA <- merge(matchesA, temp, by = "idA") + matchesA <- matchesA[order(matchesA$roworder), ] + matchesB <- matchesB[order(matchesB$roworder), ] + matchesA$rm <- abs(matchesA$uni - matchesA$uni.max) + rm <- which(matchesA$rm == 0) + + ## Subset down to broken tie + matchesA <- matchesA[rm, ] + matchesB <- matchesB[rm, ] + + }else{ - if(length(n2) > 0) { - temp.0 <- temp.0[, -n2] - } + ## Find duplicates + dupA <- duplicated(matchesA$idA) + dupB <- duplicated(matchesA$idA, fromLast = T) + matchesA$dupA <- ifelse(dupA == 1 | dupB == 1, 1, 0) - temp.0 <- data.frame(temp.0) - names(temp.0) <- c("idA.t", "idB.t") + dupA <- duplicated(matchesB$idB) + dupB <- duplicated(matchesB$idB, fromLast = T) + matchesA$dupB <- ifelse(dupA == 1 | dupB == 1, 1, 0) - ## Merge in dedupe information - dedup <- merge(temp.0, dups, by = c("idA.t", "idB.t")) - dedup$idA.t <- dedup$idB.t <- NULL + ## Split into dupes, not dups + dups <- subset(matchesA, dupA == 1 | dupB == 1) + nodups <- subset(matchesA, dupA == 0 & dupB == 0) - ## Combine dupes, dedupes - matchesA <- rbind(dedup, nodups) - matchesA$dupA <- matchesA$dupB <- NULL - listA <- paste(matchesA$idA, matchesA$idB, sep = "-") - listB <- paste(matchesB$idA, matchesB$idB, sep = "-") - keep <- which(listB %in% listA) - matchesB <- matchesB[keep, ] + dups$idA.t <- as.numeric(as.factor(dups$idA)) + dups$idB.t <- as.numeric(as.factor(dups$idB)) - ## Subset down and order - matchesA <- matchesA[order(matchesA$idA, matchesA$idB), ] - matchesB <- matchesB[order(matchesB$idA, matchesB$idB), ] + nr <- max(dups$idA.t) + nc <- max(dups$idB.t) + dim <- max(nr, nc) - } + ## Create adjacency matrix to id duplicates + mat.adj <- sparseMatrix(i = dups$idA.t, j = dups$idB.t, x = dups$zeta.j, + dims = c(dim, dim)) + mat.adj <- as.matrix(mat.adj) - ## ----------------- - ## Correct EM object - ## ----------------- - counts <- eval(parse( - text = paste0("data.frame(matchesA %>% group_by(", - paste(namevec, collapse = ", "), - ") %>% summarise(counts = n()))")) - ) - patterns <- as.data.frame(EM$patterns.w) - patterns$rownum <- 1:nrow(patterns) - patterns <- merge(patterns, counts, by = namevec, all.x = TRUE) - patterns$counts.x <- ifelse(!is.na(patterns$counts.y), patterns$counts.y, - patterns$counts.x) - patterns <- patterns[order(patterns$rownum),] - patterns$counts.y <- NULL; patterns$rownum <- NULL - names(patterns) <- c(namevec, "counts", "weights", "p.gamma.j.m", "p.gamma.j.u") - EM$patterns.w <- as.matrix(patterns) - - ## -------------------------- - ## Correct matchesLink object - ## -------------------------- - matchesLink <- data.frame(inds.a = matchesA$idA, inds.b = matchesB$idB) - - ## -------------------------- - ## Correct dataframes objects - ## -------------------------- - matchesA <- subset(matchesA, select = colnames.df.a) - matchesB <- subset(matchesB, select = colnames.df.b) - - ## Return deduped object - out <- list(matchesA = matchesA, matchesB = matchesB, - EM = EM, matchesLink = matchesLink) - return(out) - -} + ## Solve linear sum assignment problem + T1 <- suppressWarnings(assignment(-mat.adj)) + temp.0 <- cbind(1:dim, T1$perm) + n1 <- which(rowSums(mat.adj) == 0) + n2 <- which(colSums(mat.adj) == 0) + + if(length(n1) > 0) { + temp.0 <- temp.0[-n1, ] + } + + if(length(n2) > 0) { + temp.0 <- temp.0[, -n2] + } + + temp.0 <- data.frame(temp.0) + names(temp.0) <- c("idA.t", "idB.t") + + ## Merge in dedupe information + dedup <- merge(temp.0, dups, by = c("idA.t", "idB.t")) + dedup$idA.t <- dedup$idB.t <- NULL + + ## Combine dupes, dedupes + matchesA <- rbind(dedup, nodups) + matchesA$dupA <- matchesA$dupB <- NULL + listA <- paste(matchesA$idA, matchesA$idB, sep = "-") + listB <- paste(matchesB$idA, matchesB$idB, sep = "-") + keep <- which(listB %in% listA) + matchesB <- matchesB[keep, ] + + ## Subset down and order + matchesA <- matchesA[order(matchesA$idA, matchesA$idB), ] + matchesB <- matchesB[order(matchesB$idA, matchesB$idB), ] + + } + + ## ----------------- + ## Correct EM object + ## ----------------- + counts <- eval(parse( + text = paste0("data.frame(matchesA %>% group_by(", + paste(namevec, collapse = ", "), + ") %>% summarise(counts = n()))")) + ) + patterns <- as.data.frame(EM$patterns.w) + patterns$rownum <- 1:nrow(patterns) + patterns <- merge(patterns, counts, by = namevec, all.x = TRUE) + patterns$counts.x <- ifelse(!is.na(patterns$counts.y), patterns$counts.y, + patterns$counts.x) + patterns <- patterns[order(patterns$rownum),] + patterns$counts.y <- NULL; patterns$rownum <- NULL + names(patterns) <- c(namevec, "counts", "weights", "p.gamma.j.m", "p.gamma.j.u") + EM$patterns.w <- as.matrix(patterns) + + ## -------------------------- + ## Correct matchesLink object + ## -------------------------- + matchesLink <- data.frame(inds.a = matchesA$idA, inds.b = matchesB$idB) + + ## -------------------------- + ## Correct dataframes objects + ## -------------------------- + matchesA <- subset(matchesA, select = colnames.df.a) + matchesB <- subset(matchesB, select = colnames.df.b) + + ## Return deduped object + out <- list(matchesA = matchesA, matchesB = matchesB, + EM = EM, matchesLink = matchesLink) + return(out) + + } diff --git a/R/fastLink.R b/R/fastLink.R index 02ff9f2..46a4b8a 100644 --- a/R/fastLink.R +++ b/R/fastLink.R @@ -4,7 +4,8 @@ #' two datasets. #' #' @usage fastLink(dfA, dfB, varnames, stringdist.match, -#' partial.match, cut.a, cut.p, priors.obj, w.lambda, w.pi, +#' partial.match, stringdist.method, cut.a, cut.p, jw.weight, +#' priors.obj, w.lambda, w.pi, #' address.field, gender.field, estimate.only, em.obj, #' dedupe.matches, linprog.dedupe, #' reweight.names, firstname.field, @@ -20,8 +21,10 @@ #' @param partial.match A vector of variable names indicating whether to include #' a partial matching category for the string distances. Must be a subset of 'varnames' #' and 'stringdist.match'. +#' @param stringdist.method String distance method for calculating similarity, options are: "jw" Jaro-Winkler (Default), "jaro" Jaro, and "lv" Edit #' @param cut.a Lower bound for full string-distance match, ranging between 0 and 1. Default is 0.92 #' @param cut.p Lower bound for partial string-distance match, ranging between 0 and 1. Default is 0.88 +#' @param jw.weight Parameter that describes the importance of the first characters of a string (only needed if stringdist.method = "jw"). Default is .10 #' @param priors.obj A list containing priors for auxiliary movers information, #' as output from calcMoversPriors(). Default is NULL #' @param w.lambda How much weight to give the prior on lambda versus the data. Must range between 0 (no weight on prior) and 1 (weight fully on prior). @@ -71,7 +74,8 @@ #' @export fastLink <- function(dfA, dfB, varnames, stringdist.match = NULL, partial.match = NULL, - cut.a = 0.92, cut.p = 0.88, + stringdist.method = "jw", + cut.a = 0.92, cut.p = 0.88, jw.weight = .10, priors.obj = NULL, w.lambda = NULL, w.pi = NULL, address.field = NULL, gender.field = NULL, estimate.only = FALSE, em.obj = NULL, @@ -139,6 +143,14 @@ fastLink <- function(dfA, dfB, varnames, estimate.only <- FALSE cat("You have provided an EM object but have set 'estimate.only' to TRUE. Setting 'estimate.only' to FALSE so that matched indices are returned.\n") } + if(!(stringdist.method %in% c("jw", "jaro", "lv"))){ + stop("Invalid string distance method. Method should be one of 'jw', 'jaro', or 'lv'.") + } + if(stringdist.method == "jw" & !is.null(jw.weight)){ + if(jw.weight < 0 | jw.weight > 0.25){ + stop("Invalid value provided for jw.weight. Remember, jw.weight in [0, 0.25].") + } + } ## Create boolean indicators sm.bool <- which(varnames %in% stringdist.match) @@ -189,9 +201,11 @@ fastLink <- function(dfA, dfB, varnames, ## Get patterns if(stringdist.match[i]){ if(partial.match[i]){ - gammalist[[i]] <- gammaCKpar(dfA[,varnames[i]], dfB[,varnames[i]], cut.a = cut.a, cut.p = cut.p, n.cores = n.cores) + gammalist[[i]] <- gammaCKpar( + dfA[,varnames[i]], dfB[,varnames[i]], cut.a = cut.a, cut.p = cut.p, method = stringdist.method, w = jw.weight, n.cores = n.cores + ) }else{ - gammalist[[i]] <- gammaCK2par(dfA[,varnames[i]], dfB[,varnames[i]], cut.a = cut.a, n.cores = n.cores) + gammalist[[i]] <- gammaCK2par(dfA[,varnames[i]], dfB[,varnames[i]], cut.a = cut.a, method = stringdist.method, w = jw.weight, n.cores = n.cores) } }else{ gammalist[[i]] <- gammaKpar(dfA[,varnames[i]], dfB[,varnames[i]], gender = gender.field[i], n.cores = n.cores) @@ -277,7 +291,8 @@ fastLink <- function(dfA, dfB, varnames, ddm.out <- dedupeMatches(matchesA = dfA[matches$inds.a,], matchesB = dfB[matches$inds.b,], EM = resultsEM, matchesLink = matches, varnames = varnames, stringdist.match = stringdist.match, partial.match = partial.match, - linprog = linprog.dedupe, cut.a = cut.a, cut.p = cut.p) + linprog = linprog.dedupe, stringdist.method = stringdist.method, + cut.a = cut.a, cut.p = cut.p, jw.weight = jw.weight) matches <- ddm.out$matchesLink resultsEM <- ddm.out$EM end <- Sys.time() @@ -293,7 +308,8 @@ fastLink <- function(dfA, dfB, varnames, rwn.out <- nameReweight(dfA, dfB, EM = resultsEM, gammalist = gammalist, matchesLink = matches, varnames = varnames, stringdist.match = stringdist.match, partial.match = partial.match, firstname.field = firstname.field, threshold.match = threshold.match, - cut.a = cut.a, cut.p = cut.p, n.cores = n.cores) + stringdist.method = stringdist.method, cut.a = cut.a, cut.p = cut.p, jw.weight = jw.weight, + n.cores = n.cores) end <- Sys.time() if(verbose){ cat("Reweighting by first name took", round(difftime(end, start, units = "mins"), 2), "minutes.\n\n") diff --git a/R/gammaCK2par.R b/R/gammaCK2par.R index df38a6e..0c60e33 100644 --- a/R/gammaCK2par.R +++ b/R/gammaCK2par.R @@ -4,12 +4,14 @@ #' 0 total disagreement, 2 agreement. #' The distance between strings is calculated using a Jaro-Winkler distance. #' -#' @usage gammaCK2par(matAp, matBp, n.cores, cut.a) +#' @usage gammaCK2par(matAp, matBp, n.cores, cut.a, method, w) #' #' @param matAp vector storing the comparison field in data set 1 #' @param matBp vector storing the comparison field in data set 2 #' @param n.cores Number of cores to parallelize over. Default is NULL. #' @param cut.a Lower bound for full match, ranging between 0 and 1. Default is 0.92 +#' @param method String distance method, options are: "jw" Jaro-Winkler (Default), "jaro" Jaro, and "lv" Edit +#' @param w Parameter that describes the importance of the first characters of a string (only needed if method = "jw"). Default is .10 #' #' @return \code{gammaCK2par} returns a list with the indices corresponding to each #' matching pattern, which can be fed directly into \code{tableCounts} and \code{matchesLink}. @@ -28,7 +30,7 @@ ## in parallel ## ------------------------ -gammaCK2par <- function(matAp, matBp, n.cores = NULL, cut.a = 0.92) { +gammaCK2par <- function(matAp, matBp, n.cores = NULL, cut.a = 0.92, method = "jw", w = .10) { if(any(class(matAp) %in% c("tbl_df", "data.table"))){ matAp <- as.data.frame(matAp)[,1] @@ -46,6 +48,16 @@ gammaCK2par <- function(matAp, matBp, n.cores = NULL, cut.a = 0.92) { if(sum(is.na(matBp)) == length(matBp) | length(unique(matBp)) == 1){ stop("You have no variation in this variable, or all observations are missing in dataset B.") } + + if(!(method %in% c("jw", "jaro", "lv"))){ + stop("Invalid string distance method. Method should be one of 'jw', 'jaro', or 'lv'.") + } + + if(method == "jw" & !is.null(w)){ + if(w < 0 | w > 0.25){ + stop("Invalid value provided for w. Remember, w in [0, 0.25].") + } + } if(is.null(n.cores)) { n.cores <- detectCores() - 1 @@ -78,13 +90,33 @@ gammaCK2par <- function(matAp, matBp, n.cores = NULL, cut.a = 0.92) { temp.2[[i]] <- list(u.values.1[(limit.2[i]+1):limit.2[i+1]], limit.2[i]) } - stringvec <- function(m, y, cut) { + stringvec <- function(m, y, cut, strdist = method, p1 = w) { x <- as.matrix(m[[1]]) - e <- as.matrix(y[[1]]) - t <- 1 - stringdistmatrix(e, x, method = "jw") - t[ t < cut ] <- 0 - t <- Matrix(t, sparse = T) - t@x[t@x >= cut] <- 2; gc() + e <- as.matrix(y[[1]]) + + if(strdist == "jw") { + t <- 1 - stringdistmatrix(e, x, method = "jw", p = p1, nthread = 1) + t[ t < cut ] <- 0 + t <- Matrix(t, sparse = T) + } + + if(strdist == "jaro") { + t <- 1 - stringdistmatrix(e, x, method = "jw", nthread = 1) + t[ t < cut ] <- 0 + t <- Matrix(t, sparse = T) + } + + if(strdist == "lv") { + t <- stringdistmatrix(e, x, method = method, nthread = 1) + t.1 <- nchar(as.matrix(e)) + t.2 <- nchar(as.matrix(x)) + o <- t(apply(t.1, 1, function(w){ ifelse(w >= t.2, w, t.2)})) + t <- 1 - t * (1/o) + t[ t < cut ] <- 0 + t <- Matrix(t, sparse = T) + } + + t@x[t@x >= cut] <- 2; gc() slice.1 <- m[[2]] slice.2 <- y[[2]] indexes.2 <- which(t == 2, arr.ind = T) diff --git a/R/gammaCKpar.R b/R/gammaCKpar.R index 6c3e971..a6d6ace 100644 --- a/R/gammaCKpar.R +++ b/R/gammaCKpar.R @@ -4,13 +4,15 @@ #' 0 total disagreement, 1 partial agreement, 2 agreement. #' The distance between strings is calculated using a Jaro-Winkler distance. #' -#' @usage gammaCKpar(matAp, matBp, n.cores, cut.a, cut.p) +#' @usage gammaCKpar(matAp, matBp, n.cores, cut.a, cut.p, method, w) #' #' @param matAp vector storing the comparison field in data set 1 #' @param matBp vector storing the comparison field in data set 2 #' @param n.cores Number of cores to parallelize over. Default is NULL. #' @param cut.a Lower bound for full match, ranging between 0 and 1. Default is 0.92 #' @param cut.p Lower bound for partial match, ranging between 0 and 1. Default is 0.88 +#' @param method String distance method, options are: "jw" Jaro-Winkler (Default), "jaro" Jaro, and "lv" Edit +#' @param w Parameter that describes the importance of the first characters of a string (only needed if method = "jw"). Default is .10 #' #' @return \code{gammaCKpar} returns a list with the indices corresponding to each #' matching pattern, which can be fed directly into \code{tableCounts} and \code{matchesLink}. @@ -30,7 +32,7 @@ ## in parallel ## ------------------------ -gammaCKpar <- function(matAp, matBp, n.cores = NULL, cut.a = 0.92, cut.p = 0.88) { +gammaCKpar <- function(matAp, matBp, n.cores = NULL, cut.a = 0.92, cut.p = 0.88, method = "jw", w = .10) { if(any(class(matAp) %in% c("tbl_df", "data.table"))){ matAp <- as.data.frame(matAp)[,1] @@ -48,6 +50,16 @@ gammaCKpar <- function(matAp, matBp, n.cores = NULL, cut.a = 0.92, cut.p = 0.88) if(sum(is.na(matBp)) == length(matBp) | length(unique(matBp)) == 1){ stop("You have no variation in this variable, or all observations are missing in dataset B.") } + + if(!(method %in% c("jw", "jaro", "lv"))){ + stop("Invalid string distance method. Method should be one of 'jw', 'jaro', or 'lv'.") + } + + if(method == "jw" & !is.null(w)){ + if(w < 0 | w > 0.25){ + stop("Invalid value provided for w. Remember, w in [0, 0.25].") + } + } if(is.null(n.cores)) { n.cores <- detectCores() - 1 @@ -80,12 +92,32 @@ gammaCKpar <- function(matAp, matBp, n.cores = NULL, cut.a = 0.92, cut.p = 0.88) temp.2[[i]] <- list(u.values.1[(limit.2[i]+1):limit.2[i+1]], limit.2[i]) } - stringvec <- function(m, y, cut) { + stringvec <- function(m, y, cut, strdist = method, p1 = w) { x <- as.matrix(m[[1]]) e <- as.matrix(y[[1]]) - t <- 1 - stringdistmatrix(e, x, method = "jw", nthread = 1) - t[ t < cut[2] ] <- 0 - t <- Matrix(t, sparse = T) + + if(strdist == "jw") { + t <- 1 - stringdistmatrix(e, x, method = "jw", p = p1, nthread = 1) + t[ t < cut[[2]] ] <- 0 + t <- Matrix(t, sparse = T) + } + + if(strdist == "jaro") { + t <- 1 - stringdistmatrix(e, x, method = "jw", nthread = 1) + t[ t < cut[[2]] ] <- 0 + t <- Matrix(t, sparse = T) + } + + if(strdist == "lv") { + t <- stringdistmatrix(e, x, method = method, nthread = 1) + t.1 <- nchar(as.matrix(e)) + t.2 <- nchar(as.matrix(x)) + o <- t(apply(t.1, 1, function(w){ ifelse(w >= t.2, w, t.2)})) + t <- 1 - t * (1/o) + t[ t < cut[[2]] ] <- 0 + t <- Matrix(t, sparse = T) + } + t@x[t@x >= cut[1]] <- 2 t@x[t@x >= cut[2] & t@x < cut[1]] <- 1; gc() slice.1 <- m[[2]] @@ -105,7 +137,7 @@ gammaCKpar <- function(matAp, matBp, n.cores = NULL, cut.a = 0.92, cut.p = 0.88) cl <- makeCluster(nc) registerDoParallel(cl) - temp.f <- foreach(i = 1:nrow(do)) %dopar% { + temp.f <- foreach(i = 1:nrow(do), .packages = c("stringdist", "Matrix")) %dopar% { r1 <- do[i, 1] r2 <- do[i, 2] stringvec(temp.1[[r1]], temp.2[[r2]], c(cut.a, cut.p)) diff --git a/R/nameReweight.R b/R/nameReweight.R index 0cc13b4..0acbf9a 100644 --- a/R/nameReweight.R +++ b/R/nameReweight.R @@ -6,7 +6,8 @@ #' #' @usage nameReweight(dfA, dfB, EM, gammalist, matchesLink, #' varnames, stringdist.match, partial.match, -#' firstname.field, threshold.match, cut.a, cut.p, n.cores) +#' firstname.field, threshold.match, stringdist.method, cut.a, cut.p, +#' jw.weight, n.cores) #' @param dfA The full version of dataset A that is being matched. #' @param dfB The full version of dataset B that is being matched. #' @param EM The EM object from \code{emlinkMARmov()} @@ -26,8 +27,10 @@ #' @param threshold.match A number between 0 and 1 indicating either the lower bound (if only one number provided) or the range of certainty that the #' user wants to declare a match. For instance, threshold.match = .85 will return all pairs with posterior probability greater than .85 as matches, #' while threshold.match = c(.85, .95) will return all pairs with posterior probability between .85 and .95 as matches. +#' @param stringdist.method String distance method for calculating similarity, options are: "jw" Jaro-Winkler (Default), "jaro" Jaro, and "lv" Edit #' @param cut.a Lower bound for full string-distance match, ranging between 0 and 1. Default is 0.92 #' @param cut.p Lower bound for partial string-distance match, ranging between 0 and 1. Default is 0.88 +#' @param jw.weight Parameter that describes the importance of the first characters of a string (only needed if stringdist.method = "jw"). Default is .10 #' @param n.cores Number of cores to parallelize over. Default is NULL. #' #' @return \code{nameReweight()} returns a list containing the following elements: @@ -38,8 +41,18 @@ nameReweight <- function(dfA, dfB, EM, gammalist, matchesLink, varnames, stringdist.match, partial.match, firstname.field, threshold.match, - cut.a = .92, cut.p = .88, n.cores = NULL){ + stringdist.method = "jw", cut.a = .92, cut.p = .88, + jw.weight = .10, n.cores = NULL){ + if(!(stringdist.method %in% c("jw", "jaro", "lv"))){ + stop("Invalid string distance method. Method should be one of 'jw', 'jaro', or 'lv'.") + } + if(stringdist.method == "jw" & !is.null(jw.weight)){ + if(jw.weight < 0 | jw.weight > 0.25){ + stop("Invalid value provided for jw.weight. Remember, jw.weight in [0, 0.25].") + } + } + ## Get cores if(is.null(n.cores)) { n.cores <- detectCores() - 1 @@ -85,7 +98,20 @@ nameReweight <- function(dfA, dfB, EM, gammalist, matchesLink, } ## Get matches if(stringdist.match[i]){ - tmp <- 1 - stringdist(matchesA[,varnames[i]], matchesB[,varnames[i]], "jw") + if(stringdist.method %in% c("jw", "jaro")){ + if(stringdist.method == "jw"){ + p1 <- jw.weight + }else{ + p1 <- NULL + } + tmp <- 1 - stringdist(matchesA[,varnames[i]], matchesB[,varnames[i]], "jw", p = p1) + }else{ + t <- stringdist(matchesA[,varnames[i]], matchesB[,varnames[i]], method = stringdist.method) + t.1 <- nchar(matchesA[,varnames[i]]) + t.2 <- nchar(matchesB[,varnames[i]]) + o <- ifelse(t.1 > t.2, t.1, t.2) + tmp <- 1 - t * (1/o) + } if(partial.match[i]){ gammalist[[i]] <- ifelse( tmp >= cut.a, 2, ifelse(tmp >= cut.p, 1, 0) diff --git a/man/dedupeMatches.Rd b/man/dedupeMatches.Rd index a8e6827..e1d16d6 100644 --- a/man/dedupeMatches.Rd +++ b/man/dedupeMatches.Rd @@ -6,7 +6,8 @@ \usage{ dedupeMatches(matchesA, matchesB, EM, matchesLink, varnames, stringdist.match, partial.match, -linprog, cut.a = 0.92, cut.p = 0.88) +linprog, stringdist.method, cut.a = 0.92, cut.p = 0.88, +jw.weight) } \arguments{ \item{matchesA}{A dataframe of the matched observations in @@ -33,9 +34,13 @@ as varnames. Default is FALSE for all variables.} \item{linprog}{Whether to implement Winkler's linear programming solution to the deduplication problem. Default is false.} +\item{stringdist.method}{String distance method for calculating similarity, options are: "jw" Jaro-Winkler (Default), "jaro" Jaro, and "lv" Edit} + \item{cut.a}{Lower bound for full string-distance match, ranging between 0 and 1. Default is 0.92} \item{cut.p}{Lower bound for partial string-distance match, ranging between 0 and 1. Default is 0.88} + +\item{jw.weight}{Parameter that describes the importance of the first characters of a string (only needed if stringdist.method = "jw"). Default is .10} } \value{ \code{dedupeMatches()} returns a list containing the following elements: diff --git a/man/fastLink.Rd b/man/fastLink.Rd index ac16d62..2f36c19 100644 --- a/man/fastLink.Rd +++ b/man/fastLink.Rd @@ -5,7 +5,8 @@ \title{fastLink} \usage{ fastLink(dfA, dfB, varnames, stringdist.match, -partial.match, cut.a, cut.p, priors.obj, w.lambda, w.pi, +partial.match, stringdist.method, cut.a, cut.p, jw.weight, +priors.obj, w.lambda, w.pi, address.field, gender.field, estimate.only, em.obj, dedupe.matches, linprog.dedupe, reweight.names, firstname.field, @@ -27,10 +28,14 @@ which variables should use string distance matching. Must be a subset of a partial matching category for the string distances. Must be a subset of 'varnames' and 'stringdist.match'.} +\item{stringdist.method}{String distance method for calculating similarity, options are: "jw" Jaro-Winkler (Default), "jaro" Jaro, and "lv" Edit} + \item{cut.a}{Lower bound for full string-distance match, ranging between 0 and 1. Default is 0.92} \item{cut.p}{Lower bound for partial string-distance match, ranging between 0 and 1. Default is 0.88} +\item{jw.weight}{Parameter that describes the importance of the first characters of a string (only needed if stringdist.method = "jw"). Default is .10} + \item{priors.obj}{A list containing priors for auxiliary movers information, as output from calcMoversPriors(). Default is NULL} diff --git a/man/gammaCK2par.Rd b/man/gammaCK2par.Rd index f729527..9df1a85 100644 --- a/man/gammaCK2par.Rd +++ b/man/gammaCK2par.Rd @@ -4,7 +4,7 @@ \alias{gammaCK2par} \title{gammaCK2par} \usage{ -gammaCK2par(matAp, matBp, n.cores, cut.a) +gammaCK2par(matAp, matBp, n.cores, cut.a, method, w) } \arguments{ \item{matAp}{vector storing the comparison field in data set 1} @@ -14,6 +14,10 @@ gammaCK2par(matAp, matBp, n.cores, cut.a) \item{n.cores}{Number of cores to parallelize over. Default is NULL.} \item{cut.a}{Lower bound for full match, ranging between 0 and 1. Default is 0.92} + +\item{method}{String distance method, options are: "jw" Jaro-Winkler (Default), "jaro" Jaro, and "lv" Edit} + +\item{w}{Parameter that describes the importance of the first characters of a string (only needed if method = "jw"). Default is .10} } \value{ \code{gammaCK2par} returns a list with the indices corresponding to each diff --git a/man/gammaCKpar.Rd b/man/gammaCKpar.Rd index 0babf4a..6080476 100644 --- a/man/gammaCKpar.Rd +++ b/man/gammaCKpar.Rd @@ -4,7 +4,7 @@ \alias{gammaCKpar} \title{gammaCKpar} \usage{ -gammaCKpar(matAp, matBp, n.cores, cut.a, cut.p) +gammaCKpar(matAp, matBp, n.cores, cut.a, cut.p, method, w) } \arguments{ \item{matAp}{vector storing the comparison field in data set 1} @@ -16,6 +16,10 @@ gammaCKpar(matAp, matBp, n.cores, cut.a, cut.p) \item{cut.a}{Lower bound for full match, ranging between 0 and 1. Default is 0.92} \item{cut.p}{Lower bound for partial match, ranging between 0 and 1. Default is 0.88} + +\item{method}{String distance method, options are: "jw" Jaro-Winkler (Default), "jaro" Jaro, and "lv" Edit} + +\item{w}{Parameter that describes the importance of the first characters of a string (only needed if method = "jw"). Default is .10} } \value{ \code{gammaCKpar} returns a list with the indices corresponding to each diff --git a/man/nameReweight.Rd b/man/nameReweight.Rd index 7a33044..158f4dd 100644 --- a/man/nameReweight.Rd +++ b/man/nameReweight.Rd @@ -6,7 +6,8 @@ \usage{ nameReweight(dfA, dfB, EM, gammalist, matchesLink, varnames, stringdist.match, partial.match, -firstname.field, threshold.match, cut.a, cut.p, n.cores) +firstname.field, threshold.match, stringdist.method, cut.a, cut.p, +jw.weight, n.cores) } \arguments{ \item{dfA}{The full version of dataset A that is being matched.} @@ -38,10 +39,14 @@ as varnames. Default is FALSE for all variables.} user wants to declare a match. For instance, threshold.match = .85 will return all pairs with posterior probability greater than .85 as matches, while threshold.match = c(.85, .95) will return all pairs with posterior probability between .85 and .95 as matches.} +\item{stringdist.method}{String distance method for calculating similarity, options are: "jw" Jaro-Winkler (Default), "jaro" Jaro, and "lv" Edit} + \item{cut.a}{Lower bound for full string-distance match, ranging between 0 and 1. Default is 0.92} \item{cut.p}{Lower bound for partial string-distance match, ranging between 0 and 1. Default is 0.88} +\item{jw.weight}{Parameter that describes the importance of the first characters of a string (only needed if stringdist.method = "jw"). Default is .10} + \item{n.cores}{Number of cores to parallelize over. Default is NULL.} } \value{