diff --git a/DESCRIPTION b/DESCRIPTION index 4b2a5a2..ab4013f 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,8 +1,8 @@ Package: fastLink Type: Package Title: Fast Probabilistic Record Linkage with Missing Data -Version: 0.1.1 -Date: 2017-07-10 +Version: 0.2.0 +Date: 2017-08-31 Authors@R: c( person("Ted", "Enamorado", email = "fastlinkr@gmail.com", role = c("aut", "cre")), person("Ben", "Fifield", email = "fastlinkr@gmail.com", role = c("aut")), @@ -24,10 +24,14 @@ Imports: data.table, stringdist, stringr, - Rcpp (>= 0.12.9), + stringi, + Rcpp (>= 0.12.7), FactoClass, adagio, - dplyr + dplyr, + plotrix, + grDevices, + graphics Depends: R (>= 2.14.0) LinkingTo: RcppArmadillo, Rcpp, RcppEigen diff --git a/NAMESPACE b/NAMESPACE index a9157c2..f328a6a 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,9 +1,10 @@ # Generated by roxygen2: do not edit by hand +S3method(plot,fastLink) +S3method(print,inspectEM) S3method(summary,fastLink) export(aggregateEM) export(calcMoversPriors) -export(cleanAddressUSPS) export(clusterMatch) export(dedupeMatches) export(emlinkMARmov) @@ -13,8 +14,11 @@ export(gammaCK2par) export(gammaCKpar) export(gammaKpar) export(getMatches) +export(getPosterior) +export(inspectEM) export(matchesLink) export(nameReweight) +export(preprocText) export(tableCounts) import(Matrix) import(data.table) @@ -28,11 +32,17 @@ importFrom(dplyr,n) importFrom(dplyr,summarise) importFrom(foreach,"%dopar%") importFrom(foreach,foreach) +importFrom(grDevices,colorRampPalette) +importFrom(graphics,axis) +importFrom(graphics,legend) +importFrom(graphics,plot) +importFrom(graphics,polygon) importFrom(gtools,rdirichlet) importFrom(parallel,detectCores) importFrom(parallel,makeCluster) importFrom(parallel,mclapply) importFrom(parallel,stopCluster) +importFrom(plotrix,staxlab) importFrom(stats,kmeans) importFrom(stats,na.omit) importFrom(stats,prcomp) @@ -40,8 +50,11 @@ importFrom(stats,predict) importFrom(stats,quantile) importFrom(stats,runif) importFrom(stats,var) +importFrom(stringdist,phonetic) importFrom(stringdist,stringdist) importFrom(stringdist,stringdistmatrix) +importFrom(stringi,stri_trans_general) +importFrom(stringi,stri_trans_list) importFrom(stringr,str_count) importFrom(utils,data) useDynLib(fastLink, .registration = TRUE) diff --git a/R/cleanAddressUSPS.R b/R/cleanAddressUSPS.R deleted file mode 100644 index 5c35ff4..0000000 --- a/R/cleanAddressUSPS.R +++ /dev/null @@ -1,38 +0,0 @@ -#' cleanAddressUSPS -#' -#' Apply USPS address standardization to address field. -#' -#' @usage cleanAddressUSPS(address.field) -#' @param address.field A vector containing address information to be cleaned. -#' -#' @return \code{cleanAddressUSPS()} returns a cleaned version of \code{address.field}. -#' -#' @author Ted Enamorado and Ben Fifield -#' @examples dfA$streetname <- cleanAddressUSPS(dfA$streetname) -#' @export -cleanAddressUSPS <- function(address.field){ - - ## Standardization - address.field <- ifelse(grepl(" avenue", address.field), - gsub(" avenue", " ave", address.field), - ifelse(grepl(" avn", address.field), - gsub(" avn", " ave", address.field), - gsub(" av", " ave", address.field))) - address.field <- gsub(" avee", " ave", address.field) - address.field <- gsub(" boulevard", " blvd", address.field) - address.field <- gsub(" circle", " cir", address.field) - address.field <- gsub(" court", " ct", address.field) - address.field <- gsub(" drive", " dr", address.field) - address.field <- gsub(" junction", " jct", address.field) - address.field <- gsub(" place", " pl", address.field) - address.field <- gsub(" road", " rd", address.field) - address.field <- gsub(" route", " rte", address.field) - address.field <- gsub(" square", " sq", address.field) - address.field <- gsub(" street", " st", address.field) - address.field <- gsub(" apartment", " apt", address.field) - address.field <- gsub(" building", " bldg", address.field) - - return(trimws(address.field)) - -} - diff --git a/R/emlinkMARmov.R b/R/emlinkMARmov.R index 8637527..0c4fd18 100644 --- a/R/emlinkMARmov.R +++ b/R/emlinkMARmov.R @@ -5,7 +5,7 @@ #' #' @usage emlinkMARmov(patterns, nobs.a, nobs.b, p.m, iter.max, #' tol, p.gamma.k.m, p.gamma.k.u, prior.lambda, w.lambda, -#' prior.pi, w.pi, address.field, gender.field) +#' prior.pi, w.pi, address.field, gender.field, varnames) #' #' @param patterns table that holds the counts for each unique agreement #' pattern. This object is produced by the function: tableCounts. @@ -25,6 +25,8 @@ #' Address fields should be set to TRUE while non-address fields are set to FALSE if provided. #' @param gender.field Boolean indicators for whether a given field is for gender. If so, exact match is conducted on gender. #' Default is NULL (FALSE for all fields). The one gender field should be set to TRUE while all other fields are set to FALSE if provided. +#' @param varnames The vector of variable names used for matching. Automatically provided if using \code{fastLink()} wrapper. Used for +#' clean visualization of EM results in summary functions. #' #' @return \code{emlinkMARmov} returns a list with the following components: #' \item{zeta.j}{The posterior match probabilities for each unique pattern.} @@ -53,7 +55,7 @@ #' tc <- tableCounts(list(g1, g2, g3, g4), nobs.a = nrow(dfA), nobs.b = nrow(dfB)) #' #' ## Run EM -#' em <- emlinkMAR(tc, nobs.a = nrow(dfA), nobs.b = nrow(dfB)) +#' em <- emlinkMARmov(tc, nobs.a = nrow(dfA), nobs.b = nrow(dfB)) #' } #' #' @export @@ -62,7 +64,7 @@ emlinkMARmov <- function(patterns, nobs.a, nobs.b, p.m = 0.1, iter.max = 5000, tol = 1e-5, p.gamma.k.m = NULL, p.gamma.k.u = NULL, prior.lambda = NULL, w.lambda = NULL, prior.pi = NULL, w.pi = NULL, address.field = NULL, - gender.field = NULL) { + gender.field = NULL, varnames = NULL) { options(digits=16) @@ -326,15 +328,22 @@ emlinkMARmov <- function(patterns, nobs.a, nobs.b, colnames(data.w)[nc-1] <- "p.gamma.j.m" colnames(data.w)[nc] <- "p.gamma.j.u" - inf <- which(data.w == Inf, arr.ind = T) - ninf <- which(data.w == -Inf, arr.ind = T) - - data.w[inf[, 1], unique(inf[, 2])] <- 150 - data.w[ninf[, 1], unique(ninf[, 2])] <- -150 + inf <- which(data.w == Inf, arr.ind = T) + ninf <- which(data.w == -Inf, arr.ind = T) + + data.w[inf[, 1], unique(inf[, 2])] <- 150 + data.w[ninf[, 1], unique(ninf[, 2])] <- -150 + + if(!is.null(varnames)){ + output <- list("zeta.j"= zeta.j,"p.m"= p.m, "p.u" = p.u, "p.gamma.k.m" = p.gamma.k.m, "p.gamma.k.u" = p.gamma.k.u, + "p.gamma.j.m" = p.gamma.j.m, "p.gamma.j.u" = p.gamma.j.u, "patterns.w" = data.w, "iter.converge" = count, + "nobs.a" = nobs.a, "nobs.b" = nobs.b, "varnames" = varnames) + }else{ + output <- list("zeta.j"= zeta.j,"p.m"= p.m, "p.u" = p.u, "p.gamma.k.m" = p.gamma.k.m, "p.gamma.k.u" = p.gamma.k.u, + "p.gamma.j.m" = p.gamma.j.m, "p.gamma.j.u" = p.gamma.j.u, "patterns.w" = data.w, "iter.converge" = count, + "nobs.a" = nobs.a, "nobs.b" = nobs.b, "varnames" = paste0("gamma.", 1:nfeatures)) + } - output <- list("zeta.j"= zeta.j,"p.m"= p.m, "p.u" = p.u, "p.gamma.k.m" = p.gamma.k.m, "p.gamma.k.u" = p.gamma.k.u, - "p.gamma.j.m" = p.gamma.j.m, "p.gamma.j.u" = p.gamma.j.u, "patterns.w" = data.w, "iter.converge" = count, - "nobs.a" = nobs.a, "nobs.b" = nobs.b) class(output) <- c("fastLink", "fastLink.EM") return(output) @@ -481,15 +490,15 @@ emlinkRS <- function(patterns.out, em.out, nobs.a, nobs.b){ colnames(data.w)[nc - 1] <- "p.gamma.j.m" colnames(data.w)[nc] <- "p.gamma.j.u" - inf <- which(data.w == Inf, arr.ind = T) - ninf <- which(data.w == -Inf, arr.ind = T) - - data.w[inf[, 1], unique(inf[, 2])] <- 150 - data.w[ninf[, 1], unique(ninf[, 2])] <- -150 + inf <- which(data.w == Inf, arr.ind = T) + ninf <- which(data.w == -Inf, arr.ind = T) + + data.w[inf[, 1], unique(inf[, 2])] <- 150 + data.w[ninf[, 1], unique(ninf[, 2])] <- -150 output <- list("zeta.j" = zeta.j, "p.m" = em.out$p.m, "p.u" = em.out$p.u, "p.gamma.k.m" = em.out$p.gamma.k.m, "p.gamma.k.u" = em.out$p.gamma.k.u, "p.gamma.j.m" = p.gamma.j.m, "p.gamma.j.u" = p.gamma.j.u, "patterns.w" = data.w, "iter.converge" = em.out$iter.converge, - "nobs.a" = nobs.a, "nobs.b" = nobs.b) + "nobs.a" = nobs.a, "nobs.b" = nobs.b, "varnames" = em.out$varnames) class(output) <- c("fastLink", "fastLink.EM") return(output) diff --git a/R/fastLink-package.R b/R/fastLink-package.R index 01dade5..4da2681 100644 --- a/R/fastLink-package.R +++ b/R/fastLink-package.R @@ -8,8 +8,8 @@ #' the Fellegi-Sunter model, using the Expectation-Maximization Algorithm. In addition, #' tools for conducting and summarizing data merges are included. #' -#' \tabular{ll}{ Package: \tab fastLink\cr Type: \tab Package\cr Version: \tab 0.1.1-\cr -#' Date: \tab 2017-07-10\cr License: \tab GPL (>= 3)\cr } +#' \tabular{ll}{ Package: \tab fastLink\cr Type: \tab Package\cr Version: \tab 0.2.0-\cr +#' Date: \tab 2017-07-30\cr License: \tab GPL (>= 3)\cr } #' #' @name fastLink-package #' @useDynLib fastLink, .registration = TRUE diff --git a/R/fastLink.R b/R/fastLink.R index 48653af..c6d4ce0 100644 --- a/R/fastLink.R +++ b/R/fastLink.R @@ -9,7 +9,7 @@ #' address.field, gender.field, estimate.only, em.obj, #' dedupe.matches, linprog.dedupe, #' reweight.names, firstname.field, -#' return.df, n.cores, tol.em, threshold.match, verbose) +#' n.cores, tol.em, threshold.match, return.all, return.df, verbose) #' #' @param dfA Dataset A - to be matched to Dataset B #' @param dfB Dataset B - to be matched to Dataset A @@ -45,12 +45,14 @@ #' @param linprog.dedupe If deduping matches, whether to use Winkler's linear programming solution to dedupe. Default is FALSE. #' @param reweight.names Whether to reweight the posterior match probabilities by the frequency of individual first names. Default is FALSE. #' @param firstname.field The name of the field indicating first name. Must be provided if reweight.names = TRUE. -#' @param return.df Whether to return the entire dataframe of dfA and dfB instead of just the indices. Default is FALSE. #' @param n.cores Number of cores to parallelize over. Default is NULL. #' @param tol.em Convergence tolerance for the EM Algorithm. Default is 1e-04. #' @param threshold.match A number between 0 and 1 indicating either the lower bound (if only one number provided) or the range of certainty that the #' user wants to declare a match. For instance, threshold.match = .85 will return all pairs with posterior probability greater than .85 as matches, #' while threshold.match = c(.85, .95) will return all pairs with posterior probability between .85 and .95 as matches. +#' @param return.all Whether to return the most likely match for each observation in dfA and dfB. Overrides user setting of \code{threshold.match} by setting +#' \code{threshold.match} to 0.0001, and automatically dedupes all matches. Default is FALSE. +#' @param return.df Whether to return the entire dataframe of dfA and dfB instead of just the indices. Default is FALSE. #' @param verbose Whether to print elapsed time for each step. Default is FALSE. #' #' @return \code{fastLink} returns a list of class 'fastLink' containing the following components if calculating matches: @@ -83,8 +85,8 @@ fastLink <- function(dfA, dfB, varnames, gender.field = NULL, estimate.only = FALSE, em.obj = NULL, dedupe.matches = TRUE, linprog.dedupe = FALSE, reweight.names = FALSE, firstname.field = NULL, - return.df = FALSE, n.cores = NULL, tol.em = 1e-04, - threshold.match = 0.85, verbose = FALSE){ + n.cores = NULL, tol.em = 1e-04, threshold.match = 0.85, + return.all = FALSE, return.df = FALSE, verbose = FALSE){ cat("\n") cat(c(paste(rep("=", 20), sep = "", collapse = ""), "\n")) @@ -156,6 +158,13 @@ fastLink <- function(dfA, dfB, varnames, stop("Invalid value provided for jw.weight. Remember, jw.weight in [0, 0.25].") } } + if(return.all){ + threshold.match <- 0.0001 + if(!dedupe.matches){ + cat("You have specified that all matches be returned but have not deduped the matches. Setting 'dedupe.matches' to TRUE.\n") + dedupe.matches <- TRUE + } + } ## Create boolean indicators sm.bool <- which(varnames %in% stringdist.match) @@ -195,6 +204,10 @@ fastLink <- function(dfA, dfB, varnames, start <- Sys.time() gammalist <- vector(mode = "list", length = length(varnames)) for(i in 1:length(gammalist)){ + if(verbose){ + sdb <- ifelse(stringdist.match[i], "string-distance", "exact") + cat(" Matching variable", varnames[i], "using", sdb, "matching.\n") + } ## Convert to character if(is.factor(dfA[,varnames[i]]) | is.factor(dfB[,varnames[i]])){ dfA[,varnames[i]] <- as.character(dfA[,varnames[i]]) @@ -209,6 +222,9 @@ fastLink <- function(dfA, dfB, varnames, stop(paste("You have no variation in dataset B for", varnames[i], "or all observations are missing.")) } } + if(sum(dfA[,varnames[i]] %in% dfB[,varnames[i]]) == 0){ + stop(paste0("You have no exact matches for ", varnames[i], ". Please drop this variable from your analysis.")) + } ## Get patterns if(stringdist.match[i]){ if(partial.match[i]){ @@ -270,7 +286,8 @@ fastLink <- function(dfA, dfB, varnames, prior.lambda = lambda.prior, w.lambda = w.lambda, prior.pi = pi.prior, w.pi = w.pi, address.field = address.field, - gender.field = gender.field) + gender.field = gender.field, + varnames = varnames) end <- Sys.time() if(verbose){ cat("Running the EM algorithm took", round(difftime(end, start, units = "secs"), 2), "seconds.\n\n") @@ -310,9 +327,19 @@ fastLink <- function(dfA, dfB, varnames, if(verbose){ cat("Deduping the estimated matches took", round(difftime(end, start, units = "mins"), 2), "minutes.\n\n") } + }else{ + cat("Calculating the posterior for each pair of matched observations.\n") + start <- Sys.time() + zeta <- getPosterior(dfA[matches$inds.a,], dfB[matches$inds.b,], EM = resultsEM, + varnames = varnames, stringdist.match = stringdist.match, partial.match = partial.match, + stringdist.method = stringdist.method, cut.a = cut.a, cut.p = cut.p, jw.weight = jw.weight) + end <- Sys.time() + if(verbose){ + cat("Calculating the posterior for each matched pair took", round(difftime(end, start, units = "mins"), 2), "minutes.\n\n") + } } - ## Reweight first names + ## Reweight first names or get zeta if(reweight.names){ cat("Reweighting match probabilities by frequency of occurrence.\n") start <- Sys.time() @@ -335,14 +362,16 @@ fastLink <- function(dfA, dfB, varnames, } out[["matches"]] <- matches out[["EM"]] <- resultsEM - out[["nobs.a"]] <- nr_a - out[["nobs.b"]] <- nr_b if(dedupe.matches){ - out[["max.zeta"]] <- ddm.out$max.zeta + out[["posterior"]] <- ddm.out$max.zeta + }else{ + out[["posterior"]] <- zeta } if(reweight.names){ out[["zeta.name"]] <- rwn.out } + out[["nobs.a"]] <- nr_a + out[["nobs.b"]] <- nr_b class(out) <- "fastLink" }else{ out <- resultsEM diff --git a/R/getPosterior.R b/R/getPosterior.R new file mode 100644 index 0000000..6e00a1b --- /dev/null +++ b/R/getPosterior.R @@ -0,0 +1,120 @@ +#' getPosterior +#' +#' Get the posterior probability of a match for each matched pair of observations +#' +#' @usage getPosterior(matchesA, matchesB, EM, varnames, stringdist.match, +#' partial.match, stringdist.method, cut.a, cut.p, jw.weight) +#' @param matchesA A dataframe of the matched observations in +#' dataset A, with all variables used to inform the match. +#' @param matchesB A dataframe of the matched observations in +#' dataset B, with all variables used to inform the match. +#' @param EM The EM object from \code{emlinkMARmov()} +#' @param varnames A vector of variable names to use for matching. +#' Must be present in both matchesA and matchesB. +#' @param stringdist.match A vector of booleans, indicating whether to use +#' string distance matching when determining matching patterns on +#' each variable. Must be same length as varnames. +#' @param partial.match A vector of booleans, indicating whether to include +#' a partial matching category for the string distances. Must be same length +#' as varnames. Default is FALSE for all variables. +#' @param stringdist.method String distance method for calculating similarity, options are: "jw" Jaro-Winkler (Default), "jaro" Jaro, and "lv" Edit +#' @param cut.a Lower bound for full string-distance match, ranging between 0 and 1. Default is 0.92 +#' @param cut.p Lower bound for partial string-distance match, ranging between 0 and 1. Default is 0.88 +#' @param jw.weight Parameter that describes the importance of the first characters of a string (only needed if stringdist.method = "jw"). Default is .10 +#' +#' @return \code{getPosterior} returns the posterior probability of a match for each matched pair of observations +#' in matchesA and matchesB +#' @author Ben Fifield +#' @export +getPosterior <- function(matchesA, matchesB, EM, varnames, stringdist.match, partial.match, + stringdist.method, cut.a, cut.p, jw.weight){ + + ## -------------- + ## Start function + ## -------------- + ## Convert to dataframe + if(any(class(matchesA) %in% c("tbl_df", "data.table"))){ + matchesA <- as.data.frame(matchesA) + } + if(any(class(matchesB) %in% c("tbl_df", "data.table"))){ + matchesB <- as.data.frame(matchesB) + } + if(!(stringdist.method %in% c("jw", "jaro", "lv"))){ + stop("Invalid string distance method. Method should be one of 'jw', 'jaro', or 'lv'.") + } + if(stringdist.method == "jw" & !is.null(jw.weight)){ + if(jw.weight < 0 | jw.weight > 0.25){ + stop("Invalid value provided for jw.weight. Remember, jw.weight in [0, 0.25].") + } + } + + ## Get original column names + colnames.df.a <- colnames(matchesA) + colnames.df.b <- colnames(matchesB) + + ## ---------- + ## Get gammas + ## ---------- + gammalist <- vector(mode = "list", length = length(varnames)) + namevec <- rep(NA, length(varnames)) + for(i in 1:length(gammalist)){ + ## Convert to character + if(is.factor(matchesA[,varnames[i]]) | is.factor(matchesB[,varnames[i]])){ + matchesA[,varnames[i]] <- as.character(matchesA[,varnames[i]]) + matchesB[,varnames[i]] <- as.character(matchesB[,varnames[i]]) + } + ## Get matches + if(stringdist.match[i]){ + if(stringdist.method %in% c("jw", "jaro")){ + if(stringdist.method == "jw"){ + p1 <- jw.weight + }else{ + p1 <- NULL + } + tmp <- 1 - stringdist(matchesA[,varnames[i]], matchesB[,varnames[i]], "jw", p = p1) + }else{ + t <- stringdist(matchesA[,varnames[i]], matchesB[,varnames[i]], method = stringdist.method) + t.1 <- nchar(matchesA[,varnames[i]]) + t.2 <- nchar(matchesB[,varnames[i]]) + o <- ifelse(t.1 > t.2, t.1, t.2) + tmp <- 1 - t * (1/o) + } + if(partial.match[i]){ + gammalist[[i]] <- ifelse( + tmp >= cut.a, 2, ifelse(tmp >= cut.p, 1, 0) + ) + }else{ + gammalist[[i]] <- ifelse(tmp >= cut.a, 2, 0) + } + }else{ + tmp <- matchesA[,varnames[i]] == matchesB[,varnames[i]] + gammalist[[i]] <- ifelse(tmp == TRUE, 2, 0) + } + + namevec[i] <- paste0("gamma.", i) + + } + gammalist <- data.frame(do.call(cbind, gammalist)) + names(gammalist) <- namevec + + ## ------------------------------- + ## Convert EM object to data frame + ## ------------------------------- + emdf <- as.data.frame(EM$patterns.w) + emdf$zeta.j <- c(EM$zeta.j) + + ## --------------------- + ## Merge EM to gammalist + ## --------------------- + matchesA <- cbind(matchesA, gammalist) + matchesA$roworder <- 1:nrow(matchesA) + matchesA <- merge(matchesA, emdf, by = namevec, all.x = TRUE) + matchesA <- matchesA[order(matchesA$roworder),] + + ## ------------- + ## Get max zetas + ## ------------- + return(matchesA$zeta.j) + +} + diff --git a/R/inspectEM.R b/R/inspectEM.R new file mode 100644 index 0000000..b1ce1eb --- /dev/null +++ b/R/inspectEM.R @@ -0,0 +1,79 @@ +#' inspectEM +#' +#' Inspect EM objects to analyze successfully and +#' unsuccessfully matched patterns. +#' +#' @usage inspectEM(object, posterior.range, digits) +#' @param object The output from either \code{fastLink} or \code{emlinkMARmov}. +#' @param posterior.range The range of posterior probabilities to display. +#' Default is c(0.85, 1). +#' @param digits How many digits to include in inspectEM dataframe. Default is 3. +#' +#' @return \code{inspectEM} returns a data frame with information +#' about patterns around the provided threshold. +#' +#' @author Ben Fifield +#' @export +inspectEM <- function(object, posterior.range = c(0.85, 1), digits = 3){ + + ## Extract EM object + if("fastLink.EM" %in% class(object)){ + em <- object + }else if(length(class(object)) == 1 & "fastLink" %in% class(object)){ + em <- object$EM + }else if(!("fastLink" %in% class(object))){ + stop("inspectEM() is not compatible with the input object.") + } + + if(min(posterior.range) < 0 | max(posterior.range) > 1){ + stop("Please make sure that posterior.range is between 0 and 1.") + } + if(length(posterior.range) == 1){ + posterior.range <- c(posterior.range, 1) + } + + ## --------------- + ## Output patterns + ## --------------- + ## Clean up object + em.ins <- data.frame(em$patterns.w) + em.ins$zeta.j <- em$zeta.j + em.ins <- em.ins[order(em.ins[, "zeta.j"]), ] + + ## Which pattern is closest to the threshold? + min <- which.min(abs(em.ins$zeta.j - min(posterior.range))) + max <- which.min(abs(em.ins$zeta.j - max(posterior.range))) + em.ins <- em.ins[min:max,] + + ## Clean up outputted object + inds.gamma <- grep("gamma.[[:digit:]]", colnames(em.ins)) + em.ins[,inds.gamma] <- ifelse(em.ins[,inds.gamma] == 2, "M", + ifelse(em.ins[,inds.gamma] == 1, "PM", + ifelse(em.ins[,inds.gamma] == 0, "NM", NA))) + em.ins[,(max(inds.gamma)+1):ncol(em.ins)] <- round(em.ins[,(max(inds.gamma)+1):ncol(em.ins)], digits) + if(is.null(em$varnames)){ + varnames <- paste0("gamma.", 1:max(inds.gamma)) + }else{ + varnames <- em$varnames + } + colnames(em.ins)[inds.gamma] <- varnames + + ## ------------------------ + ## Output other information + ## ------------------------ + ## Number of matches + num.matches <- sum(em.ins$counts * em.ins$zeta.j) + + ## Gammas + gammaprob <- em$p.gamma.k.m + names(gammaprob) <- varnames + gammaprob <- lapply(gammaprob, function(x){round(x, digits)}) + out <- list(match.patterns = em.ins, matchprob.by.variable = gammaprob, + num.matches = num.matches, posterior.range = posterior.range, + nobs.a = em$nobs.a, nobs.b = em$nobs.b, iter.converge = em$iter.converge, + lambda = em$p.m) + class(out) <- c("fastLink", "inspectEM") + + return(out) + +} diff --git a/R/matchesLink.R b/R/matchesLink.R index d2d6691..6c88750 100644 --- a/R/matchesLink.R +++ b/R/matchesLink.R @@ -131,7 +131,6 @@ matchesLink <- function(gammalist, nobs.a, nobs.b, em, thresh, n.cores = NULL) { ## Run main function if(Sys.info()[['sysname']] == 'Darwin') { - cat("Parallelizing gamma calculation using", nc, "cores.\n") cl <- makeCluster(nc) registerDoParallel(cl) diff --git a/R/plot.R b/R/plot.R new file mode 100644 index 0000000..e114ff7 --- /dev/null +++ b/R/plot.R @@ -0,0 +1,88 @@ +#' Plot matching patterns of the EM object by posterior probability of match +#' +#' \code{plot.fastLink()} plots the matching patterns of the EM object, +#' ordering the matching patterns by the posterior probability of the match. +#' +#' @usage \method{plot}{fastLink}(x, posterior.range, ...) +#' @param x Either a \code{fastLink} or \code{fastLink.EM} object to be plotted. +#' @param posterior.range The range of posterior probabilities to display. +#' Default is c(0.85, 1). +#' @param ... Further arguments to be passed to \code{plot.fastLink()}. +#' +#' @export +#' @method plot fastLink +#' @importFrom plotrix staxlab +#' @importFrom grDevices colorRampPalette +#' @importFrom graphics axis legend plot polygon +plot.fastLink <- function(x, posterior.range = c(.85, 1), ...){ + + ## Extract EM object + if("fastLink.EM" %in% class(x)){ + em <- x + }else if(length(class(x)) == 1 & "fastLink" %in% class(x)){ + em <- x$EM + } + + if(min(posterior.range) < 0 | max(posterior.range) > 1){ + stop("Please make sure that posterior.range is between 0 and 1.") + } + if(length(posterior.range) == 1){ + posterior.range <- c(posterior.range, 1) + } + + em.ins <- em + em.ins <- data.frame(em.ins$patterns.w) + em.ins$zeta.j <- em$zeta.j + em.ins <- em.ins[order(em.ins[, "zeta.j"]), ] + inds.gamma <- grep("gamma.[[:digit:]]", colnames(em.ins)) + + ## Subset to the neighborhood around threshold + min <- which.min(abs(em.ins$zeta.j - posterior.range[1])) + max <- which.min(abs(em.ins$zeta.j - posterior.range[2])) + em.ins <- em.ins[min:max,] + colfunc <- colorRampPalette(c("darkred", "white")) + cols <- colfunc(3) + if(is.null(em$varnames)){ + varnames <- paste0("gamma.", 1:max(inds.gamma)) + }else{ + varnames <- em$varnames + } + ylabs <- seq(min(posterior.range), max(posterior.range), by = .05) + yinds <- sapply(ylabs, function(x){which.min(abs(em.ins$zeta.j - x))}) + + ## Plot polygons + extra.x <- ceiling(length(inds.gamma)/3) + plot(1, + type = "n", + xlim = c(0, length(inds.gamma) + extra.x), + ylim = c(0, nrow(em.ins)), + xaxt = "n", xlab = "", + yaxt = "n", ylab = "Posterior Probability of a Match", + bty = "n", + main = "Matching Patterns Ordered by Posterior Probability of Match" + ) + staxlab(1, 1:length(inds.gamma)-.5, varnames, + srt = 45, top.line = 0) + axis(2, yinds-.5, ylabs) + for(i in 1:nrow(em.ins)){ + + for(j in 1:length(inds.gamma)){ + val <- em.ins[i,j] + c.val <- ifelse(is.na(val), "grey", + ifelse(val == 0, cols[3], + ifelse(val == 1, cols[2], + cols[1]))) + polygon(c(j-1, j, j, j-1), + c(i-1, i-1, i, i), + col = c.val) + } + + } + legend("topright", + c("Match", "Partial Match", "Non-Match", "NA"), + pch = rep(22, 4), col = rep("black", 4), + pt.bg = c(cols[1], cols[2], cols[3], "grey"), + bty = "n") + +} + diff --git a/R/preprocText.R b/R/preprocText.R new file mode 100644 index 0000000..593c12f --- /dev/null +++ b/R/preprocText.R @@ -0,0 +1,60 @@ +#' preprocText +#' +#' Preprocess text data such as names and addresses. +#' +#' @usage preprocText(text, convert_text, tolower, soundex, +#' usps_address, convert_text_to) +#' @param text A vector of text data to convert. +#' @param convert_text Whether to convert text to the desired encoding, where +#' the encoding is specified in the 'convert_text_to' argument. Default is +#' TRUE +#' @param tolower Whether to normalize the text to be all lowercase. Default is +#' TRUE. +#' @param soundex Whether to convert the field to the Census's soundex encoding. +#' Default is FALSE. +#' @param usps_address Whether to use USPS address standardization rules to clean address fields. +#' Default is FALSE. +#' @param convert_text_to Which encoding to use when converting text. Default is 'Latin-ASCII'. +#' Full list of encodings in the \code{stri_trans_list()} function in the \code{stringi} package. +#' +#' @return \code{preprocText()} returns the preprocessed vector of text. +#' +#' @author Ben Fifield +#' @export +#' @importFrom stringi stri_trans_list stri_trans_general +#' @importFrom stringdist phonetic +preprocText <- function(text, convert_text = TRUE, tolower = TRUE, + soundex = FALSE, usps_address = FALSE, convert_text_to = "Latin-ASCII"){ + + if(!(convert_text_to %in% stri_trans_list())){ + stop("Sorry, that encoding is not included in the set of valid encodings. Please check 'stri_trans_list()' in the 'stringi' package for the full set of valid encodings.") + } + + if(convert_text){ + text <- stri_trans_general(text, convert_text_to) + } + if(tolower){ + text <- tolower(text) + } + if(soundex){ + text <- phonetic(text) + } + if(usps_address){ + text <- gsub(" avenue", " ave", text) + text <- gsub(" boulevard", " blvd", text) + text <- gsub(" circle", " cir", text) + text <- gsub(" court", " ct", text) + text <- gsub(" drive", " dr", text) + text <- gsub(" junction", " jct", text) + text <- gsub(" place", " pl", text) + text <- gsub(" road", " rd", text) + text <- gsub(" route", " rte", text) + text <- gsub(" square", " sq", text) + text <- gsub(" street", " st", text) + text <- gsub(" apartment", " apt", text) + text <- gsub(" building", " bldg", text) + } + + return(text) + +} diff --git a/R/print.R b/R/print.R new file mode 100644 index 0000000..b461784 --- /dev/null +++ b/R/print.R @@ -0,0 +1,40 @@ +#' print.inspectEM +#' +#' Print information from the EM algorithm to console. +#' +#' @usage \method{print}{inspectEM}(x, ...) +#' @param x An \code{inspectEM} object +#' @param ... Further arguments to be passed to \code{print.fastLink()}. +#' +#' @export +#' @method print inspectEM +print.inspectEM <- function(x, ...){ + + ## ------ + ## Output + ## ------ + ## Details of match + cat("\nMatched", x$nobs.a, "observations in dataset A to", x$nobs.b, "observations in dataset B.\n") + cat("EM algorithm converged in", x$iter.converge, "iterations.\n") + + ## Number of matches + min <- min(x$posterior.range) + max <- max(x$posterior.range) + cat(paste0("\nNumber of matches found for posterior between ", min, " and ", max, ":\n")) + cat(x$num.matches, "\n") + + ## Quality of each variable + cat("\nProbability of observing pattern conditional on being in matched set\n(By Variable):\n") + print(x$matchprob.by.variable) + cat("\n") + + cat("\nProbability of a match across all pairwise comparisons:\n") + print(x$lambda) + cat("\n") + + ## Posteriors + cat("\nPosterior probability of a match, by matching pattern:\n") + print(x$match.patterns) + cat("\n") + +} diff --git a/R/tableCounts.R b/R/tableCounts.R index 9936e88..aa867d6 100644 --- a/R/tableCounts.R +++ b/R/tableCounts.R @@ -80,7 +80,6 @@ tableCounts <- function(gammalist, nobs.a, nobs.b, n.cores = NULL) { ## Run main function if(Sys.info()[['sysname']] == 'Darwin') { - cat("Parallelizing gamma calculation using", nc, "cores.\n") cl <- makeCluster(nc) registerDoParallel(cl) diff --git a/man/cleanAddressUSPS.Rd b/man/cleanAddressUSPS.Rd deleted file mode 100644 index aa0efdd..0000000 --- a/man/cleanAddressUSPS.Rd +++ /dev/null @@ -1,24 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/cleanAddressUSPS.R -\name{cleanAddressUSPS} -\alias{cleanAddressUSPS} -\title{cleanAddressUSPS} -\usage{ -cleanAddressUSPS(address.field) -} -\arguments{ -\item{address.field}{A vector containing address information to be cleaned.} -} -\value{ -\code{cleanAddressUSPS()} returns a cleaned version of \code{address.field}. -} -\description{ -Apply USPS address standardization to address field. -} -\examples{ -dfA$streetname <- cleanAddressUSPS(dfA$streetname) -} -\author{ -Ted Enamorado and Ben Fifield -} - diff --git a/man/emlinkMARmov.Rd b/man/emlinkMARmov.Rd index 8b03a00..ecb0fec 100644 --- a/man/emlinkMARmov.Rd +++ b/man/emlinkMARmov.Rd @@ -6,7 +6,7 @@ \usage{ emlinkMARmov(patterns, nobs.a, nobs.b, p.m, iter.max, tol, p.gamma.k.m, p.gamma.k.u, prior.lambda, w.lambda, -prior.pi, w.pi, address.field, gender.field) +prior.pi, w.pi, address.field, gender.field, varnames) } \arguments{ \item{patterns}{table that holds the counts for each unique agreement @@ -40,6 +40,9 @@ Address fields should be set to TRUE while non-address fields are set to FALSE i \item{gender.field}{Boolean indicators for whether a given field is for gender. If so, exact match is conducted on gender. Default is NULL (FALSE for all fields). The one gender field should be set to TRUE while all other fields are set to FALSE if provided.} + +\item{varnames}{The vector of variable names used for matching. Automatically provided if using \code{fastLink()} wrapper. Used for +clean visualization of EM results in summary functions.} } \value{ \code{emlinkMARmov} returns a list with the following components: @@ -71,7 +74,7 @@ g4 <- gammaKpar(dfA$birthyear, dfB$birthyear) tc <- tableCounts(list(g1, g2, g3, g4), nobs.a = nrow(dfA), nobs.b = nrow(dfB)) ## Run EM -em <- emlinkMAR(tc, nobs.a = nrow(dfA), nobs.b = nrow(dfB)) +em <- emlinkMARmov(tc, nobs.a = nrow(dfA), nobs.b = nrow(dfB)) } } diff --git a/man/fastLink-package.Rd b/man/fastLink-package.Rd index 8c5bf2b..85bcc62 100644 --- a/man/fastLink-package.Rd +++ b/man/fastLink-package.Rd @@ -14,8 +14,8 @@ the Fellegi-Sunter model, using the Expectation-Maximization Algorithm. In addit tools for conducting and summarizing data merges are included. } \details{ -\tabular{ll}{ Package: \tab fastLink\cr Type: \tab Package\cr Version: \tab 0.1.1-\cr -Date: \tab 2017-07-10\cr License: \tab GPL (>= 3)\cr } +\tabular{ll}{ Package: \tab fastLink\cr Type: \tab Package\cr Version: \tab 0.2.0-\cr +Date: \tab 2017-07-30\cr License: \tab GPL (>= 3)\cr } } \author{ Ted Enamorado \email{fastlinkr@gmail.com}, Ben Fifield \email{fastlinkr@gmail.com}, and Kosuke Imai \email{kimai@princeton.edu} diff --git a/man/fastLink.Rd b/man/fastLink.Rd index 498aa38..8311917 100644 --- a/man/fastLink.Rd +++ b/man/fastLink.Rd @@ -10,7 +10,7 @@ priors.obj, w.lambda, w.pi, address.field, gender.field, estimate.only, em.obj, dedupe.matches, linprog.dedupe, reweight.names, firstname.field, -return.df, n.cores, tol.em, threshold.match, verbose) +n.cores, tol.em, threshold.match, return.all, return.df, verbose) } \arguments{ \item{dfA}{Dataset A - to be matched to Dataset B} @@ -67,8 +67,6 @@ estimated on a smaller sample, and the user wants to apply them to the full data \item{firstname.field}{The name of the field indicating first name. Must be provided if reweight.names = TRUE.} -\item{return.df}{Whether to return the entire dataframe of dfA and dfB instead of just the indices. Default is FALSE.} - \item{n.cores}{Number of cores to parallelize over. Default is NULL.} \item{tol.em}{Convergence tolerance for the EM Algorithm. Default is 1e-04.} @@ -77,6 +75,11 @@ estimated on a smaller sample, and the user wants to apply them to the full data user wants to declare a match. For instance, threshold.match = .85 will return all pairs with posterior probability greater than .85 as matches, while threshold.match = c(.85, .95) will return all pairs with posterior probability between .85 and .95 as matches.} +\item{return.all}{Whether to return the most likely match for each observation in dfA and dfB. Overrides user setting of \code{threshold.match} by setting +\code{threshold.match} to 0.0001, and automatically dedupes all matches. Default is FALSE.} + +\item{return.df}{Whether to return the entire dataframe of dfA and dfB instead of just the indices. Default is FALSE.} + \item{verbose}{Whether to print elapsed time for each step. Default is FALSE.} } \value{ diff --git a/man/getPosterior.Rd b/man/getPosterior.Rd new file mode 100644 index 0000000..1083cab --- /dev/null +++ b/man/getPosterior.Rd @@ -0,0 +1,48 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/getPosterior.R +\name{getPosterior} +\alias{getPosterior} +\title{getPosterior} +\usage{ +getPosterior(matchesA, matchesB, EM, varnames, stringdist.match, +partial.match, stringdist.method, cut.a, cut.p, jw.weight) +} +\arguments{ +\item{matchesA}{A dataframe of the matched observations in +dataset A, with all variables used to inform the match.} + +\item{matchesB}{A dataframe of the matched observations in +dataset B, with all variables used to inform the match.} + +\item{EM}{The EM object from \code{emlinkMARmov()}} + +\item{varnames}{A vector of variable names to use for matching. +Must be present in both matchesA and matchesB.} + +\item{stringdist.match}{A vector of booleans, indicating whether to use +string distance matching when determining matching patterns on +each variable. Must be same length as varnames.} + +\item{partial.match}{A vector of booleans, indicating whether to include +a partial matching category for the string distances. Must be same length +as varnames. Default is FALSE for all variables.} + +\item{stringdist.method}{String distance method for calculating similarity, options are: "jw" Jaro-Winkler (Default), "jaro" Jaro, and "lv" Edit} + +\item{cut.a}{Lower bound for full string-distance match, ranging between 0 and 1. Default is 0.92} + +\item{cut.p}{Lower bound for partial string-distance match, ranging between 0 and 1. Default is 0.88} + +\item{jw.weight}{Parameter that describes the importance of the first characters of a string (only needed if stringdist.method = "jw"). Default is .10} +} +\value{ +\code{getPosterior} returns the posterior probability of a match for each matched pair of observations +in matchesA and matchesB +} +\description{ +Get the posterior probability of a match for each matched pair of observations +} +\author{ +Ben Fifield +} + diff --git a/man/inspectEM.Rd b/man/inspectEM.Rd new file mode 100644 index 0000000..a1a29f8 --- /dev/null +++ b/man/inspectEM.Rd @@ -0,0 +1,28 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/inspectEM.R +\name{inspectEM} +\alias{inspectEM} +\title{inspectEM} +\usage{ +inspectEM(object, posterior.range, digits) +} +\arguments{ +\item{object}{The output from either \code{fastLink} or \code{emlinkMARmov}.} + +\item{posterior.range}{The range of posterior probabilities to display. +Default is c(0.85, 1).} + +\item{digits}{How many digits to include in inspectEM dataframe. Default is 3.} +} +\value{ +\code{inspectEM} returns a data frame with information +about patterns around the provided threshold. +} +\description{ +Inspect EM objects to analyze successfully and +unsuccessfully matched patterns. +} +\author{ +Ben Fifield +} + diff --git a/man/plot.fastLink.Rd b/man/plot.fastLink.Rd new file mode 100644 index 0000000..e21f031 --- /dev/null +++ b/man/plot.fastLink.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/plot.R +\name{plot.fastLink} +\alias{plot.fastLink} +\title{Plot matching patterns of the EM object by posterior probability of match} +\usage{ +\method{plot}{fastLink}(x, posterior.range, ...) +} +\arguments{ +\item{x}{Either a \code{fastLink} or \code{fastLink.EM} object to be plotted.} + +\item{posterior.range}{The range of posterior probabilities to display. +Default is c(0.85, 1).} + +\item{...}{Further arguments to be passed to \code{plot.fastLink()}.} +} +\description{ +\code{plot.fastLink()} plots the matching patterns of the EM object, +ordering the matching patterns by the posterior probability of the match. +} + diff --git a/man/preprocText.Rd b/man/preprocText.Rd new file mode 100644 index 0000000..0afb752 --- /dev/null +++ b/man/preprocText.Rd @@ -0,0 +1,38 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/preprocText.R +\name{preprocText} +\alias{preprocText} +\title{preprocText} +\usage{ +preprocText(text, convert_text, tolower, soundex, +usps_address, convert_text_to) +} +\arguments{ +\item{text}{A vector of text data to convert.} + +\item{convert_text}{Whether to convert text to the desired encoding, where +the encoding is specified in the 'convert_text_to' argument. Default is +TRUE} + +\item{tolower}{Whether to normalize the text to be all lowercase. Default is +TRUE.} + +\item{soundex}{Whether to convert the field to the Census's soundex encoding. +Default is FALSE.} + +\item{usps_address}{Whether to use USPS address standardization rules to clean address fields. +Default is FALSE.} + +\item{convert_text_to}{Which encoding to use when converting text. Default is 'Latin-ASCII'. +Full list of encodings in the \code{stri_trans_list()} function in the \code{stringi} package.} +} +\value{ +\code{preprocText()} returns the preprocessed vector of text. +} +\description{ +Preprocess text data such as names and addresses. +} +\author{ +Ben Fifield +} + diff --git a/man/print.inspectEM.Rd b/man/print.inspectEM.Rd new file mode 100644 index 0000000..266eae1 --- /dev/null +++ b/man/print.inspectEM.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/print.R +\name{print.inspectEM} +\alias{print.inspectEM} +\title{print.inspectEM} +\usage{ +\method{print}{inspectEM}(x, ...) +} +\arguments{ +\item{x}{An \code{inspectEM} object} + +\item{...}{Further arguments to be passed to \code{print.fastLink()}.} +} +\description{ +Print information from the EM algorithm to console. +} + diff --git a/src/m_func_cpp.cpp b/src/m_func_cpp.cpp index 70b056f..7372431 100644 --- a/src/m_func_cpp.cpp +++ b/src/m_func_cpp.cpp @@ -324,9 +324,9 @@ std::vector< std::vector > m_func_par(const std::vector< std::vector< #ifdef _OPENMP omp_set_num_threads(threads); int threadsused = omp_get_max_threads(); - Rcout << "(Using OpenMP to parallelize calculation. " + Rcout << " Parallelizing calculation using OpenMP. " << threadsused << " threads out of " - << omp_get_num_procs() << " are used.)" + << omp_get_num_procs() << " are used." << std::endl; #pragma omp parallel for private(n, m, temp_feature, ptemp_feature) firstprivate(lims, lims_2, templist, ptemplist, natemplist, mf_out) #endif