Skip to content

Commit

Permalink
Merge pull request #16 from kosukeimai/development
Browse files Browse the repository at this point in the history
Merge development into master for new CRAN submission
  • Loading branch information
bfifield authored Sep 1, 2017
2 parents d885027 + 6b07bcb commit d7806e9
Show file tree
Hide file tree
Showing 23 changed files with 642 additions and 106 deletions.
12 changes: 8 additions & 4 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
Package: fastLink
Type: Package
Title: Fast Probabilistic Record Linkage with Missing Data
Version: 0.1.1
Date: 2017-07-10
Version: 0.2.0
Date: 2017-08-31
Authors@R: c(
person("Ted", "Enamorado", email = "[email protected]", role = c("aut", "cre")),
person("Ben", "Fifield", email = "[email protected]", role = c("aut")),
Expand All @@ -24,10 +24,14 @@ Imports:
data.table,
stringdist,
stringr,
Rcpp (>= 0.12.9),
stringi,
Rcpp (>= 0.12.7),
FactoClass,
adagio,
dplyr
dplyr,
plotrix,
grDevices,
graphics
Depends:
R (>= 2.14.0)
LinkingTo: RcppArmadillo, Rcpp, RcppEigen
Expand Down
15 changes: 14 additions & 1 deletion NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
# Generated by roxygen2: do not edit by hand

S3method(plot,fastLink)
S3method(print,inspectEM)
S3method(summary,fastLink)
export(aggregateEM)
export(calcMoversPriors)
export(cleanAddressUSPS)
export(clusterMatch)
export(dedupeMatches)
export(emlinkMARmov)
Expand All @@ -13,8 +14,11 @@ export(gammaCK2par)
export(gammaCKpar)
export(gammaKpar)
export(getMatches)
export(getPosterior)
export(inspectEM)
export(matchesLink)
export(nameReweight)
export(preprocText)
export(tableCounts)
import(Matrix)
import(data.table)
Expand All @@ -28,20 +32,29 @@ importFrom(dplyr,n)
importFrom(dplyr,summarise)
importFrom(foreach,"%dopar%")
importFrom(foreach,foreach)
importFrom(grDevices,colorRampPalette)
importFrom(graphics,axis)
importFrom(graphics,legend)
importFrom(graphics,plot)
importFrom(graphics,polygon)
importFrom(gtools,rdirichlet)
importFrom(parallel,detectCores)
importFrom(parallel,makeCluster)
importFrom(parallel,mclapply)
importFrom(parallel,stopCluster)
importFrom(plotrix,staxlab)
importFrom(stats,kmeans)
importFrom(stats,na.omit)
importFrom(stats,prcomp)
importFrom(stats,predict)
importFrom(stats,quantile)
importFrom(stats,runif)
importFrom(stats,var)
importFrom(stringdist,phonetic)
importFrom(stringdist,stringdist)
importFrom(stringdist,stringdistmatrix)
importFrom(stringi,stri_trans_general)
importFrom(stringi,stri_trans_list)
importFrom(stringr,str_count)
importFrom(utils,data)
useDynLib(fastLink, .registration = TRUE)
38 changes: 0 additions & 38 deletions R/cleanAddressUSPS.R

This file was deleted.

43 changes: 26 additions & 17 deletions R/emlinkMARmov.R
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
#'
#' @usage emlinkMARmov(patterns, nobs.a, nobs.b, p.m, iter.max,
#' tol, p.gamma.k.m, p.gamma.k.u, prior.lambda, w.lambda,
#' prior.pi, w.pi, address.field, gender.field)
#' prior.pi, w.pi, address.field, gender.field, varnames)
#'
#' @param patterns table that holds the counts for each unique agreement
#' pattern. This object is produced by the function: tableCounts.
Expand All @@ -25,6 +25,8 @@
#' Address fields should be set to TRUE while non-address fields are set to FALSE if provided.
#' @param gender.field Boolean indicators for whether a given field is for gender. If so, exact match is conducted on gender.
#' Default is NULL (FALSE for all fields). The one gender field should be set to TRUE while all other fields are set to FALSE if provided.
#' @param varnames The vector of variable names used for matching. Automatically provided if using \code{fastLink()} wrapper. Used for
#' clean visualization of EM results in summary functions.
#'
#' @return \code{emlinkMARmov} returns a list with the following components:
#' \item{zeta.j}{The posterior match probabilities for each unique pattern.}
Expand Down Expand Up @@ -53,7 +55,7 @@
#' tc <- tableCounts(list(g1, g2, g3, g4), nobs.a = nrow(dfA), nobs.b = nrow(dfB))
#'
#' ## Run EM
#' em <- emlinkMAR(tc, nobs.a = nrow(dfA), nobs.b = nrow(dfB))
#' em <- emlinkMARmov(tc, nobs.a = nrow(dfA), nobs.b = nrow(dfB))
#' }
#'
#' @export
Expand All @@ -62,7 +64,7 @@ emlinkMARmov <- function(patterns, nobs.a, nobs.b,
p.m = 0.1, iter.max = 5000, tol = 1e-5, p.gamma.k.m = NULL, p.gamma.k.u = NULL,
prior.lambda = NULL, w.lambda = NULL,
prior.pi = NULL, w.pi = NULL, address.field = NULL,
gender.field = NULL) {
gender.field = NULL, varnames = NULL) {

options(digits=16)

Expand Down Expand Up @@ -326,15 +328,22 @@ emlinkMARmov <- function(patterns, nobs.a, nobs.b,
colnames(data.w)[nc-1] <- "p.gamma.j.m"
colnames(data.w)[nc] <- "p.gamma.j.u"

inf <- which(data.w == Inf, arr.ind = T)
ninf <- which(data.w == -Inf, arr.ind = T)

data.w[inf[, 1], unique(inf[, 2])] <- 150
data.w[ninf[, 1], unique(ninf[, 2])] <- -150
inf <- which(data.w == Inf, arr.ind = T)
ninf <- which(data.w == -Inf, arr.ind = T)

data.w[inf[, 1], unique(inf[, 2])] <- 150
data.w[ninf[, 1], unique(ninf[, 2])] <- -150

if(!is.null(varnames)){
output <- list("zeta.j"= zeta.j,"p.m"= p.m, "p.u" = p.u, "p.gamma.k.m" = p.gamma.k.m, "p.gamma.k.u" = p.gamma.k.u,
"p.gamma.j.m" = p.gamma.j.m, "p.gamma.j.u" = p.gamma.j.u, "patterns.w" = data.w, "iter.converge" = count,
"nobs.a" = nobs.a, "nobs.b" = nobs.b, "varnames" = varnames)
}else{
output <- list("zeta.j"= zeta.j,"p.m"= p.m, "p.u" = p.u, "p.gamma.k.m" = p.gamma.k.m, "p.gamma.k.u" = p.gamma.k.u,
"p.gamma.j.m" = p.gamma.j.m, "p.gamma.j.u" = p.gamma.j.u, "patterns.w" = data.w, "iter.converge" = count,
"nobs.a" = nobs.a, "nobs.b" = nobs.b, "varnames" = paste0("gamma.", 1:nfeatures))
}

output <- list("zeta.j"= zeta.j,"p.m"= p.m, "p.u" = p.u, "p.gamma.k.m" = p.gamma.k.m, "p.gamma.k.u" = p.gamma.k.u,
"p.gamma.j.m" = p.gamma.j.m, "p.gamma.j.u" = p.gamma.j.u, "patterns.w" = data.w, "iter.converge" = count,
"nobs.a" = nobs.a, "nobs.b" = nobs.b)
class(output) <- c("fastLink", "fastLink.EM")

return(output)
Expand Down Expand Up @@ -481,15 +490,15 @@ emlinkRS <- function(patterns.out, em.out, nobs.a, nobs.b){
colnames(data.w)[nc - 1] <- "p.gamma.j.m"
colnames(data.w)[nc] <- "p.gamma.j.u"

inf <- which(data.w == Inf, arr.ind = T)
ninf <- which(data.w == -Inf, arr.ind = T)
data.w[inf[, 1], unique(inf[, 2])] <- 150
data.w[ninf[, 1], unique(ninf[, 2])] <- -150
inf <- which(data.w == Inf, arr.ind = T)
ninf <- which(data.w == -Inf, arr.ind = T)
data.w[inf[, 1], unique(inf[, 2])] <- 150
data.w[ninf[, 1], unique(ninf[, 2])] <- -150

output <- list("zeta.j" = zeta.j, "p.m" = em.out$p.m, "p.u" = em.out$p.u, "p.gamma.k.m" = em.out$p.gamma.k.m, "p.gamma.k.u" = em.out$p.gamma.k.u,
"p.gamma.j.m" = p.gamma.j.m, "p.gamma.j.u" = p.gamma.j.u, "patterns.w" = data.w, "iter.converge" = em.out$iter.converge,
"nobs.a" = nobs.a, "nobs.b" = nobs.b)
"nobs.a" = nobs.a, "nobs.b" = nobs.b, "varnames" = em.out$varnames)
class(output) <- c("fastLink", "fastLink.EM")

return(output)
Expand Down
4 changes: 2 additions & 2 deletions R/fastLink-package.R
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@
#' the Fellegi-Sunter model, using the Expectation-Maximization Algorithm. In addition,
#' tools for conducting and summarizing data merges are included.
#'
#' \tabular{ll}{ Package: \tab fastLink\cr Type: \tab Package\cr Version: \tab 0.1.1-\cr
#' Date: \tab 2017-07-10\cr License: \tab GPL (>= 3)\cr }
#' \tabular{ll}{ Package: \tab fastLink\cr Type: \tab Package\cr Version: \tab 0.2.0-\cr
#' Date: \tab 2017-07-30\cr License: \tab GPL (>= 3)\cr }
#'
#' @name fastLink-package
#' @useDynLib fastLink, .registration = TRUE
Expand Down
47 changes: 38 additions & 9 deletions R/fastLink.R
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
#' address.field, gender.field, estimate.only, em.obj,
#' dedupe.matches, linprog.dedupe,
#' reweight.names, firstname.field,
#' return.df, n.cores, tol.em, threshold.match, verbose)
#' n.cores, tol.em, threshold.match, return.all, return.df, verbose)
#'
#' @param dfA Dataset A - to be matched to Dataset B
#' @param dfB Dataset B - to be matched to Dataset A
Expand Down Expand Up @@ -45,12 +45,14 @@
#' @param linprog.dedupe If deduping matches, whether to use Winkler's linear programming solution to dedupe. Default is FALSE.
#' @param reweight.names Whether to reweight the posterior match probabilities by the frequency of individual first names. Default is FALSE.
#' @param firstname.field The name of the field indicating first name. Must be provided if reweight.names = TRUE.
#' @param return.df Whether to return the entire dataframe of dfA and dfB instead of just the indices. Default is FALSE.
#' @param n.cores Number of cores to parallelize over. Default is NULL.
#' @param tol.em Convergence tolerance for the EM Algorithm. Default is 1e-04.
#' @param threshold.match A number between 0 and 1 indicating either the lower bound (if only one number provided) or the range of certainty that the
#' user wants to declare a match. For instance, threshold.match = .85 will return all pairs with posterior probability greater than .85 as matches,
#' while threshold.match = c(.85, .95) will return all pairs with posterior probability between .85 and .95 as matches.
#' @param return.all Whether to return the most likely match for each observation in dfA and dfB. Overrides user setting of \code{threshold.match} by setting
#' \code{threshold.match} to 0.0001, and automatically dedupes all matches. Default is FALSE.
#' @param return.df Whether to return the entire dataframe of dfA and dfB instead of just the indices. Default is FALSE.
#' @param verbose Whether to print elapsed time for each step. Default is FALSE.
#'
#' @return \code{fastLink} returns a list of class 'fastLink' containing the following components if calculating matches:
Expand Down Expand Up @@ -83,8 +85,8 @@ fastLink <- function(dfA, dfB, varnames,
gender.field = NULL, estimate.only = FALSE, em.obj = NULL,
dedupe.matches = TRUE, linprog.dedupe = FALSE,
reweight.names = FALSE, firstname.field = NULL,
return.df = FALSE, n.cores = NULL, tol.em = 1e-04,
threshold.match = 0.85, verbose = FALSE){
n.cores = NULL, tol.em = 1e-04, threshold.match = 0.85,
return.all = FALSE, return.df = FALSE, verbose = FALSE){

cat("\n")
cat(c(paste(rep("=", 20), sep = "", collapse = ""), "\n"))
Expand Down Expand Up @@ -156,6 +158,13 @@ fastLink <- function(dfA, dfB, varnames,
stop("Invalid value provided for jw.weight. Remember, jw.weight in [0, 0.25].")
}
}
if(return.all){
threshold.match <- 0.0001
if(!dedupe.matches){
cat("You have specified that all matches be returned but have not deduped the matches. Setting 'dedupe.matches' to TRUE.\n")
dedupe.matches <- TRUE
}
}

## Create boolean indicators
sm.bool <- which(varnames %in% stringdist.match)
Expand Down Expand Up @@ -195,6 +204,10 @@ fastLink <- function(dfA, dfB, varnames,
start <- Sys.time()
gammalist <- vector(mode = "list", length = length(varnames))
for(i in 1:length(gammalist)){
if(verbose){
sdb <- ifelse(stringdist.match[i], "string-distance", "exact")
cat(" Matching variable", varnames[i], "using", sdb, "matching.\n")
}
## Convert to character
if(is.factor(dfA[,varnames[i]]) | is.factor(dfB[,varnames[i]])){
dfA[,varnames[i]] <- as.character(dfA[,varnames[i]])
Expand All @@ -209,6 +222,9 @@ fastLink <- function(dfA, dfB, varnames,
stop(paste("You have no variation in dataset B for", varnames[i], "or all observations are missing."))
}
}
if(sum(dfA[,varnames[i]] %in% dfB[,varnames[i]]) == 0){
stop(paste0("You have no exact matches for ", varnames[i], ". Please drop this variable from your analysis."))
}
## Get patterns
if(stringdist.match[i]){
if(partial.match[i]){
Expand Down Expand Up @@ -270,7 +286,8 @@ fastLink <- function(dfA, dfB, varnames,
prior.lambda = lambda.prior, w.lambda = w.lambda,
prior.pi = pi.prior, w.pi = w.pi,
address.field = address.field,
gender.field = gender.field)
gender.field = gender.field,
varnames = varnames)
end <- Sys.time()
if(verbose){
cat("Running the EM algorithm took", round(difftime(end, start, units = "secs"), 2), "seconds.\n\n")
Expand Down Expand Up @@ -310,9 +327,19 @@ fastLink <- function(dfA, dfB, varnames,
if(verbose){
cat("Deduping the estimated matches took", round(difftime(end, start, units = "mins"), 2), "minutes.\n\n")
}
}else{
cat("Calculating the posterior for each pair of matched observations.\n")
start <- Sys.time()
zeta <- getPosterior(dfA[matches$inds.a,], dfB[matches$inds.b,], EM = resultsEM,
varnames = varnames, stringdist.match = stringdist.match, partial.match = partial.match,
stringdist.method = stringdist.method, cut.a = cut.a, cut.p = cut.p, jw.weight = jw.weight)
end <- Sys.time()
if(verbose){
cat("Calculating the posterior for each matched pair took", round(difftime(end, start, units = "mins"), 2), "minutes.\n\n")
}
}

## Reweight first names
## Reweight first names or get zeta
if(reweight.names){
cat("Reweighting match probabilities by frequency of occurrence.\n")
start <- Sys.time()
Expand All @@ -335,14 +362,16 @@ fastLink <- function(dfA, dfB, varnames,
}
out[["matches"]] <- matches
out[["EM"]] <- resultsEM
out[["nobs.a"]] <- nr_a
out[["nobs.b"]] <- nr_b
if(dedupe.matches){
out[["max.zeta"]] <- ddm.out$max.zeta
out[["posterior"]] <- ddm.out$max.zeta
}else{
out[["posterior"]] <- zeta
}
if(reweight.names){
out[["zeta.name"]] <- rwn.out
}
out[["nobs.a"]] <- nr_a
out[["nobs.b"]] <- nr_b
class(out) <- "fastLink"
}else{
out <- resultsEM
Expand Down
Loading

0 comments on commit d7806e9

Please sign in to comment.