Skip to content

Commit

Permalink
up return max zeta when deduping
Browse files Browse the repository at this point in the history
  • Loading branch information
bfifield committed Jun 7, 2017
1 parent c8173c8 commit 10f9c40
Show file tree
Hide file tree
Showing 5 changed files with 549 additions and 497 deletions.
382 changes: 194 additions & 188 deletions R/dedupeMatches.R
Original file line number Diff line number Diff line change
Expand Up @@ -100,201 +100,207 @@ dedupeMatches <- function(matchesA, matchesB, EM, matchesLink,
}else{
gammalist[[i]] <- ifelse(tmp >= cut.a, 2, 0)
}
}else{
tmp <- matchesA[,varnames[i]] == matchesB[,varnames[i]]
gammalist[[i]] <- ifelse(tmp == TRUE, 2, 0)
}

namevec[i] <- paste0("gamma.", i)

}
gammalist <- data.frame(do.call(cbind, gammalist))
names(gammalist) <- namevec

## -------------------------------
## Convert EM object to data frame
## -------------------------------
emdf <- as.data.frame(EM$patterns.w)
emdf$zeta.j <- c(EM$zeta.j)

## ---------------------
## Merge EM to gammalist
## ---------------------
matchesA <- cbind(matchesA, gammalist)
matchesB <- cbind(matchesB, gammalist)
matchesA$roworder <- 1:nrow(matchesA)
matchesB$roworder <- 1:nrow(matchesB)
matchesA <- merge(matchesA, emdf, by = namevec, all.x = TRUE)
matchesB <- merge(matchesB, emdf, by = namevec, all.x = TRUE)
matchesA <- matchesA[order(matchesA$roworder),]
matchesB <- matchesB[order(matchesB$roworder),]

## ------------
## Start dedupe
## ------------
## Ids
matchesA$idA <- matchesLink$inds.a
matchesB$idB <- matchesLink$inds.b
matchesB$idA <- matchesA$idA
matchesA$idB <- matchesB$idB

## Remove observations with NA for zeta.j
matchesA <- matchesA[!is.na(matchesA$zeta.j),]
matchesB <- matchesB[!is.na(matchesB$zeta.j),]

if(!linprog){

## Step 1: Find max zeta for each observation in dataset A:
## Merge in maximum zeta for each observation in dataset A
temp <- as.matrix(tapply(matchesA$zeta.j, matchesA$idA, max, na.rm = T))
temp <- data.frame(cbind(as.numeric(rownames(temp)), as.numeric(temp)))
names(temp) <- c("idA", "zeta.max")
matchesA <- merge(matchesA, temp, by = "idA")

## Calculate difference
matchesA <- matchesA[order(matchesA$roworder), ]
matchesB <- matchesB[order(matchesB$roworder), ]
matchesA$rm <- abs(matchesA$zeta.j - matchesA$zeta.max)
rm <- which(matchesA$rm == 0)

## Subset down to max zetas
matchesA <- matchesA[rm, ]
matchesB <- matchesB[rm, ]

## Step 2: Find max zeta for each observation in dataset B, if in first subset:
## Merge in maximum zeta for each observation in dataset B
temp <- as.matrix(tapply(matchesB$zeta.j, matchesB$idB, max, na.rm = T))
temp <- data.frame(cbind(as.numeric(rownames(temp)), as.numeric(temp)))
names(temp) <- c("idB", "zeta.max")
matchesB <- merge(matchesB, temp, by = "idB")

## Calculate difference
matchesA <- matchesA[order(matchesA$roworder), ]
matchesB <- matchesB[order(matchesB$roworder), ]
matchesB$rm <- abs(matchesB$zeta.j - matchesB$zeta.max)
rm <- which(matchesB$rm == 0)

## Subset down to max zetas
matchesA <- matchesA[rm, ]
matchesB <- matchesB[rm, ]

## Step 3: Break remaining ties
## Find remaining duplicates in A
d1 <- duplicated(matchesA$idA)
d2 <- duplicated(matchesA$idA, fromLast = T)
matchesA$dA <- ifelse((d1 + d2) > 0, 1, 0)
matchesB$dA <- ifelse((d1 + d2) > 0, 1, 0)

## Draw uniform to break tie, and merge in
matchesA$uni <- runif(nrow(matchesA))
temp <- as.matrix(tapply(matchesA$uni, matchesA$idA, max, na.rm = T))
temp <- data.frame(cbind(as.numeric(rownames(temp)), as.numeric(temp)))
names(temp) <- c("idA", "uni.max")
matchesA <- merge(matchesA, temp, by = "idA")
matchesA <- matchesA[order(matchesA$roworder), ]
matchesB <- matchesB[order(matchesB$roworder), ]
matchesA$rm <- abs(matchesA$uni - matchesA$uni.max)
rm <- which(matchesA$rm == 0)

## Subset down to broken tie
matchesA <- matchesA[rm, ]
matchesB <- matchesB[rm, ]

}else{
tmp <- matchesA[,varnames[i]] == matchesB[,varnames[i]]
gammalist[[i]] <- ifelse(tmp == TRUE, 2, 0)
}

## Find duplicates
dupA <- duplicated(matchesA$idA)
dupB <- duplicated(matchesA$idA, fromLast = T)
matchesA$dupA <- ifelse(dupA == 1 | dupB == 1, 1, 0)

dupA <- duplicated(matchesB$idB)
dupB <- duplicated(matchesB$idB, fromLast = T)
matchesA$dupB <- ifelse(dupA == 1 | dupB == 1, 1, 0)

## Split into dupes, not dups
dups <- subset(matchesA, dupA == 1 | dupB == 1)
nodups <- subset(matchesA, dupA == 0 & dupB == 0)

dups$idA.t <- as.numeric(as.factor(dups$idA))
dups$idB.t <- as.numeric(as.factor(dups$idB))

nr <- max(dups$idA.t)
nc <- max(dups$idB.t)
dim <- max(nr, nc)

## Create adjacency matrix to id duplicates
mat.adj <- sparseMatrix(i = dups$idA.t, j = dups$idB.t, x = dups$zeta.j,
dims = c(dim, dim))
mat.adj <- as.matrix(mat.adj)

## Solve linear sum assignment problem
T1 <- suppressWarnings(assignment(-mat.adj))
temp.0 <- cbind(1:dim, T1$perm)
n1 <- which(rowSums(mat.adj) == 0)
n2 <- which(colSums(mat.adj) == 0)

if(length(n1) > 0) {
temp.0 <- temp.0[-n1, ]
}

if(length(n2) > 0) {
temp.0 <- temp.0[, -n2]
}

temp.0 <- data.frame(temp.0)
names(temp.0) <- c("idA.t", "idB.t")
namevec[i] <- paste0("gamma.", i)

}
gammalist <- data.frame(do.call(cbind, gammalist))
names(gammalist) <- namevec

## -------------------------------
## Convert EM object to data frame
## -------------------------------
emdf <- as.data.frame(EM$patterns.w)
emdf$zeta.j <- c(EM$zeta.j)

## ---------------------
## Merge EM to gammalist
## ---------------------
matchesA <- cbind(matchesA, gammalist)
matchesB <- cbind(matchesB, gammalist)
matchesA$roworder <- 1:nrow(matchesA)
matchesB$roworder <- 1:nrow(matchesB)
matchesA <- merge(matchesA, emdf, by = namevec, all.x = TRUE)
matchesB <- merge(matchesB, emdf, by = namevec, all.x = TRUE)
matchesA <- matchesA[order(matchesA$roworder),]
matchesB <- matchesB[order(matchesB$roworder),]

## ------------
## Start dedupe
## ------------
## Ids
matchesA$idA <- matchesLink$inds.a
matchesB$idB <- matchesLink$inds.b
matchesB$idA <- matchesA$idA
matchesA$idB <- matchesB$idB

## Remove observations with NA for zeta.j
matchesA <- matchesA[!is.na(matchesA$zeta.j),]
matchesB <- matchesB[!is.na(matchesB$zeta.j),]

if(!linprog){

## Step 1: Find max zeta for each observation in dataset A:
## Merge in maximum zeta for each observation in dataset A
temp <- as.matrix(tapply(matchesA$zeta.j, matchesA$idA, max, na.rm = T))
temp <- data.frame(cbind(as.numeric(rownames(temp)), as.numeric(temp)))
names(temp) <- c("idA", "zeta.max")
matchesA <- merge(matchesA, temp, by = "idA")

## Calculate difference
matchesA <- matchesA[order(matchesA$roworder), ]
matchesB <- matchesB[order(matchesB$roworder), ]
matchesA$rm <- abs(matchesA$zeta.j - matchesA$zeta.max)
rm <- which(matchesA$rm == 0)

## Subset down to max zetas
matchesA <- matchesA[rm, ]
matchesB <- matchesB[rm, ]

## Step 2: Find max zeta for each observation in dataset B, if in first subset:
## Merge in maximum zeta for each observation in dataset B
temp <- as.matrix(tapply(matchesB$zeta.j, matchesB$idB, max, na.rm = T))
temp <- data.frame(cbind(as.numeric(rownames(temp)), as.numeric(temp)))
names(temp) <- c("idB", "zeta.max")
matchesB <- merge(matchesB, temp, by = "idB")

## Calculate difference
matchesA <- matchesA[order(matchesA$roworder), ]
matchesB <- matchesB[order(matchesB$roworder), ]
matchesB$rm <- abs(matchesB$zeta.j - matchesB$zeta.max)
rm <- which(matchesB$rm == 0)

## Subset down to max zetas
matchesA <- matchesA[rm, ]
matchesB <- matchesB[rm, ]

## Step 3: Break remaining ties
## Find remaining duplicates in A
d1 <- duplicated(matchesA$idA)
d2 <- duplicated(matchesA$idA, fromLast = T)
matchesA$dA <- ifelse((d1 + d2) > 0, 1, 0)
matchesB$dA <- ifelse((d1 + d2) > 0, 1, 0)

## Draw uniform to break tie, and merge in
matchesA$uni <- runif(nrow(matchesA))
temp <- as.matrix(tapply(matchesA$uni, matchesA$idA, max, na.rm = T))
temp <- data.frame(cbind(as.numeric(rownames(temp)), as.numeric(temp)))
names(temp) <- c("idA", "uni.max")
matchesA <- merge(matchesA, temp, by = "idA")
matchesA <- matchesA[order(matchesA$roworder), ]
matchesB <- matchesB[order(matchesB$roworder), ]
matchesA$rm <- abs(matchesA$uni - matchesA$uni.max)
rm <- which(matchesA$rm == 0)

## Subset down to broken tie
matchesA <- matchesA[rm, ]
matchesB <- matchesB[rm, ]

}else{

## Find duplicates
dupA <- duplicated(matchesA$idA)
dupB <- duplicated(matchesA$idA, fromLast = T)
matchesA$dupA <- ifelse(dupA == 1 | dupB == 1, 1, 0)

dupA <- duplicated(matchesB$idB)
dupB <- duplicated(matchesB$idB, fromLast = T)
matchesA$dupB <- ifelse(dupA == 1 | dupB == 1, 1, 0)

## Split into dupes, not dups
dups <- subset(matchesA, dupA == 1 | dupB == 1)
nodups <- subset(matchesA, dupA == 0 & dupB == 0)

dups$idA.t <- as.numeric(as.factor(dups$idA))
dups$idB.t <- as.numeric(as.factor(dups$idB))

nr <- max(dups$idA.t)
nc <- max(dups$idB.t)
dim <- max(nr, nc)

## Create adjacency matrix to id duplicates
mat.adj <- sparseMatrix(i = dups$idA.t, j = dups$idB.t, x = dups$zeta.j,
dims = c(dim, dim))
mat.adj <- as.matrix(mat.adj)

## Solve linear sum assignment problem
T1 <- suppressWarnings(assignment(-mat.adj))
temp.0 <- cbind(1:dim, T1$perm)
n1 <- which(rowSums(mat.adj) == 0)
n2 <- which(colSums(mat.adj) == 0)

if(length(n1) > 0) {
temp.0 <- temp.0[-n1, ]
}

## Merge in dedupe information
dedup <- merge(temp.0, dups, by = c("idA.t", "idB.t"))
dedup$idA.t <- dedup$idB.t <- NULL
if(length(n2) > 0) {
temp.0 <- temp.0[, -n2]
}

## Combine dupes, dedupes
matchesA <- rbind(dedup, nodups)
matchesA$dupA <- matchesA$dupB <- NULL
listA <- paste(matchesA$idA, matchesA$idB, sep = "-")
listB <- paste(matchesB$idA, matchesB$idB, sep = "-")
keep <- which(listB %in% listA)
matchesB <- matchesB[keep, ]
temp.0 <- data.frame(temp.0)
names(temp.0) <- c("idA.t", "idB.t")

## Subset down and order
matchesA <- matchesA[order(matchesA$idA, matchesA$idB), ]
matchesB <- matchesB[order(matchesB$idA, matchesB$idB), ]
## Merge in dedupe information
dedup <- merge(temp.0, dups, by = c("idA.t", "idB.t"))
dedup$idA.t <- dedup$idB.t <- NULL

}
## Combine dupes, dedupes
matchesA <- rbind(dedup, nodups)
matchesA$dupA <- matchesA$dupB <- NULL
listA <- paste(matchesA$idA, matchesA$idB, sep = "-")
listB <- paste(matchesB$idA, matchesB$idB, sep = "-")
keep <- which(listB %in% listA)
matchesB <- matchesB[keep, ]

## -----------------
## Correct EM object
## -----------------
counts <- eval(parse(
text = paste0("data.frame(matchesA %>% group_by(",
paste(namevec, collapse = ", "),
") %>% summarise(counts = n()))"))
)
patterns <- as.data.frame(EM$patterns.w)
patterns$rownum <- 1:nrow(patterns)
patterns <- merge(patterns, counts, by = namevec, all.x = TRUE)
patterns$counts.x <- ifelse(!is.na(patterns$counts.y), patterns$counts.y,
patterns$counts.x)
patterns <- patterns[order(patterns$rownum),]
patterns$counts.y <- NULL; patterns$rownum <- NULL
names(patterns) <- c(namevec, "counts", "weights", "p.gamma.j.m", "p.gamma.j.u")
EM$patterns.w <- as.matrix(patterns)

## --------------------------
## Correct matchesLink object
## --------------------------
matchesLink <- data.frame(inds.a = matchesA$idA, inds.b = matchesB$idB)

## --------------------------
## Correct dataframes objects
## --------------------------
matchesA <- subset(matchesA, select = colnames.df.a)
matchesB <- subset(matchesB, select = colnames.df.b)

## Return deduped object
out <- list(matchesA = matchesA, matchesB = matchesB,
EM = EM, matchesLink = matchesLink)
return(out)
## Subset down and order
matchesA <- matchesA[order(matchesA$idA, matchesA$idB), ]
matchesB <- matchesB[order(matchesB$idA, matchesB$idB), ]

}

## -----------------
## Correct EM object
## -----------------
counts <- eval(parse(
text = paste0("data.frame(matchesA %>% group_by(",
paste(namevec, collapse = ", "),
") %>% summarise(counts = n()))"))
)
patterns <- as.data.frame(EM$patterns.w)
patterns$rownum <- 1:nrow(patterns)
patterns <- merge(patterns, counts, by = namevec, all.x = TRUE)
patterns$counts.x <- ifelse(!is.na(patterns$counts.y), patterns$counts.y,
patterns$counts.x)
patterns <- patterns[order(patterns$rownum),]
patterns$counts.y <- NULL; patterns$rownum <- NULL
names(patterns) <- c(namevec, "counts", "weights", "p.gamma.j.m", "p.gamma.j.u")
EM$patterns.w <- as.matrix(patterns)

## --------------------------
## Correct matchesLink object
## --------------------------
matchesLink <- data.frame(inds.a = matchesA$idA, inds.b = matchesB$idB)

## -------------------
## Get the zeta values
## -------------------
max.zeta <- matchesA$zeta.j

## --------------------------
## Correct dataframes objects
## --------------------------
matchesA <- subset(matchesA, select = colnames.df.a)
matchesB <- subset(matchesB, select = colnames.df.b)

## Return deduped object
out <- list(matchesA = matchesA, matchesB = matchesB,
EM = EM, matchesLink = matchesLink,
max.zeta = max.zeta)
return(out)

}

Loading

0 comments on commit 10f9c40

Please sign in to comment.