-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathduplicate_subjects_kmeans.R
94 lines (79 loc) · 2.63 KB
/
duplicate_subjects_kmeans.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
# Purpose -----------------------------------------------------------------
# Identify potentially duplicate enrolled subjects based on:
# age
# dob
# sex
# ethnicity
# race
# height
# weight
# temperature
# systolic_bp
# diastolic_bp
# Libraries ---------------------------------------------------------------
library(tidyverse)
library(haven)
library(tidymodels)
library(mclust)
# Data --------------------------------------------------------------------
data <- purrr::map(c('vs', 'dm'),
~haven::read_xpt(paste0('data/', .x, '.xpt'))
) %>%
`names<-`(c('vs', 'dm'))
dm <- data$dm %>%
select(USUBJID, SEX, RACE, ETHNIC, ARM) %>%
filter(ARM != 'Screen Failure') %>%
rowwise() %>%
mutate(YOB = sample(1950:2004, 1)) %>%
ungroup()
vs <- data$vs %>%
filter(EPOCH == 'SCREENING' & !is.na(VSORRES)) %>%
group_by(USUBJID, EPOCH, VSTESTCD) %>%
mutate(VSSEQ = row_number()) %>%
ungroup() %>%
filter(VSSEQ == 1) %>%
select(USUBJID, VSTESTCD, VSSTRESN)
patient_data <- left_join(dm %>% select(-ARM),
vs,
by = c('USUBJID')
) %>%
pivot_wider(names_from = VSTESTCD,
values_from = VSSTRESN) %>%
bind_rows(., slice(., rep(1, 1)))
# Model ------------------------------------------------------------------------
# Split the patient data into training and testing sets
patient_split <- initial_split(patient_data, prop = 0.8, strata = "SEX")
patient_train <- training(patient_split)
patient_test <- testing(patient_split)
# Preprocess the patient data using a recipe
patient_recipe <- recipe(USUBJID ~ ., data = patient_train) %>%
step_rm(USUBJID) %>%
step_dummy(SEX, RACE, ETHNIC) %>%
step_center(all_numeric(), -all_outcomes()) %>%
step_scale(all_numeric(), -all_outcomes())
# Fit a k-means clustering model to the preprocessed patient data
patient_kmeans <- patient_recipe %>%
prep() %>%
bake(patient_train) %>%
mclust::Mclust(method = "EM") %>%
predict()
# Add the cluster labels to the patient data
patient_train <- patient_train %>%
mutate(cluster = patient_kmeans$classification)
# Identify potential duplicate patients within each cluster
potential_duplicates <- patient_train %>%
group_by(cluster) %>%
filter(n() > 1) %>%
group_by(YOB, SEX, RACE, ETHNIC) %>% #, HEIGHT, WEIGHT, TEMP, SYSBP, DIABP) %>%
summarize(n_duplicates = n(),
.groups = 'keep') %>%
filter(n_duplicates > 1)
# Print the potential duplicate patients
potential_duplicates
# Plot the data colored by cluster assignment
P <- ggplot(potential_duplicates,
aes(x = X1, y = X2, color = cluster)) +
geom_point() +
scale_color_discrete(name = "Cluster") +
theme_BW()
P