-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlinkage_prep_CHdata.R
153 lines (111 loc) · 4.13 KB
/
linkage_prep_CHdata.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
## ==========================================================================##
# Project: DACHA
# Team: Improvement Analytics Unit (IAU) at the Health Foundation
# Script: linkage_prep_CHdata.R
# Corresponding author: Liz Crellin ([email protected])
# Description: Prepare digital carehome record by adding necessary pseudo
# identifiers
# Inputs: List of all residents with IDs and cleaned CH data
#
# Outputs:
# Digital care home record information ready for linkage in MDS
# Notes: To use, need to adjust locations of R scripts and csv files
## ==========================================================================##
# Source relevant scripts -------------------------------------------------
library(aws.s3)
library(dplyr)
library(tidyverse)
library(lubridate)
project_bucket <- '' # assign project directory
#Latest list of residents with all ids
resident_ids <- s3read_using(readRDS,
object = 'residents_all_linkage.rds',
bucket = project_bucket)
#import the cleaned CH data
ch_dataset <- s3read_using(read.csv,
object = 'Final_resident_dataset_update.csv',
bucket = project_bucket,
na.strings = c("", "NA")
)
#Aims:
#add pseudonhsnos and pseudocqcids to the CH dataset so can be linked (based on client ids)
#one row - wave 1 only
#any changes for us in r/for linkage
#check waves
ch_dataset %>%
group_by(wave) %>%
tally()
#check provider
ch_dataset %>%
filter(wave==1) %>%
group_by(provider) %>%
tally()
resident_ids %>%
group_by(Provider) %>%
tally()
#wave 1 and wave 2 cuts
ch_dataset_w1 <- ch_dataset %>%
filter(wave==1)
ch_dataset_w2 <- ch_dataset %>%
filter(wave==2)
column_names_old <- colnames(ch_dataset_w2)
columns_names_new <- paste0(column_names_old, "_w2")
colnames(ch_dataset_w2) <- columns_names_new
#separate by provider and then join back together afterwards
ch_dataset_nw1 <- ch_dataset_w1 %>%
filter(provider==1)
ch_dataset_pw1 <- ch_dataset_w1 %>%
filter(provider==2) %>%
mutate(hashedid = toupper(hashedid))
ch_dataset_nw2 <- ch_dataset_w2 %>%
filter(provider_w2==1)
ch_dataset_pw2 <- ch_dataset_w2 %>%
filter(provider_w2==2) %>%
mutate(hashedid_w2 = toupper(hashedid_w2))
#check distinct ids
ch_dataset_nw1 %>%
select(clientid) %>%
n_distinct()
ch_dataset_nw2 %>%
select(clientid_w2) %>%
n_distinct()
ch_dataset_pw1 %>%
select(hashedid) %>%
n_distinct()
ch_dataset_pw2 %>%
select(hashedid_w2) %>%
n_distinct()
#join to get pseudonhsnos and pseudocqcids
ch_dataset_n_linked <- resident_ids %>%
filter(Provider=="N") %>%
left_join(ch_dataset_nw1, by = c("ClientID_N" = "clientid")) %>%
select(-provider, -wave, -ch_admission, -index_wave1, -index_wave2, -ascotwave1, -ascotwave2, -hashedid, -ClientID_PCS ) %>%
left_join(ch_dataset_nw2, by = c("ClientID_N" = "clientid_w2")) %>%
select(-provider_w2, -wave_w2, -ClientID_N, -residentid_w2, -hashedid_w2, -both_waves_w2) %>%
relocate(pseudonhsno, pseudocqcid)
ch_dataset_p_linked <- resident_ids %>%
filter(Provider=="P") %>%
left_join(ch_dataset_pw1, by = c("ClientID_P" = "hashedid")) %>%
select(-provider, -ch_admission, -index_wave1, -index_wave2, -ascotwave1, -ascotwave2, -clientid, -ClientID_N) %>%
left_join(ch_dataset_pw2, by = c("ClientID_P" = "hashedid_w2")) %>%
select(-provider_w2, -wave_w2, -ClientID_P, -residentid_w2, -both_waves_w2) %>%
relocate(pseudonhsno, pseudocqcid)
#Append them together again
ch_dataset_linkage <- bind_rows(ch_dataset_n_linked, ch_dataset_p_linked)
#double check the numbers
ch_dataset_linkage %>%
group_by(Provider) %>%
tally()
############################
# save to S3
s3saveRDS(x = ch_dataset_linkage
,object = 'ch_dataset_linkage.rds'
,bucket = project_bucket
,multipart=TRUE)
#Export as csv for review with Yannis (wrt final output to WP5)
s3write_using(ch_dataset_linkage,
FUN=write.csv,
object = "ch_dataset_linkage.csv",
bucket = project_bucket,
row.names = FALSE,
quote=FALSE)