-
Notifications
You must be signed in to change notification settings - Fork 2
/
johannaskod.R
583 lines (475 loc) · 30 KB
/
johannaskod.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
library(tidyverse)
library(dplyr)
library(tableone)
library(lubridate)
library(noacsr)
library(glm2)
library(ggplot2)
library(pROC)
#setwd("C:/Users/maria/Downloads/Packt Learning RStudio for R Statistical Computing 2012 RETAIL eBook-repackb00k/unmet-ICU-beds")
source("johanna/functions.R")
# 1. Import the data
#
# Read the dataset and codebook from github
# data <- import_csv_from_github("https://github.com/titco/titco-I/blob/master/titco-I-full-dataset-v1.csv")
# #data <- import_csv_from_github("https://github.com/titco/titco-I/blob/master/titco-I-limited-dataset-v1.csv")
# codebook <- import_csv_from_github("https://github.com/titco/titco-I/blob/master/codebook-titco-I-v1.csv")
# #för jag vill kunna köra offline
# save(data, file = "data.RData")
# save(codebook, file = "codebook.RData")
load("data.RData")
load("codebook.RData")
outcome=data %>%
select(doi, toi, doa, toa, doar, toar, dom_1, tom_1,dom_2, tom_2,dodd, todd,tran,spo2_o2_1) %>%
# Combine date and time to make a datetime object
mutate(injury_time = as_datetime(paste0(doi, toi))) %>%
mutate(arrival_time = as_datetime(paste0(doa, toa))) %>%
mutate(admission_time = as_datetime(paste0(doa, toa))) %>%
mutate(firstsurvey_time = as_datetime(paste0(dom_1, tom_1))) %>%
mutate(secondsurvey_time = as_datetime(paste0(dom_2, tom_2))) %>%
mutate(time_of_death = as_datetime(paste0(dodd, todd))) %>%
# Calculate difference in hours between arrival and death
mutate(time_to_death = round(as.numeric(time_of_death - arrival_time, units = "hours"),0))%>%
#här börjar min påbyggnad
mutate(time_to_first = round(as.numeric(firstsurvey_time-arrival_time , units = "hours"),2))%>%
mutate(time_between_surveys = round(as.numeric(secondsurvey_time - firstsurvey_time, units = "hours"),2)) %>%
#vi tar dessa åt sidan, för vi vill bara göra detta på de som inte är transfererade, eftersom de snurrat förbi ett annat sjukhus blir tiden från injury till arrival missvisande
# approach 1, blev kortare
# filter(tran %in% c("No"))%>%
# mutate(injury_time_direct = as_datetime(paste0(doi, toi))) %>%
# mutate(arrival_time_direct = as_datetime(paste0(doar, toar))) %>%
# mutate(firstsurvey_time_direct = as_datetime(paste0(dom_1, tom_1)))
mutate(stillalive_6h = case_when(time_to_death > 6 ~ 1,
time_to_death < 6 ~ 0)) %>%
#återkom fatta hur du filtrerar på detta
mutate(delay2 = round(as.numeric(arrival_time-injury_time , units = "hours"),0))%>%
mutate(delay3 = round(as.numeric(firstsurvey_time-arrival_time , units = "hours"),0))%>%
mutate(delay23 = round(as.numeric(firstsurvey_time-injury_time , units = "hours"),0))%>%
mutate(delay2ifdirect = case_when(str_detect(tran, "No") & delay2<72 ~ delay2)) # %>%
#addera kolumnerna
data=data.frame( data, outcome)
#lite cleaning: ersätt alla data med GCS vars summor inte add up ordentligt
data <- data %>%
mutate(gcs_e_1int = case_when(str_detect(gcs_e_1, "1c") ~ 1,
str_detect(gcs_e_1, "1") ~ 1,
str_detect(gcs_e_1, "2") ~ 2,
str_detect(gcs_e_1, "3") ~ 3,
str_detect(gcs_e_1, "4") ~ 4,
str_detect(gcs_e_1, "5") ~ 5,
str_detect(gcs_e_1, "6") ~ 6)) %>%
mutate(gcs_v_1int = case_when(str_detect(gcs_v_1, "1") ~ 1,
str_detect(gcs_v_1, "1v") ~ 1,
str_detect(gcs_v_1, "2") ~ 2,
str_detect(gcs_v_1, "3") ~ 3,
str_detect(gcs_v_1, "4") ~ 4,
str_detect(gcs_v_1, "5") ~ 5,
str_detect(gcs_v_1, "6") ~ 6))%>%
mutate(gcs_e_2int = case_when(str_detect(gcs_e_2, "1c") ~ 1,
str_detect(gcs_e_2, "1") ~ 1,
str_detect(gcs_e_2, "2") ~ 2,
str_detect(gcs_e_2, "3") ~ 3,
str_detect(gcs_e_2, "4") ~ 4,
str_detect(gcs_e_2, "5") ~ 5,
str_detect(gcs_e_2, "6") ~ 6)) %>%
mutate(gcs_v_2int = case_when(str_detect(gcs_v_2, "1") ~ 1,
str_detect(gcs_v_2, "1v") ~ 1,
str_detect(gcs_v_2, "2") ~ 2,
str_detect(gcs_v_2, "3") ~ 3,
str_detect(gcs_v_2, "4") ~ 4,
str_detect(gcs_v_2, "5") ~ 5,
str_detect(gcs_v_2, "6") ~ 6))
data$gcs_e_1int <- as.integer(data$gcs_e_1int)
data$gcs_v_1int <- as.integer(data$gcs_v_1int)
data$gcs_m_1 <- as.integer(data$gcs_m_1)
sum_individual_1=data$gcs_e_1int +data$gcs_v_1int+data$gcs_m_1
diff_1=sum_individual_1-data$gcs_t_1
data$gcs_e_2int <- as.integer(data$gcs_e_2int)
data$gcs_v_2int <- as.integer(data$gcs_v_2int)
data$gcs_m_2 <- as.integer(data$gcs_m_2)
sum_individual_2=data$gcs_e_2int +data$gcs_v_2int+data$gcs_m_2
diff_2=sum_individual_2-data$gcs_t_2
data=data.frame( data, diff_1,diff_2)
data <- data %>%
mutate(weird_gcs_1=case_when(diff_1 ==0 ~ 0))%>%
mutate(weird_gcs_2=case_when(diff_2 ==0 ~ 0))
#lite rensning
# Updates to the dataset
data <- data %>%
# Convert age to numeric
mutate(age = parse_number(age)) %>%
#applicera överenskomna filtreringar
# filter(data$stillalive==1)%>%
#... återkom
mutate(rts_sbp1 = case_when(sbp_1 ==0 & hr_1 <1 & sbp_1 !='NA' ~ 0,
sbp_1 < 49 & sbp_1 !='NA' ~ 1,
sbp_1 < 75 & sbp_1 !='NA' ~ 2,
sbp_1 < 89 & sbp_1 !='NA' ~ 3,
sbp_1 < 300 & sbp_1 !='NA' ~4)) %>%
mutate(rts_sbp2 = case_when(sbp_2 ==0 & hr_2<1 ~ 0,
sbp_2 < 49 & sbp_2 !='NA' ~ 1,
sbp_2 < 75 & sbp_2 !='NA' ~ 2,
sbp_2 < 89 & sbp_2 !='NA' ~ 3,
sbp_2 < 300 & sbp_2 !='NA' ~ 4)) %>%
mutate(rts_rr1 = case_when(rr_1 ==0 ~ 0,
rr_1 < 5 ~ 1,
rr_1 < 9 ~ 2,
rr_1 > 29 ~ 3,
rr_1 > 9 & rr_1 < 29 ~ 4)) %>%
mutate(rts_rr2 = case_when(rr_2 ==0 ~ 0,
rr_2 < 5 ~ 1,
rr_2 < 9 ~ 2,
rr_2 > 29 ~ 3,
rr_2 > 9 & rr_2 < 29 ~ 4)) %>%
#får in transfer
mutate(ambulancefromthescene = case_when(str_detect(tran, "No") & str_detect(mot, "Ambulance") ~ 1)) %>%
mutate(tranclass= case_when(ambulancefromthescene==1 ~ 1,
str_detect(mot, "Police") ~ 2,
ambulancefromthescene==0 ~ 3,
str_detect(mot, "Carried by man") ~ 3,
str_detect(mot, "Other") ~ 3,
str_detect(mot, "Private car") ~ 3,
str_detect(mot, "Taxi, motor rickshaw") ~ 3 )) %>%
mutate(bl_rec= case_when(ubr_1>1 ~ "Yes",
ubr_1<1.5 ~ "No")) %>%
mutate(sc_hi= case_when(sc>3 ~ "Yes",
sc<3.000001 ~ "No")) %>%
# logical or för paste ||
mutate(gcs_v_1_class= case_when(str_detect(gcs_v_1, "1") & weird_gcs_1==0 ~ 1,
str_detect(gcs_v_1, "1v") & weird_gcs_1==0 ~ 2,
str_detect(gcs_v_1, "2") &weird_gcs_1==0 ~ 3,
str_detect(gcs_v_1, "3" ) &weird_gcs_1==0 ~ 4,
str_detect(gcs_v_1, "4") &weird_gcs_1==0 ~ 4,
str_detect(gcs_v_1, "5" ) &weird_gcs_1==0 ~ 4,
str_detect(gcs_v_1, "6" ) &weird_gcs_1==0 ~ 4)) %>%
mutate(gcs_m_1_class= case_when(str_detect(gcs_m_1, "1") &weird_gcs_1==0 ~ 1,
str_detect(gcs_m_1, "2") &weird_gcs_1==0 ~ 3,
str_detect(gcs_m_1, "3" ) &weird_gcs_1==0 ~ 4,
str_detect(gcs_m_1, "4") &weird_gcs_1==0 ~ 4,
str_detect(gcs_m_1, "5" ) &weird_gcs_1==0 ~ 4,
str_detect(gcs_m_1, "6" ) &weird_gcs_1==0 ~ 4)) %>%
mutate(gcs_e_1_class= case_when(str_detect(gcs_e_1, "1") &weird_gcs_1==0 ~ 1,
str_detect(gcs_e_1, "1v") &weird_gcs_1==0 ~ 2,
str_detect(gcs_e_1, "2") &weird_gcs_1==0 ~ 3,
str_detect(gcs_e_1, "3" ) &weird_gcs_1==0 ~ 4,
str_detect(gcs_e_1, "4") &weird_gcs_1==0 ~ 4,
str_detect(gcs_e_1, "5" ) &weird_gcs_1==0 ~ 4,
str_detect(gcs_e_1, "6" ) &weird_gcs_1==0 ~ 4)) %>%
#second survey. du ,åste pricka allt så du kan få NA
mutate(gcs_v_2_class= case_when(str_detect(gcs_v_2, "1") ~ 1,
str_detect(gcs_v_2, "1v") ~ 2,
str_detect(gcs_v_2, "2") ~ 3,
str_detect(gcs_v_2, "3" ) ~ 4,
str_detect(gcs_v_2, "4") ~ 4,
str_detect(gcs_v_2, "5" ) ~ 4,
str_detect(gcs_v_2, "6" ) ~ 4)) %>%
mutate(gcs_m_2_class= case_when(str_detect(gcs_m_2, "1") ~ 1,
str_detect(gcs_m_2, "2") ~ 3,
str_detect(gcs_m_2, "3" ) ~ 4,
str_detect(gcs_m_2, "4") ~ 4,
str_detect(gcs_m_2, "5" ) ~ 4,
str_detect(gcs_m_2, "6" ) ~ 4)) %>%
mutate(gcs_e_2_class= case_when(str_detect(gcs_e_2, "1") ~ 1,
str_detect(gcs_e_2, "1v") ~ 2,
str_detect(gcs_e_2, "2") ~ 3,
str_detect(gcs_e_2, "3" ) ~ 4,
str_detect(gcs_e_2, "4") ~ 4,
str_detect(gcs_e_2, "5" ) ~ 4,
str_detect(gcs_e_2, "6" ) ~ 4)) %>%
# Add id column to the dataset
# mutate(id = row_number()) %>% #jag byter ut, finns seqn redan
# Add our outcome, if patient has a recorded length in the ICU then treated_icu = 1
mutate(treated_icu = case_when(licu > 24 & hos ==6273 ~ 1,
licu < 25 & hos ==6273 ~ 0,
licu > 0 & hos ==7215 ~ 1,
licu > 0 & hos ==7842 ~ 1,
licu > 0 & hos ==8264 ~ 1,
licu == 0 & hos ==7215 ~ 0,
licu == 0 & hos ==7842 ~ 0,
licu == 0 & hos ==8264 ~ 0)) %>% #var det inte det här vi sa?
# mutate(treated_icu = case_when(licu > 24 ~ 1,
# licu < 25 ~ 0))
mutate(spO2_1_wO2 = case_when(str_detect(spo2_o2_1, "Yes") ~ spo2_1))%>%
mutate(spO2_1_woO2 = case_when(str_detect(spo2_o2_1, "No") ~ spo2_1))%>%
mutate(spO2_2_wO2 = case_when(str_detect(spo2_o2_2, "Yes") ~ spo2_2))%>%
mutate(spO2_2_woO2 = case_when(str_detect(spo2_o2_2, "No") ~ spo2_2))%>%
#using NEWS to make a reverse
mutate(spO2_1_cat = case_when(str_detect(spo2_o2_1, "No") & spo2_1 > 96 ~ 0,
str_detect(spo2_o2_1, "Yes") & spo2_1 > 96 ~ 2,
str_detect(spo2_o2_1, "No") & spo2_1 > 94 ~ 1,
str_detect(spo2_o2_1, "Yes") & spo2_1 > 94 ~ 3,
str_detect(spo2_o2_1, "No") & spo2_1 > 92 ~ 2,
str_detect(spo2_o2_1, "Yes") & spo2_1 > 92 ~ 4,
str_detect(spo2_o2_1, "No") & spo2_1 < 92 ~ 3,
str_detect(spo2_o2_1, "Yes") & spo2_1 < 92 ~ 5))%>%
# str_detect(spo2_o2_1, "Yes") & spo2_1 > 93 ~ 3,
# str_detect(spo2_o2_1, "Yes") & spo2_1 > 88 ~ 4,
# str_detect(spo2_o2_1, "No") & spo2_1 > 93 ~ 4,
# str_detect(spo2_o2_1, "No") & spo2_1 > 86 ~ 3,
# str_detect(spo2_o2_1, "No") & spo2_1 > 83 ~ 2,
# str_detect(spo2_o2_1, "No") & spo2_1 < 83 ~ 1))%>%
mutate(spO2_2_cat = case_when(str_detect(spo2_o2_2, "No") & spo2_2 > 96 ~ 0,
str_detect(spo2_o2_2, "Yes") & spo2_2 > 96 ~ 2,
str_detect(spo2_o2_2, "No") & spo2_2 > 94 ~ 1,
str_detect(spo2_o2_2, "Yes") & spo2_2 > 94 ~ 3,
str_detect(spo2_o2_2, "No") & spo2_2 > 92 ~ 2,
str_detect(spo2_o2_2, "Yes") & spo2_2 > 92 ~ 4,
str_detect(spo2_o2_2, "No") & spo2_2 < 92 ~ 3,
str_detect(spo2_o2_2, "Yes") & spo2_2 < 92 ~ 5))
# mutate(spO2_2_cat = case_when(str_detect(spo2_o2_2, "Yes") & spo2_2 > 97 ~ 1,
# str_detect(spo2_o2_2, "Yes") & spo2_2 > 95 ~ 2,
# str_detect(spo2_o2_2, "Yes") & spo2_2 > 93 ~ 3,
# str_detect(spo2_o2_2, "Yes") & spo2_2 > 88 ~ 4,
# str_detect(spo2_o2_2, "No") & spo2_2 > 93 ~ 4,
# str_detect(spo2_o2_2, "No") & spo2_2 > 86 ~ 3,
# str_detect(spo2_o2_2, "No") & spo2_2 > 83 ~ 2,
# str_detect(spo2_o2_2, "No") & spo2_2 < 83 ~ 1))
#måste nu tillfogar dessa som variabler til codebook
newnames<-data.frame("delay2ifdirect","rts_sbp1","rts_sbp2","rts_rr1","rts_rr2","ambulancefromthescene","tranclass","bl_rec","sc_hi","gcs_v_1_class","gcs_m_1_class","gcs_e_1_class","gcs_v_2_class","gcs_m_2_class","gcs_e_2_class","spO2_1_wO2","spO2_1_woO2","spO2_2_wO2","spO2_2_woO2","spO2_1_cat","spO2_2_cat")
newlabels<-data.frame("delay2ifdirect","revised trauma score sbp1","revised trauma score sbp2","revised trauma score rr1","revised trauma score rr2","ambulancefromthescene","tranclass","blood received 1st hour", "serum creatinine high","gcs_v_1_class","gcs_m_1_class","gcs_e_1_class","gcs_v_2_class","gcs_m_2_class","gcs_e_2_class","spO2_1_wO2","spO2_1_woO2","spO2_2_wO2","spO2_2_woO2","spO2_1_cat","spO2_2_cat")
newtypes<-data.frame("quantitative","quantitative","quantitative","quantitative","quantitative","qualitative","qualitative","qualitative","qualitative","qualitative","qualitative","qualitative","qualitative","qualitative","qualitative","quantitative","quantitative","quantitative","quantitative","qualitative","qualitative")
#men jag vet inte hur jag lägger till nya rader så...
headersnew = data.frame(matrix(,nrow = nrow(codebook)+21, ncol = ncol(codebook)-1))
headersnew[1:193,1:ncol(codebook)]<-data.frame(codebook)
colnames(headersnew)<-(colnames(codebook))
#start=220 #nrow(headerproperties)+1
#endat1=233 #nrow(headerproperties)+3
count=0
for (a in 1:21){
count=count+1
headersnew[nrow(codebook)+count,1]=newnames[1,count]
headersnew[nrow(codebook)+count,2]=newlabels[1,count]
headersnew[nrow(codebook)+count,6]=newtypes[1,count]
}
count=0
#fixar att type hamnat på note i dessa rader
for (a in 69:193){
count=count+1
misplacedtype=headersnew[a,5]
headersnew[a,6]=misplacedtype
}
codebook=headersnew
# 3. Set seed and split into training and test samples.
# The seed allows us to always get the same randomized sample when we run the code.
set.seed(46)
# Use 60% of the data for training
train_data <- data %>% sample_frac(.60)
# Use the remaining 40% as test data
test_data <- anti_join(data, train_data, by = 'seqn')
# Optional, Verify that no rows overlap, should return 0
count(train_data %>% filter(seqn %in% test_data$seqn))
# 4. Compare variables between ICU and non-ICU patients
# Define the categorical and continuous variables
cat_variables <- codebook %>% filter(type %in% "qualitative") %>%
# We filter out pid, not to include that in our analysis
filter(name != "pid") %>% pull(name)
cont_variables <- codebook %>% filter(type %in% "quantitative") %>%
# We filter out licu since this is our outcome
filter(!name %in% c("licu")) %>% pull(name)
# Look at the data using table one, jag fattar inte hur jag kan se om inte variabelnamn
tableone::CreateContTable(data = train_data, vars = cont_variables, strata = "treated_icu")
tableone::CreateCatTable(data = train_data, vars = cat_variables, strata = "treated_icu")
look_cont1=tableone::CreateContTable(data = test_data, vars = cont_variables, strata = "treated_icu")
look_cat1=tableone::CreateCatTable(data = test_data, vars = cat_variables, strata = "treated_icu")
#nu lägger vi på själva analysen. Vi har alla nya skapade parametrar exkl ambulansransporten med
predictors=data
#"rts_sbp1","rts_sbp2","rts_rr1","rts_rr2","tranclass","bl_rec","sc_hi","gcs_v_1_class","gcs_m_1_class","gcs_e_1_class","gcs_v_2_class","gcs_m_2_class","gcs_e_2_class", "spO2_1_wO2","spO2_1_woO2","spO2_2_wO2","spO2_2_woO2"
#samt de gamla
#..... kolla i saker jag vill ta upp
#died, NISS, Operation 1a timmen, Mode of injury, transfer, age sex
#predictors=data.frame(data["rts_sbp1”],data[”rts_sbp2”],data[”rts_rr1”],data[”rts_rr2”],data[”tranclass”],data[”bl_rec”],data[”sc_hi”],data[”gcs_v_1_class”],data[”gcs_m_1_class”],data[”gcs_e_1_class”],data[”gcs_v_2_class”],data[”gcs_m_2_class”],data[”gcs_e_2_class”], data["spO2_1_wO2"],data[”spO2_1_woO2”],data[”spO2_2_wO2”],data[”spO2_2_woO2"],data["NISS"],data["age"],data["sex"],data["age"],data["op_1"],data["moi"], data["tran"],data["died"])
predictors = data %>% select("rts_sbp1","rts_sbp2","rts_rr1","rts_rr2","tranclass","bl_rec","sc_hi","gcs_v_1_class","gcs_m_1_class","gcs_e_1_class","gcs_v_2_class","gcs_m_2_class","gcs_e_2_class", "spO2_1_wO2","spO2_1_woO2","spO2_2_wO2","spO2_2_woO2", "niss", "age", "sex", "age", "ot_1", "moi", "tran", "died","spO2_1_cat","spO2_2_cat","died" )
#titta på kont värdena och bedöm
continuous <-select_if(predictors, is.numeric)
summary(continuous)
# Histogram with kernel density curve, exempel med ålder
ggplot(continuous, aes(x = age)) +
geom_density(alpha = .2, fill = "#FF6666")
# Select categorical column
factor <- data.frame(select_if(predictors, is.character))
ncol(factor)
# Create graph for each column
graph <- lapply(names(factor),
function(x)
ggplot(factor, aes(get(x))) +
geom_bar() +
theme(axis.text.x = element_text(angle = 90)))
graph
#addera outputten till prediktor data framen
treated_icu=data$treated_icu
tran=data$tran
predictors$treated_icu=treated_icu
#make 2 factor
predictors$bl_rec <- as.factor(predictors$bl_rec)
predictors$sc_hi <- as.factor(predictors$sc_hi)
predictors$sex <- as.factor(predictors$sex)
predictors$tran <- as.factor(predictors$tran)
predictors$tranclass <- as.factor(predictors$tranclass)
predictors$ot_1 <- as.factor(predictors$ot_1)
predictors$moi <- as.factor(predictors$moi)
predictors$tran <- as.factor(predictors$tran)
predictors$died <- as.factor(predictors$died)
#make integer
predictors$rts_sbp1 <- as.integer(predictors$rts_sbp1)
predictors$rts_sbp2 <- as.integer(predictors$rts_sbp2)
predictors$rts_rr1 <- as.integer(predictors$rts_rr1)
predictors$rts_rr2 <- as.integer(predictors$rts_rr2)
predictors$gcs_v_1_class <- as.integer(predictors$gcs_v_1_class)
predictors$gcs_m_1_class <- as.integer(predictors$gcs_m_1_class)
predictors$gcs_e_1_class <- as.integer(predictors$gcs_e_1_class)
predictors$gcs_v_2_class <- as.integer(predictors$gcs_v_2_class)
predictors$gcs_m_2_class <- as.integer(predictors$gcs_m_2_class)
predictors$gcs_e_2_class <- as.integer(predictors$gcs_e_2_class)
#funkar inte
#model=glm2(treated_icu~rts_sbp1+rts_sbp2+rts_rr1+rts_rr2+tranclass+bl_rec+sc_hi+gcs_v_1_class+gcs_m_1_class+gcs_e_1_class+gcs_v_2_class+gcs_m_2_class+gcs_e_2_class+spO2_1_wO2+spO2_1_woO2+spO2_2_wO2+spO2_2_woO2+ niss+ age+ sex+ ot_1+ moi+ tran+ died,data=predictors, family=binomial)
#funkar
#model=glm2(treated_icu~rts_sbp1+rts_sbp2+rts_rr1+rts_rr2+tranclass+bl_rec+sc_hi+gcs_v_1_class+gcs_e_1_class+gcs_v_2_class+gcs_m_2_class+gcs_e_2_class+niss+age+sex+died+ot_1+spO2_1_cat,data=predictors, family=binomial)
#funkar
glm.fit=glm2(treated_icu~rts_sbp1+rts_sbp2+rts_rr1+rts_rr2+tranclass+bl_rec+sc_hi+gcs_v_1_class+gcs_m_1_class+gcs_e_1_class+gcs_v_2_class+gcs_m_2_class+gcs_e_2_class+ niss+ age+ sex+ ot_1+ moi+ tran+ died+spO2_1_cat+spO2_2_cat,data=predictors, family=binomial)
#spO2_1_woO2+spO2_2_wO2+spO2_2_woO2 funkar inte, eftersom de har så många NA
map(predictors, ~sum(is.na(.)))
summary(predictors)
sapply(predictors, table)
#okej, nu ska jag bedöma hur bra det blev. Steg 1 verkar vara att återskapa prediktor-dataframen så som den var innan jag la till beroende variabeln
#predictors <- predictors %>% pull(treted_icu)
# predictors = data %>% select("rts_sbp1","rts_sbp2","rts_rr1","rts_rr2","tranclass","bl_rec","sc_hi","gcs_v_1_class","gcs_m_1_class","gcs_e_1_class","gcs_v_2_class","gcs_m_2_class","gcs_e_2_class", "spO2_1_wO2","spO2_1_woO2","spO2_2_wO2","spO2_2_woO2", "niss", "age", "sex", "age", "ot_1", "moi", "tran", "died","spO2_1_cat","spO2_2_cat" )
#
# #make 2 factor
# predictors$bl_rec <- as.factor(predictors$bl_rec)
# predictors$sc_hi <- as.factor(predictors$sc_hi)
# predictors$sex <- as.factor(predictors$sex)
# predictors$tran <- as.factor(predictors$tran)
# predictors$tranclass <- as.factor(predictors$tranclass)
# predictors$ot_1 <- as.factor(predictors$ot_1)
# predictors$moi <- as.factor(predictors$moi)
# predictors$tran <- as.factor(predictors$tran)
# predictors$died <- as.factor(predictors$died)
#
# #make integer
# predictors$rts_sbp1 <- as.integer(predictors$rts_sbp1)
# predictors$rts_sbp2 <- as.integer(predictors$rts_sbp2)
# predictors$rts_rr1 <- as.integer(predictors$rts_rr1)
# predictors$rts_rr2 <- as.integer(predictors$rts_rr2)
# predictors$gcs_v_1_class <- as.integer(predictors$gcs_v_1_class)
# predictors$gcs_m_1_class <- as.integer(predictors$gcs_m_1_class)
# predictors$gcs_e_1_class <- as.integer(predictors$gcs_e_1_class)
# predictors$gcs_v_2_class <- as.integer(predictors$gcs_v_2_class)
# predictors$gcs_m_2_class <- as.integer(predictors$gcs_m_2_class)
# predictors$gcs_e_2_class <- as.integer(predictors$gcs_e_2_class)
treated_icu=data.frame(treated_icu)
fitval=glm.fit$fitted.values
#lines(treated_icu, glm.fit$fitted.values)
#roc(predictors, glm.fit$fitted.values, plot=TRUE)
################# Johanna ##################
# To check the model
summary(glm.fit)
# Here we see that, using the test data in the way it was treated above, 14115 observations were deleted and not included in the model due to missing values.
# This is because each entry containing an NA for a predictor variable that is not a factor, will be removed. That's why there's an error with unknown factor levels when you try to apply the model on the data
# The model is acutally built on just 16000 - 14115 observations. To improve this, we remove cases where we have continious variables containg NA (Better to be explicit and not let the model drop them)
# We convert NA to a category, Missing, that way it gets treated as another category and not NA (and hence get dropped).
####### Data pre-processing for modeling #######
# We can't use any observations that has NA in the outcome we predict to, we delete these.
train_data <- train_data %>% filter(!is.na(treated_icu))
# Select the predictors and outcome
model_data <- train_data %>% select("treated_icu", "rts_sbp1","rts_sbp2","rts_rr1","rts_rr2","tranclass","bl_rec","sc_hi","gcs_v_1_class","gcs_m_1_class","gcs_e_1_class","gcs_v_2_class","gcs_m_2_class","gcs_e_2_class", "spO2_1_wO2","spO2_1_woO2","spO2_2_wO2","spO2_2_woO2", "niss", "age", "sex", "age", "ot_1", "moi", "tran", "died","spO2_1_cat","spO2_2_cat" )
# Analyse missing
library(naniar)
# Visulise number of missing for each variable
vis_miss(model_data)
# We will treat all variables, except for age, and niss as categorical. This based on them being categorical variables or having a large amout of missing that we will treat as categorical.
# For age and niss, we remove observations that contain NA for these variables.
model_data <- model_data %>% filter(!is.na(niss), !is.na(age))
# For the rest, all to be treated as categorical variables, replace NA with "Missing", for it to be modeled as it's own category. This will automaticly convert for exampel numeric cariables to characters.
model_data <- model_data %>% replace(is.na(.), "Missing")
# Handle the test data in the same way as the training data
test_data <- test_data %>% select("treated_icu", "rts_sbp1","rts_sbp2","rts_rr1","rts_rr2","tranclass","bl_rec","sc_hi","gcs_v_1_class","gcs_m_1_class","gcs_e_1_class","gcs_v_2_class","gcs_m_2_class","gcs_e_2_class", "spO2_1_wO2","spO2_1_woO2","spO2_2_wO2","spO2_2_woO2", "niss", "age", "sex", "age", "ot_1", "moi", "tran", "died","spO2_1_cat","spO2_2_cat" )
test_data <- test_data %>% filter(!is.na(treated_icu))
test_data <- test_data %>% filter(!is.na(niss), !is.na(age))
test_data <- test_data %>% replace(is.na(.), "Missing")
# Fot the model using the training data
glm_model <- glm(treated_icu ~ rts_sbp1+rts_sbp2+rts_rr1+rts_rr2+tranclass+bl_rec+sc_hi+gcs_v_1_class+gcs_m_1_class+gcs_e_1_class+gcs_v_2_class+gcs_m_2_class+gcs_e_2_class+niss+age+sex+ot_1+moi+tran+died+spO2_1_cat+spO2_2_cat,
family=binomial, data = model_data)
summary(glm_model)
# Use the model to create ROC curve and calculate AUC on the training data
train_prob <- predict(glm_model, newdata = model_data, type = "response")
train_roc = roc(model_data$treated_icu ~ train_prob, plot = TRUE, print.auc = TRUE)
# Apply the model to the test data and calculate AUC
test_prob <- predict(glm_model, newdata = test_data, type = "response")
test_roc = roc(test_data$treated_icu ~ test_prob, plot = TRUE, print.auc = TRUE)
### Matching/Youdens statistic ###
#youdens bedömer ROC-objektet. den anger vilken cutoff som ger bäst specificitet och sensitivity
youdens_train=coords(train_roc, x="best", input="threshold", best.method="youden")
youdens_test=coords(test_roc, x="best", input="threshold", best.method="youden")
#cutoff
cutoff_train=youdens_train$threshold
cutoff_test=youdens_train$threshold
#unmet
#sen räknar vi hur många pat fått högre propensity scores av modellen än den cutoffen . det är så många som har tillräckligt höga propensity scores att de rimligen borde ha ICU-vårdats.
overcutoff_train=sum(train_prob>cutoff_train)
overcutoff_test=sum(test_prob>cutoff_test)
# och genom att sätta denna beräkning i relation till hur många som faktiskt ICU-vårdades, som vi kan uttala oss om underskottet på ICU sängar.
ICUtrain=sum(train_data$treated_icu,na.rm=T)
ICUtest=sum(test_data$treated_icu,na.rm=T)
unmet_train=overcutoff_train-ICUtrain
unmetprop_train=unmet_train/ICUtrain
unmet_test=overcutoff_test-ICUtest
unmetprop_test=unmet_test/ICUtest
#pga nyfiken, hur många vårdades på ICU trots att de hade för låga propensity scores för att "platsa"
df_prob=data.frame(test_prob)
df_question=data.frame(test_data,df_prob)
question <- df_question %>%
mutate(unneccesary = case_when(df_prob <cutoff_test & treated_icu >0.5 ~ 1))
unneccesary_ICU=sum(question$unneccesary,na.rm=t)
unnecessary_prop=unneccesary_ICU/ICUtest
#bootstrapping
sum(data$weird_gcs_1,na.rm=t)
sum(data$weird_gcs_2,na.rm=t)
#cont
continuous <-select_if(predictors, is.numeric)
# Select categorical column
factor <- data.frame(select_if(predictors, is.character))
#för jämförelse räknar vi också p- och t-värdena så som vi gjorde förra gången
#den vill inte filtrera bort ett fel i codebook så jag har gjort en rättad version (spo2_o2_1 och 2 felklassificerade som kontinuerliga variabler)
codebook <- read.csv("codebook.csv")
# 4. Compare variables between ICU and non-ICU patients
# Define the categorical and continuous variables
cat_variables <- codebook %>% filter(type %in% "qualitative") %>%
# We filter out pid, not to include that in our analysis
filter(name != "pid") %>% pull(name)
cont_variables <- codebook %>% filter(type %in% "quantitative") %>%
# We filter out licu since this is our outcome
filter(!name %in% c("licu")) %>% pull(name)
#we filter for spO2_o2_1 och spO2_o2_2 för de är felklassificerade och det klarar inte loopen -funkar ej för character
# filter(!name %in% c("spO2_o2_1")) %>% pull(name)%>%
# filter(!name %in% c("spO2_o2_2")) %>% pull(name)
#bygg samma stratum så här istället
licu=filter(test_data, licu >0.5)
nolicu=filter(test_data, licu <0.5)
#välj kvant kolumnerna (jag måste speca age som numeric igen, vet inte varför)
quantLicu <- licu[, which((names(licu) %in% cont_variables)==TRUE)]
#quantLicu[,1] <- as.numeric(quantLicu[,1])
quantNolicu <- nolicu[, which((names(nolicu) %in% cont_variables)==TRUE)]
#quantNolicu[,1] <- as.numeric(quantNolicu[,1])
#gör stat analysen
quantstatlicu=CreateTableOne(data = quantLicu,includeNA=FALSE)
quantstatnolicu=CreateTableOne(data = quantNolicu,includeNA=FALSE)
#gör t.test för alla kvant kolumner
cont_variables_df=data.frame(cont_variables) #för nästa rad
columncount=nrow(cont_variables_df) #hur många kolumner ska vi rulla
pvalues<- data.frame(matrix(ncol = 1, nrow = columncount))
tvalues<- data.frame(matrix(ncol = 1, nrow = columncount))
for (a in 1:columncount) {
c=t.test(quantLicu[,a],quantNolicu[,a],na.action=na.exclude)
pval=c[["p.value"]]
tval=c[["statistic"]][["t"]]
pvalues[a,1]=pval
tvalues[a,1]=tval
}
#så tittar vi på hur de blev ur tableone. konvertera till
l=look_cont1
look_cont=print(l, showAllLevels = TRUE, formatOptions = list(big.mark = ","))
look_cont=data.frame(look_cont)
pval=data.frame(look_cont$p)
pval=data.frame(look_cont$p[2:34])
#och nu för jämförelsen
otherpvalues=data.frame(cont_variables_df,pvalues,pval,tvalues )
colnames(otherpvalues)<-data.frame("variables","p_values old way","p_values new way","t values")