-
Notifications
You must be signed in to change notification settings - Fork 1
/
reshaping_dataframe.R
595 lines (478 loc) · 32.8 KB
/
reshaping_dataframe.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
#######
##restacking data
#########
##restacks data frame so each intervention that leads to a series of time points has its own study number (study_number_new)
##also recodes coverage: cut off 70% for LLIN, 90% for IRS and MDA
data$study_area[data$study_area=='Papua New Guinea, Ilahita area of Maprik district, East Sepik Province']<-'Papua New Guinea, Ilahita area'
#adding row numbers
data$row_number<-seq.int(nrow(data))
data$row_number_smaller<-data$row_number-1
data$row_number_smaller[data$row_number_smaller==0]<-1
#############
##incidence data
#############
#make subsets but only with data points that follow one intervention, and also add the last data point that happened before the intervention
data_int1_inc<-subset(data, ((data$time_incidence_int1[data$row_number+1]>0 | ((data$study_number[data$row_number] != data$study_number[data$row_number + 1]) & data$time_incidence_int1[data$row_number]>0)) & ((data$time_incidence_int1<=0 & (data$time_incidence_int2[data$row_number+1]<=0 | is.na(data$time_incidence_int2[data$row_number+1]))) | ((data$time_incidence_int2<=0 | is.na(data$time_incidence_int2)) & data$time_incidence_int1>0)) |((data$Sum_Intervention1=='control' | data$Sum_Intervention1[data$row_number_smaller]=='control') & data$incidence_YN=='WAHR'& data$time_incidence_int1>=-6)))
data_int2_inc<-subset(data, ((data$time_incidence_int2[data$row_number+1]>0 | ((data$study_number[data$row_number] != data$study_number[data$row_number + 1]) & data$time_incidence_int2[data$row_number]>0)) & ((data$time_incidence_int2<=0 & (data$time_incidence_int3[data$row_number+1]<=0 | is.na(data$time_incidence_int3[data$row_number+1]))) | ((data$time_incidence_int3<=0 | is.na(data$time_incidence_int3)) & data$time_incidence_int2>0)) | ((data$Sum_Intervention2[row_number]=='control' | data$Sum_Intervention2[data$row_number_smaller]=='control') & data$incidence_YN[row_number]=='WAHR'& data$time_incidence_int2>=-6)))
data_int3_inc<-subset(data, ((data$time_incidence_int3[data$row_number+1]>0 | ((data$study_number[data$row_number] != data$study_number[data$row_number + 1]) & data$time_incidence_int3[data$row_number]>0)) & ((data$time_incidence_int3<=0 & (data$time_incidence_int4[data$row_number+1]<=0 | is.na(data$time_incidence_int4[data$row_number+1]))) | ((data$time_incidence_int4<=0 | is.na(data$time_incidence_int4)) & data$time_incidence_int3>0)) | ((data$Sum_Intervention3[row_number]=='control' | data$Sum_Intervention3[data$row_number_smaller]=='control') & data$incidence_YN[row_number]=='WAHR'& data$time_incidence_int3>=-6)))
data_int4_inc<-subset(data, ((data$time_incidence_int4[data$row_number+1]>0 | ((data$study_number[data$row_number] != data$study_number[data$row_number + 1]) & data$time_incidence_int4[data$row_number]>0)) & ((data$time_incidence_int4<=0 & (data$time_incidence_int5[data$row_number+1]<=0 | is.na(data$time_incidence_int5[data$row_number+1]))) | ((data$time_incidence_int5<=0 | is.na(data$time_incidence_int5)) & data$time_incidence_int4>0)) | ((data$Sum_Intervention4[row_number]=='control' | data$Sum_Intervention4[data$row_number_smaller]=='control') & data$incidence_YN[row_number]=='WAHR'& data$time_incidence_int4>=-6)))
#######
#first intervention: this code is to find more baseline data points but only up until 6 months before intervention
s<-unique(data_int1_inc$study_number)
w<-c()
for (val in s)
{w<-c(w,data$ID[data$study_number==val])}
g<-c()
for (val in w)
{g<-c(g,data$ID[data$ID==val&data$time_incidence_int1<=0])}
k<-c()
for (val in g)
{k<-c(k,data$ID[data$ID==val&data$time_incidence_int1>-6])}
#for (val in k) {print(data$time_incidence_int1[data$ID==val])}
data_int1_inc_extra<-data_int1_inc
for (val in k)
{data_int1_inc_extra<-rbind(data_int1_inc_extra,data[data$ID==val,])}
data_int1_inc_extra<-unique(data_int1_inc_extra)
data_int1_inc<-data_int1_inc_extra
data_int1_inc$Intervention<-data_int1_inc$Sum_Intervention1
data_int1_inc$Intervention<- as.character(data_int1_inc$Intervention)
data_int1_inc$Intervention<- as.factor(data_int1_inc$Intervention)
data_int1_inc$coverage<-data_int1_inc$coverage1
data_int1_inc$implementation_start<-data_int1_inc$implementation1_start
data_int1_inc$implementation_end<-data_int1_inc$implementation1_end
data_int1_inc$study_number_new<-data_int1_inc$study_number
data_int1_inc$time_incidence<-data_int1_inc$time_incidence_int1
data_int1_inc$transmission_falc<-data_int1_inc$trans_falc_int1
data_int1_inc$transmission_viv<-data_int1_inc$trans_viv_int1
#########
#second intervention: this code is to find more baseline data points but only up until 6 months before intervention
s<-unique(data_int2_inc$study_number)
w<-c()
for (val in s)
{w<-c(w,data$ID[data$study_number==val])}
g<-c()
for (val in w)
{g<-c(g,data$ID[data$ID==val&data$time_incidence_int2<=0])}
k<-c()
for (val in g)
{k<-c(k,data$ID[data$ID==val&data$time_incidence_int2>-6])}
#for (val in k)
#{print(data$time_incidence_int2[data$ID==val])}
data_int2_inc_extra<-data_int2_inc
for (val in k)
{data_int2_inc_extra<-rbind(data_int2_inc_extra,data[data$ID==val,])}
data_int2_inc_extra<-unique(data_int2_inc_extra)
data_int2_inc<-data_int2_inc_extra
data_int2_inc$Intervention<-data_int2_inc$Sum_Intervention2
data_int2_inc$Intervention<- as.character(data_int2_inc$Intervention)
data_int2_inc$Intervention<- as.factor(data_int2_inc$Intervention)
data_int2_inc$coverage<-data_int2_inc$coverage2
data_int2_inc$implementation_start<-data_int2_inc$implementation2_start
data_int2_inc$implementation_end<-data_int2_inc$implementation2_end
data_int2_inc$study_number_new<-data_int2_inc$study_number + 1*166
data_int2_inc$time_incidence<-data_int2_inc$time_incidence_int2
data_int2_inc$transmission_falc<-data_int2_inc$trans_falc_int2
data_int2_inc$transmission_viv<-data_int2_inc$trans_viv_int2
##########
#third intervention: this code is to find more baseline data points but only up until 6 months before intervention
s<-unique(data_int3_inc$study_number)
w<-c()
for (val in s)
{w<-c(w,data$ID[data$study_number==val])}
g<-c()
for (val in w)
{g<-c(g,data$ID[data$ID==val&data$time_incidence_int3<=0])}
k<-c()
for (val in g)
{k<-c(k,data$ID[data$ID==val&data$time_incidence_int3>-6])}
#for (val in k)
#{print(data$time_incidence_int3[data$ID==val])}
data_int3_inc_extra<-data_int3_inc
for (val in k)
{data_int3_inc_extra<-rbind(data_int3_inc_extra,data[data$ID==val,])}
data_int3_inc_extra<-unique(data_int3_inc_extra)
data_int3_inc<-data_int3_inc_extra
data_int3_inc$Intervention<-data_int3_inc$Sum_Intervention3
data_int3_inc$Intervention<- as.character(data_int3_inc$Intervention)
data_int3_inc$Intervention<- as.factor(data_int3_inc$Intervention)
data_int3_inc$coverage<-data_int3_inc$coverage3
data_int3_inc$implementation_start<-data_int3_inc$implementation3_start
data_int3_inc$implementation_end<-data_int3_inc$implementation3_end
data_int3_inc$study_number_new<-data_int3_inc$study_number + 2*166
data_int3_inc$time_incidence<-data_int3_inc$time_incidence_int3
data_int3_inc$transmission_falc<-data_int3_inc$trans_falc_int3
data_int3_inc$transmission_viv<-data_int3_inc$trans_viv_int3
##########
#fourth intervention: this code is to find more baseline data points but only up until 6 months before intervention
s<-unique(data_int4_inc$study_number)
w<-c()
for (val in s)
{w<-c(w,data$ID[data$study_number==val])}
g<-c()
for (val in w)
{g<-c(g,data$ID[data$ID==val&data$time_incidence_int4<=0])}
k<-c()
for (val in g)
{k<-c(k,data$ID[data$ID==val&data$time_incidence_int4>-6])}
#for (val in k)
#{print(data$time_incidence_int4[data$ID==val])}
data_int4_inc_extra<-data_int4_inc
for (val in k)
{data_int4_inc_extra<-rbind(data_int4_inc_extra,data[data$ID==val,])}
data_int4_inc_extra<-unique(data_int4_inc_extra)
data_int4_inc<-data_int4_inc_extra
data_int4_inc$Intervention<-data_int4_inc$Sum_Intervention4
data_int4_inc$Intervention<- as.character(data_int4_inc$Intervention)
data_int4_inc$Intervention<- as.factor(data_int4_inc$Intervention)
data_int4_inc$coverage<-data_int4_inc$coverage4
data_int4_inc$implementation_start<-data_int4_inc$implementation4_start
data_int4_inc$implementation_end<-data_int4_inc$implementation4_end
data_int4_inc$study_number_new<-data_int4_inc$study_number + 3*166
data_int4_inc$time_incidence<-data_int4_inc$time_incidence_int4
data_int4_inc$transmission_falc<-data_int4_inc$trans_falc_int4
data_int4_inc$transmission_viv<-data_int4_inc$trans_viv_int4
#rbind the datasets
data_new_inc<-rbind(data_int1_inc, data_int2_inc, data_int3_inc, data_int4_inc)
#only keep one time point before, for kondrashin repeated MDA before because had first MDA 3-6 months before
data_new_inc<-subset(data_new_inc, subset=!(study_number_new==268&time_incidence<(-1)))
data_new_inc<-subset(data_new_inc, subset=!(study_number_new==269&time_incidence<(-1)))
data_new_inc<-subset(data_new_inc, subset=!(study_number_new==270&time_incidence<(-1)))
data_new_inc<-subset(data_new_inc, subset=!(study_number_new==271&time_incidence<(-1)))
#get the coverage coded:
#LLIN
data_new_inc$coverage[data_new_inc$coverage=="'extensive' distribution"]<-'high'
data_new_inc$coverage[data_new_inc$coverage=='1 net per 2.5 for the whole village']<-'high'
data_new_inc$coverage[data_new_inc$coverage=='10% of commuity got nets and then looked at these (so no community effect) compared to other people. So in people who were in bed net group all of them had nets']<-'high'
data_new_inc$coverage[data_new_inc$coverage=='2.3 ITN per household']<-'high'
data_new_inc$coverage[data_new_inc$coverage=='all households got them depending on household size, however LLIN use lower half a year around 50%, then 26% and in second yar only around 6-8%']<-'low'
data_new_inc$coverage[data_new_inc$coverage=='all villagers according to family size etc, usage depended on season, above 80% in rainy,58.6 in cool and around 50 in hot']<-'low'
data_new_inc$coverage[data_new_inc$coverage=='all villagers according to family size etc, usage: in rainy season 93.7%, cool season 46%, hot season 32.1%']<-'low'
data_new_inc$coverage[data_new_inc$coverage=='around 25% covered afterwards']<-'low'
data_new_inc$coverage[data_new_inc$coverage=='around 50% of population covered afterwards']<-'low'
data_new_inc$coverage[data_new_inc$coverage=='between 96% and 715 ownership and usage between 67% and 27%']<-'low'
data_new_inc$coverage[data_new_inc$coverage=="high ('nearly universal')"]<-'high'
data_new_inc$coverage[data_new_inc$coverage=='in Lihir villages 36-89% of households with one net']<-'high'
data_new_inc$coverage[data_new_inc$coverage=="in all of meghalaya state: 900'000 nets for 3.5 mio people. According to number and ages of individuals in household)"]<-'low'
data_new_inc$coverage[data_new_inc$coverage=='increase from 40% of households at risk to 100% in 2011']<-'high'
data_new_inc$coverage[data_new_inc$coverage=='missing but in one commune had high coverage with only 2.5% of households no nets']<-'high'
data_new_inc$coverage[data_new_inc$coverage=="missing, MP at villlage and villages not too big, so covers all?, says 'LLINs were distributed to all hosueholds at M0'"]<-'high'
data_new_inc$coverage[data_new_inc$coverage=='self repordet: 77%, 89%, 86% in 2012,13,14']<-'high'
data_new_inc$coverage[data_new_inc$coverage=='self reported use after distribution:69% in 2013 and 48% in 2014']<-'low'
data_new_inc$coverage[data_new_inc$coverage=='self reported use: 69%']<-'low'
data_new_inc$coverage[data_new_inc$coverage=='self reported: 47% and 48% in 2013 and 2014']<-'low'
data_new_inc$coverage[data_new_inc$coverage=='self reported: 73%,53%,61% in 2011,12,13']<-'high'
data_new_inc$coverage[data_new_inc$coverage=='selfrepored use99%']<-'high'
data_new_inc$coverage[data_new_inc$coverage=='selfreported use: always above 98% in 2012 until 2014']<-'high'
data_new_inc$coverage[data_new_inc$coverage=='LLINs to all households']<-'high'
#net retreatment
data_new_inc$coverage[data_new_inc$coverage=='64% (for incidenc eonly looked at families who retreated in 94 so would be 100% there)']<-'high'
data_new_inc$coverage[data_new_inc$coverage=='87%']<-'high'
#increased case management
data_new_inc$coverage[data_new_inc$coverage=='60-90% of population tested, and treated']<-'high'
data_new_inc$coverage[data_new_inc$coverage=='assume high as not many people living at farm around 150 --< could easly cover all?']<-'high'
#MDA with cut of at 90% covered at least once
data_new_inc$coverage[data_new_inc$coverage=='29% three rounds, 18% two, 23% one']<-'low'
data_new_inc$coverage[data_new_inc$coverage=='57% three rounds, 18% two, 17% one']<-'high'
data_new_inc$coverage[data_new_inc$coverage=='80% of censued people. Estimated 70% of population']<-'low'
data_new_inc$coverage[data_new_inc$coverage=='90.9% at least one round, 69.5% two and 52.5% three rounds']<-'high'
data_new_inc$coverage[data_new_inc$coverage=='estimated 85%']<-'low'
data_new_inc$coverage[data_new_inc$coverage=='one round 91% three rounds 64%']<-'high'
data_new_inc$coverage[data_new_inc$coverage=='86.8% at least one round, 74% two and 60.3% three rounds']<-'low'
#IRS
data_new_inc$coverage[data_new_inc$coverage=='91% of houses sprayed and 97.5% the following year']<-'high'
data_new_inc$coverage[data_new_inc$coverage=='96%']<-'high'
data_new_inc$coverage[data_new_inc$coverage=='97%']<-'high'
data_new_inc$coverage[data_new_inc$coverage=='around 80% for DDT, almost 100% in 88']<-'high'
data_new_inc$coverage[data_new_inc$coverage=='around 80%. Around 96% for 88']<-'high'
data_new_inc$coverage[data_new_inc$coverage=='high (says all the people with 92% of rooms coverede)']<-'high'
data_new_inc$coverage[data_new_inc$coverage=='over 90%']<-'high'
data_new_inc$coverage[data_new_inc$coverage=='always mor ethan 90%']<-'high'
data_new_inc$coverage<-as.factor(data_new_inc$coverage)
data_new_inc$coverage<-as.character(data_new_inc$coverage)
x<-c(1:length(data_new_inc$Intervention))
for (val in x){
if(data_new_inc$Intervention[val]=='control'){data_new_inc$coverage[val]<-data_new_inc$coverage[val+1]}}
data_new_inc$coverage<-as.factor(data_new_inc$coverage)
#see if the the season of survey span different seasons: for incidence
data_new_inc$season_span<-data_new_inc$season_survey_incidence
data_new_inc$season_span<-as.character(data_new_inc$season_span)
data_new_inc$season_span<-'NA'
x<-c(1:max(data_new_inc$study_number_new))
for (val in x)
{n<-c()
n<-unique(data_new_inc$season_survey_incidence[data_new_inc$study_number_new==val])
n<-paste(n,collapse=" ")
data_new_inc$season_span[data_new_inc$study_number_new==val]<-as.character(n)}
data_new_inc$season_span[data_new_inc$season_span=='wet dry']<- 'yes'
data_new_inc$season_span[data_new_inc$season_span=='dry wet']<- 'yes'
data_new_inc$season_span[data_new_inc$season_span=='both dry']<- 'yes'
data_new_inc$season_span[data_new_inc$season_span=='dry both']<- 'yes'
data_new_inc$season_span[data_new_inc$season_span=='wet']<- 'no'
data_new_inc$season_span[data_new_inc$season_span=='dry']<- 'no'
data_new_inc$season_span[data_new_inc$season_span=='wet both']<- 'yes'
data_new_inc$season_span[data_new_inc$season_span=='both']<- 'no'
data_new_inc$season_span[data_new_inc$season_span=='missing both']<- 'unknown'
data_new_inc$season_span<-as.factor(data_new_inc$season_span)
#quantify transmission
data_new_inc$transmission_study<-paste(data_new_inc$transmission_falc,data_new_inc$transmission_viv)
data_new_inc$transmission_study<-as.factor(data_new_inc$transmission_study)
#############
##prevalence data
#############
#make subsets but only with data points that follow on one intervention, also last data point before intervention happened
data_int1_prev<-subset(data, ((data$time_prevalence_int1[data$row_number+1]>0 | ((data$study_number[data$row_number] != data$study_number[data$row_number + 1])& data$time_prevalence_int1[data$row_number]>0)) & ((data$time_prevalence_int1<=0 & (data$time_prevalence_int2[data$row_number+1]<=0 | is.na(data$time_prevalence_int2[data$row_number+1]))) | ((data$time_prevalence_int2<=0 | is.na(data$time_prevalence_int2)) & data$time_prevalence_int1>0))| ((data$Sum_Intervention1[row_number]=='control' | data$Sum_Intervention1[data$row_number_smaller]=='control') & data$prevalence_YN[row_number]=='WAHR')))
data_int2_prev<-subset(data, ((data$time_prevalence_int2[data$row_number+1]>0 | ((data$study_number[data$row_number] != data$study_number[data$row_number + 1])& data$time_prevalence_int2[data$row_number]>0)) & ((data$time_prevalence_int2<=0 & (data$time_prevalence_int3[data$row_number+1]<=0 | is.na(data$time_prevalence_int3[data$row_number+1]))) | ((data$time_prevalence_int3<=0 | is.na(data$time_prevalence_int3)) & data$time_prevalence_int2>0))| ((data$Sum_Intervention2[row_number]=='control' | data$Sum_Intervention2[data$row_number_smaller]=='control') & data$prevalence_YN[row_number]=='WAHR')))
data_int3_prev<-subset(data, ((data$time_prevalence_int3[data$row_number+1]>0 | ((data$study_number[data$row_number] != data$study_number[data$row_number + 1])& data$time_prevalence_int3[data$row_number]>0)) & ((data$time_prevalence_int3<=0 & (data$time_prevalence_int4[data$row_number+1]<=0 | is.na(data$time_prevalence_int4[data$row_number+1]))) | ((data$time_prevalence_int4<=0 | is.na(data$time_prevalence_int4)) & data$time_prevalence_int3>0))| ((data$Sum_Intervention3[row_number]=='control' | data$Sum_Intervention3[data$row_number_smaller]=='control') & data$prevalence_YN[row_number]=='WAHR')))
data_int4_prev<-subset(data, ((data$time_prevalence_int4[data$row_number+1]>0 | ((data$study_number[data$row_number] != data$study_number[data$row_number + 1])& data$time_prevalence_int4[data$row_number]>0)) & ((data$time_prevalence_int4<=0 & (data$time_prevalence_int5[data$row_number+1]<=0 | is.na(data$time_prevalence_int5[data$row_number+1]))) | ((data$time_prevalence_int5<=0 | is.na(data$time_prevalence_int5)) & data$time_prevalence_int4>0))| ((data$Sum_Intervention4[row_number]=='control' | data$Sum_Intervention4[data$row_number_smaller]=='control') & data$prevalence_YN[row_number]=='WAHR')))
#######
#first intervention: this code is to find more baseline data points but only up until 6 months before intervention
s<-unique(data_int1_prev$study_number)
w<-c()
for (val in s)
{w<-c(w,data$ID[data$study_number==val])}
g<-c()
for (val in w)
{g<-c(g,data$ID[data$ID==val&data$time_prevalence_int1<=0])}
k<-c()
for (val in g)
{k<-c(k,data$ID[data$ID==val&data$time_prevalence_int1>-6])}
#for (val in k)
#{print(data$time_prevalence_int1[data$ID==val])}
data_int1_prev_extra<-data_int1_prev
for (val in k)
{data_int1_prev_extra<-rbind(data_int1_prev_extra,data[data$ID==val,])}
data_int1_prev_extra<-unique(data_int1_prev_extra)
data_int1_prev<-data_int1_prev_extra
data_int1_prev$Intervention<-data_int1_prev$Sum_Intervention1
data_int1_prev$Intervention<- as.character(data_int1_prev$Intervention)
data_int1_prev$Intervention<- as.factor(data_int1_prev$Intervention)
data_int1_prev$implementation_start<-data_int1_prev$implementation1_start
data_int1_prev$implementation_end<-data_int1_prev$implementation1_end
data_int1_prev$coverage<-data_int1_prev$coverage1
data_int1_prev$study_number_new<-data_int1_prev$study_number
data_int1_prev$time_prevalence<-data_int1_prev$time_prevalence_int1
data_int1_prev$transmission_falc<-data_int1_prev$trans_falc_int1
data_int1_prev$transmission_viv<-data_int1_prev$trans_viv_int1
#######
#second intervention: this code is to find more baseline data points but only up until 6 months before intervention
s<-unique(data_int2_prev$study_number)
w<-c()
for (val in s)
{w<-c(w,data$ID[data$study_number==val])}
g<-c()
for (val in w)
{g<-c(g,data$ID[data$ID==val&data$time_prevalence_int2<=0])}
k<-c()
for (val in g)
{k<-c(k,data$ID[data$ID==val&data$time_prevalence_int2>-6])}
#for (val in k)
#{print(data$time_prevalence_int2[data$ID==val])}
data_int2_prev_extra<-data_int2_prev
for (val in k)
{data_int2_prev_extra<-rbind(data_int2_prev_extra,data[data$ID==val,])}
data_int2_prev_extra<-unique(data_int2_prev_extra)
data_int2_prev<-data_int2_prev_extra
data_int2_prev$Intervention<-data_int2_prev$Sum_Intervention2
data_int2_prev$Intervention<- as.character(data_int2_prev$Intervention)
data_int2_prev$Intervention<- as.factor(data_int2_prev$Intervention)
data_int2_prev$coverage<-data_int2_prev$coverage2
data_int2_prev$implementation_start<-data_int2_prev$implementation2_start
data_int2_prev$implementation_end<-data_int2_prev$implementation2_end
data_int2_prev$study_number_new<-data_int2_prev$study_number + 1*182
data_int2_prev$time_prevalence<-data_int2_prev$time_prevalence_int2
data_int2_prev$transmission_falc<-data_int2_prev$trans_falc_int2
data_int2_prev$transmission_viv<-data_int2_prev$trans_viv_int2
#######
#third intervention: this code is to find more baseline data points but only up until 6 months before intervention
s<-unique(data_int3_prev$study_number)
w<-c()
for (val in s)
{w<-c(w,data$ID[data$study_number==val])}
g<-c()
for (val in w)
{g<-c(g,data$ID[data$ID==val&data$time_prevalence_int3<=0])}
k<-c()
for (val in g)
{k<-c(k,data$ID[data$ID==val&data$time_prevalence_int3>-6])}
#for (val in k)
#{print(data$time_prevalence_int3[data$ID==val])}
data_int3_prev_extra<-data_int3_prev
for (val in k)
{data_int3_prev_extra<-rbind(data_int3_prev_extra,data[data$ID==val,])}
data_int3_prev_extra<-unique(data_int3_prev_extra)
data_int3_prev<-data_int3_prev_extra
data_int3_prev$Intervention<-data_int3_prev$Sum_Intervention3
data_int3_prev$Intervention<- as.character(data_int3_prev$Intervention)
data_int3_prev$Intervention<- as.factor(data_int3_prev$Intervention)
data_int3_prev$coverage<-data_int3_prev$coverage3
data_int3_prev$implementation_start<-data_int3_prev$implementation3_start
data_int3_prev$implementation_end<-data_int3_prev$implementation3_end
data_int3_prev$study_number_new<-data_int3_prev$study_number + 2*182
data_int3_prev$time_prevalence<-data_int3_prev$time_prevalence_int3
data_int3_prev$transmission_falc<-data_int3_prev$trans_falc_int3
data_int3_prev$transmission_viv<-data_int3_prev$trans_viv_int3
#######
#fourth intervention: this code is to find more baseline data points but only up until 6 months before intervention
s<-unique(data_int4_prev$study_number)
w<-c()
for (val in s)
{w<-c(w,data$ID[data$study_number==val])}
g<-c()
for (val in w)
{g<-c(g,data$ID[data$ID==val&data$time_prevalence_int4<=0])}
k<-c()
for (val in g)
{k<-c(k,data$ID[data$ID==val&data$time_prevalence_int4>-6])}
#for (val in k)
#{print(data$time_prevalence_int4[data$ID==val])}
data_int4_prev_extra<-data_int4_prev
for (val in k)
{data_int4_prev_extra<-rbind(data_int4_prev_extra,data[data$ID==val,])}
data_int4_prev_extra<-unique(data_int4_prev_extra)
data_int4_prev<-data_int4_prev_extra
data_int4_prev$Intervention<-data_int4_prev$Sum_Intervention4
data_int4_prev$Intervention<- as.character(data_int4_prev$Intervention)
data_int4_prev$Intervention<- as.factor(data_int4_prev$Intervention)
data_int4_prev$coverage<-data_int4_prev$coverage4
data_int4_prev$implementation_start<-data_int4_prev$implementation4_start
data_int4_prev$implementation_end<-data_int4_prev$implementation4_end
data_int4_prev$study_number_new<-data_int4_prev$study_number + 3*182
data_int4_prev$time_prevalence<-data_int4_prev$time_prevalence_int4
data_int4_prev$transmission_falc<-data_int4_prev$trans_falc_int4
data_int4_prev$transmission_viv<-data_int4_prev$trans_viv_int4
#rbind the datasets together
data_new_prev<-rbind(data_int1_prev, data_int2_prev, data_int3_prev, data_int4_prev)
#excluding metseelar for MDA because timing cant be pinpointed well enough
data_new_prev<-subset(data_new_prev, study_number_new!=225)
data_new_prev<-subset(data_new_prev, study_number_new!=407)
#exclude the ones that either have no cases before or no cases after the implementation of interventions
data_new_prev<-subset(data_new_prev, study_number_new!=38)
data_new_prev<-subset(data_new_prev, study_number_new!=22)
data_new_prev<-subset(data_new_prev, study_number_new!=170)
data_new_prev<-subset(data_new_prev, study_number_new!=182)
data_new_prev<-subset(data_new_prev, study_number_new!=167)
data_new_prev<-subset(data_new_prev, study_number_new!=168)
data_new_prev<-subset(data_new_prev, study_number_new!=171)
data_new_prev<-subset(data_new_prev, study_number_new!=176)
data_new_prev<-subset(data_new_prev, study_number_new!=204)
data_new_prev<-subset(data_new_prev, study_number_new!=364)
#recoding of coverage
#LLIN
data_new_prev$coverage[data_new_prev$coverage=="'high'"]<-'high'
data_new_prev$coverage[data_new_prev$coverage=='0.94 nets per villager']<-'high'
data_new_prev$coverage[data_new_prev$coverage=='10% of commuity got nets and then looked at these (so no community effect) compared to other people. So in people who were in bed net group all of them had nets']<-'high'
data_new_prev$coverage[data_new_prev$coverage=='375 nets for 580 people']<-'high'
data_new_prev$coverage[data_new_prev$coverage=='52% bought net']<-'low'
data_new_prev$coverage[data_new_prev$coverage=='56% used nets (before only 33%), but 80% reported to having rceived a net, but only 60% of remporary workers']<-'low'
data_new_prev$coverage[data_new_prev$coverage=='59% bought net']<-'low'
data_new_prev$coverage[data_new_prev$coverage=='68% bought net']<-'low'
data_new_prev$coverage[data_new_prev$coverage=='71% covered with nets afterwards but only 27% use']<-'low'
data_new_prev$coverage[data_new_prev$coverage=='91% covered afterwards but only 60% use']<-'low'
data_new_prev$coverage[data_new_prev$coverage=='91% covered but only 61% use']<-'low'
data_new_prev$coverage[data_new_prev$coverage=='92% covered but only 49% use']<-'low'
data_new_prev$coverage[data_new_prev$coverage=='92% covered but only 72% use']<-'high'
data_new_prev$coverage[data_new_prev$coverage=='94.9% reported use of nets']<-'high'
data_new_prev$coverage[data_new_prev$coverage=='at least one LLIN per household but tried 1 per 2 people']<-'high'
data_new_prev$coverage[data_new_prev$coverage=='each child received one net, bigger if slept with more than 2 people, only covered and also only tested children']<-'high'
data_new_prev$coverage[data_new_prev$coverage=="high ('nearly universal')"]<-'high'
data_new_prev$coverage[data_new_prev$coverage=='high, 98.6 reported using net in final survey']<-'high'
data_new_prev$coverage[data_new_prev$coverage=="missing, MP at villlage and villages not too big, so covers all?, says 'LLINs were distributed to all hosueholds at M0'"]<-'high'
data_new_prev$coverage[data_new_prev$coverage=='selfreported use above 90%']<-'high'
data_new_prev$coverage[data_new_prev$coverage=='39% bought net']<-'low'
data_new_prev$coverage[data_new_prev$coverage=='LLINs to all households']<-'high'
data_new_prev$coverage[data_new_prev$coverage=='104']<-'high'
data_new_prev$coverage[data_new_prev$coverage=='105']<-'high'
data_new_prev$coverage[data_new_prev$coverage=='117']<-'high'
data_new_prev$coverage[data_new_prev$coverage=='118']<-'high'
data_new_prev$coverage[data_new_prev$coverage=='12']<-'low'
data_new_prev$coverage[data_new_prev$coverage=='122']<-'high'
data_new_prev$coverage[data_new_prev$coverage=='18']<-'low'
data_new_prev$coverage[data_new_prev$coverage=='20']<-'low'
data_new_prev$coverage[data_new_prev$coverage=='37']<-'low'
data_new_prev$coverage[data_new_prev$coverage=='38']<-'low'
data_new_prev$coverage[data_new_prev$coverage=='42']<-'low'
data_new_prev$coverage[data_new_prev$coverage=='55']<-'low'
data_new_prev$coverage[data_new_prev$coverage=='7']<-'low'
data_new_prev$coverage[data_new_prev$coverage=='74']<-'high'
data_new_prev$coverage[data_new_prev$coverage=='80']<-'high'
data_new_prev$coverage[data_new_prev$coverage=='92']<-'high'
data_new_prev$coverage[data_new_prev$coverage=='96']<-'high'
data_new_prev$coverage[data_new_prev$coverage=='98']<-'high'
data_new_prev$coverage[data_new_prev$coverage=='36.70%']<-'low'
data_new_prev$coverage[data_new_prev$coverage=='38.70%']<-'low'
data_new_prev$coverage[data_new_prev$coverage=='43.30%']<-'low'
#MDA, cut of 90% at least one round
data_new_prev$coverage[data_new_prev$coverage=='29% three rounds, 18% two, 23% one']<-'low'
data_new_prev$coverage[data_new_prev$coverage=='57% three rounds, 18% two, 17% one']<-'high'
data_new_prev$coverage[data_new_prev$coverage=='90.9% at least one round, 69.5% two and 52.5% three rounds']<-'high'
data_new_prev$coverage[data_new_prev$coverage=='97.2% (93.1-100)']<-'high'
data_new_prev$coverage[data_new_prev$coverage=='adults 72%,76% and 78%fully treated, children 85%, 84% 85% fully treated, also some partially treated']<-'low'
data_new_prev$coverage[data_new_prev$coverage=='all of the children that were followed completed treatment (but other people in village were not covered)']<-'high'
data_new_prev$coverage[data_new_prev$coverage=='average attendance 90%']<-'high'
data_new_prev$coverage[data_new_prev$coverage=='38.6% at least one dose']<-'low'
#increased case management
data_new_prev$coverage[data_new_prev$coverage=='mean screening 89%']<-'high'
data_new_prev$coverage[data_new_prev$coverage=='mean screening87%']<-'high'
#net retreatment
data_new_prev$coverage[data_new_prev$coverage=='64% (for incidenc eonly looked at families who retreated in 94 so would be 100% there)']<-'high'
data_new_prev$coverage[data_new_prev$coverage=='87%']<-'high'
#IRS
data_new_prev$coverage[data_new_prev$coverage=='95% (85%-100%)']<-'high'
data_new_prev$coverage[data_new_prev$coverage=='95.3% of houses sprayed and second round 87%']<-'high'
data_new_prev$coverage[data_new_prev$coverage=='96%']<-'high'
data_new_prev$coverage[data_new_prev$coverage=='97%']<-'high'
data_new_prev$coverage[data_new_prev$coverage=='97% (92-100%)']<-'high'
data_new_prev$coverage[data_new_prev$coverage=='all houses']<-'high'
data_new_prev$coverage[data_new_prev$coverage=='always mor ethan 90%']<-'high'
data_new_prev$coverage<-as.factor(data_new_prev$coverage)
data_new_prev$coverage<-as.character(data_new_prev$coverage)
x<-c(1:length(data_new_prev$Intervention))
for (val in x){
if(data_new_prev$Intervention[val]=='control'){data_new_prev$coverage[val]<-data_new_prev$coverage[val+1]}}
data_new_prev$coverage<-as.factor(data_new_prev$coverage)
#see if the the season of survey span different seasons: for prevalence
data_new_prev$season_span<-data_new_prev$season_survey_prevalence
data_new_prev$season_span<-as.character(data_new_prev$season_span)
data_new_prev$season_span<-'NA'
x<-c(1:max(data_new_prev$study_number_new))
for (val in x)
{n<-c()
n<-unique(data_new_prev$season_survey_prevalence[data_new_prev$study_number_new==val])
n<-paste(n,collapse=" ")
data_new_prev$season_span[data_new_prev$study_number_new==val]<-as.character(n)}
data_new_prev$season_span[data_new_prev$season_span=='wet dry']<- 'yes'
data_new_prev$season_span[data_new_prev$season_span=='dry wet']<- 'yes'
data_new_prev$season_span[data_new_prev$season_span=='dry both']<- 'yes'
data_new_prev$season_span[data_new_prev$season_span=='dry wet both']<- 'yes'
data_new_prev$season_span[data_new_prev$season_span=='both dry']<- 'yes'
data_new_prev$season_span[data_new_prev$season_span=='wet']<- 'no'
data_new_prev$season_span[data_new_prev$season_span=='dry']<- 'no'
data_new_prev$season_span[data_new_prev$season_span=='wet both']<- 'yes'
data_new_prev$season_span[data_new_prev$season_span=='both']<- 'no'
data_new_prev$season_span[data_new_prev$season_span=='missing both']<- 'unknown'
data_new_prev$season_span[data_new_prev$season_span=='missing']<- 'unknown'
data_new_prev$season_span[data_new_prev$season_span=='missing wet']<- 'unknown'
data_new_prev$season_span<-as.factor(data_new_prev$season_span)
#quantify transmission: 4 categories
data_new_prev$transmission_study<-paste(data_new_prev$transmission_falc,data_new_prev$transmission_viv)
data_new_prev$transmission_study<-as.factor(data_new_prev$transmission_study)
#exclude studies that are not included
#LOHA master study number 51: the one where there is LLIN and IRS pretty close together
data_new_inc$ID[data_new_inc$study_number_new==149]
data_new_inc$row_number<-seq.int(nrow(data_new_inc))
b<-data_new_inc$row_number[data_new_inc$study_number_new==149]
data_new_inc<-data_new_inc[-c(b),]
data_new_inc$row_number<-seq.int(nrow(data_new_inc))
#exclude this LOHA one with master study number 51 for IRS as well...
data_new_inc$ID[data_new_inc$study_number_new==315]
data_new_inc$row_number<-seq.int(nrow(data_new_inc))
b<-data_new_inc$row_number[data_new_inc$study_number_new==315]
data_new_inc<-data_new_inc[-c(b),]
data_new_inc$row_number<-seq.int(nrow(data_new_inc))
#because there are two 183 make ome-kaius one to 184!
v<-unique(data_new_prev[,c('first_authors',"study_number_new")])
v$study_number_new[duplicated(v$study_number_new)]
data_new_prev$study_number_new[data_new_prev$study_number_new==183&data_new_prev$first_authors=='Ome-Kaius, Maria']<-184
v<-unique(data_new_inc[,c('first_authors',"study_number_new")])
v$study_number_new[duplicated(v$study_number_new)]
remove(v)