-
Notifications
You must be signed in to change notification settings - Fork 1
/
data_preparation_and_cleaning.R
814 lines (647 loc) · 57.6 KB
/
data_preparation_and_cleaning.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
###########################################
## data_preparation_and_cleaning.r
##script for data cleaning and preparation
###########################################
#read in data from the csv file
data<- read.csv(file = 'database_rel_impact.csv', header=TRUE, sep=";" )
# correct first column name
colnames(data)[1]<-c("first_authors")
##recodes variables
##adds variables such as time between data collection and intervention, dot size for the plots, converts incidence rates and prevalence rates
##into actual case numbers/number of people infected
##codes 'shorter_time' which is a data frame that shows all studies with two interventions within 6 months
##adds mixed cases to vivax and falciparum
#converting dates from characters to dates
data$date_start_incidence<- as.Date(data$date_start_incidence, "%d.%m.%Y")
data$date_end_incidence<- as.Date(data$date_end_incidence, "%d.%m.%Y")
data$date_start_prevalence<- as.Date(data$date_start_prevalence, "%d.%m.%Y")
data$date_end_prevalence<- as.Date(data$date_end_prevalence, "%d.%m.%Y")
data$implementation1_start<- as.Date(data$implementation1_start, "%d.%m.%Y")
data$implementation1_end<- as.Date(data$implementation1_end, "%d.%m.%Y")
data$implementation2_start<- as.Date(data$implementation2_start, "%d.%m.%Y")
data$implementation2_end<- as.Date(data$implementation2_end, "%d.%m.%Y")
data$implementation3_start<- as.Date(data$implementation3_start, "%d.%m.%Y")
data$implementation3_end<- as.Date(data$implementation3_end, "%d.%m.%Y")
data$implementation4_start<- as.Date(data$implementation4_start, "%d.%m.%Y")
data$implementation4_end<- as.Date(data$implementation4_end, "%d.%m.%Y")
data$implementation5_start<- as.Date(data$implementation5_start, "%d.%m.%Y")
data$implementation5_end<- as.Date(data$implementation5_end, "%d.%m.%Y")
data$implementation6_start<- as.Date(data$implementation6_start, "%d.%m.%Y")
data$implementation6_end<- as.Date(data$implementation6_end, "%d.%m.%Y")
data$implementation7_start<- as.Date(data$implementation7_start, "%d.%m.%Y")
data$implementation7_end<- as.Date(data$implementation7_end, "%d.%m.%Y")
data$implementation8_start<- as.Date(data$implementation8_start, "%d.%m.%Y")
data$implementation8_end<- as.Date(data$implementation8_end, "%d.%m.%Y")
data$implementation9_start<- as.Date(data$implementation9_start, "%d.%m.%Y")
data$implementation9_end<- as.Date(data$implementation9_end, "%d.%m.%Y")
#get mid point of implementation time span
data$implementation1<-(data$implementation1_start + ((data$implementation1_end - data$implementation1_start) / 2))
data$implementation2<-(data$implementation2_start + ((data$implementation2_end - data$implementation2_start) / 2))
data$implementation3<-(data$implementation3_start + ((data$implementation3_end - data$implementation3_start) / 2))
data$implementation4<-(data$implementation4_start + ((data$implementation4_end - data$implementation4_start) / 2))
data$implementation5<-(data$implementation5_start + ((data$implementation5_end - data$implementation5_start) / 2))
data$implementation6<-(data$implementation6_start + ((data$implementation6_end - data$implementation6_start) / 2))
data$implementation7<-(data$implementation7_start + ((data$implementation7_end - data$implementation7_start) / 2))
data$implementation8<-(data$implementation8_start + ((data$implementation8_end - data$implementation8_start) / 2))
data$implementation9<-(data$implementation9_start + ((data$implementation9_end - data$implementation9_start) / 2))
#check if any implementation end points are missing
data$study_number[is.na(data$implementation1)&!is.na(data$implementation1_start)]
#mid-point of prevalence and incidence data collection time span
data$date_incidence<-(data$date_start_incidence + ((data$date_end_incidence - data$date_start_incidence) / 2))
data$date_prevalence<-(data$date_start_prevalence + ((data$date_end_prevalence - data$date_start_prevalence) / 2))
#converting characters to factors for: incidence_diagnostics, mode_control and interventions
data$Intervention1<- as.factor(data$Intervention1)
data$Intervention2<- as.factor(data$Intervention2)
data$Intervention3<- as.factor(data$Intervention3)
data$Intervention4<- as.factor(data$Intervention4)
data$Intervention5<- as.factor(data$Intervention5)
data$Intervention6<- as.factor(data$Intervention6)
data$Intervention7<- as.factor(data$Intervention7)
data$Intervention8<- as.factor(data$Intervention8)
data$Intervention9<- as.factor(data$Intervention9)
data$mode_control<- as.factor(data$mode_control)
data$incidence_diagnostics<- as.factor(data$incidence_diagnostics)
#recoding mode_control
data$mode_control[data$mode_control== 'vector and drug'] <- 'drug and vector'
data$mode_control<-droplevels(data$mode_control)
data$mode_control<- as.character(data$mode_control)
data$mode_control<- as.factor(data$mode_control)
#recoding of seasonality with a cut off at 0.6
data$seasonality[data$seasonality== 0.2] <- 'low'
data$seasonality[data$seasonality== 0.4] <- 'low'
data$seasonality[data$seasonality== 0.6] <- 'low'
data$seasonality[data$seasonality== 0.8] <- 'high'
data$seasonality[data$seasonality== 1.0] <- 'high'
data$seasonality[data$seasonality== 1.2] <- 'high'
data$seasonality[data$seasonality== 1.4] <- 'high'
data$seasonality[data$seasonality== 1.6] <- 'high'
data$seasonality[data$seasonality== 1.8] <- 'high'
data$seasonality[data$seasonality== 2.0] <- 'high'
data$seasonality[data$seasonality== ''] <- 'missing'
data$seasonality[is.na(data$seasonality)] <- 'missing'
data$seasonality<- as.factor(data$seasonality)
data$seasonality <- factor(data$seasonality, levels = c("low", "high", "missing"))
#recoding season of survey_incidence
data$season_survey_incidence<-as.character(data$season_survey_incidence)
data$season_survey_incidence[data$season_survey_incidence=='']<-'missing'
data$season_survey_incidence[is.na(data$season_survey_incidence)]<-'missing'
data$season_survey_incidence[data$season_survey_incidence=='non_monsoon']<-'dry'
data$season_survey_incidence[data$season_survey_incidence=='non-monsoon']<-'dry'
data$season_survey_incidence[data$season_survey_incidence=='monsoon']<-'wet'
data$season_survey_incidence[data$season_survey_incidence=='moderate wet']<-'wet'
data$season_survey_incidence[data$season_survey_incidence=='both monson and non-monsoon']<-'both'
data$season_survey_incidence[data$season_survey_incidence=='dry (non-monsoon)']<-'dry'
data$season_survey_incidence[data$season_survey_incidence=='dry and following monsoon']<-'dry'
data$season_survey_incidence[data$season_survey_incidence=='following monsoon']<-'dry'
data$season_survey_incidence<-as.factor(data$season_survey_incidence)
data$season_survey_incidence<-as.character(data$season_survey_incidence)
data$season_survey_incidence<-as.factor(data$season_survey_incidence)
data$season_survey_incidence <- factor(data$season_survey_incidence, levels = c("dry", "wet", "both", "missing"))
#recoding season of survey_prevalence
data$season_survey_prevalence<-as.character(data$season_survey_prevalence)
data$season_survey_prevalence[data$season_survey_prevalence=='']<-'missing'
data$season_survey_prevalence[is.na(data$season_survey_prevalence)]<-'missing'
data$season_survey_prevalence[data$season_survey_prevalence=='dry/cool']<-'dry'
data$season_survey_prevalence[data$season_survey_prevalence=='post-monsoon']<-'dry'
data$season_survey_prevalence[data$season_survey_prevalence=='both monson and non-monsoon']<-'both'
data$season_survey_prevalence[data$season_survey_prevalence=='both monsoon and non-monsoon']<-'both'
data$season_survey_prevalence[data$season_survey_prevalence=='cool season (=dry?)']<-'dry'
data$season_survey_prevalence[data$season_survey_prevalence=='dry (non-monsoon)']<-'dry'
data$season_survey_prevalence[data$season_survey_prevalence=='dry and following monsoon']<-'dry'
data$season_survey_prevalence[data$season_survey_prevalence=='following monsoon']<-'dry'
data$season_survey_prevalence[data$season_survey_prevalence=='non-monsoon']<-'dry'
data$season_survey_prevalence<-as.factor(data$season_survey_prevalence)
data$season_survey_prevalence<-as.character(data$season_survey_prevalence)
data$season_survey_prevalence<-as.factor(data$season_survey_prevalence)
data$season_survey_prevalence <- factor(data$season_survey_prevalence, levels = c("dry", "wet", "both", "missing"))
#recoding diagnostic tool
data$incidence_diagnostics<-as.character(data$incidence_diagnostics)
data$incidence_diagnostics[data$incidence_diagnostics=='']<-'missing'
data$incidence_diagnostics[data$incidence_diagnostics=="doesn't say"]<-'missing'
data$incidence_diagnostics[data$incidence_diagnostics=='microscopy or RDT']<-'RDT or microscopy'
data$incidence_diagnostics[data$incidence_diagnostics=='LM']<-'microscopy'
data$incidence_diagnostics[data$incidence_diagnostics=='missing (assume microscopy)']<-'microscopy'
data$incidence_diagnostics[data$incidence_diagnostics=='RDT positive and then species determination with LM']<-'RDT then microscopy'
data$incidence_diagnostics<-as.factor(data$incidence_diagnostics)
data$incidence_diagnostics<-as.character(data$incidence_diagnostics)
data$incidence_diagnostics<-as.factor(data$incidence_diagnostics)
#recoding of age groups for age_incidence and age_prevalence
#incidence
data$age_incidence[data$age_incidence== 'all ages above 6 months'] <- 'age_all'
data$age_incidence[data$age_incidence== 'all ages'] <- 'age_all'
data$age_incidence[data$age_incidence== '1-3 yo'] <- 'age_5'
data$age_incidence[data$age_incidence== '<5'] <- 'age_5'
data$age_incidence[data$age_incidence== 'below 10'] <- 'age_10'
data$age_incidence[data$age_incidence== 'above 10'] <- 'age_10+'
data$age_incidence[data$age_incidence== ''] <- 'missing'
data$age_incidence<- as.factor(data$age_incidence)
data$age_incidence <- factor(data$age_incidence, levels = c("age_5", "age_10", "age_10+", "age_all", "missing"))
#recoding age groups prevalence
data$age_prevalence<- as.character(data$age_prevalence)
data$age_prevalence[data$age_prevalence== 'all ages above 6 months'] <- 'age_all'
data$age_prevalence[data$age_prevalence== 'all ages above 5 months'] <- 'age_all'
data$age_prevalence[data$age_prevalence== 'all ages'] <- 'age_all'
data$age_prevalence[data$age_prevalence== '4-15 yo'] <- 'age_15'
data$age_prevalence[data$age_prevalence== 'pre-school children'] <- 'age_15'
data$age_prevalence[data$age_prevalence== '1-3 yo'] <- 'age_15'
data$age_prevalence[data$age_prevalence== '1-3yo'] <- 'age_15'
data$age_prevalence[data$age_prevalence== 'under 10'] <- 'age_15'
data$age_prevalence[data$age_prevalence== '0-14'] <- 'age_15'
data$age_prevalence[data$age_prevalence== '1-9 year olds'] <- 'age_15'
data$age_prevalence[data$age_prevalence== '5-10 year olds'] <- 'age_15'
data$age_prevalence[data$age_prevalence== 'below 10'] <- 'age_15'
data$age_prevalence[data$age_prevalence== '5-15 year old school children'] <- 'age_15'
data$age_prevalence[data$age_prevalence== '5-15 years, age adjusted'] <- 'age_15'
data$age_prevalence[data$age_prevalence== 'below 10'] <- 'age_15'
data$age_prevalence[data$age_prevalence== 'age below 10'] <- 'age_15'
data$age_prevalence[data$age_prevalence== 'schoolchildren'] <- 'age_15'
data$age_prevalence[data$age_prevalence== 'above 15'] <- 'age_15+'
data$age_prevalence[data$age_prevalence== 'above 2 year olds'] <- 'age_2+'
data$age_prevalence[data$age_prevalence== 'all ages above 5 year olds'] <- 'age_5+'
data$age_prevalence<- as.factor(data$age_prevalence)
data$age_prevalence <- factor(data$age_prevalence, levels = c("age_2+", "age_5+", "age_15", "age_15+", "age_all"))
#coding time differences to first intervention in months
data$time_incidence_int1 <- data$date_incidence - data$implementation1
data$time_prevalence_int1 <- data$date_prevalence - data$implementation1
data$time_prevalence_int1 <- data$time_prevalence_int1/(365.25/12)
data$time_incidence_int1 <- data$time_incidence_int1/(365.25/12)
#coding time differences to second intervention in months
data$time_incidence_int2 <- data$date_incidence - data$implementation2
data$time_prevalence_int2 <- data$date_prevalence - data$implementation2
data$time_prevalence_int2 <- data$time_prevalence_int2/(365.25/12)
data$time_incidence_int2 <- data$time_incidence_int2/(365.25/12)
#coding time differences to third intervention in months
data$time_incidence_int3 <- data$date_incidence - data$implementation3
data$time_prevalence_int3 <- data$date_prevalence - data$implementation3
data$time_prevalence_int3 <- data$time_prevalence_int3/(365.25/12)
data$time_incidence_int3 <- data$time_incidence_int3/(365.25/12)
#coding time differences to fourth intervention in months
data$time_incidence_int4 <- data$date_incidence - data$implementation4
data$time_prevalence_int4 <- data$date_prevalence - data$implementation4
data$time_prevalence_int4 <- data$time_prevalence_int4/(365.25/12)
data$time_incidence_int4 <- data$time_incidence_int4/(365.25/12)
#coding time differences to fifth intervention in months
data$time_incidence_int5 <- data$date_incidence - data$implementation5
data$time_prevalence_int5 <- data$date_prevalence - data$implementation5
data$time_prevalence_int5 <- data$time_prevalence_int5/(365.25/12)
data$time_incidence_int5 <- data$time_incidence_int5/(365.25/12)
#coding time differences to sixth intervention in months
data$time_incidence_int6 <- data$date_incidence - data$implementation6
data$time_prevalence_int6 <- data$date_prevalence - data$implementation6
data$time_prevalence_int6 <- data$time_prevalence_int6/(365.25/12)
data$time_incidence_int6 <- data$time_incidence_int6/(365.25/12)
#coding time differences to seventh intervention in months
data$time_incidence_int7 <- data$date_incidence - data$implementation7
data$time_prevalence_int7 <- data$date_prevalence - data$implementation7
data$time_prevalence_int7 <- data$time_prevalence_int7/(365.25/12)
data$time_incidence_int7 <- data$time_incidence_int7/(365.25/12)
#coding time differences to eighth intervention in months
data$time_incidence_int8 <- data$date_incidence - data$implementation8
data$time_prevalence_int8 <- data$date_prevalence - data$implementation8
data$time_prevalence_int8 <- data$time_prevalence_int8/(365.25/12)
data$time_incidence_int8 <- data$time_incidence_int8/(365.25/12)
#coding time differences to ninth intervention in months
data$time_incidence_int9 <- data$date_incidence - data$implementation9
data$time_prevalence_int9 <- data$date_prevalence - data$implementation9
data$time_prevalence_int9 <- data$time_prevalence_int9/(365.25/12)
data$time_incidence_int9 <- data$time_incidence_int9/(365.25/12)
#new variable which categorizes the Interventions
data$Sum_Intervention1<- data$Intervention1
data$Sum_Intervention2<- data$Intervention2
data$Sum_Intervention3<- data$Intervention3
data$Sum_Intervention4<- data$Intervention4
data$Sum_Intervention5<- data$Intervention5
data$Sum_Intervention6<- data$Intervention6
data$Sum_Intervention7<- data$Intervention7
data$Sum_Intervention8<- data$Intervention8
data$Sum_Intervention9<- data$Intervention9
#summarize the new variables Intervention 1
data$Sum_Intervention1<- as.character(data$Sum_Intervention1)
data$Sum_Intervention1[data$Sum_Intervention1== 'LLIN 1st round'] <- 'LLIN 1st dist'
data$Sum_Intervention1[data$Sum_Intervention1== 'LLIN 2nd round'] <- 'LLIN later dist'
data$Sum_Intervention1[data$Sum_Intervention1== 'LLIN 3rd round'] <- 'LLIN later dist'
data$Sum_Intervention1[data$Sum_Intervention1== 'LLIN (5-yearly)'] <- 'LLIN'
data$Sum_Intervention1[data$Sum_Intervention1== 'LLIN (repeated probably cause 65% had at least one per hosuehold already)'] <- 'LLIN later dist'
data$Sum_Intervention1[data$Sum_Intervention1== 'LLIN (not first distribution, 1 net per individual)'] <- 'LLIN later dist'
data$Sum_Intervention1[data$Sum_Intervention1== 'ITN (permethrin) (80% used nets before, there were distributions before, probably untreated)'] <- 'LLIN later dist'
data$Sum_Intervention1[data$Sum_Intervention1== 'ITN (permethrin treated, first time, or not mentioned if happened before)'] <- 'LLIN 1st dist'
data$Sum_Intervention1[data$Sum_Intervention1== 'ITNs (first time deltamethrin)'] <- 'LLIN 1st dist'
data$Sum_Intervention1[data$Sum_Intervention1== 'treated nets (alphacypermethrin) (no mention of distribution before)'] <- 'LLIN 1st dist'
data$Sum_Intervention1[data$Sum_Intervention1== "introduction benets/Permethrin-net re impregnation (first time or don't mentioned it happened before)"] <- 'LLIN 1st dist'
data$Sum_Intervention1[data$Sum_Intervention1== 'untreated nets (76% used nets before, there were dirtsibutions before)'] <- 'untreated nets'
data$Sum_Intervention1[data$Sum_Intervention1== 'untreated nets (no mention of any distribution before)'] <- 'untreated nets'
data$Sum_Intervention1[data$Sum_Intervention1== 'untreated nets, probably first dist'] <- 'untreated nets'
data$Sum_Intervention1[data$Sum_Intervention1== 'LLIN (used nets before (11%), not sure which round)'] <- 'LLIN 1st dist'
data$Sum_Intervention1[data$Sum_Intervention1== 'LLIN (had ITNs before but first state wide distribution, before could get ITN distributed from PHC)'] <- 'LLIN 1st dist'
data$Sum_Intervention1[data$Sum_Intervention1== 'LLIN 2nd round(one was done 6 years previous)'] <- 'LLIN later dist'
data$Sum_Intervention1[data$Sum_Intervention1== 'LLIN distribution (first)'] <- 'LLIN 1st dist'
data$Sum_Intervention1[data$Sum_Intervention1== 'ITN (does not speak of a round of distribution before, but not sure if they had nets before)'] <- 'LLIN 1st dist'
data$Sum_Intervention1[data$Sum_Intervention1== 'ITN (permethrin or lambdacyhalothrin), offering of reatreatment in following years, probably first dist'] <- 'LLIN 1st dist'
data$Sum_Intervention1[data$Sum_Intervention1== 'ITN (permethrin or lambdacyhalothrin), offering of reatreatment in following years, probably first distribution'] <- 'LLIN 1st dist'
data$Sum_Intervention1[data$Sum_Intervention1== 'ITN (some already distributde before but only low coverage, so first time?)'] <- 'LLIN 1st dist' #this is the chaves one in vanuatu
data$Sum_Intervention1[data$Sum_Intervention1== 'LLIN distribution (not sure which round, had nets before, no mention of previous distributipon but might have happened)'] <- 'LLIN 1st dist'
data$Sum_Intervention1[data$Sum_Intervention1== 'LLIN (seemed to hae nets before and distributed them, not sure how extensivelxy)'] <- 'LLIN'
data$Sum_Intervention1[data$Sum_Intervention1== 'LLIN distribution (probably first round)'] <- 'LLIN 1st dist'
data$Sum_Intervention1[data$Sum_Intervention1== 'later time LLIN'] <- 'LLIN later dist'
data$Sum_Intervention1[data$Sum_Intervention1== 'LLIN (none before mentioned)'] <- 'LLIN 1st dist'
data$Sum_Intervention1[data$Sum_Intervention1== 'IRS (restarted with deltamethrin)'] <- 'IRS 1st time'
data$Sum_Intervention1[data$Sum_Intervention1== 'IRS biannualy with DDT (continued IRS) until 78 then irregular'] <- 'IRS 1st time'
data$Sum_Intervention1[data$Sum_Intervention1== 'first time IRS (twice a year DDT)'] <- 'IRS 1st time'
data$Sum_Intervention1[data$Sum_Intervention1== 'IRS (some areas received before otherwise first, 2.3 spraying per year avergae, DDT)'] <- 'IRS 1st time'
data$Sum_Intervention1[data$Sum_Intervention1== 'IRS (some areas received before otherwise first, 2.0 spraying per year avergae, DDT)'] <- 'IRS 1st time'
data$Sum_Intervention1[data$Sum_Intervention1== 'spraying (alphacypermethrin suspeincion concentrate), spraying happened in pakistan before but doesnt say anything about this area'] <- 'IRS'
data$Sum_Intervention1[data$Sum_Intervention1== 'spraying (alphacypermethrin wettable powder), spraying happened in pakistan before but doesnt say anything about this area'] <- 'IRS'
data$Sum_Intervention1[data$Sum_Intervention1== 'IRS (lambdacyhalothrin, malathion was used before, so repeated)'] <- 'IRS repeated time'
data$Sum_Intervention1[data$Sum_Intervention1== 'IRS (malathion, repeated spraying)'] <- 'IRS repeated time'
data$Sum_Intervention1[data$Sum_Intervention1== 'spraying with lambda-cyhalotrin 10% WP (first time with lambda-cyhalotrin but DDT before)'] <- 'IRS repeated time'
data$Sum_Intervention1[data$Sum_Intervention1== 'spraying (malathion or lambdacyhalothrin) (probably first time, but very unsure)'] <- 'IRS'
data$Sum_Intervention1[data$Sum_Intervention1== 'DDT spraying (no spraying in previous year, one spraiyng in feb and one in sept 87, had spraying at some point before)'] <- 'IRS 1st time'
data$Sum_Intervention1[data$Sum_Intervention1== "DDT spraying two rounds one aug/sep the second nov/dec, dont know if any spraying happened the year before, if it did, then not mentioned, but 'control' villages were also sprayed by state health authorities, so could have happened before)"] <- 'IRS'
data$Sum_Intervention1[data$Sum_Intervention1== 'IRS (as spraying done by government could be that not first time, one in july 13 and sept 13)'] <- 'IRS'
data$Sum_Intervention1[data$Sum_Intervention1== 'IRS (none in year before so first?, sep 14, july 15, july 16)'] <- 'IRS 1st time'
data$Sum_Intervention1[data$Sum_Intervention1== 'IRS (nov 13, apr 14, april 15, august 15) not clear if there was sprayin before or not)'] <- 'IRS'
data$Sum_Intervention1[data$Sum_Intervention1== 'IRS with DDT (yearly, june 09 with DDT and july 10 with deltamethrin, there was some spraying nationally but doesnt say if in this area or not. So we dont know if first time)'] <- 'IRS'
data$Sum_Intervention1[data$Sum_Intervention1== 'in 87 DDT two rounds, probably not first time and previous spraying the year before?, then in 88 one spraying with ICON)'] <- 'IRS'
data$Sum_Intervention1[data$Sum_Intervention1== 'AL(over 3 days) and CQ (over 3 days 25mg/kg total dose) and PQ 20 days (no mention of it being done before)'] <- 'MDA 1st'
data$Sum_Intervention1[data$Sum_Intervention1== 'AL(over 3 days) and CQ (over 3 days 25mg/kg total dose) and placebo (no mention of it being done before)'] <- 'MDA 1st'
data$Sum_Intervention1[data$Sum_Intervention1== 'MDA (chloroquine and primaquine, three day course (subtherapeutic dose of primaquine for exo-erythorcyte stages of Pv), adults 600 mg,450,450 of chloroquine, and 15 mg of primaquine each day), did MDA in 1967 before but not since 1969'] <- 'MDA 1st'
data$Sum_Intervention1[data$Sum_Intervention1== 'MDA(chloroquine, primaquine, pyrimethamine, sulfadoxine,) once a week for nine weeks, week 1,5,9: 45mg primaquine, 600mg chloroquine, 1500mg sulfadoxine and 3 tablets of 75mg pyrimethamine, other weeks only45 mg primaquine and 300 mg chloroquine, no mention of any MDA before this'] <- 'MDA 1st'
data$Sum_Intervention1[data$Sum_Intervention1== 'treatment (chloroquine and sulfadoxine/pyrimethamine) (first time)'] <- 'MDA 1st'
data$Sum_Intervention1[data$Sum_Intervention1== 'MDA (dihydroartemisinin (7mg/kg) and piperaquine tetraphosphate (55mg/kg) every month three days for three months, probably first time)'] <- 'MDA 1st'
data$Sum_Intervention1[data$Sum_Intervention1== "MDA (might be first time, doesn't really say anything, doesn't say what drug used)"] <- 'MDA 1st'
data$Sum_Intervention1[data$Sum_Intervention1== 'MDA (three rounds,daily dosage adults for 5 days: plasmochine (30mg) and quinine sulphate (900mg), twice daily, 3x every three weeks), did a study the year before not sure if at same scale. Then would be repeated MDA)'] <- 'MDA 1st'
data$Sum_Intervention1[data$Sum_Intervention1== 'MDA every four weeks, elevn treatment rounds, chloroquine, if above 45 kg then a dose of 450mg (also diethylcarbamazine for filariasis), measured after 10 rounds (nothing like this before mentioned\342\200\246 but kinda over a long time? So repeated or not?)'] <- 'MDA 1st'
data$Sum_Intervention1[data$Sum_Intervention1== 'three monthly rounds MDA (3 days of dihydroartemisinin-piperaquine 7.5/60mg/kg) only people above 14 years'] <- 'MDA 1st'
data$Sum_Intervention1[data$Sum_Intervention1== 'MDA (3 times a month apart, dihydroartemisnin (7mg per kg) and piperaquine (55mg per kg), 3 days once per day, probably first time apart from maybe pilot'] <- 'MDA 1st'
data$Sum_Intervention1[data$Sum_Intervention1== 'MDA 600 mg chloroquine (and 45mg primaquine where Pf is predominant), probably first time'] <- 'MDA 1st'
data$Sum_Intervention1[data$Sum_Intervention1== 'MDA (low dose primaquine, dihydroartemisinin-piperaquine, three rounds, 1 monthly, no mention of any MDA done before'] <- 'MDA 1st'
data$Sum_Intervention1[data$Sum_Intervention1== 'more screening (weekly screening)'] <- 'increased case management'
data$Sum_Intervention1[data$Sum_Intervention1== 'test and treat'] <- 'increased case management'
data$Sum_Intervention1[data$Sum_Intervention1== 'community health workers that diagnosed and treated and also could give LLINs'] <- 'increased case management'
data$Sum_Intervention1[data$Sum_Intervention1== 'village malaria workers scale up'] <- 'increased case management'
data$Sum_Intervention1[data$Sum_Intervention1== 'first mass screening and treating (DHP and primaquine for Pf and primaquine over 14 days for Pv)'] <- 'increased case management'
data$Sum_Intervention1[data$Sum_Intervention1== 'early diagnosis and access to treatment via malaria posts whcih also provided LLIN to all'] <- 'increased case management'
data$Sum_Intervention1[data$Sum_Intervention1== 'increased case management (community agent that detects and treats, passive case detection though, but before needed to go 25km to next village)'] <- 'increased case management'
data$Sum_Intervention1[data$Sum_Intervention1== 'aggressive active case detection: monthly testing and treating of around 60-90% of population, by 2000 15 trained in 2001 39)'] <- 'increased case management'
data$Sum_Intervention1[data$Sum_Intervention1== 'increased case management: health posts with health extension workers, village level'] <- 'increased case management'
data$Sum_Intervention1[data$Sum_Intervention1== 'control to ITN (permethrin treated)'] <- 'control'
data$Sum_Intervention1[data$Sum_Intervention1== 'control to ITNs (first time deltamethrin)'] <- 'control'
data$Sum_Intervention1[data$Sum_Intervention1== 'control to nets'] <- 'control'
data$Sum_Intervention1[data$Sum_Intervention1== 'control to spraying'] <- 'control'
data$Sum_Intervention1[data$Sum_Intervention1== 'none'] <- 'control'
data$Sum_Intervention1<- as.factor(data$Sum_Intervention1)
data$Sum_Intervention1<- as.character(data$Sum_Intervention1)
data$Sum_Intervention1<- as.factor(data$Sum_Intervention1)
#summarize intervention2
data$Sum_Intervention2<- as.character(data$Sum_Intervention2)
data$Sum_Intervention2[data$Sum_Intervention2== 'LLIN 2nd round'] <- 'LLIN later dist'
data$Sum_Intervention2[data$Sum_Intervention2== 'LLIN 3rd round'] <- 'LLIN later dist'
data$Sum_Intervention2[data$Sum_Intervention2== 'LLIN (replacing older ones)'] <- 'LLIN later dist'
data$Sum_Intervention2[data$Sum_Intervention2== 'ITN (retreated yearly) no mention of any ITN provdied before this time'] <- 'LLIN 1st dist'
data$Sum_Intervention2[data$Sum_Intervention2== 'LLIN distribution (first)'] <- 'LLIN 1st dist'
data$Sum_Intervention2[data$Sum_Intervention2== 'LLIN 2nd distribution'] <- 'LLIN later dist'
data$Sum_Intervention2[data$Sum_Intervention2== 'LLIN 2nd dist'] <- 'LLIN later dist'
data$Sum_Intervention2[data$Sum_Intervention2== 'second ITN distribution'] <- 'LLIN later dist'
data$Sum_Intervention2[data$Sum_Intervention2== 'ITN distribution (not first time)'] <- 'LLIN later dist'
data$Sum_Intervention2[data$Sum_Intervention2== 'ITN scale up (no big scale up before, more gradual)'] <- 'LLIN 1st dist'
data$Sum_Intervention2[data$Sum_Intervention2== 'permethrin-impregnated nets (many owned nets before, not sure when and how they got them)'] <- 'LLIN'
data$Sum_Intervention2[data$Sum_Intervention2== 'LLIN (not known which round)'] <- 'LLIN'
data$Sum_Intervention2[data$Sum_Intervention2== 'later time LLIN'] <- 'LLIN later dist'
data$Sum_Intervention2[data$Sum_Intervention2== 'retreatment'] <- 'net retreatment'
data$Sum_Intervention2[data$Sum_Intervention2== 'Permethrin-net re impregnation'] <- 'net retreatment'
data$Sum_Intervention2[data$Sum_Intervention2== 'first retreatment'] <- 'net retreatment'
data$Sum_Intervention2[data$Sum_Intervention2== 'IRS (on a larger scale)'] <- 'IRS repeated time'
data$Sum_Intervention2[data$Sum_Intervention2== 'IRS (itensification, also doesnt specify what was before and what this intensification exactly was, spraying with cypermethrin)'] <- 'IRS repeated time'
data$Sum_Intervention2[data$Sum_Intervention2== 'IRS (none in year before so first?, sep 14, july 15, july 16)'] <- 'IRS 1st time'
data$Sum_Intervention2[data$Sum_Intervention2== 'IRS (pyrtehroid, repeated, before they used DDT, so there was spraying the year before)'] <- 'IRS repeated time'
data$Sum_Intervention2[data$Sum_Intervention2== 'test and treat'] <- 'increased case management'
data$Sum_Intervention2[data$Sum_Intervention2== 'increased case management: health posts with health extension workers, village level'] <- 'increased case management'
data$Sum_Intervention2[data$Sum_Intervention2== 'second mass screening and treating (DHP and primaquine for Pf and primaquine over 14 days for Pv)'] <- 'increased case management'
data$Sum_Intervention2[data$Sum_Intervention2== 'MDA (low dose primaquine, dihydroartemisinin-piperaquine, three rounds, 1 monthly, no mention of any MDA done before'] <- 'MDA 1st'
data$Sum_Intervention2[data$Sum_Intervention2== 'MDA chloeoquine and pyrimethamine, whenever spraying happened, for three years (68-70), pyrimethamine and chloroquine, first time)'] <- 'MDA 1st'
data$Sum_Intervention2[data$Sum_Intervention2== 'second of three monthly MDA (3 days of dihydroartemisinin-piperaquine 7.5/60mg/kg) only people above 14 years'] <- 'MDA repeated'
data$Sum_Intervention2[data$Sum_Intervention2== 'MDA (450 mgchloroquine, 50 mg pyrimethamine, 4 doses in 1958)'] <- 'MDA repeated' #considered repeatd because measurement after last and dont know how much space there was inbetween
data$Sum_Intervention2[data$Sum_Intervention2== 'MDA 600 mg chloroquine (and 45mg primaquine where Pf is predominant), repeated time'] <- 'MDA repeated'
data$Sum_Intervention2[data$Sum_Intervention2== 'control to starting of anual retreating'] <- 'control'
data$Sum_Intervention2[data$Sum_Intervention2== 'none'] <- 'control'
data$Sum_Intervention2[data$Sum_Intervention2== ''] <- 'none'
data$Sum_Intervention2<- as.character(data$Sum_Intervention2)
data$Sum_Intervention2<- as.factor(data$Sum_Intervention2)
#summarize intervention3
data$Sum_Intervention3<- as.character(data$Sum_Intervention3)
data$Sum_Intervention3[data$Sum_Intervention3== 'LLIN 3rd round'] <- 'LLIN later dist'
data$Sum_Intervention3[data$Sum_Intervention3== 'LLIN 2nd round(replacing of old nets)'] <- 'LLIN later dist'
data$Sum_Intervention3[data$Sum_Intervention3== 'second retreatment'] <- 'net retreatment'
data$Sum_Intervention3[data$Sum_Intervention3== 'third of three monthly MDA (3 days of dihydroartemisinin-piperaquine 7.5/60mg/kg) only people above 14 years'] <- 'MDA repeated'
data$Sum_Intervention3[data$Sum_Intervention3== 'MDA (adults: 450mg chloroquine,50 mg pyrimethamine, 2 doses in 1959)'] <- 'MDA repeated'
data$Sum_Intervention3[data$Sum_Intervention3== 'none'] <- 'control'
data$Sum_Intervention3[data$Sum_Intervention3== 'control to retreat'] <- 'control'
data$Sum_Intervention3[data$Sum_Intervention3== ''] <- 'none'
data$Sum_Intervention3[data$Sum_Intervention3== 'window screening'] <- 'none'
data$Sum_Intervention3<- as.character(data$Sum_Intervention3)
data$Sum_Intervention3<- as.factor(data$Sum_Intervention3)
#summarize intervention4
data$Sum_Intervention4<- as.character(data$Sum_Intervention4)
data$Sum_Intervention4[data$Sum_Intervention4== 'control to retreatment'] <- 'control'
data$Sum_Intervention4[data$Sum_Intervention4== 'third retreatment'] <- 'net retreatment'
data$Sum_Intervention4[data$Sum_Intervention4== ''] <- 'none'
data$Sum_Intervention4<- as.character(data$Sum_Intervention4)
data$Sum_Intervention4<- as.factor(data$Sum_Intervention4)
#baseline intervention, put into category yes/no (everything yes unless if empty field or none/nothing)
data$baseline_intervention_YN<- 'yes'
data$baseline_intervention_YN[data$baselin_intervention== '']<- 'unknown'
data$baseline_intervention_YN[data$baselin_intervention== 'nothing']<- 'no'
data$baseline_intervention_YN[data$baselin_intervention== 'none']<- 'no'
#making time numeric
data$time_incidence_int1<-as.numeric(data$time_incidence_int1)
data$time_incidence_int2<-as.numeric(data$time_incidence_int2)
data$time_incidence_int3<-as.numeric(data$time_incidence_int3)
data$time_incidence_int4<-as.numeric(data$time_incidence_int4)
data$time_incidence_int5<-as.numeric(data$time_incidence_int5)
data$time_incidence_int6<-as.numeric(data$time_incidence_int6)
data$time_incidence_int7<-as.numeric(data$time_incidence_int7)
data$time_incidence_int8<-as.numeric(data$time_incidence_int8)
data$time_incidence_int9<-as.numeric(data$time_incidence_int9)
data$time_prevalence_int1<-as.numeric(data$time_prevalence_int1)
data$time_prevalence_int2<-as.numeric(data$time_prevalence_int2)
data$time_prevalence_int3<-as.numeric(data$time_prevalence_int3)
data$time_prevalence_int4<-as.numeric(data$time_prevalence_int4)
data$time_prevalence_int5<-as.numeric(data$time_prevalence_int5)
data$time_prevalence_int6<-as.numeric(data$time_prevalence_int6)
data$time_prevalence_int7<-as.numeric(data$time_prevalence_int7)
data$time_prevalence_int8<-as.numeric(data$time_prevalence_int8)
data$time_prevalence_int9<-as.numeric(data$time_prevalence_int9)
#codes study type (trial or observational)
data$study_type<-as.factor(data$study_type)
data$study_type<-as.character(data$study_type)
data$study_type[data$study_type== 'RCT (pilot)'] <- 'trial'
data$study_type[data$study_type== 'RCT'] <- 'trial'
data$study_type[data$study_type== 'trial but not randomized'] <- 'trial'
data$study_type[data$study_type== 'RCT in 1996, then follow up more observational'] <- 'trial'
data$study_type[data$study_type== 'RCT but couldnt use control because doesnt give prevalence, so before and after'] <- 'observational'
data$study_type[data$study_type== 'observational (before after)'] <- 'observational'
data$study_type[data$study_type== 'observational, before and after'] <- 'observational'
data$study_type[data$study_type== 'bservational, cross sectional'] <- 'observational'
data$study_type[data$study_type== 'observational: longitudinal'] <- 'observational'
data$study_type[data$study_type== 'RCT, but here used before and after as control is untreated nets and not no nets'] <- 'observational'
data$study_type[data$study_type== 'before-after'] <- 'observational'
data$study_type[data$study_type== 'longitudinal'] <- 'observational'
data$study_type[data$study_type== 'longitudinal??'] <- 'observational'
data$study_type[data$study_type== 'observational before after'] <- 'observational'
data$study_type[data$study_type== 'observational, cross sectional'] <- 'observational'
data$study_type[data$study_type== 'uncontrolled before and after study'] <- 'observational'
data$study_type[data$study_type== 'uncotrolled before and after'] <- 'observational'
data$study_type[data$study_type== 'RCT (but here only control used because compare LLIN and LLIN and repellent)'] <- 'observational'
data$study_type[data$study_type== 'trial but we look at control'] <- 'observational'
data$study_type[data$study_type== 'trial: no control though because compared one regimen with primaquine and one without'] <- 'observational'
data$study_type[data$study_type== 'RCT, but this is control group that received intervention later'] <- 'observational'
data$study_type[data$study_type== 'RCT (pilot), but took before time points'] <- 'observational'
data$study_type[data$study_type== 'trial but control group with untreated nets'] <- 'observational'
data$study_type<-as.factor(data$study_type)
#codes relapse zones from battle et al.
data$relapse_pattern_zone<-as.character(data$relapse_pattern_zone)
data$relapse_pattern_zone[data$relapse_pattern_zone=='3']<-'South America'
data$relapse_pattern_zone[data$relapse_pattern_zone=='2']<-'Central America'
data$relapse_pattern_zone[data$relapse_pattern_zone=='12']<-'Melanesia'
data$relapse_pattern_zone[data$relapse_pattern_zone=='7']<-'Sub Saharan Africa'
data$relapse_pattern_zone[data$relapse_pattern_zone=='10']<-'South East Asia'
data$relapse_pattern_zone[data$relapse_pattern_zone=='8']<-'Monsoon Asia'
data$relapse_pattern_zone[data$relapse_pattern_zone=='11']<-'Northern Europe and Asia'
data$relapse_pattern_zone[data$relapse_pattern_zone=='5']<-'Mediterranean and North Africa'
data$relapse_pattern_zone<-as.factor(data$relapse_pattern_zone)
#relapse pattern from white
data$relapse_pattern_white<-as.factor(data$relapse_pattern_white)
#recoding sample size incidence: is the number of people that could get ill:
data$sample_size_incidence_number_people<-as.factor(data$sample_size_incidence_number_people)
data$sample_size_incidence_number_people<-as.character(data$sample_size_incidence_number_people)
data$sample_size_incidence_number_people[data$sample_size_incidence_number_people== '103799 people at risk'] <- '103799'
data$sample_size_incidence_number_people[data$sample_size_incidence_number_people== '106882 people at risk'] <- '106882'
data$sample_size_incidence_number_people[data$sample_size_incidence_number_people== '109827 people at risk'] <- '109827'
data$sample_size_incidence_number_people[data$sample_size_incidence_number_people== '112755 people at risk'] <- '112755'
data$sample_size_incidence_number_people[data$sample_size_incidence_number_people== '120457 pop mimika around apr 2004-mar 2006'] <- '120457'
data$sample_size_incidence_number_people[data$sample_size_incidence_number_people== '143723 pop mimika around apr2006-mar2008'] <- '143723'
data$sample_size_incidence_number_people[data$sample_size_incidence_number_people== '148124 pop mimika around apr 2008-dec 2009'] <- '148124'
data$sample_size_incidence_number_people[data$sample_size_incidence_number_people== '189447pop mimika around jan 2010-dec 2013'] <- '189447'
data$sample_size_incidence_number_people[data$sample_size_incidence_number_people== '31292 (population in district in 2009)'] <- '31292'
data$sample_size_incidence_number_people[data$sample_size_incidence_number_people== '96496 people at risk'] <- '96496'
data$sample_size_incidence_number_people[data$sample_size_incidence_number_people== 'ariound 150000 people at risk'] <- '150000'
data$sample_size_incidence_number_people[data$sample_size_incidence_number_people== 'a it below 130000 people at risk'] <- ''
data$sample_size_incidence_number_people[data$sample_size_incidence_number_people== 'around 130000 people at risk'] <- '130000'
data$sample_size_incidence_number_people[data$sample_size_incidence_number_people== 'around 135000 people at risk'] <- '135000'
data$sample_size_incidence_number_people[data$sample_size_incidence_number_people== 'around 140000 people at risk'] <- '140000'
data$sample_size_incidence_number_people[data$sample_size_incidence_number_people== 'around 145000 people at risk'] <- '145000'
data$sample_size_incidence_number_people[data$sample_size_incidence_number_people== 'around 155000 people at risk'] <- '155000'
data$sample_size_incidence_number_people[data$sample_size_incidence_number_people== 'around 160000 people at risk'] <- '160000'
data$sample_size_incidence_number_people[data$sample_size_incidence_number_people== 'around 165000 people at risk'] <- '165000'
data$sample_size_incidence_number_people[data$sample_size_incidence_number_people== 'around 170000 people at risk'] <- '170000'
data$sample_size_incidence_number_people[data$sample_size_incidence_number_people== 'around 175000 people at risk'] <- '175000'
data$sample_size_incidence_number_people[data$sample_size_incidence_number_people== 'around 180000 people at risk'] <- '180000'
data$sample_size_incidence_number_people[data$sample_size_incidence_number_people== 'around 50000 residents'] <- '50000'
data$sample_size_incidence_number_people[data$sample_size_incidence_number_people== 'because national: whople population: 2.7 million people'] <- '2700000'
data$sample_size_incidence_number_people[data$sample_size_incidence_number_people== 'between 120000 and 130000 people at risk'] <- '125000'
data$sample_size_incidence_number_people[data$sample_size_incidence_number_people== 'between 297 and 366 people'] <- '332'
data$sample_size_incidence_number_people[data$sample_size_incidence_number_people== 'between 710 and 928 people'] <- '819'
data$sample_size_incidence_number_people[data$sample_size_incidence_number_people== 'missing, say something abot how many people in subdiustrict but dont know what that means'] <- ''
data$sample_size_incidence_number_people[data$sample_size_incidence_number_people== 'no exact numbers: in 86/87 around 3600 people, a lot of migration, porbably had less people before that'] <- ''
data$sample_size_incidence_number_people[data$sample_size_incidence_number_people== 'none given, but at least 5000 catchement population'] <- '5000'
data$sample_size_incidence_number_people[data$sample_size_incidence_number_people== 'not clear, this is a mixture of all the hotspots that got MDA, there are different follow up times. So the closer to time point zero the bigger the population size and at the end tehre will be less because not all have such a long follow up. In total 12465 people in hotspot village'] <- '12465'
data$sample_size_incidence_number_people[data$sample_size_incidence_number_people== 'not know for the individual PHCs in total would be 51325'] <- ''
data$sample_size_incidence_number_people[data$sample_size_incidence_number_people== 'missing'] <- ''
data$sample_size_incidence_number_people<-as.factor(data$sample_size_incidence_number_people)
is.na(data$sample_size_incidence_number_people) <- data$sample_size_incidence_number_people == ''
data$sample_size_incidence_number_people<-strtoi(data$sample_size_incidence_number_people)
#######
##add mixed cases to both vivax as well as falciparum
#######
#for prevalence
data$mixed_what_done<-as.factor(data$mixed_what_done)
#levels(data$mixed_what_done)
x<-c(1:length(data$master_study_number))
for (val in x)
{if (!is.na(data$pr_numb_vivax_LM[val])) {(data$pr_numb_vivax_LM[val] <- rowSums(data[val,c("pr_numb_vivax_LM", "pr_numb_mixed_LM")], na.rm=TRUE))}}
for (val in x)
{if (!is.na(data$pr_numb_falciparum_LM[val])) {(data$pr_numb_falciparum_LM[val] <- rowSums(data[val,c("pr_numb_falciparum_LM", "pr_numb_mixed_LM")], na.rm=TRUE))}}
for (val in x)
{if (!is.na(data$pr_numb_vivax_PCR[val])) {(data$pr_numb_vivax_PCR[val] <- rowSums(data[val,c("pr_numb_vivax_PCR", "pr_numb_mixed_PCR")], na.rm=TRUE))}}
for (val in x)
{if (!is.na(data$pr_numb_falciparum_PCR[val])) {(data$pr_numb_falciparum_PCR[val] <- rowSums(data[val,c("pr_numb_falciparum_PCR", "pr_numb_mixed_PCR")], na.rm=TRUE))}}
for (val in x)
{if (!is.na(data$prevalence_vivax_LM[val])) {(data$prevalence_vivax_LM[val] <- rowSums(data[val,c("prevalence_vivax_LM", "prevalence_mixed_LM")], na.rm=TRUE))}}
for (val in x)
{if (!is.na(data$prevalence_falciparum_LM[val])) {(data$prevalence_falciparum_LM[val] <- rowSums(data[val,c("prevalence_falciparum_LM", "prevalence_mixed_LM")], na.rm=TRUE))}}
for (val in x)
{if (!is.na(data$prevalence_vivax_PCR[val])) {(data$prevalence_vivax_PCR[val] <- rowSums(data[val,c("prevalence_vivax_PCR", "prevalence_mixed_PCR")], na.rm=TRUE))}}
for (val in x)
{if (!is.na(data$prevalence_falciparum_PCR[val])) {(data$prevalence_falciparum_PCR[val] <- rowSums(data[val,c("prevalence_falciparum_PCR", "prevalence_mixed_PCR")], na.rm=TRUE))}}
for (val in x)
{if (!is.na(data$prevalence_vivax_RDT[val])) {(data$prevalence_vivax_RDT[val] <- rowSums(data[val,c("prevalence_vivax_RDT", "prevalence_mixed_RDT")], na.rm=TRUE))}}
for (val in x)
{if (!is.na(data$prevalence_falciparum_RDT[val])) {(data$prevalence_falciparum_RDT[val] <- rowSums(data[val,c("prevalence_falciparum_RDT", "prevalence_mixed_RDT")], na.rm=TRUE))}}
#for incidence
data$pr_what_done_mixed<-as.factor(data$pr_what_done_mixed)
#levels(data$pr_what_done_mixed)
x<-c(1:length(data$master_study_number))
for (val in x)
{if (!is.na(data$incidence_rate_vivax[val])) {(data$incidence_rate_vivax[val] <- rowSums(data[val,c("incidence_rate_vivax", "incidence_rate_mixed")], na.rm=TRUE))}}
for (val in x)
{if (!is.na(data$incidence_rate_falciparum[val])) {(data$incidence_rate_falciparum[val] <- rowSums(data[val,c("incidence_rate_falciparum", "incidence_rate_mixed")], na.rm=TRUE))}}
for (val in x)
{if (!is.na(data$case_numbers_vivax[val])) {(data$case_numbers_vivax[val] <- rowSums(data[val,c("case_numbers_vivax", "case_numbers_mixed")], na.rm=TRUE))}}
for (val in x)
{if (!is.na(data$case_numbers_falciparum[val])) {(data$case_numbers_falciparum[val] <- rowSums(data[val,c("case_numbers_falciparum", "case_numbers_mixed")], na.rm=TRUE))}}
#remove the kessler because no vivax cases found in that time
#find which row numbers
data$row_number<-seq.int(nrow(data))
b<-data$row_number[data$study_number==48]
data<-data[-c(b),]
data$row_number<-seq.int(nrow(data))
#convert incidence rates in actual case numbers
data$unit_incidence_rate<-as.character(data$unit_incidence_rate)
data$inc_converted_vivax<-data$incidence_rate_vivax
data$inc_converted_vivax<-NA
x<-c(1:length(data$master_study_number))
for (val in x){
if(data$unit_incidence_rate[val]=='cases per 1000 people per month'){data$inc_converted_vivax[val]<-data$incidence_rate_vivax[val]*data$sample_size_incidence_number_people[val]*(as.numeric(data$date_end_incidence[val]-data$date_start_incidence[val])/30/1000)}}
for (val in x){
if(data$unit_incidence_rate[val]=='cases per 1000 child-months'){data$inc_converted_vivax[val]<-data$incidence_rate_vivax[val]*data$sample_size_incidence_number_people[val]*(as.numeric(data$date_end_incidence[val]-data$date_start_incidence[val])/30/1000)}}
for (val in x){
if(data$unit_incidence_rate[val]=='cases per 1000 person months'){data$inc_converted_vivax[val]<-data$incidence_rate_vivax[val]*data$sample_size_incidence_number_people[val]*(as.numeric(data$date_end_incidence[val]-data$date_start_incidence[val])/30/1000)}}
for (val in x){
if(data$unit_incidence_rate[val]=='cases per 1000 person years!'){data$inc_converted_vivax[val]<-data$incidence_rate_vivax[val]*data$sample_size_incidence_number_people[val]*(as.numeric(data$date_end_incidence[val]-data$date_start_incidence[val])/365/1000)}}
for (val in x){
if(data$unit_incidence_rate[val]=='cases per person per year'){data$inc_converted_vivax[val]<-data$incidence_rate_vivax[val]*data$sample_size_incidence_number_people[val]*(as.numeric(data$date_end_incidence[val]-data$date_start_incidence[val])/365)}}
for (val in x){
if(data$unit_incidence_rate[val]=='episodes/child/year-at-risk'){data$inc_converted_vivax[val]<-data$incidence_rate_vivax[val]*data$sample_size_incidence_number_people[val]*(as.numeric(data$date_end_incidence[val]-data$date_start_incidence[val])/365)}}
for (val in x){
if(data$unit_incidence_rate[val]=='cases per 1000 per month'){data$inc_converted_vivax[val]<-data$incidence_rate_vivax[val]*data$sample_size_incidence_number_people[val]*(as.numeric(data$date_end_incidence[val]-data$date_start_incidence[val])/30/1000)}}
#for falciparum
data$unit_incidence_rate<-as.character(data$unit_incidence_rate)
data$inc_converted_falc<-data$incidence_rate_falciparum
data$inc_converted_falc<-NA
x<-c(1:length(data$master_study_number))
for (val in x){
if(data$unit_incidence_rate[val]=='cases per 1000 people per month'){data$inc_converted_falc[val]<-data$incidence_rate_falciparum[val]*data$sample_size_incidence_number_people[val]*(as.numeric(data$date_end_incidence[val]-data$date_start_incidence[val])/30/1000)}}
for (val in x){
if(data$unit_incidence_rate[val]=='cases per 1000 child-months'){data$inc_converted_falc[val]<-data$incidence_rate_falciparum[val]*data$sample_size_incidence_number_people[val]*(as.numeric(data$date_end_incidence[val]-data$date_start_incidence[val])/30/1000)}}
for (val in x){
if(data$unit_incidence_rate[val]=='cases per 1000 person months'){data$inc_converted_falc[val]<-data$incidence_rate_falciparum[val]*data$sample_size_incidence_number_people[val]*(as.numeric(data$date_end_incidence[val]-data$date_start_incidence[val])/30/1000)}}
for (val in x){
if(data$unit_incidence_rate[val]=='cases per 1000 person years!'){data$inc_converted_falc[val]<-data$incidence_rate_falciparum[val]*data$sample_size_incidence_number_people[val]*(as.numeric(data$date_end_incidence[val]-data$date_start_incidence[val])/365/1000)}}
for (val in x){
if(data$unit_incidence_rate[val]=='cases per person per year'){data$inc_converted_falc[val]<-data$incidence_rate_falciparum[val]*data$sample_size_incidence_number_people[val]*(as.numeric(data$date_end_incidence[val]-data$date_start_incidence[val])/365)}}
for (val in x){
if(data$unit_incidence_rate[val]=='episodes/child/year-at-risk'){data$inc_converted_falc[val]<-data$incidence_rate_falciparum[val]*data$sample_size_incidence_number_people[val]*(as.numeric(data$date_end_incidence[val]-data$date_start_incidence[val])/365)}}
for (val in x){
if(data$unit_incidence_rate[val]=='cases per 1000 per month'){data$inc_converted_falc[val]<-data$incidence_rate_falciparum[val]*data$sample_size_incidence_number_people[val]*(as.numeric(data$date_end_incidence[val]-data$date_start_incidence[val])/30/1000)}}
#do the same for prevalence: convert prevalence rates in actual number of infections found
#for falciparum
data$prev_converted_falc<-data$prevalence_falciparum_PCR
data$prev_converted_falc<-NA
x<-c(1:length(data$master_study_number))
for (val in x){
data$prev_converted_falc[val]<-data$prevalence_falciparum_LM[val]/100*data$sample_size_prevalence[val]}
for (val in x){
if(is.na(data$prev_converted_falc[val])){data$prev_converted_falc[val]<-data$prevalence_falciparum_PCR[val]/100*data$sample_size_prevalence[val]}}
for (val in x){
if(is.na(data$prev_converted_falc[val])){data$prev_converted_falc[val]<-data$prevalence_falciparum_RDT[val]/100*data$sample_size_prevalence[val]}}
#also for prevalence make number of infection found
#for vivax
data$prev_converted_viv<-data$prevalence_vivax_PCR
data$prev_converted_viv<-NA
x<-c(1:length(data$master_study_number))
for (val in x){
if(is.na(data$prev_converted_viv[val])){data$prev_converted_viv[val]<-data$prevalence_vivax_RDT[val]/100*data$sample_size_prevalence[val]}}
for (val in x){
if(is.na(data$prev_converted_viv[val])){data$prev_converted_viv[val]<-data$prevalence_vivax_PCR[val]/100*data$sample_size_prevalence[val]}}
for (val in x){
if(is.na(data$prev_converted_viv[val])){data$prev_converted_viv[val]<-data$prevalence_vivax_LM[val]/100*data$sample_size_prevalence[val]}}
#code categories for case numbers--> which then gives the size of the dots for the plots
data$row_number<-seq.int(nrow(data))
#vivax_new_inc and falciparum_new_inc are variables for the actual case numbers found in incidence
data$vivax_new_inc<-data$case_numbers_vivax
data$falciparum_new_inc<-data$case_numbers_falciparum
x<-c(1:length(data$first_authors))
for (val in x){
if(is.na(data$vivax_new_inc[val])){data$vivax_new_inc[val]<-data$inc_converted_vivax[val]}}
x<-c(1:length(data$first_authors))
for (val in x){
if(is.na(data$falciparum_new_inc[val])){data$falciparum_new_inc[val]<-data$inc_converted_falc[val]}}
data$case_numbers_total_inc<-data$vivax_new_inc+data$falciparum_new_inc
data$case_numbers_total_cat_inc<-data$case_numbers_total_inc
data$case_numbers_total_cat_inc<-NA
x<-c(1:length(data$first_authors))
for (val in x){
if(!is.na(data$case_numbers_total_inc[val])&data$case_numbers_total_inc[val]<200){data$case_numbers_total_cat_inc[val]<-0.75}
if(!is.na(data$case_numbers_total_inc[val])&data$case_numbers_total_inc[val]>=200){data$case_numbers_total_cat_inc[val]<-1.0}
if(!is.na(data$case_numbers_total_inc[val])&data$case_numbers_total_inc[val]>=500){data$case_numbers_total_cat_inc[val]<-1.25}
if(!is.na(data$case_numbers_total_inc[val])&data$case_numbers_total_inc[val]>=1000){data$case_numbers_total_cat_inc[val]<-1.5}
if(!is.na(data$case_numbers_total_inc[val])&data$case_numbers_total_inc[val]>=2000){data$case_numbers_total_cat_inc[val]<-1.75}
}
#code for getting the dot size for prevalence data
data$row_number<-seq.int(nrow(data))
data$vivax_new_prev<-data$pr_numb_vivax_LM
data$falciparum_new_prev<-data$pr_numb_falciparum_LM
x<-c(1:length(data$Intervention1))
for (val in x){
if(is.na(data$vivax_new_prev[val])){data$vivax_new_prev[val]<-data$pr_numb_vivax_PCR[val]}}
for (val in x){
if(is.na(data$falciparum_new_prev[val])){data$falciparum_new_prev[val]<-data$pr_numb_falciparum_PCR[val]}}
x<-c(1:length(data$Intervention1))
for (val in x){
if(is.na(data$vivax_new_prev[val])){data$vivax_new_prev[val]<-data$prev_converted_viv[val]}}
for (val in x){
if(is.na(data$falciparum_new_prev[val])){data$falciparum_new_prev[val]<-data$prev_converted_falc[val]}}
data$case_numbers_total_prev<-data$vivax_new_prev+data$falciparum_new_prev
data$case_numbers_total_cat_prev<-data$case_numbers_total_prev
data$case_numbers_total_cat_prev<-NA
x<-c(1:length(data$first_authors))
for (val in x){
if(!is.na(data$case_numbers_total_prev[val])&data$case_numbers_total_prev[val]<100){data$case_numbers_total_cat_prev[val]<-0.75}
if(!is.na(data$case_numbers_total_prev[val])&data$case_numbers_total_prev[val]>=100){data$case_numbers_total_cat_prev[val]<-1.0}
if(!is.na(data$case_numbers_total_prev[val])&data$case_numbers_total_prev[val]>=200){data$case_numbers_total_cat_prev[val]<-1.25}
if(!is.na(data$case_numbers_total_prev[val])&data$case_numbers_total_prev[val]>=500){data$case_numbers_total_cat_prev[val]<-1.5}
if(!is.na(data$case_numbers_total_prev[val])&data$case_numbers_total_prev[val]>=1000){data$case_numbers_total_cat_prev[val]<-1.75}
}
#what was done with mixed cases (incidence), recoding
data$mixed_what_done<-as.character(data$mixed_what_done)
data$mixed_what_done[data$mixed_what_done=='counted separetly']<-'given separately'
data$mixed_what_done[data$mixed_what_done=='counted seperately']<-'given separately'
data$mixed_what_done[data$mixed_what_done=='reported seperately']<-'given separately'
data$mixed_what_done[data$mixed_what_done=="doesn't clearly say"]<-'not reported'
data$mixed_what_done[data$mixed_what_done=="doesn't say"]<-'not reported'
data$mixed_what_done[data$mixed_what_done=="doesn't say, give mixed but could also be with Pm"]<-'mixed given but could be Pm too, hence not added'
data$mixed_what_done[data$mixed_what_done=="gives mixed but could also be Pm although unlikely because only few Pm cases"]<-'mixed given, could be with Pm but because few cases counted towards both Pv and Pf'
data$mixed_what_done[data$mixed_what_done=="mixed are counted towards falciparum"]<-'counted towards falciparum'
data$mixed_what_done[data$mixed_what_done=="not sure, not seperate, probably counted towards falciparum, but only few mixed cases"]<-'probably counted towards falciparum'
data$mixed_what_done<-as.factor(data$mixed_what_done)
#what was done with mixed cases (prevalence), recoding
data$pr_what_done_mixed<-as.character(data$pr_what_done_mixed)
data$pr_what_done_mixed[data$pr_what_done_mixed=='counted separately']<-'given separately'
data$pr_what_done_mixed[data$pr_what_done_mixed=='counted seperately']<-'given separately'
data$pr_what_done_mixed[data$pr_what_done_mixed=='counted towards both']<-'counted towards both'
data$pr_what_done_mixed[data$pr_what_done_mixed=='counted towards both Pv and Pf infections']<-'counted towards both'
data$pr_what_done_mixed[data$pr_what_done_mixed=='counted towards both: vivax as well as falciparum']<-'counted towards both'
data$pr_what_done_mixed[data$pr_what_done_mixed=='counted towards falciparum']<-'counted towards falciparum'
data$pr_what_done_mixed[data$pr_what_done_mixed=="doesn't say"]<-'not reported'
data$pr_what_done_mixed[data$pr_what_done_mixed=="doesn't say (assume counted towards both Pv and Pf)"]<-'not reported'
data$pr_what_done_mixed[data$pr_what_done_mixed=="doesn't say, probably counted towards both, in table 2 P spp is less than if you add up Pv and Pf"]<-'not reported'
data$pr_what_done_mixed[data$pr_what_done_mixed=='mixed cases counted towards falciparum']<-'counted towards falciparum'
data$pr_what_done_mixed[data$pr_what_done_mixed=='mixed counted towards both species']<-'counted towards both'
data$pr_what_done_mixed[data$pr_what_done_mixed=='mixed counted towards falciparum and vivax cases as well (probably), otherwise would be supe rhigh numbers!']<-'counted towards both'
data$pr_what_done_mixed[data$pr_what_done_mixed=='most probably counted towards both']<-'counted towards both'
data$pr_what_done_mixed[data$pr_what_done_mixed=='reported seperately']<-'given separately'
data$pr_what_done_mixed[data$pr_what_done_mixed=="there are some (not too many) but doesn't say if counted towards both or not"]<-'not reported'
data$pr_what_done_mixed[data$pr_what_done_mixed=="they have a seperate category but could also contain Pm, but could also be already counted towards the different groups and then mentioned again in the mixed ones"]<-'not reported'
data$pr_what_done_mixed<-as.factor(data$pr_what_done_mixed)
#code diagnostic tool for prevalence
data$prevalence_diagnostics<-NA
x<-c(1:length(data$first_authors))
for (val in x)
{if (!is.na(data$prevalence_falciparum_LM[val])){data$prevalence_diagnostics[val]<-'LM'}
if (!is.na(data$prevalence_falciparum_PCR[val])){data$prevalence_diagnostics[val]<-'PCR'}
if (!is.na(data$pr_numb_falciparum_LM[val])){data$prevalence_diagnostics[val]<-'LM'}
if (!is.na(data$pr_numb_falciparum_PCR[val])){data$prevalence_diagnostics[val]<-'PCR'}
if (!is.na(data$pr_numb_falciparum_PCR[val])&!is.na(data$pr_numb_falciparum_LM[val])){data$prevalence_diagnostics[val]<-'PCR and LM'}
if (!is.na(data$prevalence_falciparum_PCR[val])&!is.na(data$prevalence_falciparum_LM[val])){data$prevalence_diagnostics[val]<-'PCR and LM'}
}
data$prevalence_diagnostics<-as.factor(data$prevalence_diagnostics)