forked from mszyszkowicz/DataGEMM
-
Notifications
You must be signed in to change notification settings - Fork 0
/
SCHIF.sas
1761 lines (1477 loc) · 57.5 KB
/
SCHIF.sas
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/*************************************************************************************************
*
* Nonlinear concentraton-response function: model fitting and plotting procedure
*
* Date: Sept 19, 2016
* Version 2.17
*
* Purpose: This macro will fit a series of nonlinear concentration-response functions using
* cox proportional hazards models to identify optimal nonlinear relationship
*
* It will produce three outputs in the file directory specified by users:
*
* (1) a graph (.png) showing nonlinear relationship based on an optimal model, and if pre-specified, it will overlay an ensemble model,
* based on either all models examined or the best 3 models, according to their -2LogLik
* (2) a summary table (.pdf) listing coefficient, standard error, loglik, and function form of each cox model that was fitted
* (3) a summary table (.pdf) showing descriptive stats of air pollution exposure variable from the original and the trimmed datasets
*
* For more details about this modeling approach, please refer to the accompanying paper Masoud, Szyszkowicz,...,Burnett et al 2016 Air Quality, Atmosphere and Health
*
* Should there be any question with this macro, please contact:
* Drs Hong Chen ([email protected]) and Rick Burnett ([email protected])
*
*
* Please read the following notes before running the macro
*
* 1. All categorical variables should to be prepared as a series of dichotomous variables prior to calling macro
*
* 2. Exposure variable need to be define by using "fitvar" parameter
*
* 3. The following 3 parameters are optional: "strata", "label_exposure", and "label_unit"
*
* 4. To run time-fixed cox model, users need to specify "case" and "time" parameters,
* and omit "start" and "stop" parameters
*
* 5. To run time-varying cox model, users need to specify "case", "start", and "stop" parameters,
* and omit "time" parameter
*
* 6. If users do not wish to translate data, please specify translate=N or NO
*
* 7. By default, the option "best_model" is set to be 1. This means that the program will display concentration-response curve for best fitting model only.
*
* 7.1. Set "best_model"=2, the program will display concentration-response curve from ensemble models based on all models examined.
*
* 7.2. Set "best_model"=3, the program will display two concentration-response curves by overlaying best fitting model and ensemble models based on best 3 models examined.
*
* 7.3. Set "best_model"=4, the program will display two concentration-response curves by overlaying best fitting model and ensemble models based on ALL models examined.
*
* 8. By default, the option "lowest" is set to be "minimum". This means that the concentration-response curve covers the full range of the exposure variable.
*
* 8.1. Set "lowest"=1, the program will display concentration-response curve from 1 to max concentration of the exposure variable.
*
* 9. By default, the option "x_min" is set to be 1. This means that the concentration-response plot has x axis with min=1.
*
* 9.1. Set "x_min" as any positive number, x axis will start from this user-defined value.
*
* 9.2. Set "x_min" as any NON-numeric character including blank (eg, x_min=, x_min=NO, x_min=Null), x axis will start from the minimum of exposure variable.
*
* 10. By default, the option "export" is set to be NULL. This means that this program will NOT produce any matrix for HRs derived over the range of AP exposure.
*
* 10.1. Set "export" as YES or Y, this program will produce two csv files:
* (1) a table "export_hr.csv" presenting HRs derived from ~1000 curves at a series of locations: 0% to 100% by 1% over the range of AP exposure, and
* (2) a table "ap_exp.csv" presenting concentrations at 0% to 100% by 1% over the range of AP exposure.
*
*
* To illustrate how to use this macro, 7 examples are given below:
*
* Example 1: (translate data, fit time-fixed cox models, with strata variable, only show optimal model, c-r curve from min to max)
*
* %fitap(datain=cohort_ABC,perc_trim=1,translate=yes,dataout=allmodels,case=status,time=time,fitvar=no2,covvars=age sex, strata=inst,
* label_exposure=NO2, label_unit=ppb, output_path=D:\Working directory\Cohort);
*
* Example 2: (translate data, fit time-fixed cox models, without strata variable, only show optimal model, c-r curve from min to max)
*
* %fitap(datain=cohort_ABC,perc_trim=1,translate=y,dataout=allmodels,case=status,time=time,fitvar=no2,covvars=age sex, strata=,
* label_exposure=NO2, output_path=D:\Working directory\Cohort);
*
* Example 3: (translate data, fit time-fixed cox models, with two strata variables, only show optimal model, c-r curve from min to max)
*
* %fitap(datain=cohort_ABC,perc_trim=5,translate=YES,dataout=allmodels,case=death,time=time,fitvar=pm25,covvars=ses,strata=age sex,
* label_exposure=pm25, label_unit=ug/m3, output_path=D:\Working directory\Cohort);
*
* Example 4: (translate data, fit time-varying cox models, with strata variable, only show optimal model, c-r curve from min to max)
*
* %fitap(datain=cohort_ABC,perc_trim=1,dataout=allmodels,case=status,start=T1,stop=T2,fitvar=pm25,covvars=sex ses,strata=age,
* label_exposure=pm25, label_unit=ug/m3, output_path=D:\Working directory\Cohort);
*
* Example 5: (no translate data, fit time-varying cox models, with strata variable, only show optimal model, c-r curve from min to max)
*
* %fitap(datain=cohort_ABC,perc_trim=1,translate=N,dataout=allmodels,case=Status,start=TStart,stop=TStop,fitvar=pm25,covvars=sex ses income bmi,strata=age,
* label_exposure=pm25, label_unit=ug/m3, output_path=D:\Working directory\Cohort);
*
* Example 6: (no translate data, fit time-varying cox models, with strata variable, c-r curve from min to max, only show ensemble model based on all models examined)
*
* Note that to display only ensemble model based on all models examined, please define best_model=2
*
* %fitap(datain=cohort_ABC,perc_trim=1,translate=no,dataout=allmodels,case=Status,start=TStart,stop=TStop,fitvar=pm25,covvars=sex ses income bmi,strata=age,
* label_exposure=pm25, label_unit=ug/m3, best_model=2, output_path=D:\Working directory\Cohort);
*
* Example 7: (no translate data, fit time-varying cox models, with strata variable, c-r curve from min to max, overlay optimal model with ensemble model based on all models examined)
*
* Note that to overlay best fitting model with ensemble model based on all models examined, please define best_model=4
*
* %fitap(datain=cohort_ABC,perc_trim=1,translate=no,dataout=allmodels,case=Status,start=TStart,stop=TStop,fitvar=pm25,covvars=sex ses income bmi,strata=age,
* label_exposure=pm25, label_unit=ug/m3, output_path=D:\Working directory\Cohort, best_model=4);
*
* Example 8: (no translate data, fit time-varying cox models, with strata variable, c-r curve from min to max, overlay optimal model with ensemble model based on best 3 models examined)
*
* Note that to overlay best fitting model with ensemble model based on best 3 models examined, please define best_model=3
*
* %fitap(datain=cohort_ABC,perc_trim=1,translate=No,dataout=allmodels,case=Status,start=TStart,stop=TStop,fitvar=pm25,covvars=sex ses income bmi,strata=age,
* label_exposure=pm25, label_unit=ug/m3, output_path=D:\Working directory\Cohort, best_model=3);
*
* Example 9: (no translate data, fit time-varying cox models, with strata variable, c-r curve from 1 to max, overlay optimal model with ensemble model based on all models examined)
*
* %fitap(datain=cohort_ABC,perc_trim=1,lowest=1,translate=No,dataout=allmodels,case=Status,start=TStart,stop=TStop,fitvar=pm25,covvars=sex ses income bmi,strata=age,
* label_exposure=pm25, label_unit=ug/m3, output_path=D:\Working directory\Cohort, best_model=4);
*
* Example 10: (no translate data, fit time-varying cox models, with strata variable, x axis starts at 10 unit, c-r curve from 1 to max, overlay optimal model with ensemble model based on all models examined)
*
* %fitap(datain=cohort_ABC,perc_trim=1,lowest=1,translate=No,dataout=allmodels,case=Status,start=TStart,stop=TStop,fitvar=pm25,covvars=sex ses income bmi,strata=age,
* label_exposure=pm25, label_unit=ug/m3, x_min=10, output_path=D:\Working directory\Cohort, best_model=4);
*
* Example 11: (translated data, fit time-varying cox models, with strata variable, x axis starts at 10 unit, c-r curve from 1 to max, export HR matrix, overlay optimal model with ensemble model based on all models examined)
*
* %fitap(datain=cohort_ABC,perc_trim=1,lowest=1,translate=Yes,dataout=allmodels,case=Status,start=TStart,stop=TStop,fitvar=pm25,covvars=sex ses income bmi,strata=age,
* label_exposure=pm25, label_unit=ug/m3, x_min=10, output_path=D:\Working directory\Cohort, best_model=4, export=Yes);
*
*************************************************************************************************/
/**************************
* main function to select optimal model and ensemble model
***************************/
option LINESIZE=MAX;
%macro fitap(datain=,perc_trim=0,lowest=minimum,translate=yes,dataout=,case=,time=,start=,stop=,fitvar=,covvars=,strata=,label_exposure=,label_unit=,x_min=1,best_model=1,export=,output_path=);
/*Clean up any existing global macro variables*/
%symdel low_pct low_wc low_pct_1 low_wc_1 linear_model time start stop LL_min_all sum_LL_all LL_min_3 sum_LL_3 ensemble_model overlay;
/*lowest must be either min or 1, but not anything else*/
%if %upcase(&lowest.) ne MINIMUM and &lowest. ne 1 %then %do;
%put "WARNING: lowest must be either MINIMUM or 1";
%abort;
%end;
/*Identify user's choice on model overlay*/
/*Show best fitting model only*/
%if &best_model.=1 %then %do;
%let ensemble_model=;
%let overlay=;
/* %put &ensemble_model.; */
/* %put %length(&ensemble_model.); */
%end;
/*Show only ensemble model based on all models examined*/
%else %if &best_model.=2 %then %do;
%let ensemble_model=ensemble;
%let overlay=ALL;
/* %put &ensemble_model.; */
/* %put %length(&ensemble_model.); */
%end;
/*Overlay best fitting model with ensemble model based on best 3 models examined*/
%else %if &best_model.=3 %then %do;
%let ensemble_model=;
%let overlay=best3;
/* %put &overlay.; */
/* %put %length(&overlay.); */
%end;
/*Overlay best fitting model with ensemble model based on ALL models examined*/
%else %if &best_model.>=4 %then %do;
%let ensemble_model=;
%let overlay=ALL;
/* %put &overlay.; */
/* %put %length(&overlay.); */
%end;
/*Consider non-linear models first*/
%let linear_model = 0;
/*convert any conc < 1 to 1*/
/*
data &datain.; set &datain.;
if &fitvar.<1 then &fitvar.=1;
run;
*/
/*Trim data*/
%if 0<=&perc_trim.<= 10 %then %do;
%let trim_l=%SYSEVALF(&perc_trim.);
%let trim_r=%SYSEVALF(100-&perc_trim.);
%end;
%else %do; %put "WARNING: perc_trim must be an integer between 0 and 10"; %abort;%end;
%if 0=&perc_trim. %then %do;
data aftertrim; set &datain.; run;
proc sql noprint;
select min(&fitvar.) into: pctll from aftertrim;
select max(&fitvar.) into: pctlr from aftertrim;
quit;
%end;
%else %do;
proc univariate data=&datain. noprint;
var &fitvar.;
output out=percentiles pctlpts=&trim_l. &trim_r. pctlpre=ppp;
run;
data _null_;set percentiles;call symput('pctll',ppp%left(&trim_l.));call symput('pctlr',ppp%left(&trim_r.));run;
data aftertrim; set &datain.; where &pctll.<=&fitvar.<=&pctlr.; run;
%end;
/*Descriptive statistics of exp variable in the original and trimmed datasets*/
proc means data=&datain. n nmiss min q1 mean median q3 max;
var &fitvar.;
output out=stats_original n=total_obs nmiss=miss_obs min=min_ap q1=q1_ap mean=mean_ap median=median_ap q3=q3_ap max=max_ap;
run;
proc means data=aftertrim n nmiss min q1 mean median q3 max;
var &fitvar.;
output out=stats_aftertrim n=total_obs nmiss=miss_obs min=min_ap q1=q1_ap mean=mean_ap median=median_ap q3=q3_ap max=max_ap;
run;
data stats_original (drop=_type_ _freq_); set stats_original;
data_description="original dataset";
exp_var="&fitvar.";
run;
data stats_aftertrim (drop=_type_ _freq_); set stats_aftertrim;
data_description="trimmed dataset";
exp_var="&fitvar.";
run;
data overall_stat; set stats_original stats_aftertrim; run;
/*Translate data*/
%put "check translate";
%put &translate.;
%put &pctll.;
%put &pctlr.;
%let pctll_bk=&pctll.; /* save a backup macro variable for &pctll. for translate=N */
%if %upcase(&translate.)= YES or %upcase(&translate.)= Y %then %do;
/*%let Tran=1;*/ /* translate z to have the min of 1 */
%let Tran=0; /* translate z to have the min of 1 */
%end;
%else %do;
%let Tran=0; /* no translate */
%let pctll=0; /* no translate */
%end;
%put &Tran.;
%put &pctll.;
/*retain an original copy to be reused at each time when modelpct() is called*/
data aftertrim_backup; set aftertrim; run;
/*set tau=0.1*/
%let set_tau=0.1;
%let model_tau=1;
/* %LET tau=%SYSEVALF(&set_tau.*(&p100.-&p0.)); */
%LET count=0;
/*run 8 models to decide the model type and pct with smallest coefficient*/
%do mdltp=1 %TO 2;
%do pctt=0 %TO 75 %BY 25;
%modelpct(modeltype=&mdltp.,pct=&pctt.);
%end;%end;
data fourmodel_1;set mtll_:; run;
proc sort data=fourmodel_1;by WithCovariates;run;
data fourmodel_1;set fourmodel_1; if _N_=1;run;
data _null_;set fourmodel_1;
call symput('modeltp_a',modeltype); call symput('low_pct_a',pct); call symput('low_wc_a',WithCovariates);
run;
/*set tau=0.2*/
%let set_tau=0.2;
%let model_tau=2;
/* %LET tau=%SYSEVALF(&set_tau.*(&p100.-&p0.)); */
%LET count=0;
/*run 8 models to decide the model type and pct with smallest coefficient*/
%do mdltp=1 %TO 2;
%do pctt=0 %TO 75 %BY 25;
%modelpct(modeltype=&mdltp.,pct=&pctt.);
%end;%end;
data fourmodel_2;set mtll_:; run;
proc sort data=fourmodel_2;by WithCovariates;run;
data fourmodel_2;set fourmodel_2; if _N_=1;run;
data _null_;set fourmodel_2;
call symput('modeltp_b',modeltype); call symput('low_pct_b',pct); call symput('low_wc_b',WithCovariates);
run;
/*compare and find the optimal tau*/
%if &low_wc_a.<=low_wc_b. %then %do;
%let modeltp=&modeltp_a.;
%let low_pct=&low_pct_a.;
%let low_wc=&low_wc_a.;
%let set_tau=0.1;
/* %LET tau=%SYSEVALF(&set_tau.*(&p100.-&p0.)); */
%let model_tau=9;
%put &set_tau.;
%put &model_tau.;
/*retain rejected tau and related models*/
data dataout_reject; set dataout_2; call symput('set_tau_reject',0.2); run;
%end;
%else %do;
%let modeltp=&modeltp_b.;
%let low_pct=&low_pct_b.;
%let low_wc=&low_wc_b.;
%let set_tau=0.2;
/* %LET tau=%SYSEVALF(&set_tau.*(&p100.-&p0.)); */
%let model_tau=9;
%put &set_tau.;
%put &model_tau.;
/*retain rejected tau and related models*/
data dataout_reject; set dataout_1; call symput('set_tau_reject',0.1); run;
%end;
/*compare -5/+5 percentile around low_pct from above, which is either 0 or 25 or 50 or 75 */
%let low_wc_1=&low_wc.;
%let low_pct_1=&low_pct.;
%do pct_=(&low_pct.+5) %to (&low_pct.-5) %by -10;
%modelpct(modeltype=&modeltp., pct=&pct_.);
data _null_; set fits; call symput('new_wc',WithCovariates); call symput('new_pct',pct); run;
%if &new_wc.<&low_wc. %then %do; %let low_wc_1=&new_wc.; %let low_pct_1=&new_pct.; %end;
%end;
/* STOP if reaching mu=-15th, 100th, or LL is no longer smaller */
%do %while ( &low_pct_1. >= -10 and &low_pct_1. <= 95 and &low_pct_1. NE &low_pct.);
%let low_pct_temp = %SYSEVALF(&low_pct_1. + (&low_pct_1. - &low_pct.));
%let low_pct=&low_pct_1.;
%modelpct(modeltype=&modeltp., pct=&low_pct_temp.);
data _null_; set fits; call symput('new_wc',WithCovariates); call symput('new_pct',pct); run;
%if &new_wc.<&low_wc_1. %then %do; %let low_wc_1=&new_wc.; %let low_pct_1=&new_pct.; %end;
%end;
proc datasets;delete mtll: percentiles fourmodel: fits ;run;quit;
/*drop last run if mu was -20 or 105*/
data &dataout.; set &dataout.; where (pct NE -20); run;
data &dataout.; set &dataout.; where (pct NE 105); run;
/*add tau and append the rejected model outputs*/
proc sort data=&dataout.; by iteration; run;
data &dataout.; set &dataout.; tau=&set_tau.; run;
data dataout1_8; set &dataout.; where iteration<=8; run;
data dataout9_n; set &dataout.; where iteration>8; run;
data dataout9_n; set dataout9_n; iteration=iteration+8; run;
proc sort data=dataout_reject;by iteration;run;
data dataout_reject; set dataout_reject; tau=&set_tau_reject.; iteration=iteration+8; run;
data &dataout.; set dataout1_8 dataout_reject; run;
data &dataout.; set &dataout. dataout9_n; run;
/* print out chosen percentage and corresponding coefficient */
data &dataout.;set &dataout.;
rename pct=mu;
rename pctl=z_at_mu;
drop WithoutCovariates;
run;
proc sort data=&dataout.;by iteration;run;
/* Calculate ensemble weights using 3 models around the best fit, ie., based on best mu with +/- 5th%
The three model include last 2 models from the search + a 3rd model corresponding mu+5 of last model */
%symdel best_LL final_mu;
data &dataout. (drop=WithCovariates); set &dataout.; format LL d18.5; LL=WithCovariates; run;
data &dataout.; set &dataout.; rename LL=WithCovariates; run;
/* find optimal model corresonding to minimum LL */
proc sql noprint;
select min(WithCovariates) into: best_LL from &dataout.;
quit;
data &dataout.; set &dataout.; id=1; run;
data qaqc1; set &dataout.; run;
proc sort data=qaqc1; by WithCovariates; run;
data qaqc2; set qaqc1; id=_n_; run;
data qaqc2; set qaqc2; rename WithCovariates=best_LL2; where id=1; run;
data &dataout.; merge &dataout. (in=fro) qaqc2 (keep=id best_LL2); by id; if fro; run;
data &dataout.; set &dataout.;
if (WithCovariates = best_LL2) then do;
best3=1; call symput('final_mu',mu);
end;
run;
data &dataout.; set &dataout.; drop id best_LL2; run;
data &dataout.; set &dataout.; final_mu=&final_mu.; final_form=&modeltp.; run;
/* find 2 other alternative models */
proc sql noprint;
select sum(best3) into: count_best_LL from &dataout.; /* num of models with same smallest LL */
quit;
proc sql noprint;
select max(iteration) into: last_iteration from &dataout.; /* iteration id for last run */
quit;
%if &count_best_LL.=2 %then %do; /* a special case where last 2 runs had same smallest LL */
data &dataout.; set &dataout.; if mu=final_mu and tau=&set_tau. then call symput('best_final_LL_iteration',iteration); run;
%if &last_iteration. > &best_final_LL_iteration. %then %do; /* followed by additional run with a larger LL*/
data &dataout.; set &dataout.;
if iteration eq &last_iteration. then best3=3;
run;
%end;
%if &last_iteration. eq &best_final_LL_iteration. %then %do; /* followed by no more additional run*/
data &dataout.; set &dataout.;
if &new_pct. > &low_pct_1. and mu=(final_mu-10) and final_form=modeltype and tau=&set_tau. then best3=3; /* ascending*/
else if &new_pct. < &low_pct_1. and mu=(final_mu+10) and final_form=modeltype and tau=&set_tau. then best3=3; /* descending*/
run;
%end;
%end;
%if &count_best_LL.=1 %then
%do;
data &dataout.; set &dataout.;
if final_mu=-15 then do; /* a special case where last run reached mu=-15 */
if mu=-10 and tau=&set_tau. then best3=2;
if mu=-5 and tau=&set_tau. then best3=3;
end;
else if final_mu=100 then do; /* a special case where last run reached mu=100 */
if mu=95 and tau=&set_tau. then best3=2;
if mu=90 and tau=&set_tau. then best3=3;
end;
else do; /* all other cases */
if mu=(final_mu-5) and final_form=modeltype and tau=&set_tau. then best3=2;
if mu=(final_mu+5) and final_form=modeltype and tau=&set_tau. then best3=3;
end;
run;
%end;
/* derive LL from -2LL */
data &dataout.; set &dataout.;
LL = WithCovariates/(-2);
run;
/* calculate ensemble weights for all models */
proc sql noprint;
select max(LL) into: LL_min_all from &dataout.;
quit;
data &dataout.; set &dataout.;
LL_diff = exp(LL-&LL_min_all.);
run;
proc sql noprint;
select sum(LL_diff) into: sum_LL_all
from &dataout.;
quit;
data &dataout.; set &dataout.; wt=LL_diff/&sum_LL_all.;run;
data &dataout.; set &dataout.; drop LL_diff; run;
/* calculate ensemble weights for 3 final models */
data final3models; set &dataout.; where best3>=1; run;
proc sql noprint;
select max(LL) into: LL_min_3 from final3models;
quit;
data final3models; set final3models; LL_diff = exp(LL-&LL_min_3.); run;
proc sql noprint;
select sum(LL_diff) into: sum_LL_3 from final3models;
quit;
data &dataout.; set &dataout.;
if best3>=1 then LL_diff = exp(LL-&LL_min_3.);
else LL_diff=.;
run;
data &dataout.; set &dataout.; wt_final3=LL_diff/&sum_LL_3.;run;
data &dataout.; set &dataout.; if best3=. then wt_final3=.; drop LL_diff; run;
/* Bootstrap to sample beta based on joint model */
%if %length(&overlay.)>0 %then %do;
%simulate_z2(indata=&dataout., model="joint", over_lay=&overlay.);
proc sort data=newap3; by id; run;
%simulate_beta(indata=&dataout., model="joint", over_lay=&overlay.);
proc sort data=simdata; by id; run;
data newap3; merge newap3(in=fro) simdata; by id; if fro; run;
/* count max num of sim ap data points */
proc contents data=newap3 out=output_z; run;
proc sort data=output_z; by varnum; run;
data _null_;
set output_z;
call symputx("maximum",varnum-2);
run;
/* beta*transformed(z) for each simulated z data points */
data newap3 (drop=beta id); set newap3;
array CC{&maximum.} C1-C&maximum.;
do i=1 to &maximum.;
CC{i}=exp(beta*CC{i});
end;
run;
/* derive median and 2.5th% and 97th% */
%do i=1 %to &maximum.;
data newap_&i.; set newap3;
keep C&i.;
run;
proc univariate data=newap_&i. noprint;
var C&i.;
output out=beta_distr_&i. pctlpts= 2.5 50 97.5 pctlpre=P;
run;
data beta_distr_&i.;set beta_distr_&i.; id=&i.; run;
%if &i.=1 %then %do;
data beta_distr; set beta_distr_&i.; run;
%end;
%else %do;
data beta_distr; set beta_distr beta_distr_&i.; run; /* contain 4 variables: id, p2.5, p50, p97.5 */
%end;
%end;
/* translate=N, rescale &pctll. to its original pctll */
%if %upcase(&translate.)= YES or %upcase(&translate.)= Y %then %do;
%let pctll_sim = &pctll.; /* no change */
%let pctlr_sim = &pctlr.; /* no change */
%end;
%else %do;
%let pctll_sim = &pctll_bk.; /* scale pctlr_sim to max-min+1 */
%let pctlr_sim = &pctlr.; /* no change */
%end;
%if &lowest.= 1 %then %do;
%let pctll_sim = 1; /* force to extend the min to have 1*/
%let pctlr_sim = &pctlr.; /* no change */
%end;
data newap4(keep = z_sim);
do i = &pctll_sim. to &pctlr_sim. by 0.1;
z_sim = i;
output;
end;
run;
data newap4; set newap4; id=_n_; run;
proc sort data=newap4; by id; run;
proc sort data=beta_distr; by id; run;
data beta_distr; merge beta_distr(in=fro) newap4; if fro; by id; run;
%if %upcase(&translate.)= YES or %upcase(&translate.)= Y %then %do;
%if &modeltp.=1 %then %do;
data newap_joint; set beta_distr;
rename p50=rr_mean;
rename p97_5=rr_ucl;
rename p2_5=rr_lcl;
ap = z_sim;
run;
%end;
%else %if &modeltp.=2 %then %do;
data newap_joint; set beta_distr;
rename p50=rr_mean;
rename p97_5=rr_ucl;
rename p2_5=rr_lcl;
ap = z_sim;
run;
%end;
%end;
%else %do;
data newap_joint; set beta_distr;
rename p50=rr_mean;
rename p97_5=rr_ucl;
rename p2_5=rr_lcl;
ap = z_sim;
run;
%end;
/* export 1000*101 matrix */
%if %upcase(&export.)= YES or %upcase(&export.)= Y %then %do;
/* first, find ranking from 0th% to 100th% by 1, and reduce RR matrix to 1000:101 for export */
data sampleN (drop=i);
do i=1 to &maximum.;
CC=i;
output;
end;
run;
proc univariate data=sampleN ;
var CC;
output out=sampleID pctlpts= 0 to 100 by 1 pctlpre=PP;
run;
data sampleID2 (drop=i PP0-PP100); set sampleID;
array P{101} PP0-PP100;
do i=1 to 101;
CC_P=P{i};
output;
end;
run;
data sampleID2; set sampleID2;
CC_P=int(CC_P);
run;
data sampleID2; set sampleID2;
CC=compress('C'||CC_P);
id=CC_P;
run;
proc sql noprint;
select CC
into :CC1 - :CC101
from sampleID2;
quit;
/* %put _user_; */
%do i=1 %to 101;
%put &&CC&i.;
data export_newap_&i.; set newap3;
keep &&CC&i.;
run;
data export_newap_&i.; set export_newap_&i.;
id=_n_;
run;
proc sort data=export_newap_&i.; by id; run;
%if &i.=1 %then %do;
data export_newap; set export_newap_&i.; run;
%end;
%else %do;
data export_newap; merge export_newap (in=fro) export_newap_&i.; if fro; by id; run;
%end;
%end;
data export_newap; set export_newap; drop id; run;
proc export data=export_newap
outfile="&output_path.\export_hr.csv"
dbms=csv
replace;
run;
/* second, create another dataset to contain 101 data points from the range of AP exp */
proc sort data=sampleID2; by id; run;
data ap_exp; merge sampleID2 (in=fro) newap4; by id; if fro; run;
data ap_exp (drop=id CC_P); set ap_exp;
CC=compress('CC'||id);
rename z_sim=AP;
run;
proc export data=ap_exp
outfile="&output_path.\ap_exp.csv"
dbms=csv
replace;
run;
%end;
%end;
/* Bootstrap to sample beta based on optimal model */
%simulate_z2(indata=&dataout., model="optimal");
proc sort data=newap3; by id; run;
%simulate_beta(indata=&dataout., model="optimal");
proc sort data=simdata; by id; run;
data newap3; merge newap3(in=fro) simdata; by id; if fro; run;
/* count max num of sim ap data points */
proc contents data=newap3 out=output_z; run;
proc sort data=output_z; by varnum; run;
data _null_;
set output_z;
call symputx("maximum",varnum-2);
run;
/* beta*transformed(z) for each simulated z data points */
data newap3 (drop=beta id); set newap3;
array CC{&maximum.} C1-C&maximum.;
do i=1 to &maximum.;
CC{i}=exp(beta*CC{i});
end;
run;
/* derive median and 2.5th% and 97th% */
%do i=1 %to &maximum.;
data newap_&i.; set newap3;
keep C&i.;
run;
proc univariate data=newap_&i. noprint;
var C&i.;
output out=beta_distr_&i. pctlpts= 2.5 50 97.5 pctlpre=P;
run;
data beta_distr_&i.;set beta_distr_&i.; id=&i.; run;
%if &i=1 %then %do;
data beta_distr; set beta_distr_&i.; run;
%end;
%else %do;
data beta_distr; set beta_distr beta_distr_&i.; run; /* contain 4 variables: id, p2.5, p50, p97.5 */
%end;
%end;
/* translate=N, rescale &pctll. to its original pctll */
%if %upcase(&translate.)= YES or %upcase(&translate.)= Y %then %do;
%let pctll_sim = &pctll.; /* no change */
%let pctlr_sim = &pctlr.; /* no change */
%end;
%else %do;
%let pctll_sim = &pctll_bk.; /* scale pctlr_sim to max-min+1 */
%let pctlr_sim = &pctlr.; /* no change */
%end;
%if &lowest.= 1 %then %do;
%let pctll_sim = 1; /* force to extend the min to have 1*/
%let pctlr_sim = &pctlr.; /* no change */
%end;
data newap4(keep = z_sim);
do i = &pctll_sim. to &pctlr_sim. by 0.1;
z_sim = i;
output;
end;
run;
data newap4; set newap4; id=_n_; run;
proc sort data=newap4; by id; run;
proc sort data=beta_distr; by id; run;
data beta_distr; merge beta_distr(in=fro) newap4; if fro; by id; run;
%if %upcase(&translate.)= YES or %upcase(&translate.)= Y %then %do;
%if &modeltp.=1 %then %do;
data newap_optimal; set beta_distr;
rename p50=rr_mean;
rename p97_5=rr_ucl;
rename p2_5=rr_lcl;
ap = z_sim;
run;
%end;
%else %if &modeltp.=2 %then %do;
data newap_optimal; set beta_distr;
rename p50=rr_mean;
rename p97_5=rr_ucl;
rename p2_5=rr_lcl;
ap = z_sim;
run;
%end;
%end;
%else %do;
data newap_optimal; set beta_distr;
rename p50=rr_mean;
rename p97_5=rr_ucl;
rename p2_5=rr_lcl;
ap = z_sim;
run;
%end;
/* plot joint and optimal models */
%plot_cr(indata_joint=newap_joint, indata_optimal=newap_optimal, expo=&label_exposure., unit=&label_unit., ensemblemodel=&ensemble_model., over_lay=&overlay.);
/* add a pure linear model with z */
%let linear_model = 1;
%put &linear_model.;
%put &modeltp.;
%put &low_pct_1.;
%modelpct(modeltype=9, pct=&low_pct_1.);
/* add a pure log(z) model */
%let linear_model = 2;
proc datasets;delete fits ;run;quit;
%modelpct(modeltype=9, pct=&low_pct_1.);
/* print description of original/trimmed datasets and model fitting, and clean up working directory */
ods pdf file="&output_path.\model_fitting_summary.pdf";
title;
proc print data=overall_stat;
title "Summary statistics of air pollution exposure variable in the original and trimmed datasets";
run;
title;
title;
data &dataout.;set &dataout. (drop=LL);
length model_function $15.;
if modeltype=1 then model_function='z*logit';
else if modeltype=2 then model_function='log(z)*logit';
else if modeltype=3 then model_function='pure linear';
else if modeltype=4 then model_function='pure log';
rename WithCovariates=LL;
drop WithoutCovariates pct pctl best3 final_form final_mu modeltype;
run;
/* proc sort data=&dataout.;by iteration;run; */
data &dataout.; set &dataout.; drop iteration Criterion; run; /* simplif summary table so as to fit in one page */
data &dataout.; set &dataout.; rename mu=location; run;
data &dataout.; set &dataout.; rename z_at_mu=mu; rename wt_final3=final_wt; run;
/* data &dataout.; set &dataout.; tau=&set_tau.; run; */
data prt; set &dataout.; run;
proc print data=prt;
title "Summary description of model fitting";
run;
title;
ods pdf close;
/* export output table to csv format */
data &dataout.; set &dataout.; rename LL=Minus2LL; run;
proc export data=&dataout.
outfile="&output_path.\model_fitting_summary.csv"
dbms=csv
replace;
run;
/* clean up non-essential datasets */
proc datasets library=work;
delete sample: sample_all beta_distr final3models
newap newap_joint newap_optimal param prt overall_stat simdata temp_sim temp_sim2 qaqc: percentiles output_z
Z_sim_distr new_datain mtll: fits combined Beta_distr: dataout: newap: sampleID: sampleN export_newap_:;
run;quit;
%Mend fitap;
/**************************
* fit cox PH model with z
**************************/
%macro modelpct(modeltype=,pct=);
/*translate ap data to have the min of 1 or 0, depending on log or linear model*/
/*note that this only applies to translate=Y*/
/*note that if translate=N, then &pctll.=0 and &Tran.=0*/
/*for log model: translate dataset to have the min of 0, if translate=Y*/
%if &modeltype.=2 %then %do;
data aftertrim; set aftertrim_backup; z=&fitvar.-&pctll.+&Tran.; run;
%end;
/*for linear model: translate dataset to have the min of 0*/
%else %if &modeltype.=1 %then %do;
data aftertrim; set aftertrim_backup; z=&fitvar.-&pctll.; run;
%end;
/*for pure linear and pure log model: use original dataset*/
%else %if &modeltype.=9 %then %do;
data aftertrim; set aftertrim_backup; z=&fitvar.; run;
%end;
proc sql noprint;
select max(z) into: p100 from aftertrim;
select min(z) into: p0 from aftertrim;
quit;
proc univariate data=aftertrim noprint;
var z;
output out=percentiles pctlpts= 5 to 95 by 5 pctlpre=P;
run;
data percentiles; set percentiles; p_5=&p0.-(p5-&p0.); run; /* add a variable p_5 denoting 5 pctl below p0*/
data percentiles; set percentiles; p_10=&p0.-(p10-&p0.); run; /* add a variable p_10 denoting 10 pctl below p0*/
data percentiles; set percentiles; p_15=&p0.-(p15-&p0.); run; /* add a variable p_15 denoting 15 pctl below p0*/
proc transpose data=percentiles out=percentiles;run;
data _null_;set percentiles;
call symput(_name_,col1);
run;
/* calc tau based on set_tau */
%LET tau=%SYSEVALF(&set_tau.*(&p100.-&p0.));
/*calc log transformed ap data*/
%if &pct.= -5 %then %do;
%let mu=&p_5.; /* if 5 percentile less than p0, then change to _5 */
data aftertrim; set aftertrim;
%if &modeltype.=1 %then %do;
APvar=(z)*(1/(1+exp(-(z-&mu.)/&tau.)));
%end;
%if &modeltype.=2 %then %do;
%if %upcase(&translate.)= YES or %upcase(&translate.)= Y %then %do;
APvar= log(1+z)*(1/(1+exp(-(z-&mu.)/&tau.)));
%end;
%else %do;
APvar= log(1+z)*(1/(1+exp(-(z-&mu.)/&tau.)));
%end;
%end;run;
%end;
%else %if &pct. = -10 %then %do;
%let mu=&p_10.; /* if 10 percentile less than p0, then change to _10 */
data aftertrim; set aftertrim;
%if &modeltype.=1 %then %do;
APvar=(z)*(1/(1+exp(-(z-&mu.)/&tau.)));
%end;
%if &modeltype.=2 %then %do;
%if %upcase(&translate.)= YES or %upcase(&translate.)= Y %then %do;
APvar= log(1+z)*(1/(1+exp(-(z-&mu.)/&tau.)));
%end;
%else %do;
APvar= log(1+z)*(1/(1+exp(-(z-&mu.)/&tau.)));
%end;
%end;run;
%end;
%else %if &pct. < -10 %then %do;
%let mu=&p_15.; /* if 15 or more percentile less than p0, then change to _15 */
data aftertrim; set aftertrim;
%if &modeltype.=1 %then %do;
APvar=(z)*(1/(1+exp(-(z-&mu.)/&tau.)));
%end;
%if &modeltype.=2 %then %do;
%if %upcase(&translate.)= YES or %upcase(&translate.)= Y %then %do;
APvar= log(1+z)*(1/(1+exp(-(z-&mu.)/&tau.)));
%end;
%else %do;
APvar= log(1+z)*(1/(1+exp(-(z-&mu.)/&tau.)));
%end;
%end;run;
%end;
%else %do;
%let mu=&&p&pct.;
data aftertrim; set aftertrim;
%if &modeltype.=1 %then %do;
APvar=(z)*(1/(1+exp(-(z-&mu.)/&tau.)));
%end;
%if &modeltype.=2 %then %do;
%if %upcase(&translate.)= YES or %upcase(&translate.)= Y %then %do;
APvar= log(1+z)*(1/(1+exp(-(z-&mu.)/&tau.)));
%end;
%else %do;
APvar= log(1+z)*(1/(1+exp(-(z-&mu.)/&tau.)));
%end;
%end;run;
%end;
/* non-linear model*/
%if &linear_model. eq 0 %then %do;
%if %length(&start.)=0 %then %do;
%if %length(&time.)=0 %then %do; %put "WARNING: TIME variable is missing"; %abort;%end;
%if %length(&strata.)=0 %then %do;
proc phreg data=aftertrim;
model &time.*&case.(0)=APvar &covvars.;
ods output FitStatistics=fits(where=(Criterion='-2 LOG L'))
ParameterEstimates=param (where=(parameter='APvar'));
run;
%end;
%else %do;
proc phreg data=aftertrim;
model &time.*&case.(0)=APvar &covvars.;
strata &strata.;
ods output FitStatistics=fits(where=(Criterion='-2 LOG L'))
ParameterEstimates=param (where=(parameter='APvar'));
run;
%end;
%end;
%if %length(&start.)>0 %then %do;
%if %length(&stop.)=0 %then %do; %put "WARNING: STOP variable is missing"; %abort;%end;
%if %length(&strata.)=0 %then %do;
proc phreg data=aftertrim;
model (&start.,&stop.)*&case.(0)=APvar &covvars.;
ods output FitStatistics=fits(where=(Criterion='-2 LOG L'))
ParameterEstimates=param (where=(parameter='APvar'));
run;
%end;
%else %do;
proc phreg data=aftertrim;
model (&start.,&stop.)*&case.(0)=APvar &covvars.;
strata &strata.;
ods output FitStatistics=fits(where=(Criterion='-2 LOG L'))
ParameterEstimates=param (where=(parameter='APvar'));
run;
%end;
%end;
%end;
/* pure linear z model */
%if &linear_model. eq 1 %then %do;