-
Notifications
You must be signed in to change notification settings - Fork 96
/
datachk.sas
359 lines (310 loc) · 11 KB
/
datachk.sas
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
/*-----------------------------------------------------------------*
| Name: datachk.sas |
| Title: Perform basic data checking on a dataset |
Doc: http://www.datavis.ca/sasmac/datachk.html
| ----------------------------------------------------------------|
| Procs: contents sort transpose means standard print datasets |
| rank |
| Macdefs: datachk hilo |
| Macrefs: hilo splot |
| ----------------------------------------------------------------|
| Author: Michael Friendly <[email protected]> |
| Created: 12 Aug 1991 15:46:41 |
| Revised: 09 Oct 2004 13:34:43 |
| Version: 1.5 |
| - Fixed bug when no variable has a label |
| - Cleaned up work data sets |
| - Added display of obs with at least &nout |z-scores| > &zout |
| - Handle missing value codes (.A-.Z) |
| 1.5 |
| - Replaced splot with boxplot for hi-res plot |
| |
*-----------------------------------------------------------------*/
/*=
=Description:
The DATACHK macro performs basic data screening/checking on numeric variables
in a dataset, and is designed to give a compact overview of many variables.
The output is a small subset of that produced by
the univariate procedure. For a each variable, a few statistics
and the lowest/highest observations are shown in a format like:
Variable Stat Value Extremes Id
ATBAT N 322 16 Tony Armas
Miss 0 19 Cliff Johnson
Times at Bat Mean 380.9286 19 Terry Kennedy
Std 153.405 20 Mike Schmidt
Skew -0.07806
663 Joe Carter
677 Don Mattingly
680 Kirby Puckett
687 T Fernandez
--------------------------------------------
In addition, a schematic plot (boxplot) is shown for the standard
scores for all variables when the SPLOT=Y option is given (assuming
the BOXPLOT macro is available.)
If a CLASS= variable is specified, this output is produced for each
value of the CLASS= variable.
The macro also prints a table of standardized (Z) scores, for all
observations which have at least NOUT z-scores greater than ZOUT
in absolute value.
=Usage:
The DATACHK macro takes keyword arguments. The VAR= variable list
is required.
==Parameters:
* DATA=_last_ Name of input data set
* VAR= Variable(s) to be screened. You may use any of the
shorthands for variable lists, e.g., X1-X20,
STATA--STATZ, _NUMERIC_.
* CLASS= Class/grouping variable
* ID= Name of id variable, used to label observations
in the output.
* OUT=_hilo_ Name of the output dataset, containing the
highest and lowest observations on each variable.
* LS=80 Output linesize, mainly used for the boxplots
produced by the SPLOT option.
* LOHI=4 Number of low/high obs printed
* ZOUT=2 Z-score for treating an obs as unusual
* NOUT=3 Number of |z|>&zout to print. Use NOUT=0 to
suppress this output.
* SPLOT=yes Do boxplot of std scores?
=*/
%macro datachk(
data=_last_, /* name of input data set */
var=, /* variable(s) to be screened */
class=, /* class/grouping variable */
id=, /* name of id variable */
out=_hilo_, /* name of the output dataset */
ls=80, /* linesize for %splot */
lohi=4, /* number of low/high obs printed */
zout=2, /* z-score for treating an obs as unusual */
nout=3, /* number of |z|>&zout to print */
splot=yes, /* do boxplot of std scores? */
idheight=1
);
%let abort=0;
%let work=;
%if %upcase(&data)=_LAST_ %then %let data = &syslast;
%*-- Reset required global options;
%if %sysevalf(&sysver >= 7) %then %do;
%local o1 o2;
%let o1 = %sysfunc(getoption(notes));
%let o2 = %sysfunc(getoption(validvarname,keyword));
options nonotes /*validvarname=upcase */;
%end;
%else %do;
options nonotes;
%end;
%if %length(&var) =0 %then %do;
%put DATACHK: No variable(s) have been specified;
%put %str( )Specify a VAR= variable.;
%let abort=1;
%goto done;
%end;
%*-- Make sure at least one variable has a label (so id _label_ wont fail);
data _new_;
set &data;
keep &var;
proc contents data=_new_ out=_work_ noprint;
%if &syserr > 4 %then %let abort=1; %if &abort %then %goto DONE;
%let nlabs=0;
data _null_;
set _work_(keep=name label) end=eof;
if label^=' '
then nlabs + 1;
else var=name;
* put label= nlabs=;
if eof then do;
call symput('nlabs', put(nlabs,4.));
call symput('onevar', var);
end;
run;
*put DATACHK: nlabs= &nlabs onevar= &onevar;
data _new_;
set &data;
%if &nlabs=0 %then %do;
label &onevar = "&onevar";
%end;
%if %length(&id) =0 %then %do;
_id_ = _n_;
%let id=_id_;
%end;
%let data=_new_;
%if %length(&class) >0 %then %do;
proc sort data=&data out=_sorted_;
by &class;
data _sorted_;
set &data;
run;
%let data=_sorted_;
%let work = &work _sorted_;
%end;
proc transpose data=&data out=_trans_;
var &var;
by &id notsorted;
%if %length(&class) >0 %then %do;
copy &class;
%end;
proc sort;
by &class _name_;
%put DATACHK: Summarizing the variables...;
proc means data=_trans_ noprint;
var col1;
by &class _name_;
id _label_;
output out=_stat_ n=n nmiss=nmiss mean=mean std=std skewness=skew;
data _stat_;
set _stat_;
rename _freq_=nobs;
*proc print;
*-- Find lowest and highest observations;
%put DATACHK: Finding the &lohi Highest/Lowest values...;
%hilo(in=_trans_, out=_trup_, var=col1, id=&id, class=&class, dir=UP,
num=&lohi);
%hilo(in=_trans_, out=_trdn_, var=col1, id=&id, class=&class, dir=DN,
num=&lohi);
%put DATACHK: Printing the report...;
data _hilo_;
merge _stat_ _trup_ _trdn_;
by &class _name_;
file print linesleft=lines;
drop c1-c6 s1-s5 dash flag;
array s{*} $4 s1-s5 ( 'N' 'Miss' 'Mean' 'Std' 'Skew' );
array cols{*} c1-c6 ( 22 30 42 51 60 72 );
if _n_=1 then do;
put 'Variable'
@c1 'Stat' @c2 'Value' @c3 'Extremes' @c4 'Id'//;
end;
if lines < 12 then put _page_;
dash = repeat('-',c6-c2);
flag = ' ';
if abs(skew)>2 then flag='**';
else if abs(skew)>1 then flag='*';
put _name_ $20. @c1 s1 @c2 n best8. @c3 lo1 best8. @c4 idlo1 /
@c1 s2 @c2 nmiss best8. @c3 lo2 best8. @c4 idlo2 /
_label_ $20. @c1 s3 @c2 Mean best8. @c3 lo3 best8. @c4 idlo3 /
@c1 s4 @c2 std best8. @c3 lo4 best8. @c4 idlo4 /
@c1 s5 @c2 skew best8. +1 flag /
@c3 hi4 best8. @c4 idhi4 /
@c3 hi3 best8. @c4 idhi3 /
@c3 hi2 best8. @c4 idhi2 /
@c3 hi1 best8. @c4 idhi1 /
@c1 dash //
;
%if %upcase(%substr(&splot,1,1))=Y %then %do;
/* Convert to standard scores for comparative boxplots */
%put DATACHK: Preparing boxplot of standard scores...;
proc standard data=_trans_ out=_stand_ m=0 s=1;
var col1;
label col1='Standard score'
_name_='Variable';
by _name_;
%* Exclude a numeric ID;
%if %length(&id) %then %do;
* where (_name_ ^= "&id");
%end;
run;
%let work=&work _stand_;
*splot(data=_stand_,class=_name_,var=col1,ls=&ls);
*options nonotes;
axis90 label=(a=90) minor=(number=1);
%boxplot(data=_stand_,class=_name_,var=col1, classlab=Variable,
vaxis=axis90, id=&id, olabel=far, idheight=&idheight);
/*
proc boxplot data=_stand_;
plot col1 * _name_ /
boxstyle = schematicidfar
cboxes=black idheight=&idheight
vaxis=axis90;
%if %length(&id) %then %do;
id &id;
%end;
run; quit;
*/
%end;
%if &nout>0 %then %do;
proc standard data=&data out=_new_ m=0 s=1;
var &var;
%if %length(&class) >0 %then %do;
by &class;
%end;
data _new_;
set _new_;
array vars{*} &var;
drop i;
_out_ = 0;
do i=1 to dim(vars);
_out_ = sum(_out_, abs(vars(i)) > &zout);
end;
proc sort;
by descending _out_;
proc print;
var &var;
id &id _out_;
where (_out_>=&nout);
format &var 5.2;
title2 "Observations with at least &nout |z-scores| > &zout";
%end;
proc datasets nolist nowarn;
delete _new_ _trup_ _trdn_ _stat_ _rank_ _id_ _trans_ _tr_ &work;
quit;
title2;
%done:;
%*-- Restore global options;
%if %sysevalf(&sysver >= 7) %then %do;
options &o1 &o2;
%end;
%else %do;
options notes;
%end;
%if &abort %then %put ERROR: The DATACHK macro ended abnormally.;
%mend;
/* ---------------------------------------------------------------*/
%macro hilo(in=, /* input data set */
out=, /* output data set */
var=col1, /* variable to analyze */
id=, /* id variable */
num=, /* number of extreme observations wanted */
class=, /* grouping variable */
dir=up /* which extreme: UP or DN */
);
%let dir=%upcase(&dir);
%if &dir=UP
%then %do;
%let pre=LO;
%let order=;
%end;
%else %do;
%let pre=HI;
%let order=DESCENDING;
%end;
*-- Find lowest or highest num observations;
proc rank data=&in out=_rank_ ties=low &ORDER;
where (&var > .Z);
var &var;
by &class _name_;
ranks _rank_;
proc sort data=_rank_;
by &class _name_ _rank_;
data _rank_;
set _rank_;
if _rank_ ^= .;
data _rank_;
set _rank_;
by &class _name_;
rename &id =_id_;
* rename col1=_lo_;
if first._name_ then _r_=0;
_r_+1;
if _r_ <= #
drop /*_label_*/ _rank_ _r_;
proc transpose data=_rank_ out=_tr_ prefix=&PRE;
var &var;
by &class _name_;
proc transpose data=_rank_ out=_id_ prefix=id&PRE;
var _id_;
by &class _name_;
data &out;
merge _tr_ _id_;
by &class _name_;
drop _label_;
*proc print;
%mend;