-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathdistribution
executable file
·559 lines (502 loc) · 19.2 KB
/
distribution
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
#!/usr/bin/perl
# https://github.com/philovivero/distribution
# distribution - creates histograms based on input
#
# Version 1.0
#
# Written by Tim Ellis of Palomino
# Copyright (c) 2012 by Tim Ellis
# Released under the terms of GNU GPLv2:
#
# This program is free software; you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free Software
# Foundation; version 2 of the License.
#
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
# details.
#
# You should have received a copy of the GNU General Public License along with
# this program; if not, visit http://www.gnu.org/licenses/gpl-2.0.html in your
# web browser.
#
# If you're still living in the 1800's, you can instead write to:
#
# Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor,
# Boston, MA 02110-1301, USA.
use warnings;
use strict;
# if your system is dying on this statement, it should be safe to simply comment
# it out. your timings with --verbose will be very course.
#use Time::HiRes qw( time );
# for determining how long this program ran
my $startTime = time() * 1000;
# defaults - overridden first by distributionrc then by commandline options
my $size = '';
my $width = 80;
my $height = 15;
my $widthArg = 0;
my $heightArg = 0;
my $logarithmic = 0;
my $numOnly = '';
# character for ease of eye to scan left/right/up/down
my $histogramChar = "+";
my $unicode = 0;
my $tokenize = 0;
# by default, everything matches (nothing is stripped out)
my $matchRegexp = ".";
# status and summary statistics
my $verbose = 0;
# how often to give status if verbose
my $statInterval = 0.5;
# whether or not to parse input into bins (if data is pre-summarised and we're
# just presenting results)
my $graphValues = "";
# variables to support colourised output
my $colourisedOutput = 0;
my $colourPalette = '0,0,32,35,34';
my $regularColour = "";
my $keyColour = "";
my $ctColour = "";
my $pctColour = "";
my $graphColour = "";
# we'll re-use inLine at various times for reading files
my $inLine;
# if they want an rcfile, must be first arg
my $rcFile = $ENV{"HOME"} . "/.distributionrc";
if ($ARGV && $ARGV[0] =~ /^-+r(cfile)*=(.+)$/) { $rcFile = $2; }
# read in rcfile if it exists - it'll just be args
if (open (IFILE, "< $rcFile")) {
while ($inLine = <IFILE>) {
chomp ($inLine);
# must put args from rcfile at start so options passed on the
# commandline will override them
unshift (@ARGV, $inLine);
}
}
# process input arguments -- any arguments that aren't known switches are stuck
# onto @extraArgs - note that you can have one or two dashes before the option,
# and only need the first character. also options that take values and switches
# can be overloaded, so -c=o -c means use the o character and colourise the
# output. this can be confusing.
my @extraArgs;
foreach my $arg (@ARGV) {
if ($arg =~ /^-+h(elp)*$/) { doArgs(); exit 0; }
elsif ($arg =~ /^-+w(idth)*=(.+)$/) { $widthArg = $2; }
elsif ($arg =~ /^-+h(eight)*=(.+)$/) { $heightArg = $2; }
# can pass --graph without option
elsif ($arg =~ /^-+g(raph)*=(.+)$/) { $graphValues = $2; }
elsif ($arg =~ /^-+g(raph)*$/) { $graphValues = 'vk'; }
# --char takes option...
elsif ($arg =~ /^-+c(har)*=(.+)$/) { $histogramChar = $2; }
# ...--color doesn't.
elsif ($arg =~ /^-+c(olor)*$/) { $colourisedOutput = 1; }
elsif ($arg =~ /^-+l(ogarithmic)*$/) { $logarithmic = 2; }
# can pass --numonly without option
elsif ($arg =~ /^-+n(umonly)$/) { $numOnly = 'abs'; }
elsif ($arg =~ /^-+n(umonly)*=(.+)$/) { $numOnly = $2; }
elsif ($arg =~ /^-+p(alette)*=(.+)$/) { $colourPalette = $2; $colourisedOutput = 1; }
elsif ($arg =~ /^-+s(ize)*=(.+)$/) { $size = $2; }
elsif ($arg =~ /^-+t(okenize)*=(.+)$/) { $tokenize = $2; }
elsif ($arg =~ /^-+m(atch)*=(.+)$/) { $matchRegexp = $2; }
elsif ($arg =~ /^-+v(erbose)*$/) { $verbose = 1; }
else { push (@extraArgs, $arg); }
}
# variables to support colourised output
if ($colourisedOutput) {
# input should be comma-separated list of numerals
my @cp = split (',', $colourPalette);
# add ANSI colour commands around the values - final looks like ^[[32m
for (my $i=0; $i < scalar @cp; $i++) {
$cp[$i] = chr(27) . '[' . $cp[$i] . 'm';
}
($regularColour, $keyColour, $ctColour, $pctColour, $graphColour) = @cp;
}
# for advanced graphing
my $verticalBlocks = ["▁", "▂", "▃", "▄", "▅", "▆", "▇", "█"]; # for future?
my $partialBlocks = ["▏", "▎", "▍", "▌", "▋", "▊", "▉", "█"]; # char=pb
my $partialCircles = ["◖", "●"]; # char=pc
my $partialLines = ["╸", "╾", "━"]; # char=hl
# some useful substitutions for prettiness
if ($histogramChar eq "ba") { $unicode = 1; $histogramChar = "▬"; }
elsif ($histogramChar eq "bl") { $unicode = 1; $histogramChar = "Ξ"; }
elsif ($histogramChar eq "em") { $unicode = 1; $histogramChar = "—"; }
elsif ($histogramChar eq "me") { $unicode = 1; $histogramChar = "⋯"; }
elsif ($histogramChar eq "di") { $unicode = 1; $histogramChar = "♦"; }
elsif ($histogramChar eq "dt") { $unicode = 1; $histogramChar = "•"; }
elsif ($histogramChar eq "sq") { $unicode = 1; $histogramChar = "□"; }
# high-bit set means we're not in ASCIIland anymore
if (ord (substr ($histogramChar, 0, 1)) >= 128) { $unicode = 1; }
# sub-full character width graphing systems
my $charWidth = 1;
my $graphChars = undef;
if ($histogramChar eq "pb") {
$charWidth = 0.125;
$graphChars = $partialBlocks;
} elsif ($histogramChar eq "pc") {
$charWidth = 0.5;
$graphChars = $partialCircles;
} elsif ($histogramChar eq "hl") {
$charWidth = 0.3334;
$graphChars = $partialLines;
}
# some useful regexp replacements
if ($matchRegexp eq 'word') { $matchRegexp = '^[A-Z,a-z]+$'; }
elsif ($matchRegexp eq 'num') { $matchRegexp = '^\d+$'; }
if ($tokenize eq 'word') { $tokenize = '[^\w]'; }
elsif ($tokenize eq 'white') { $tokenize = '\s'; }
# set the size of the histogram
if ($size =~ /^f/) {
$width = int (`tput cols` * 0.99);
if ($width < 80) { $width = 80; }
$height = int (`tput lines` - 3);
if ($verbose) { $height -= 5; }
if ($height < 25) { $height = 25; }
} elsif ($size =~ /^s/) {
$width = 60;
$height = 10;
} elsif ($size =~ /^m/) {
$width = 100;
$height = 20;
} elsif ($size =~ /^l/) {
$width = 140;
$height = 35;
}
# if they passed in --width or --height, they probably meant it moreso than
# defaults or the --size parameter - so code this last
if ($widthArg) { $width = $widthArg; }
if ($heightArg) { $height = $heightArg; }
# build the dict of input values
my $nextStat = time() + $statInterval;
my $valuesDict;
my $totalValues = 0;
my $totalObjects = 0;
my $graphVal = 0;
my $maxVal = 0;
# three modes of operation
if ($numOnly) {
readNumerics();
} elsif ($graphValues) {
readPretalliedTokens();
} else {
readLinesBuildHash();
}
# see if there was input
checkValuesObjects();
my $totalKeys;
my @sortedKeys;
# if we're just graphing a bunch of numbers, no need to sort
# the values dict
if ($numOnly) {
# here we sort on KEYS not VALUES like other cases
@sortedKeys = sortKeysByKey();
# we graph everything we're given - throw away nothing!
$height = $totalObjects;
} else {
# build the sorted dict
@sortedKeys = sortKeysByValueFrequency();
$maxVal = $valuesDict->{$sortedKeys[0]};
}
# if there aren't height # of distinct values, use less
$totalKeys = scalar @sortedKeys;
if ($totalKeys < $height) { $height = $totalKeys; }
# for logarithmic graphs
my $maxLog = 0;
if ($logarithmic) {
$maxLog = log($maxVal);
}
my $i;
my $j;
my $keyText;
my $ctText;
my $pctText;
my $preBarLen;
my $barWidth;
my $maxPreBarLen = 0;
my $maxKeyLen = 0;
# here is the complex part - read it carefully
for ($i = 0; $i < $height; $i++) {
# print the i'th most-common key
$keyText->[$i] = $sortedKeys[$i];
# how many times this key occurred in the input
my $count = $valuesDict->{$sortedKeys[$i]};
# determine the bar width based on key occurence
if ($logarithmic) {
$barWidth->[$i] = log($count) / $maxLog;
} else {
$barWidth->[$i] = $count / $maxVal;
}
# determine the percent of key frequency
my $percentile = $count / $totalValues * 100;
# graph axis labels, really
$ctText->[$i] = sprintf ("%d", $count);
$pctText->[$i] = sprintf ("(%3.02f%%)", $percentile);
$preBarLen = length ($ctText->[$i]) + length ($pctText->[$i]);
# determine the longest key name and longest count/percent text for
# aligning the output
if ($preBarLen > $maxPreBarLen) { $maxPreBarLen = $preBarLen; }
if (length ($sortedKeys[$i]) > $maxKeyLen) { $maxKeyLen = length ($sortedKeys[$i]); }
}
my $endTime = time() * 1000;
my $totalMillis = sprintf ("%.2f", ($endTime - $startTime));
if ($verbose) {
print STDERR "tokens/lines examined: $totalObjects\n";
print STDERR " tallied in histogram: $totalValues\n";
print STDERR " histogram entries: $totalKeys\n";
print STDERR " runtime: ${totalMillis}ms\n";
}
outputGraph();
exit 0;
# --------------------------------------------------------------------------- #
# subroutines
# --------------------------------------------------------------------------- #
# get the keys ordered - we'll only print the most common keys
sub sortKeysByValueFrequency {
my @sortedKeys = reverse sort { int($valuesDict->{$a}) <=> int($valuesDict->{$b}) } keys %{$valuesDict};
return @sortedKeys;
}
# get the keys ordered simply by key
sub sortKeysByKey {
my @sortedKeys = sort { int($a) <=> int($b) } keys %{$valuesDict};
return @sortedKeys;
}
# here we just pull in a stream of numerics and graph them, optionally graphing
# the difference between each pair of values
sub readNumerics {
# monotonically-increasing pairs, must have at least one value
my $lastVal = undef;
while ($inLine = <STDIN>) {
chomp ($inLine);
if ($numOnly =~ /^m/i) {
if (defined $lastVal) {
$graphVal= $inLine - $lastVal;
$totalValues += $graphVal;
$totalObjects++;
}
$lastVal = $inLine;
} else {
$graphVal= $inLine;
$totalValues += $inLine;
$totalObjects++;
}
if ($graphVal > $maxVal) { $maxVal = $graphVal; }
# we just build a list where the keys are line num and values
# are the value
$valuesDict->{$totalObjects} = $graphVal;
}
}
# read in lines and build hash object from it
sub readLinesBuildHash {
while ($inLine = <STDIN>) {
chomp($inLine);
if ($tokenize) {
# in this case we break the line into tokens and tally them
foreach my $lineToken (split (/$tokenize/, $inLine)) {
$totalObjects++;
if ($lineToken =~ /$matchRegexp/) {
$valuesDict->{$lineToken}++;
$totalValues++;
}
}
} else {
# in this case the entire line is the token to be tallied
$totalObjects++;
if ($inLine =~ /$matchRegexp/) {
$valuesDict->{$inLine}++;
$totalValues++;
}
}
if ($verbose && time() > $nextStat) {
print STDERR " Objects Processed: $totalObjects..." . chr(13);
$nextStat = time() + $statInterval;
}
}
if ($verbose) { print STDERR " Objects Processed: $totalObjects \n"; }
}
# here is the case where we don't need to put the input into
# bins and tally - the data is pre-tallied for us
sub readPretalliedTokens {
while ($inLine = <STDIN>) {
chomp ($inLine);
if ($graphValues eq 'vk') {
if ($inLine =~ /^\s*(\d+)\s+(.+)$/) {
$valuesDict->{$2} = $1;
$totalValues += $1;
if ($1 > $maxVal) { $maxVal = $1; }
$totalObjects++;
} else {
print STDERR " E Input line malformed and discarded: $inLine\n";
}
} elsif ($graphValues eq 'kv') {
if ($inLine =~ /^(.+?)\s+(\d+)$/) {
$valuesDict->{$1} = $2;
$totalValues += $2;
if ($2 > $maxVal) { $maxVal = $2; }
$totalObjects++;
} else {
print STDERR " E Input line malformed and discarded: $inLine\n";
}
}
}
}
# see if there was input
sub checkValuesObjects {
# the input may be empty. or we may have been too strict on the
# matching regexp passed in. either way, we were left with nothing.
if ($totalValues == 0) {
if ($totalObjects > 0) {
print STDERR "Not enough unfiltered data to produce a histogram.\n";
} else {
print STDERR "Not enough data to produce a histogram.\n";
}
exit 255;
}
}
# the arrays, hashes, variables must be all-correct for this to
# work, TODO: list out which ones they are, convert to functional
# keyText->[] - list of the keys
# pctText->[] - list of the percents
# ctText->[] - list of the counts
# barWidth->[] - list of the widths of the bars
sub outputGraph {
# print a header with alignment from key names
print STDERR "Val";
for ($j = 4; $j <= $maxKeyLen; $j++) { print STDERR " "; }
print STDERR "|Ct (Pct)";
for ($j = 7; $j <= $maxPreBarLen; $j++) { print STDERR " "; }
print STDERR "Histogram";
# get ready for the output, but sorting gets hosed if we print the
# colour code before the key, so put it on the line before
print STDERR "$keyColour\n";
# amount of other output reduces possible size of bar - alas
my $maxBarWidth = $width - $maxPreBarLen - $maxKeyLen - 4;
for ($i = 0; $i < $height; $i++) {
# first the key that we aggregated
print $keyText->[$i];
print $regularColour;
# alignment
for ($j = length ($keyText->[$i]); $j < $maxKeyLen; $j++) { print " "; }
# separater between keys and count/pct
print "|";
print $ctColour;
print $ctText->[$i] . " ";
print $pctColour;
print $pctText->[$i];
# spaces 'til time to print the bar
for ($j = length ($ctText->[$i]) + length ($pctText->[$i]); $j <= $maxPreBarLen; $j++) { print " "; }
print $graphColour;
for ($j = 0; $j < int ($barWidth->[$i] * $maxBarWidth); $j++) {
if ($charWidth < 1) {
# print out maximum-width character (always last in array)
print $graphChars->[scalar @$graphChars - 1];
} else {
# we're printing regular non-unicode characters
if (length ($histogramChar) > 1 && !$unicode) {
# but still we have >1 byte! so print initial byte
# for all but the last character (printed outside loop)
print substr ($histogramChar, 0, 1);
} else {
print $histogramChar;
}
}
}
# print one too many bar characters so <1% gets a single bar character
my $remainder = ($barWidth->[$i] * $maxBarWidth) - int ($barWidth->[$i] * $maxBarWidth);
my $whichChar = int ($remainder / $charWidth);
if ($charWidth < 1) {
# if we have partial-width characters, get higher resolution
if ($barWidth->[$i] * $maxBarWidth > $charWidth) {
# we have more than charWidth remainder, so print out a
# remainder portion
print $graphChars->[$whichChar];
} else {
# we had minimum remainder, so print minimum-width
# character just print the minimum-width portion
print $graphChars->[0];
}
} else {
# we're printing regular non-unicode characters
if (length ($histogramChar) > 1 && !$unicode) {
# but still we have >1 byte! so print final byte of input string
print substr ($histogramChar, -1, 1);
} else {
print $histogramChar;
}
}
# FIXME: even with all these colour-printing antics, still one key will
# be printed with the wrong colour on sorted output most of the time,
# but i have no idea how to fix this other than to implement sorting of
# the output within the script itself.
if ($i == $height - 1) {
# put the terminal back into a normal-colour mode on last entry
print "$regularColour\n";
} else {
# we do these antics of printing $keyColour on the line before
# the key so that piping output to sort will work
print "$keyColour\n";
}
}
}
# usage
sub doArgs {
print "\n";
print "usage: <commandWithOutput> | $0\n";
print " [--rcfile=<rcFile>]\n";
print " [--size={sm|med|lg|full} | --width=<width> --height=<height>]\n";
print " [--color] [--palette=r,k,c,p,g]\n";
print " [--tokenize=<tokenChar>]\n";
print " [--graph[=[kv|vk]] [--numonly[=mon|abs]]\n";
print " [--char=<barChars>|<substitutionString>]\n";
print " [--help] [--verbose]\n";
print " --char=C character(s) to use for histogram character, some substitutions follow:\n";
print " ba (▬) Bar\n";
print " bl (Ξ) Building\n";
print " em (—) Emdash\n";
print " me (⋯) Mid-Elipses\n";
print " di (♦) Diamond\n";
print " dt (•) Dot\n";
print " sq (□) Square\n";
print " hl Use 1/3-width unicode partial lines to simulate 3x actual terminal width\n";
print " pb Use 1/8-width unicode partial blocks to simulate 8x actual terminal width\n";
print " pc Use 1/2-width unicode partial circles to simulate 2x actual terminal width\n";
print " --color colourise the output\n";
print " --graph[=G] input is already key/value pairs. vk is default:\n";
print " kv input is ordered key then value\n";
print " vk input is ordered value then key\n";
print " --height=N height of histogram, headers non-inclusive, overrides --size\n";
print " --help get help\n";
print " --logarithmic logarithmic graph\n";
print " --match=RE only match lines (or tokens) that match this regexp, some substitutions follow:\n";
print " word ^[A-Z,a-z]+\$ - tokens/lines must be entirely alphabetic\n";
print " num ^\\d+\$ - tokens/lines must be entirely numeric\n";
print " --numonly[=N] input is numerics, simply graph values without labels\n";
print " abs input is absolute values (default)\n";
print " mon input monotonically-increasing, graph differences (of 2nd and later values)\n";
print " --palette=P comma-separated list of ANSI colour values for portions of the output\n";
print " in this order: regular, key, count, percent, graph. implies --color.\n";
print " --rcfile=F use this rcfile instead of \$HOME/.distributionrc - must be first argument!\n";
print " --size=S size of histogram, can abbreviate to single character, overridden by --width/--height\n";
print " small 40x10\n";
print " medium 80x20\n";
print " large 120x30\n";
print " full terminal width x terminal height (approximately)\n";
print " --tokenize=RE split input on regexp RE and make histogram of all resulting tokens\n";
print " word [^\\w] - split on non-word characters like colons, brackets, commas, etc\n";
print " white \\s - split on whitespace\n";
print " --width=N width of the histogram report, N characters, overrides --size\n";
print " --verbose be verbose\n";
print "\n";
print "You can use single-characters options, like so: -h=25 -w=20 -v. You must still include the =\n";
print "\n";
print "Samples:\n";
print " du -sb /etc/* | $0 --palette=0,37,34,33,32 --graph\n";
print " du -sk /etc/* | awk '{print \$2\" \"\$1}' | $0 --graph=kv\n";
print " zcat /var/log/syslog*gz | $0 --char=o --tokenize=white\n";
print " zcat /var/log/syslog*gz | awk '{print \$5}' | $0 --t=word --m-word --h=15 --c=/\n";
print " zcat /var/log/syslog*gz | cut -c 1-9 | $0 --width=60 --height=10 --char=em\n";
print " find /etc -type f | cut -c 6- | $0 --tokenize=/ --w=90 --h=35 --c=dt\n";
print " cat /usr/share/dict/words | awk '{print length(\$1)}' | $0 --c=* --w=50 --h=10 | sort -n\n";
print "\n";
}