-
Notifications
You must be signed in to change notification settings - Fork 2
/
qy
executable file
·204 lines (179 loc) · 6.93 KB
/
qy
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
#!/bin/bash
# Moonshine script for access log queries.
# Expects a set of referrer logs to live in the directory $QY_LOGDIR (or set with -r option).
# Writes output in $QY_QRYDIR (or set with -q option).
# Modes: see -h/EOH
# Naming of log files expected to be
# access.log
# access.log.1
# access.log.2.gz
# access.log.3.gz
# ..
# Todo
# . use of fifo means it cannot read from itself yet (name+pid + double fifo?)
# . allow filter-by-field (rather than e.g. grep 404 in everything)
# . generic set of tally modes / queries. Maybe datamash some queries.
# -> top pages add number of different hosts
# . for (new) mappable visitors, sum up what interested them
# . autofilter referrer spam and crawlers
set -euo pipefail
# Five main modes. Default is to update IP map
mode="map" # "dl" "ref" "tsv" "copy" are the others.
# Input modes. Default is to take access.log
mode_take="curr" # "week" "all" "stdin" "prev" are the others.
# "week" grabs access.log.1 up to 7.
take_n_tail=+1 # by default do everything in selection
debug=false
sayso=true
QY_MAPTRIGGER=${QY_MAPTRIGGER:-/}
QY_DLPATTERN=${QY_DLPATTERN:-.tar.gz}
QY_LOGDIR=${QY_LOGDIR:-QY_LOGDIR_unset}
QY_QRYDIR=${QY_QRYDIR:-QY_QRYDIR_unset}
QY_SITENAME=${QY_SITENAME:-QY_SITENAME_unset}
QY_WEEKANCHOR=${QY_WEEKANCHOR:-'last Saturday'}
tag_dl=dl # Current needs this.
fnout=
base=
list=
taketag=
function loglog() {
if $sayso; then
>&2 echo -e "$1"
fi
}
while getopts :u:r:M:q:b:d:t:l:m:hx_ opt
do
case "$opt" in
_) debug=true ;;
l) take_n_tail=$OPTARG ;;
b) base=refs.$OPTARG ;;
M) QY_MAPTRIGGER=$OPTARG ;;
d) QY_DLPATTERN=$OPTARG ;;
r) QY_LOGDIR=$OPTARG ;;
q) QY_QRYDIR=$OPTARG ;;
u) QY_SITENAME=$OPTARG ;;
t) mode_take=$OPTARG ;;
m) mode=$OPTARG ;;
x) sayso=false ;;
h) cat <<EOH
Default action: using selected input expand IP map on trigger ($QY_MAPTRIGGER)
-m <mode> one of "map" (default) "ref" "dl" "copy" "tsv"
"tsv": remap input to tsv: IP MAPPEDIP URL STATUS REFERRER METHOD SIZE
"ref": tally referrer URLs
"dl": tally downloads
"copy": copy input verbatim
-M <str> grep "<str>" in input to add IPs to map (default $QY_MAPTRIGGER)
Input selection:
Default query file 'access log' (-t curr)
-t <mode> one of "prev" "curr" (default) "week" "all" "stdin"
"week": query last 7 log files (currently hardcoded as access.log.1 access.log.[2-7].gz)
Output is indexed by week in year as it was: '$QY_WEEKANCHOR' (env QY_WEEKANCHOR)
(hook intended for use in weekly cron jobs)
-l <NUM> query last <NUM> lines in selected input
-d <str> grep "<str>" in input to query downloads (default $QY_DLPATTERN env QY_DLPATTERN)
-b <str> tag output with <str> (default depends on -t argument)
-q <str> set place for writing output (default $QY_QRYDIR, env QY_QRYDIR)
-r <str> set place for reading logs (default $QY_LOGDIR, env QY_LOGDIR)
-u <str> set site name (default $QY_SITENAME, env QY_SITENAME)
EOH
exit 0 ;;
:) echo "Flag $OPTARG needs argument"; exit 1 ;;
?) echo "Flag $OPTARG unknown"; exit 1 ;;
esac
done
if [[ $mode_take == "stdin" ]]; then
taketag=stdin
list=-
elif [[ $mode_take == "week" ]]; then
taketag=w$(date -d "$QY_WEEKANCHOR" +%U)
list=$(echo access.log.1 access.log.[2-7].gz)
elif [[ $mode_take == "curr" ]]; then
taketag=curr
list=access.log
elif [[ $mode_take == "prev" ]]; then
taketag=$(date -d "yesterday" +%y-%j)
list=access.log.1
elif [[ $mode_take == "all" ]]; then
taketag="z"
list=$(echo access.log*)
else
echo "Cannot take [$mode_take]"
false
fi
if [[ -z $base ]]; then base=refs.$taketag; fi
known_modes="dl ref copy map tsv"
if ! grep -q "\<$mode\>" <<< "$known_modes"; then
echo "Not a know mode [$mode] (choose from $known_modes)"
false
fi
cd $QY_LOGDIR
# just because I wanted one.
source=$QY_QRYDIR/readfromlogs
trap "rm -f $source" EXIT
rm -f $source
mkfifo $source
loglog "-- files $list"
if [[ $mode_take != "stdin" ]]; then
loglog "-- $(zcat -f $list | wc -l) entries"
fi
zcat -f $list | tail -n $take_n_tail > $QY_QRYDIR/readfromlogs &
# zcat -f $list > $QY_QRYDIR/readfromlogs &
cd $QY_QRYDIR
touch .map
if [[ $mode == "copy" ]]; then
cat < readfromlogs | gzip > a.${base#refs.}.gz
elif [[ $mode == "tsv" ]]; then
perl -ne 'BEGIN{open("M",".map");%map=map{chomp;split}<M>;}/^\S+:\S+\s+(\S+)\s+\S+\s+\S+\s+\[.*?\]\s+"(\S+)\s+(.*?)\s+.*?"\s+(\d+)\s+(\d+)\s+"(.*?)"/&&(print "$1\t" . ($map{$1}?$map{$1}:"<>") . "\t$3\t$4\t$6\t$2\t$5\n")' < readfromlogs
# parse input: host:port ip=1 id id date req=2 url=3 code=4 size=5 ref=6
# our output: IP URL STATUS REFERRER TYPE SIZE
# 1 3 4 6 2 5
# host:port ip clientid userid [date] "REQUESTTYPE URL" CODE objectsize referrer useragent
elif [[ $mode == "dl" ]]; then
fnout=dl.${base#refs.}.txt
fnout=$tag_dl.${fnout#dl.}
export QYDL=$QY_DLPATTERN
perl -ne 'BEGIN{open("M",".map");%map=map{/^(\S+)\s+(\S+)/?($1,$2):()}<M>;}/\S+\s+(\S+).*?"GET (\S+$ENV{QYDL}).*?"/ && print "$2\t$1\t$map{$1}\n";' readfromlogs > $fnout
rev $fnout | sort | rev > $fnout.rev
elif [[ $mode == "ref" ]]; then
loglog "-- mode ref"
fnout=$base.ext
perl -ne '/".*?".*?(\d+)\s+\S+\s+"(.*?)"/ && print "$1-$2\n";' readfromlogs | sort | uniq -c | sort -n | perl -pe 's/-/ /' > $base.z
#echo "--"
grep -v $QY_SITENAME $base.z > $fnout
#echo "--"
# grep -v "\<\(ua\|ru\|kz\)\>" $fnout | grep -v "^ *1\>"
elif [[ $mode == "map" ]]; then
# exit 14
fnout=.add
loglog "-- hook $QY_MAPTRIGGER (-M <hook>)"
> .add
grep "$QY_MAPTRIGGER" < readfromlogs | perl -ne 'BEGIN{open(M,"<.map");%map=map{/^(\S+)\s+(\S+)/?($1,$2):()}<M>;close(M)}/^\S+\s+(\S+)/;($1&&defined($map{$1})&&next)||print"$1\n";' | sort -u | while read ip; do
name=$(dig +short -x $ip)
name=$(echo $name) # remove trailing ws
name=${name%.} # remove trailing dot
name=$(tr ' ' '-' <<< $name) # occassionally multi-line answer
echo -e "$ip\t[$name]"
loglog "$ip\t[$name]"
done >> .add
nadd=$(wc -l < .add)
if (( nadd > 0 )); then
>&2 echo "-- adding $nadd addresses"
cat .map .add | sort -u -V > .mad
if [[ ! -f .map.1.gz ]] || [[ $(find .map.1.gz -mtime +1 -print) ]]; then
loglog "-- upcycling map copies"
j=10; for i in {9..1}; do [[ -f .map.$i.gz ]] && mv -f .map.$i.gz .map.$j.gz; j=$i; done
gzip -c .map > .map.1.gz
fi
mv -f .mad .map
fi
else
loglog "-- mode $mode is not a mode, peculiarly"
false
fi
if [[ -n $fnout ]]; then
>&2 echo "++ new $fnout ($(wc -l < $fnout) lines)"
fi
# notes:
# "I believe dig +short outputs two lines for you because the domain you query,
# smtp.mydomain.net is a CNAME for smtp.ggs.mydomain.net, and dig prints the
# intermediate resolution step.