-
Notifications
You must be signed in to change notification settings - Fork 1
/
mls_stat.c
403 lines (380 loc) · 16.1 KB
/
mls_stat.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
/****************************************************************************
MailListStat - print useful statistics on email messages
stats gathering & computing functions
Copyright (C) 2001-2003 Marek Podmaka <[email protected]>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
****************************************************************************/
#include "mls.h"
#include "mls_stat.h"
#include "mls_mime.h"
#include "mls_list.h"
extern int verbose;
extern time_t t, t_oldest, t_newest;
extern long nSprav, nDlzka;
extern long nDen[31];
extern long nHod[24];
extern long nDOW[ 7];
extern long nMon[12];
extern nQptr msgQ;
extern nDptr msgD;
extern nTptr zozA, zozS, zozQ, zozM, zozAt, zozAa;
extern FILE *fInp;
/* ***** GetEmail ******************************************************** */
void GetEmail(char *auth) { // return only email address
regmatch_t pm[MAX_REGMATCH]; // buffer for matching substrings
char auth2[MAX_AUTH];
RemoveCR(auth);
if (regexec(&r_mail, auth, MAX_REGMATCH, pm, 0)) return; // should not happen
myCopy(auth2, auth+pm[2].rm_so, pm[2].rm_eo - pm[2].rm_so + 1);
myCopy(auth, auth2, MAX_AUTH);
}
/* ***** RemoveCR ******************************************************** */
void RemoveCR(char *text) { // remove '\n' from end
if (text[strlen(text)-2]=='\n') text[strlen(text)-2]='\0';
if (text[strlen(text)-1]=='\n') text[strlen(text)-1]='\0';
}
/* ***** GetSubj ********************************************************* */
void GetSubj(char *subj) { // MIME-decode & remove "Re:" from begin of subj.
regmatch_t pm[MAX_REGMATCH]; // buffer for matching substrings
char sub2[MAX_SUBJ];
RemoveCR(subj);
Decode_mime_string(subj, MAX_SUBJ);
if (regexec(&r_re, subj, MAX_REGMATCH, pm, 0)) return; // should not happen
myCopy(sub2, subj+pm[3].rm_so, pm[3].rm_eo - pm[3].rm_so + 1);
myCopy(subj, sub2, MAX_SUBJ);
}
/* ***** GetMailer ******************************************************* */
void GetMailer(char *mail) { // remove versions from X-Mailer/User-Agent/X-Newsreader line
regmatch_t pm[MAX_REGMATCH]; // buffer for matching substrings
char sub2[MAX_SUBJ]; // temp for storing result, will be copied back to mail at the end
RemoveCR(mail);
myCopy(sub2, mail, MAX_SUBJ); // copy original there for case no regex will match
if (!regexec(&r_m_bat, mail, MAX_REGMATCH, pm, 0)) { // The Bat!
myCopy(sub2, mail+pm[1].rm_so, pm[1].rm_eo - pm[1].rm_so + 1);
}
if (!regexec(&r_m_ims, mail, MAX_REGMATCH, pm, 0)) { // Internet Mail Service
myCopy(sub2, mail+pm[1].rm_so, pm[1].rm_eo - pm[1].rm_so + 1);
sub2[pm[1].rm_eo - pm[1].rm_so]=' ';
myCopy(sub2 + pm[1].rm_eo - pm[1].rm_so + 1, mail+pm[2].rm_so, pm[2].rm_eo - pm[2].rm_so + 1);
sub2[pm[1].rm_eo - pm[1].rm_so + pm[2].rm_eo - pm[2].rm_so+1]='x';
sub2[pm[1].rm_eo - pm[1].rm_so + pm[2].rm_eo - pm[2].rm_so+2]='\0';
}
if (!regexec(&r_m_moz, mail, MAX_REGMATCH, pm, 0)) { // Mozilla
myCopy(sub2, mail+pm[1].rm_so, pm[1].rm_eo - pm[1].rm_so + 1);
sub2[pm[1].rm_eo - pm[1].rm_so+0]='x';
sub2[pm[1].rm_eo - pm[1].rm_so+1]='\0';
}
if (!regexec(&r_m_ope, mail, MAX_REGMATCH, pm, 0)) { // Opera
myCopy(sub2, mail+pm[1].rm_so, pm[1].rm_eo - pm[1].rm_so + 1);
sub2[pm[1].rm_eo - pm[1].rm_so+0]='x';
sub2[pm[1].rm_eo - pm[1].rm_so+1]='\0';
}
if (!regexec(&r_m_oue, mail, MAX_REGMATCH, pm, 0)) { // Outlook Express
myCopy(sub2, mail+pm[1].rm_so, pm[1].rm_eo - pm[1].rm_so + 1);
sub2[pm[1].rm_eo - pm[1].rm_so+0]='x';
sub2[pm[1].rm_eo - pm[1].rm_so+1]='\0';
}
if (!regexec(&r_m_ouc, mail, MAX_REGMATCH, pm, 0)) { // Outlook CWS
myCopy(sub2, mail+pm[1].rm_so, pm[1].rm_eo - pm[1].rm_so + 1);
}
if (!regexec(&r_m_oum, mail, MAX_REGMATCH, pm, 0)) { // Outlook IMO
myCopy(sub2, mail+pm[1].rm_so, pm[1].rm_eo - pm[1].rm_so + 1);
}
if (!regexec(&r_m_out, mail, MAX_REGMATCH, pm, 0)) { // Outlook
myCopy(sub2, mail+pm[1].rm_so, pm[1].rm_eo - pm[1].rm_so + 1);
}
if (!regexec(&r_m_lot, mail, MAX_REGMATCH, pm, 0)) { // Lotus Notes
myCopy(sub2, mail+pm[1].rm_so, pm[1].rm_eo - pm[1].rm_so + 1);
}
if (!regexec(&r_m_cal, mail, MAX_REGMATCH, pm, 0)) { // Calypso
myCopy(sub2, mail+pm[1].rm_so, pm[1].rm_eo - pm[1].rm_so + 1);
}
if (!regexec(&r_m_peg, mail, MAX_REGMATCH, pm, 0)) { // Pegasus4win
myCopy(sub2, mail+pm[1].rm_so, pm[1].rm_eo - pm[1].rm_so + 1);
}
if (!regexec(&r_m_opw, mail, MAX_REGMATCH, pm, 0)) { // Open Webmail
myCopy(sub2, mail+pm[1].rm_so, pm[1].rm_eo - pm[1].rm_so + 1);
}
if (!regexec(&r_m_eud, mail, MAX_REGMATCH, pm, 0)) { // Eudora
myCopy(sub2, mail+pm[1].rm_so, pm[1].rm_eo - pm[1].rm_so + 1);
}
if (!regexec(&r_m_pos, mail, MAX_REGMATCH, pm, 0)) { // POSTIE
myCopy(sub2, mail+pm[1].rm_so, pm[1].rm_eo - pm[1].rm_so + 1);
}
if (!regexec(&r_m_pob, mail, MAX_REGMATCH, pm, 0)) { // POBOX
myCopy(sub2, mail+pm[1].rm_so, pm[1].rm_eo - pm[1].rm_so + 1);
}
if (!regexec(&r_m_kma, mail, MAX_REGMATCH, pm, 0)) { // KMail
myCopy(sub2, mail+pm[1].rm_so, pm[1].rm_eo - pm[1].rm_so + 1);
}
if (!regexec(&r_m_mut, mail, MAX_REGMATCH, pm, 0)) { // Mutt
myCopy(sub2, mail+pm[1].rm_so, pm[1].rm_eo - pm[1].rm_so + 1);
}
if (!regexec(&r_m_imp, mail, MAX_REGMATCH, pm, 0)) { // IMP webmail
myCopy(sub2, mail+pm[1].rm_so, pm[1].rm_eo - pm[1].rm_so + 1);
}
if (!regexec(&r_m_syl, mail, MAX_REGMATCH, pm, 0)) { // Sylpheed
myCopy(sub2, mail+pm[1].rm_so, pm[1].rm_eo - pm[1].rm_so + 1);
sub2[pm[1].rm_eo - pm[1].rm_so+0]='x';
sub2[pm[1].rm_eo - pm[1].rm_so+1]='\0';
}
if (!regexec(&r_m_pin, mail, MAX_REGMATCH, pm, 0)) { // Pine
myCopy(sub2, mail+pm[1].rm_so, pm[1].rm_eo - pm[1].rm_so + 1);
}
if (!regexec(&r_m_pi2, mail, MAX_REGMATCH, pm, 0)) { // Pine work-around
// remove Message-ID in case no mailer was found there
// Message-ID is used only if other headers are not present
*sub2='\0';
}
// (these contributed by Urke MMI <[email protected]>)
if (!regexec(&r_m_pan, mail, MAX_REGMATCH, pm, 0)) { // Pan
myCopy(sub2, mail+pm[1].rm_so, pm[1].rm_eo - pm[1].rm_so + 1);
sub2[pm[1].rm_eo - pm[1].rm_so+0]='x';
sub2[pm[1].rm_eo - pm[1].rm_so+1]='\0';
}
if (!regexec(&r_m_4td, mail, MAX_REGMATCH, pm, 0)) { // 40tude_Dialog
myCopy(sub2, mail+pm[1].rm_so, pm[1].rm_eo - pm[1].rm_so + 1);
sub2[pm[1].rm_eo - pm[1].rm_so+0]='x';
sub2[pm[1].rm_eo - pm[1].rm_so+1]='\0';
}
if (!regexec(&r_m_fag, mail, MAX_REGMATCH, pm, 0)) { // Forte Agent
myCopy(sub2, mail+pm[1].rm_so, pm[1].rm_eo - pm[1].rm_so + 1);
sub2[pm[1].rm_eo - pm[1].rm_so+0]='x';
sub2[pm[1].rm_eo - pm[1].rm_so+1]='\0';
}
if (!regexec(&r_m_mpg, mail, MAX_REGMATCH, pm, 0)) { // MicroPlanet Gravity
myCopy(sub2, mail+pm[1].rm_so, pm[1].rm_eo - pm[1].rm_so + 1);
sub2[pm[1].rm_eo - pm[1].rm_so+0]='x';
sub2[pm[1].rm_eo - pm[1].rm_so+1]='\0';
}
if (!regexec(&r_m_xws, mail, MAX_REGMATCH, pm, 0)) { // Xnews
myCopy(sub2, mail+pm[1].rm_so, pm[1].rm_eo - pm[1].rm_so + 1);
sub2[pm[1].rm_eo - pm[1].rm_so+0]='x';
sub2[pm[1].rm_eo - pm[1].rm_so+1]='\0';
}
if (!regexec(&r_m_knd, mail, MAX_REGMATCH, pm, 0)) { // KNode
myCopy(sub2, mail+pm[1].rm_so, pm[1].rm_eo - pm[1].rm_so + 1);
sub2[pm[1].rm_eo - pm[1].rm_so+0]='x';
sub2[pm[1].rm_eo - pm[1].rm_so+1]='\0';
}
if (!regexec(&r_m_hst, mail, MAX_REGMATCH, pm, 0)) { // Hamster
myCopy(sub2, mail+pm[1].rm_so, pm[1].rm_eo - pm[1].rm_so + 1);
sub2[pm[1].rm_eo - pm[1].rm_so+0]='x';
sub2[pm[1].rm_eo - pm[1].rm_so+1]='\0';
}
if (!regexec(&r_m_nnr, mail, MAX_REGMATCH, pm, 0)) { // Noworyta News Reader
myCopy(sub2, mail+pm[1].rm_so, pm[1].rm_eo - pm[1].rm_so + 1);
sub2[pm[1].rm_eo - pm[1].rm_so+0]='x';
sub2[pm[1].rm_eo - pm[1].rm_so+1]='\0';
}
myCopy(mail, sub2, MAX_SUBJ);
}
/* ***** PrintProgress *************************************************** */
void PrintProgress() { // print no. of messages processed
long delim=10;
if (!verbose) return;
while ((float)(nSprav/delim)>10) delim*=10;
if (!(nSprav%delim)) fprintf(stderr,"(i) Processing message no.%5lu\n",nSprav);
}
/* ***** AddStat ********************************************************* */
void AddStat(char *A, char *S, char *D, char *M, long sQ, long sN) {
/* A ... Author (From: line)
S ... Subject line
D ... Date line
M ... X-Mailer/User-Agent/X-Newsreader line
sQ ... size of quoted text
sN ... size of non-quoted text
*/
regmatch_t pm[MAX_REGMATCH]; // buffer for matching substrings
long sT; // total size of message body
long day,mon,yea; // day, month, year of curr. mess.
time_t t_mess; // date of current message (seconds)
struct tm tm_mess; // date of curr. message
char time[6];
sT=sQ+sN; // sum of quoted + non-quoted
// quote
if ((float)sQ/sT > msgQ->perc) {
myCopy(msgQ->auth, A, MAX_AUTH);
myCopy(msgQ->subj, S, MAX_SUBJ);
myCopy(msgQ->date, D, MAX_DATE);
msgQ->size=sT;
msgQ->perc=(float)sQ/sT;
}
// size
if (sT > msgD->size) {
myCopy(msgD->auth, A, MAX_AUTH);
myCopy(msgD->subj, S, MAX_SUBJ);
myCopy(msgD->date, D, MAX_DATE);
msgD->size=sT;
}
zozA=AddEntry(zozA, A, sT, sQ, compCount); // author
zozS=AddEntry(zozS, S, sT, sQ, compCount); // subject
zozM=AddEntry(zozM, M, sT, sQ, compCount); // mailer
// time
if (!regexec(&r_time, D, MAX_REGMATCH, pm, 0)) { // if matched
sT=pm[1].rm_eo - pm[1].rm_so + 1;
myCopy(time, D + pm[1].rm_so, (sT>3)?3:sT); // it should not match more than 2 chars (+ \0)
sT=atoi(time);
if ((sT>=0) && (sT<24)) nHod[sT]++;
} else myVerb(" ! Invalid time: ",D);
// day of week (DOW)
if (!regexec(&r_dow, D, MAX_REGMATCH, pm, 0)) { // if matched
sT=pm[2].rm_eo - pm[2].rm_so + 1;
myCopy(time, D + pm[2].rm_so, (sT>4)?4:sT); // it should not match more than 3 chars (+ \0)
sT=9; // default (invalid) value
if (!strcmp(time, "Mon")) sT=0;
if (!strcmp(time, "Tue")) sT=1;
if (!strcmp(time, "Wed")) sT=2;
if (!strcmp(time, "Thu")) sT=3;
if (!strcmp(time, "Fri")) sT=4;
if (!strcmp(time, "Sat")) sT=5;
if (!strcmp(time, "Sun")) sT=6;
if ((sT>=0) && (sT<7)) nDOW[sT]++;
} else myVerb(" ! Invalid day of week: ",D);
// date
day=mon=yea=0;
if (!regexec(&r_date, D, MAX_REGMATCH, pm, 0)) { // if matched
sT=pm[2].rm_eo - pm[2].rm_so + 1;
if (sT > 0) {
myCopy(time, D + pm[2].rm_so, (sT>3)?3:sT); // it should not match more than 2 chars (+ \0)
sT=atoi(time)-1; // days in month begin from 1 (but our array from 0)
if ((sT>=0) && (sT<31)) nDen[sT]++;
}
// oldest/newest message - match whole date, convert it to seconds
day=sT+1;
sT=pm[3].rm_eo - pm[3].rm_so + 1; // month in text form
if (sT > 0) {
myCopy(time, D + pm[3].rm_so, (sT>4)?4:sT); // it should not match more than 3 chars (+ \0)
if (!strncmp(time,"Jan",3)) mon=1;
if (!strncmp(time,"Feb",3)) mon=2;
if (!strncmp(time,"Mar",3)) mon=3;
if (!strncmp(time,"Apr",3)) mon=4;
if (!strncmp(time,"May",3)) mon=5;
if (!strncmp(time,"Jun",3)) mon=6;
if (!strncmp(time,"Jul",3)) mon=7;
if (!strncmp(time,"Aug",3)) mon=8;
if (!strncmp(time,"Sep",3)) mon=9;
if (!strncmp(time,"Oct",3)) mon=10;
if (!strncmp(time,"Nov",3)) mon=11;
if (!strncmp(time,"Dec",3)) mon=12;
}
sT=pm[4].rm_eo - pm[4].rm_so + 1; // year
if (sT > 0) {
myCopy(time, D + pm[4].rm_so, (sT>5)?5:sT); // it should not match more than 4 chars (+ \0)
yea=atoi(time);
}
if (!day || !mon || !yea) myVerb(" ! Unable to get message date for oldest/newest: ",D);
tm_mess.tm_sec=tm_mess.tm_min=0;
tm_mess.tm_hour=12; // just to be sure that timezone won't change the day
tm_mess.tm_isdst=-1; // unknown timezone
tm_mess.tm_mday=day;
tm_mess.tm_mon =mon-1;
tm_mess.tm_year=yea-1900;
t_mess=mktime(&tm_mess); // convert it to seconds since epoch
if (mon) nMon[mon-1]++; else myVerb(" ! Invalid month: ",D);
if (strncmp("DON'T DELETE THIS MESSAGE -- FOLDER INTERNAL DATA",S,49)) {
if (!t_oldest) t_oldest=t_mess; // initialize oldest message to current
if (t_mess < t_oldest) t_oldest=t_mess;
if (t_mess > t_newest) t_newest=t_mess;
}
} else myVerb(" ! Invalid date: ",D);
}
/* ***** myCopy ********************************************************** */
void myCopy(char *dst, char *src, size_t max) {
// safe copy - will add \0 to end of destination region
strncpy(dst,src,max);
dst[max-1]='\0';
}
/* ***** ParseInput ****************************************************** */
void ParseInput() { // parse whole input file
long i;
char string[MAX_LINE]; // temp. for reading lines from input
char riadok[MAX_LINE]; // 1st part of line (to be processed)
int a_empt=1; // finite state automat (empty,body,header)
int a_head=0;
int a_body=0;
int nDlzRiad=0; // size of current line
int nDlzQuot=0; // size of quoted lines of curr. email
int nDlzNorm=0; // size of non-quoted lines of curr. email
char sAuth[MAX_AUTH]=""; // From: header line
char sSubj[MAX_SUBJ]=""; // Subject: header line
char sDate[MAX_DATE]=""; // Date: header line
char sMail[MAX_SUBJ]=""; // X-Mailer/User-Agent/X-Newsreader/Message-ID header line
while (!feof(fInp) && fgets(string,MAX_LINE,fInp)) {
// read beginning of line
nDlzRiad=i=strlen(string);
strcpy(riadok,string); // both are < MAX_LINE chars, so it's safe
while (!feof(fInp) && i>1 && string[i-1]!='\n') { // read up to the end of line
fgets(string,MAX_LINE,fInp);
i=strlen(string);
nDlzRiad+=i;
} // end of line
// parse line & change automat state
if (a_empt && a_head) { a_head=0; a_body=1; a_empt=0; }
if (a_empt) { // begin of file || end of body
// find "From "
if (regexec(&r_from, riadok, 0, NULL, 0)==0) {
if (a_body) AddStat(sAuth,sSubj,sDate,sMail,nDlzQuot,nDlzNorm);
// we're at beginning of new message
a_head=1;
a_body=0;
nSprav++;
PrintProgress();
nDlzQuot=nDlzNorm=0;
*sAuth='\0'; *sSubj='\0'; *sDate='\0'; *sMail='\0';
}
}
if (nDlzRiad==1) a_empt=1; else a_empt=0;
// parse body
if (a_body) nDlzka+=nDlzRiad;
if (a_body) riadok[9]='\0'; // search only in first 9 chars
if (a_body && strstr(riadok,">")) nDlzQuot+=nDlzRiad;
else if (a_body) nDlzNorm+=nDlzRiad;
// parse header
if (a_head) {
if (!strncasecmp(riadok,"Subject: ",9)) {
myCopy(sSubj,riadok+9,MAX_SUBJ);
GetSubj(sSubj);
}
if (!strncasecmp(riadok,"Date: ",6)) {
myCopy(sDate,riadok+6,MAX_DATE);
RemoveCR(sDate);
}
if (!strncasecmp(riadok,"From: ",6)) {
myCopy(sAuth,riadok+6,MAX_AUTH);
GetEmail(sAuth);
}
// will search for mailers in these
// Message-ID will be used only if no other of these is present
if (!strncasecmp(riadok,"Message-ID: ",12) && sMail=='\0') { // for Pine mailer
myCopy(sMail,riadok+12,MAX_SUBJ);
GetMailer(sMail);
}
if (!strncasecmp(riadok,"X-Newsreader: ",14)) {
myCopy(sMail,riadok+14,MAX_SUBJ);
GetMailer(sMail);
}
if (!strncasecmp(riadok,"User-Agent: ",12)) {
myCopy(sMail,riadok+12,MAX_SUBJ);
GetMailer(sMail);
}
if (!strncasecmp(riadok,"X-Mailer: ",10)) {
myCopy(sMail,riadok+10,MAX_SUBJ);
GetMailer(sMail);
}
}
} /* while fEOF */
time(&t);
// compute stats of last email if needed
if (a_body) AddStat(sAuth,sSubj,sDate,sMail,nDlzQuot,nDlzNorm);
}