2
* Heirloom mailx - a mail user agent derived from Berkeley Mail.
4
* Copyright (c) 2000-2004 Gunnar Ritter, Freiburg i. Br., Germany.
8
* Gunnar Ritter. All rights reserved.
10
* Redistribution and use in source and binary forms, with or without
11
* modification, are permitted provided that the following conditions
13
* 1. Redistributions of source code must retain the above copyright
14
* notice, this list of conditions and the following disclaimer.
15
* 2. Redistributions in binary form must reproduce the above copyright
16
* notice, this list of conditions and the following disclaimer in the
17
* documentation and/or other materials provided with the distribution.
18
* 3. All advertising materials mentioning features or use of this software
19
* must display the following acknowledgement:
20
* This product includes software developed by Gunnar Ritter
21
* and his contributors.
22
* 4. Neither the name of Gunnar Ritter nor the names of his contributors
23
* may be used to endorse or promote products derived from this software
24
* without specific prior written permission.
26
* THIS SOFTWARE IS PROVIDED BY GUNNAR RITTER AND CONTRIBUTORS ``AS IS'' AND
27
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29
* ARE DISCLAIMED. IN NO EVENT SHALL GUNNAR RITTER OR CONTRIBUTORS BE LIABLE
30
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
41
static char sccsid[] = "@(#)junk.c 1.73 (gritter) 3/4/06";
57
#else /* !HAVE_MMAP */
58
#define mmap(a, b, c, d, e, f) MAP_FAILED
60
#endif /* !HAVE_MMAP */
62
#define mremap(a, b, c, d) MAP_FAILED
63
#endif /* !HAVE_MREMAP */
66
#define MAP_FAILED ((void *)-1)
67
#endif /* !MAP_FAILED */
73
* Mail -- a mail program
75
* Junk classification, mostly according to Paul Graham's "A Plan for Spam",
76
* August 2002, <http://www.paulgraham.com/spam.html>, and his "Better
77
* Bayesian Filtering", January 2003, <http://www.paulgraham.com/better.html>.
79
* Chained tokens according to Jonathan A. Zdziarski's "Advanced Language
80
* Classification using Chained Tokens", February 2004,
81
* <http://www.nuclearelephant.com/papers/chained.html>.
88
#define MAX2 0x0000ffff
89
#define MAX3 0x00ffffffUL
90
#define MAX4 0xffffffffUL
93
* The dictionary consists of two files forming a hash table. The hash
94
* consists of the first 56 bits of the result of applying MD5 to the
95
* input word. This scheme ensures that collisions are unlikely enough
96
* to make junk detection work; according to the birthday paradox, a
97
* 50 % probability for one single collision is reached at 2^28 entries.
99
* To make the chain structure independent from input, the MD5 input is
100
* xor'ed with a random number. This makes it impossible that someone uses
101
* a carefully crafted message for a denial-of-service attack against the
104
#define SIZEOF_node 17
105
#define OF_node_hash 0 /* first 32 bits of MD5 of word|mangle */
106
#define OF_node_next 4 /* bit-negated table index of next node */
107
#define OF_node_good 8 /* number of times this appeared in good msgs */
108
#define OF_node_bad 11 /* number of times this appeared in bad msgs */
109
#define OF_node_prob_O 14 /* table_version<1: precomputed probability */
110
#define OF_node_hash2 14 /* upper 3 bytes of MD5 hash */
113
#define SIZEOF_super 262164
114
#define OF_super_size 0 /* allocated nodes in the chain file */
115
#define OF_super_used 4 /* used nodes in the chain file */
116
#define OF_super_ngood 8 /* number of good messages scanned so far */
117
#define OF_super_nbad 12 /* number of bad messages scanned so far */
118
#define OF_super_mangle 16 /* used to mangle the MD5 input */
119
#define OF_super_bucket 20 /* 65536 bit-negated node indices */
120
#define SIZEOF_entry 4
123
static size_t super_mmapped;
124
static size_t nodes_mmapped;
126
static int chained_tokens;
132
* 1 Fixed the mangling; it was ineffective in version 0.
133
* Hash extended to 56 bits.
135
static int table_version;
136
#define current_table_version 1
139
((unsigned)(((char *)(e))[0]&0377) + \
140
((unsigned)(((char *)(e))[1]&0377) << 8) + \
141
((unsigned)(((char *)(e))[2]&0377) << 16))
144
(((char *)(e))[0] = (n) & 0x0000ff, \
145
((char *)(e))[1] = ((n) & 0x00ff00) >> 8, \
146
((char *)(e))[2] = ((n) & 0xff0000) >> 16)
148
#define f2s(d) (smin(((unsigned)((d) * MAX3)), MAX3))
150
#define s2f(s) ((float)(s) / MAX3)
153
((unsigned long)(((char *)(p))[0]&0377) + \
154
((unsigned long)(((char *)(p))[1]&0377) << 8) + \
155
((unsigned long)(((char *)(p))[2]&0377) << 16) + \
156
((unsigned long)(((char *)(p))[3]&0377) << 24))
159
(((char *)(p))[0] = (n) & 0x000000ffUL, \
160
((char *)(p))[1] = ((n) & 0x0000ff00UL) >> 8, \
161
((char *)(p))[2] = ((n) & 0x00ff0000UL) >> 16, \
162
((char *)(p))[3] = ((n) & 0xff000000UL) >> 24)
183
char field[LINESIZE];
186
#define constituent(c, b, i, price, hadamp) \
187
((c) & 0200 || alnumchar(c) || (c) == '\'' || (c) == '"' || \
188
(c) == '$' || (c) == '!' || (c) == '_' || \
189
(c) == '#' || (c) == '%' || (c) == '&' || \
190
((c) == ';' && hadamp) || \
191
((c) == '-' && !(price)) || \
192
(((c) == '.' || (c) == ',' || (c) == '/') && \
193
(i) > 0 && digitchar((b)[(i)-1]&0377)))
195
#define url_xchar(c) \
196
(((c)&0200) == 0 && ((c)&037) != (c) && (c) != 0177 && \
197
!spacechar(c) && (c) != '{' && (c) != '}' && (c) != '|' && \
198
(c) != '\\' && (c) != '^' && (c) != '~' && (c) != '[' && \
199
(c) != ']' && (c) != '`' && (c) != '<' && (c) != '>' && \
200
(c) != '#' && (c) != '"')
212
static const char README1[] = "\
213
This is a junk mail database maintained by mailx(1). It does not contain any\n\
214
of the actual words found in your messages. Instead, parts of MD5 hashes are\n\
215
used for lookup. It is thus possible to tell if some given word was likely\n\
216
contained in your mail from examining this data, at best.\n";
217
static const char README2[] = "\n\
218
The database files are stored in compress(1) format by default. This saves\n\
219
some space, but leads to higher processor usage when the database is read\n\
220
or updated. You can use uncompress(1) on these files if you prefer to store\n\
221
them in flat form.\n";
225
static FILE *sfp, *nfp;
226
static char *sname, *nname;
228
static enum okay getdb(int rw);
229
static void putdb(void);
230
static void relsedb(void);
231
static FILE *dbfp(enum db db, int rw, int *compressed, char **fn);
232
static char *lookup(unsigned long h1, unsigned long h2, int create);
233
static unsigned long grow(unsigned long size);
234
static char *nextword(char **buf, size_t *bufsize, size_t *count, FILE *fp,
235
struct lexstat *sp, int *stop);
236
static void join(char **buf, size_t *bufsize, const char *s1, const char *s2);
237
static void add(const char *word, enum entry entry, struct lexstat *sp,
239
static enum okay scan(struct message *m, enum entry entry,
240
void (*func)(const char *, enum entry, struct lexstat *, int),
242
static void recompute(void);
243
static float getprob(char *n);
244
static int insert(int *msgvec, enum entry entry, int incr);
245
static void clsf(struct message *m);
246
static void rate(const char *word, enum entry entry, struct lexstat *sp,
248
static void dbhash(const char *word, unsigned long *h1, unsigned long *h2);
249
static void mkmangle(void);
258
chained_tokens = value("chained-junk-tokens") != NULL;
259
if ((sfp = dbfp(SUPER, rw, &compressed, &sname)) == (FILE *)-1)
261
if (sfp && !compressed) {
262
super = mmap(NULL, SIZEOF_super,
263
rw!=O_RDONLY ? PROT_READ|PROT_WRITE : PROT_READ,
264
MAP_SHARED, fileno(sfp), 0);
265
if (super != MAP_FAILED) {
266
super_mmapped = SIZEOF_super;
271
super = smalloc(SIZEOF_super);
275
if ((compressed ? zread(zp, super, SIZEOF_super)
277
fread(super, 1, SIZEOF_super, sfp)
280
fprintf(stderr, "Error reading junk mail database.\n");
281
memset(super, 0, SIZEOF_super);
287
} else if (compressed)
290
memset(super, 0, SIZEOF_super);
293
skip: if ((n = getn(&super[OF_super_size])) == 0) {
295
putn(&super[OF_super_size], 1);
297
if (sfp && (nfp = dbfp(NODES, rw, &compressed, &nname)) != NULL) {
298
if (nfp == (FILE *)-1) {
304
if (sfp && nfp && !compressed) {
305
nodes = mmap(NULL, n * SIZEOF_node,
306
rw!=O_RDONLY ? PROT_READ|PROT_WRITE : PROT_READ,
307
MAP_SHARED, fileno(nfp), 0);
308
if (nodes != MAP_FAILED) {
309
nodes_mmapped = n * SIZEOF_node;
314
nodes = smalloc(n * SIZEOF_node);
318
if ((compressed ? zread(zp, nodes, n * SIZEOF_node)
320
fread(nodes, 1, n * SIZEOF_node, nfp)
321
!= n * SIZEOF_node) ||
323
fprintf(stderr, "Error reading junk mail database.\n");
324
memset(nodes, 0, n * SIZEOF_node);
325
memset(super, 0, SIZEOF_super);
327
putn(&super[OF_super_size], n);
334
memset(nodes, 0, n * SIZEOF_node);
348
if (!super_mmapped && (sfp = dbfp(SUPER, O_WRONLY, &scomp, &sname))
349
== NULL || sfp == (FILE *)-1)
351
if (!nodes_mmapped && (nfp = dbfp(NODES, O_WRONLY, &ncomp, &nname))
352
== NULL || nfp == (FILE *)-1)
354
if (super_mmapped == 0 || nodes_mmapped == 0)
357
* Use utime() with mmap() since Linux does not update st_mtime
358
* reliably otherwise.
364
zwrite(zp, super, SIZEOF_super);
368
fwrite(super, 1, SIZEOF_super, sfp);
373
zwrite(zp, nodes, getn(&super[OF_super_size]) * SIZEOF_node);
378
getn(&super[OF_super_size]) * SIZEOF_node, nfp);
379
if (super_mmapped == 0 || nodes_mmapped == 0)
387
munmap(super, super_mmapped);
392
munmap(nodes, nodes_mmapped);
396
if (sfp && sfp != (FILE *)-1) {
400
if (nfp && nfp != (FILE *)-1) {
407
dbfp(enum db db, int rw, int *compressed, char **fn)
413
{ "super", "nodes" },
414
{ "super1", "nodes1" }
418
{ "super.Z", "nodes.Z" },
419
{ "super1.Z", "nodes1.Z" }
424
if ((dir = value("junkdb")) == NULL) {
425
fprintf(stderr, "No junk mail database specified. "
426
"Set the junkdb variable.\n");
430
if (makedir(dir) == STOP) {
431
fprintf(stderr, "Cannot create directory \"%s\"\n.", dir);
435
table_version = current_table_version;
436
loop: sf = sfx[table_version];
437
zf = zfx[table_version];
438
*fn = salloc((n = strlen(dir)) + 40);
442
strcpy(&(*fn)[n+1], sf[db]);
443
if ((fp = Fopen(*fn, rw!=O_RDONLY ? "r+" : "r")) != NULL)
446
strcpy(&(*fn)[n+1], zf[db]);
447
if ((fp = Fopen(*fn, rw ? "r+" : "r")) == NULL &&
448
rw==O_WRONLY ? (fp = Fopen(*fn, "w+")) == NULL : 0) {
449
fprintf(stderr, "Cannot open junk mail database \"%s\".\n",*fn);
453
strcpy(&(*fn)[n+1], "README");
454
if (access(*fn, F_OK) < 0 && (rp = Fopen(*fn, "w")) != NULL) {
459
} else if (fp == NULL) {
460
if (table_version > 0) {
464
table_version = current_table_version;
467
flp.l_type = rw!=O_RDONLY ? F_WRLCK : F_RDLCK;
470
flp.l_whence = SEEK_SET;
471
fcntl(fileno(fp), F_SETLKW, &flp);
477
lookup(unsigned long h1, unsigned long h2, int create)
479
char *n, *lastn = NULL;
480
unsigned long c, lastc = MAX4, used, size;
482
used = getn(&super[OF_super_used]);
483
size = getn(&super[OF_super_size]);
484
c = ~getn(&super[OF_super_bucket + (h1&MAX2)*SIZEOF_entry]);
485
n = &nodes[c*SIZEOF_node];
487
if (getn(&n[OF_node_hash]) == h1 &&
488
(table_version < 1 ? 1 :
489
get(&n[OF_node_hash2]) == h2))
493
c = ~getn(&n[OF_node_next]);
494
n = &nodes[c*SIZEOF_node];
498
if ((size = grow(size)) == 0)
500
lastn = &nodes[lastc*SIZEOF_node];
502
putn(&super[OF_super_used], used+1);
503
n = &nodes[used*SIZEOF_node];
504
putn(&n[OF_node_hash], h1);
505
put(&n[OF_node_hash2], h2);
507
putn(&lastn[OF_node_next], ~used);
509
putn(&super[OF_super_bucket + (h1&MAX2)*SIZEOF_entry],
517
grow(unsigned long size)
519
unsigned long incr, newsize;
522
incr = size > MAX2 ? MAX2 : size;
523
newsize = size + incr;
524
if (newsize > MAX4-MAX2) {
525
oflo: fprintf(stderr, "Junk mail database overflow.\n");
529
if (lseek(fileno(nfp), newsize*SIZEOF_node-1, SEEK_SET)
530
== (off_t)-1 || write(fileno(nfp),"\0",1) != 1)
533
if ((nodes = mremap(nodes, nodes_mmapped, newsize*SIZEOF_node,
534
MREMAP_MAYMOVE)) == MAP_FAILED) {
535
if ((nodes = mmap(NULL, newsize*SIZEOF_node,
537
PROT_READ|PROT_WRITE :
539
MAP_SHARED, fileno(nfp), 0))
544
munmap(onodes, nodes_mmapped);
546
nodes_mmapped = newsize*SIZEOF_node;
548
nodes = srealloc(nodes, newsize*SIZEOF_node);
549
memset(&nodes[size*SIZEOF_node], 0, incr*SIZEOF_node);
552
putn(&super[OF_super_size], size);
557
if (i+j >= (long)*bufsize-4) \
558
*buf = srealloc(*buf, *bufsize += 32); \
560
i += (*buf)[j+i] != '\0'; \
564
nextword(char **buf, size_t *bufsize, size_t *count, FILE *fp,
565
struct lexstat *sp, int *stop)
574
for (cp = sp->save; *cp; cp++) {
582
if (sp->loc == FROM_LINE)
583
while (*count > 0 && (c = getc(fp)) != EOF) {
592
if (sp->loc == HEADER && sp->field[0]) {
593
field: cp = sp->field;
606
while (*count > 0 && (c = getc(fp)) != EOF) {
608
if (c == '\0' && table_version >= 1) {
614
if (c == '\b' && table_version >= 1) {
615
sp->html = HTML_TEXT;
618
if (c == '<' && sp->html == HTML_TEXT) {
623
if (sp->html == HTML_TAG) {
626
if (!asccasecmp(sp->tag, "a") ||
627
!asccasecmp(sp->tag, "img") ||
628
!asccasecmp(sp->tag, "font") ||
629
!asccasecmp(sp->tag, "span") ||
630
!asccasecmp(sp->tag, "meta") ||
631
!asccasecmp(sp->tag, "table") ||
632
!asccasecmp(sp->tag, "tr") ||
633
!asccasecmp(sp->tag, "td") ||
634
!asccasecmp(sp->tag, "p"))
635
sp->html = HTML_TEXT;
637
sp->html = HTML_SKIP;
638
} else if (c == '>') {
639
sp->html = HTML_TEXT;
642
if (sp->tagp - sp->tag < sizeof sp->tag - 1)
647
if (sp->html == HTML_SKIP) {
649
sp->html = HTML_TEXT;
652
if (c == '$' && i == 0)
654
if (sp->loc == HEADER && sp->lastc == '\n') {
657
while (k < sizeof sp->field - 3) {
660
(c = getc(fp)) == EOF)
662
if (spacechar(c) || c == ':') {
669
sp->field[k++] = '*';
674
} else if (c == '\n') {
677
sp->html = HTML_NONE;
684
cp = sp->save = smalloc(i+6);
685
for (cq = "HOST*"; *cq; cq++)
687
for (cq = &(*buf)[j]; *cq != ':'; cq++);
688
cq += 3; /* skip "://" */
689
while (cq < &(*buf)[i+j] &&
690
(alnumchar(*cq&0377) ||
691
*cq == '.' || *cq == '-'))
698
} else if (constituent(c, *buf, i+j, sp->price, sp->hadamp) ||
699
sp->loc == HEADER && c == '.' &&
700
asccasecmp(sp->field, "subject*")) {
704
} else if (i > 0 && c == ':' && *count > 2) {
705
if ((c = getc(fp)) != '/') {
710
if ((c = getc(fp)) != '/') {
719
for (cq = "URL*"; *cq; cq++) {
725
if (alnumchar(*cp&0377)) {
730
for (cq = "://"; *cq; cq++) {
733
} else if (i > 1 && ((*buf)[i+j-1] == ',' ||
734
(*buf)[i+j-1] == '.') && !digitchar(c)) {
748
for (k = 0; k < i; k++)
749
if (digitchar((*buf)[k+j]&0377))
751
else if (!alphachar((*buf)[k+j]&0377) &&
752
(*buf)[k+j] != '$') {
759
* Including the results of other filtering software (the
760
* 'X-Spam' fields) might seem tempting, but will also rate
761
* their false negatives good with this filter. Therefore
762
* these fields are ignored.
764
* Handling 'Received' fields is difficult since they include
765
* lots of both useless and interesting words for our purposes.
767
if (sp->loc == HEADER &&
768
(asccasecmp(sp->field, "message-id*") == 0 ||
769
asccasecmp(sp->field, "references*") == 0 ||
770
asccasecmp(sp->field, "in-reply-to*") == 0 ||
771
asccasecmp(sp->field, "status*") == 0 ||
772
asccasecmp(sp->field, "x-status*") == 0 ||
773
asccasecmp(sp->field, "date*") == 0 ||
774
asccasecmp(sp->field, "delivery-date*") == 0 ||
775
ascncasecmp(sp->field, "x-spam", 6) == 0 ||
776
ascncasecmp(sp->field, "x-pstn", 6) == 0 ||
777
ascncasecmp(sp->field, "x-scanned", 9) == 0 ||
778
asccasecmp(sp->field, "received*") == 0 &&
779
((2*c > i) || i < 4 ||
780
asccasestr(*buf, "localhost") != NULL)))
787
#define JOINCHECK if (i >= *bufsize) \
788
*buf = srealloc(*buf, *bufsize += 32)
790
join(char **buf, size_t *bufsize, const char *s1, const char *s2)
808
add(const char *word, enum entry entry, struct lexstat *sp, int incr)
811
unsigned long h1, h2;
814
dbhash(word, &h1, &h2);
815
if ((n = lookup(h1, h2, 1)) != NULL) {
818
c = get(&n[OF_node_good]);
819
if (incr>0 && c<MAX3-incr || incr<0 && c>=-incr) {
821
put(&n[OF_node_good], c);
825
c = get(&n[OF_node_bad]);
826
if (incr>0 && c<MAX3-incr || incr<0 && c>=-incr) {
828
put(&n[OF_node_bad], c);
836
scan(struct message *m, enum entry entry,
837
void (*func)(const char *, enum entry, struct lexstat *, int),
841
char *buf0 = NULL, *buf1 = NULL, *buf2 = NULL, **bp, *cp;
842
size_t bufsize0 = 0, bufsize1 = 0, bufsize2 = 0, *zp, count;
846
if ((fp = Ftemp(&cp, "Ra", "w+", 0600, 1)) == NULL) {
852
if (send(m, fp, NULL, NULL, SEND_TOFLTR, NULL) < 0) {
858
sp = scalloc(1, sizeof *sp);
862
while (nextword(bp, zp, &count, fp, sp, &stop) != NULL) {
863
(*func)(*bp, entry, sp, arg);
864
if (chained_tokens && buf0 && *buf0 && buf1 && *buf1 && !stop) {
865
join(&buf2, &bufsize2, bp == &buf1 ? buf0 : buf1, *bp);
866
(*func)(buf2, entry, sp, arg);
868
bp = bp == &buf1 ? &buf0 : &buf1;
869
zp = zp == &bufsize1 ? &bufsize0 : &bufsize1;
882
unsigned long used, i;
887
used = getn(&super[OF_super_used]);
888
for (i = 0; i < used; i++) {
889
n = &nodes[i*SIZEOF_node];
892
put(&n[OF_node_prob_O], s);
899
unsigned long ngood, nbad;
903
ngood = getn(&super[OF_super_ngood]);
904
nbad = getn(&super[OF_super_nbad]);
905
if (ngood + nbad >= 18000) {
908
} else if (ngood + nbad >= 9000) {
915
g = get(&n[OF_node_good]) * 2;
916
b = get(&n[OF_node_bad]);
918
p = smin(1.0, nbad ? (float)b/nbad : 0.0) /
919
(smin(1.0, ngood ? (float)g/ngood : 0.0) +
920
smin(1.0, nbad ? (float)b/nbad : 0.0));
923
} else if (g == 0 && b == 0)
931
insert(int *msgvec, enum entry entry, int incr)
936
verbose = value("verbose") != NULL;
937
if (getdb(O_RDWR) != OKAY)
941
u = getn(&super[OF_super_ngood]);
944
u = getn(&super[OF_super_nbad]);
947
for (ip = msgvec; *ip; ip++) {
948
setdot(&message[*ip-1]);
949
if (incr > 0 && u == MAX4-incr+1) {
950
fprintf(stderr, "Junk mail database overflow.\n");
952
} else if (incr < 0 && -incr > u) {
953
fprintf(stderr, "Junk mail database underflow.\n");
957
if (entry == GOOD && incr > 0 || entry == BAD && incr < 0)
958
message[*ip-1].m_flag &= ~MJUNK;
960
message[*ip-1].m_flag |= MJUNK;
961
scan(&message[*ip-1], entry, add, incr);
965
putn(&super[OF_super_ngood], u);
968
putn(&super[OF_super_nbad], u);
971
if (table_version < 1)
981
return insert(v, GOOD, 1);
987
return insert(v, BAD, 1);
993
return insert(v, GOOD, -1);
999
return insert(v, BAD, -1);
1005
int *msgvec = v, *ip;
1007
verbose = value("verbose") != NULL;
1008
_debug = debug || value("debug") != NULL;
1009
if (getdb(O_RDONLY) != OKAY)
1011
for (ip = msgvec; *ip; ip++) {
1012
setdot(&message[*ip-1]);
1013
clsf(&message[*ip-1]);
1024
unsigned long hash1;
1025
unsigned long hash2;
1030
clsf(struct message *m)
1033
float a = 1, b = 1, r;
1036
fprintf(stderr, "Examining message %d\n", m - &message[0] + 1);
1037
for (i = 0; i < BEST; i++) {
1041
if (scan(m, -1, rate, 0) != OKAY)
1043
if (best[0].prob == -1) {
1045
fprintf(stderr, "No information found.\n");
1046
m->m_flag &= ~MJUNK;
1049
for (i = 0; i < BEST; i++) {
1050
if (best[i].prob == -1)
1053
fprintf(stderr, "Probe %2d: \"%s\", hash=%lu:%lu "
1054
"prob=%.4g dist=%.4g\n",
1055
i+1, prstr(best[i].word),
1056
best[i].hash1, best[i].hash2,
1057
best[i].prob, best[i].dist);
1059
b *= 1 - best[i].prob;
1061
r = a+b > 0 ? a / (a+b) : 0;
1063
fprintf(stderr, "Junk probability of message %d: %g\n",
1064
m - &message[0] + 1, r);
1068
m->m_flag &= ~MJUNK;
1073
rate(const char *word, enum entry entry, struct lexstat *sp, int unused)
1076
unsigned long h1, h2;
1080
dbhash(word, &h1, &h2);
1081
if ((n = lookup(h1, h2, 0)) != NULL) {
1086
fprintf(stderr, "h=%lu:%lu g=%u b=%u p=%.4g %s\n", h1, h2,
1087
n ? get(&n[OF_node_good]) : 0,
1088
n ? get(&n[OF_node_bad]) : 0,
1092
d = p >= MID ? p - MID : MID - p;
1093
if (d >= best[BEST-1].dist)
1094
for (i = 0; i < BEST; i++) {
1095
if (h1 == best[i].hash1 && h2 == best[i].hash2)
1098
* This selection prefers words from the end of the
1099
* header and from the start of the body. It does
1100
* probably not matter much at all, but gives at
1101
* least the most interesting verbose output.
1103
if (d > best[i].dist || best[i].loc == HEADER &&
1104
d >= best[i].dist) {
1105
for (j = BEST-2; j >= i; j--)
1106
best[j+1] = best[j];
1109
best[i].word = savestr(word);
1112
best[i].loc = sp->loc;
1119
dbhash(const char *word, unsigned long *h1, unsigned long *h2)
1121
unsigned char digest[16];
1125
MD5Update(&ctx, (unsigned char *)word, strlen(word));
1126
if (table_version >= 1)
1127
MD5Update(&ctx, (unsigned char *)&super[OF_super_mangle], 4);
1128
MD5Final(digest, &ctx);
1130
if (table_version < 1) {
1131
*h1 ^= getn(&super[OF_super_mangle]);
1134
*h2 = get(&digest[4]);
1138
* The selection of the value for mangling is not critical. It is practically
1139
* impossible for any person to determine the exact time when the database
1140
* was created first (without looking at the database, which would reveal the
1141
* value anyway), so we just use this. The MD5 hash here ensures that each
1142
* single second gives a completely different mangling value (which is not
1143
* necessary anymore if table_version>=1, but does not hurt).
1153
unsigned char digest[16];
1156
memset(&u, 0, sizeof u);
1159
MD5Update(&ctx, (unsigned char *)u.c, sizeof u.c);
1160
MD5Final(digest, &ctx);
1162
putn(&super[OF_super_mangle], s);
1166
cprobability(void *v)
1169
unsigned long used, ngood, nbad;
1170
unsigned long h1, h2;
1175
if (*args == NULL) {
1176
fprintf(stderr, "No words given.\n");
1179
if (getdb(O_RDONLY) != OKAY)
1181
used = getn(&super[OF_super_used]);
1182
ngood = getn(&super[OF_super_ngood]);
1183
nbad = getn(&super[OF_super_nbad]);
1184
printf("Database statistics: tokens=%lu ngood=%lu nbad=%lu\n",
1187
dbhash(*args, &h1, &h2);
1188
printf("\"%s\", hash=%lu:%lu ", *args, h1, h2);
1189
if ((n = lookup(h1, h2, 0)) != NULL) {
1190
g = get(&n[OF_node_good]);
1191
b = get(&n[OF_node_bad]);
1192
printf("good=%u bad=%u ", g, b);
1195
d = p >= MID ? p - MID : MID - p;
1196
printf("prob=%.4g dist=%.4g", p, d);
1198
printf("too infrequent");
1200
printf("not in database");