58
61
/* Function Prototypes */
60
static void rstats_print_histogram(size_t robn, rstats_t **rstats_array, size_t count);
61
static void rstats_print_rtable(rstats_t **rstats_array, size_t count);
63
static void rstats_print_histogram(size_t robn, rstats_t *rstats_head);
64
static void rstats_print_rtable(rstats_t *rstats_head);
63
66
/* Function Definitions */
67
70
if (stats_head == NULL) {
68
71
stats_head = xcalloc(1, sizeof(header_t));
69
stats_tail = (rstats_t *) xcalloc( 1, sizeof(rstats_t));
72
stats_tail = (rstats_t *)xcalloc(1, sizeof(rstats_t));
70
73
stats_head->list = stats_tail;
88
void rstats_add(const word_t *token, double prob, wordcnts_t *cnts)
91
void rstats_add(const word_t *token, double prob, bool used, wordcnts_t *cnts)
93
96
stats_head->count += 1;
94
97
stats_tail->next = NULL;
95
99
/* Using externally controlled data;
96
100
token must not be freed before calling rstats_cleanup()
98
102
stats_tail->token = token;
99
103
stats_tail->prob = prob;
104
stats_tail->used = used;
100
105
stats_tail->good = cnts->good;
101
106
stats_tail->bad = cnts->bad;
102
107
stats_tail->msgs_good = cnts->msgs_good;
103
108
stats_tail->msgs_bad = cnts->msgs_bad;
104
110
stats_tail->next = (rstats_t *)xcalloc(1, sizeof(rstats_t));
105
111
stats_tail = stats_tail->next;
108
static int compare_rstats_t(const void *const ir1, const void *const ir2)
114
/* compare_rstats_t - sort by ascending spamicity */
116
static int compare_rstats_t(const void *const pv1, const void *const pv2)
110
const rstats_t *r1 = *(const rstats_t *const *)ir1;
111
const rstats_t *r2 = *(const rstats_t *const *)ir2;
118
const rstats_t *r1 = (const rstats_t *)pv1;
119
const rstats_t *r2 = (const rstats_t *)pv2;
113
if (r1->prob - r2->prob > EPS) return 1;
114
if (r2->prob - r1->prob > EPS) return -1;
121
if (r1->prob > r2->prob) return 1;
122
if (r2->prob > r1->prob) return -1;
116
124
return word_cmp(r1->token, r2->token);
129
137
void rstats_print(bool unsure)
132
139
size_t robn = stats_head->robn;
133
size_t count = stats_head->count;
135
rstats_t **rstats_array = (rstats_t **) xcalloc(count, sizeof(rstats_t *));
137
for (r=0, cur=stats_head->list; r<count; r+=1, cur=cur->next)
138
rstats_array[r] = cur;
140
/* sort by ascending probability, then name */
141
qsort(rstats_array, count, sizeof(rstats_t *), compare_rstats_t);
141
/* sort by ascending spamicity */
142
stats_head->list = (rstats_t *)listsort((element *)stats_head->list, (fcn_compare *)&compare_rstats_t);
143
144
if (Rtable || verbose>=3)
144
rstats_print_rtable(rstats_array, count);
145
rstats_print_rtable(stats_head->list);
145
146
else if (verbose==2 || (unsure && verbose))
146
rstats_print_histogram(robn, rstats_array, count);
147
rstats_print_histogram(robn, stats_head->list);
151
static void rstats_print_histogram(size_t robn, rstats_t **rstats_array, size_t count)
150
static void rstats_print_histogram(size_t robn, rstats_t *rstats_head)
154
rstats_t *cur=rstats_head;
155
155
rhistogram_t hist[INTERVALS];
157
157
double invn = (double) robn;
163
163
(void)fprintf(fpo, "\n" );
165
165
/* Compute histogram */
166
for (i=r=0; i<INTERVALS; i+=1)
166
for (i=0; i<INTERVALS; i+=1)
168
168
rhistogram_t *h = &hist[i];
169
169
double fin = 1.0*(i+1)/INTERVALS;
172
173
h->spamicity=0.0;
175
double prob = rstats_array[r]->prob;
177
double prob = cur->prob;
179
if (fabs(EVEN_ODDS - prob) - min_dev >= EPS)
184
186
logsum += log(prob);
191
193
h->spamicity = robx;
194
196
double invproduct, product;
195
197
invproduct = 1.0 - exp(invlogsum / invn);
196
198
product = 1.0 - exp(logsum / invn);
197
h->spamicity = (invproduct + product < EPS)
199
h->spamicity = (invproduct + product < EPS)
199
201
: (1.0 + (invproduct - product) / (invproduct + product)) / 2.0;
219
221
if (maxcnt>48) cnt = (cnt * 48 + maxcnt - 1) / maxcnt;
221
223
/* display histogram */
222
for (r=0; r<cnt; r+=1)
223
225
(void)fputc( '#', fpo);
224
226
(void)fputc( '\n', fpo);
228
static void rstats_print_rtable(rstats_t **rstats_array, size_t count)
230
static void rstats_print_rtable(rstats_t *rstats_head)
231
232
const char *pfx = !stats_in_header ? "" : " ";
233
236
/* print header */
235
238
(void)fprintf(fpo, "%s%*s %6s %-6s %-6s %-6s %s\n",
236
pfx, max_token_len+2,"","n", "pgood", "pbad", "fw", "U");
239
pfx, max_token_len+2, "", "n", "pgood", "pbad", "fw", "U");
238
241
(void)fprintf(fpo, "%s%*s %6s %-6s %-6s %-6s %-6s %-6s %s\n",
239
pfx, max_token_len+2,"","n", "pgood", "pbad", "fw","invfwlog", "fwlog", "U");
242
pfx, max_token_len+2, "", "n", "pgood", "pbad", "fw", "invfwlog", "fwlog", "U");
241
244
/* Print 1 line per token */
242
for (r= 0; r<count; r+=1)
245
for (cur=rstats_head->next; cur != NULL; cur=cur->next)
244
rstats_t *cur = rstats_array[r];
245
247
int len = (cur->token->leng >= max_token_len) ? 0 : (max_token_len - cur->token->leng);
246
248
double fw = calc_prob(cur->good, cur->bad, cur->msgs_good, cur->msgs_bad);
247
char flag = (fabs(fw-EVEN_ODDS) - min_dev >= EPS) ? '+' : '-';
249
char flag = cur->used ? '+' : '-';
249
251
(void)fprintf(fpo, "%s\"", pfx);
250
252
(void)word_puts(cur->token, 0, fpo);