2
* Copyright (C) 2005 Laird Breyer
4
* This program is free software; you can redistribute it and/or modify
5
* it under the terms of the GNU General Public License as published by
6
* the Free Software Foundation; either version 2 of the License, or
7
* (at your option) any later version.
9
* This program is distributed in the hope that it will be useful,
10
* but WITHOUT ANY WARRANTY; without even the implied warranty of
11
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12
* GNU General Public License for more details.
14
* You should have received a copy of the GNU General Public License
15
* along with this program; if not, write to the Free Software
16
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18
* Author: Laird Breyer <laird@lbreyer.com>
32
#if defined HAVE_UNISTD_H
38
#if defined HAVE_LANGINFO_H
41
/* on OpenBSD, CODESET doesn't seem to be defined -
42
we use 3, which should be US-ASCII, but it's not ideal... */
52
/* global variables */
54
extern hash_bit_count_t default_max_hash_bits;
55
extern hash_count_t default_max_tokens;
57
extern hash_bit_count_t default_max_grow_hash_bits;
58
extern hash_count_t default_max_grow_tokens;
60
hash_bit_count_t decimation;
62
extern options_t u_options;
63
extern options_t m_options;
66
extern token_order_t ngram_order; /* defaults to 1 */
68
/* for option processing */
70
extern int optind, opterr, optopt;
73
extern charbuf_len_t textbuf_len;
75
extern char *progname;
76
extern char *inputfile;
77
extern long inputline;
79
int exit_code = 0; /* default */
80
int overflow_warning = 0;
82
score_t default_stepsize = 0.1;
84
extern long system_pagesize;
86
extern void *in_iobuf;
87
extern void *out_iobuf;
90
/***********************************************************
91
* MISCELLANEOUS FUNCTIONS *
92
***********************************************************/
94
static void usage(char **argv) {
98
"hypex CATDUMP1 CATDUMP2...\n");
102
" computes the Chernoff information and related quantities for\n");
104
" a binary hypothesis test based on the category dump files CATDUMP1, CATDUMP2.\n");
112
" prints program version.\n");
115
/***********************************************************
116
* CATEGORY PAIRED HASH TABLE FUNCTIONS *
117
***********************************************************/
118
cp_item_t *find_in_category_pair(category_pair_t *cp, hash_value_t id) {
119
register cp_item_t *i, *loop;
121
i = loop = &cp->hash[id & (cp->max_tokens - 1)];
123
while( FILLEDP(i) ) {
124
if( EQUALP(i->id,id) ) {
125
return i; /* found id */
129
i = (i >= &cp->hash[cp->max_tokens]) ? cp->hash : i;
131
return NULL; /* when hash table is full */
136
/* empty slot, so not found */
141
/* returns true if the hash could be grown, false otherwise.
142
When the hash is grown, the old values must be redistributed. */
143
bool_t grow_category_pair(category_pair_t *cp) {
144
hash_count_t c, new_size;
145
cp_item_t *i, temp_item;
147
if( !(u_options & (1<<U_OPTION_GROWHASH)) ) {
150
if( cp->max_hash_bits < default_max_grow_hash_bits ) {
152
/* grow the memory around the hash */
153
if( (i = (cp_item_t *)realloc(cp->hash,
155
(1<<(cp->max_hash_bits+1)))) == NULL ) {
157
"failed to grow hash table.\n");
160
/* we need the old value of learner->max_tokens for a while yet */
163
/* realloc doesn't initialize the memory */
164
memset(&((cp_item_t *)i)[cp->max_tokens], 0,
165
((1<<cp->max_hash_bits) - cp->max_tokens) *
169
/* now mark every used slot */
170
for(c = 0; c < cp->max_tokens; c++) {
171
if( FILLEDP(&cp->hash[c]) ) {
172
SETMARK(&cp->hash[c]);
176
/* now relocate each marked slot and clear it */
177
new_size = (1<<cp->max_hash_bits) - 1;
178
for(c = 0; c < cp->max_tokens; c++) {
179
while( MARKEDP(&cp->hash[c]) ) {
180
/* find where it belongs */
181
i = &cp->hash[cp->hash[c].id & new_size];
182
while( FILLEDP(i) && !MARKEDP(i) ) {
184
i = (i > &cp->hash[new_size]) ? cp->hash : i;
185
} /* guaranteed to exit since hash is larger than original */
187
/* clear the mark - this must happen after we look for i,
188
since it should be possible to find i == learner->hash[c] */
189
UNSETMARK(&cp->hash[c]);
192
if( i != &cp->hash[c] ) {
195
memcpy(&temp_item, i, sizeof(cp_item_t));
196
memcpy(i, &cp->hash[c], sizeof(cp_item_t));
197
memcpy(&cp->hash[c], &temp_item, sizeof(cp_item_t));
200
memcpy(i, &cp->hash[c], sizeof(cp_item_t));
201
memset(&cp->hash[c], 0, sizeof(cp_item_t));
204
/* now &cp->hash[c] is marked iff there was a swap */
207
cp->max_tokens = (1<<cp->max_hash_bits);
209
u_options &= ~(1<<U_OPTION_GROWHASH); /* it's the law */
211
"the token hash table is nearly full, slowing down.\n");
217
void init_category_pair(category_pair_t *cp) {
218
cp->max_tokens = default_max_tokens;
219
cp->max_hash_bits = default_max_hash_bits;
220
cp->unique_token_count = 0;
222
cp->hash = (cp_item_t *)malloc(cp->max_tokens * sizeof(cp_item_t));
224
errormsg(E_FATAL, "failed to allocate %li bytes for pair hash.\n",
225
(sizeof(cp_item_t) * ((long int)cp->max_tokens)));
229
void free_category_pair(category_pair_t *cp) {
236
/***********************************************************
237
* DIVERGENCE CALCULATIONS *
238
***********************************************************/
239
void edge_divergences(category_pair_t *cp, score_t *div_01, score_t *div_10) {
244
double safetycheck0 = 1.0;
245
double safetycheck1 = 1.0;
247
e = cp->hash + cp->max_tokens;
248
for(i = cp->hash; i != e; i++) {
250
sum01 += (i->lam[0] - i->lam[1]) *
251
exp(i->lam[0] + i->ref - cp->cat[0].logZ);
252
sum10 += (i->lam[1] - i->lam[0]) *
253
exp(i->lam[1] + i->ref - cp->cat[1].logZ);
255
safetycheck0 += (exp(i->lam[0]) - 1.0) * exp(i->ref);
256
safetycheck1 += (exp(i->lam[1]) - 1.0) * exp(i->ref);
259
*div_01 = cp->cat[1].logZ - cp->cat[0].logZ + sum01;
260
*div_10 = cp->cat[0].logZ - cp->cat[1].logZ + sum10;
262
if( u_options & (1<<U_OPTION_VERBOSE) ) {
263
/* this is a quick check to verify that the normalizing constants are
265
fprintf(stdout, "safetycheck: %f %f should both be close to zero\n",
266
log(safetycheck0) - cp->cat[0].logZ,
267
log(safetycheck1) - cp->cat[1].logZ);
270
fprintf(stdout, "# D(P0|P1) = %f\tD(P1|P0) = %f\n",
271
nats2bits(*div_01), nats2bits(*div_10));
275
void calculate_divergences(category_pair_t *cp, score_t beta,
276
score_t div_Qbeta_P[2],
287
e = cp->hash + cp->max_tokens;
288
for(i = cp->hash; i != e; i++) {
290
logQbetainc = beta * i->lam[0] + (1.0 - beta) * i->lam[1];
292
Zbeta += (exp(logQbetainc) - 1.0) * exp(i->ref);
294
sum0 += (logQbetainc - i->lam[0]) * exp(logQbetainc + i->ref);
295
sum1 += (logQbetainc - i->lam[1]) * exp(logQbetainc + i->ref);
297
sum2 += (i->lam[1] - i->lam[0]) * exp(logQbetainc + i->ref);
301
*logZbeta = log(Zbeta);
302
div_Qbeta_P[0] = sum0/Zbeta + cp->cat[0].logZ - (*logZbeta);
303
div_Qbeta_P[1] = sum1/Zbeta + cp->cat[1].logZ - (*logZbeta);
305
*Psibeta = sum2/Zbeta + cp->cat[0].logZ - cp->cat[1].logZ;
307
/* the divergences can be calculated two slightly different ways,
308
either by directly using the definition as above, or by using the
309
expression containing Psibeta. We do both and average, which
310
might hopefully make the result more accurate. */
314
cp->cat[0].logZ - (*logZbeta) + (1.0 - beta) * (*Psibeta))/2.0;
318
cp->cat[1].logZ - (*logZbeta) - beta * (*Psibeta))/2.0;
320
fprintf(stdout, "%f %f %f %f %f %f\n",
321
beta, nats2bits(*logZbeta), nats2bits(*Psibeta),
322
nats2bits(div_Qbeta_P[0]), nats2bits(div_Qbeta_P[1]),
323
nats2bits(div_Qbeta_P[1] - div_Qbeta_P[0]));
329
/***********************************************************
330
* FILE MANAGEMENT FUNCTIONS *
331
***********************************************************/
332
bool_t load_category_dump(const char *filename, category_pair_t *cp, int c) {
334
char buf[MAGIC_BUFSIZE];
336
bool_t ok = (bool_t)1;
338
long unsigned int id;
339
weight_t lam, dig_ref;
343
hash_count_t linecount;
345
if( (c != 0) && (c != 1) ) {
346
errormsg(E_WARNING, "too many categories.\n");
351
inputfile = (char *)filename;
353
input = fopen(filename, "rb");
355
set_iobuf_mode(input);
357
cp->cat[c].fullfilename = strdup(filename);
358
ok = load_category_header(input, &(cp->cat[c]));
360
if( !fgets(buf, MAGIC_BUFSIZE, input) ||
361
strncmp(buf, MAGIC_DUMP, strlen(MAGIC_DUMP)) != 0 ) {
363
"%s is not a dbacl model dump! (see dbacl(1) -d switch)\n",
366
} else if( (cp->cat[c].max_order != 1) ) {
367
/* we don't support complicated models, only uniform reference
368
measures and unigram features */
369
errormsg(E_ERROR, "the category %s is not supported (model too complex)\n",
370
cp->cat[c].filename);
374
while( ok && fill_textbuf(input, &extra_lines) ) {
376
/* we don't parse the full line, only the numbers */
377
if( sscanf(textbuf, MAGIC_DUMPTBL_i,
378
&lam, &dig_ref, &count, &id) == 4 ) {
381
i = find_in_category_pair(cp, id);
383
if( i && !FILLEDP(i) &&
384
((100 * cp->unique_token_count) >=
385
(HASH_FULL * cp->max_tokens)) && grow_category_pair(cp) ) {
386
i = find_in_category_pair(cp,id);
387
/* new i, go through all tests again */
391
/* we cannot allow collisions, so we simply drop
392
the weight if the slot is full */
400
if( cp->unique_token_count < K_TOKEN_COUNT_MAX )
401
{ cp->unique_token_count++; } else { overflow_warning = 1; }
406
if( fabs(dig_ref - i->ref) > 0.00001 ) {
408
"unequal reference measures! New token ignored (id=%lx).\n",
419
if( cp->unique_token_count < K_TOKEN_COUNT_MAX )
420
{ cp->unique_token_count++; } else { overflow_warning = 1; }
426
errormsg(E_ERROR, "ran out of hashes.\n");
430
} else if( *textbuf ) {
431
errormsg(E_ERROR, "unable to parse line %ld of %s\n",
432
inputline, cp->cat[c].fullfilename);
436
if( ok && (linecount != cp->cat[c].model_unique_token_count) ) {
438
"incorrect number of features, expectig %ld, got %ld\n",
439
(long int)cp->cat[c].model_unique_token_count, inputline);
446
errormsg(E_ERROR, "couldn't open %s\n", filename);
453
bool_t process_dump_pair(const char *d1, const char *d2, category_pair_t *cp) {
455
score_t div01, div10;
457
score_t div_Qbeta_P[2] = {0.0, 0.0};
460
score_t chernoff_rate = 0.0;
461
score_t chernoff_beta = 0.0;
463
if( u_options & (1<<U_OPTION_VERBOSE) ) {
464
fprintf(stdout, "processing (%s, %s)\n", d1, d2);
467
if( load_category_dump(d1, cp, 0) && load_category_dump(d2, cp, 1) ) {
468
if( u_options & (1<<U_OPTION_VERBOSE) ) {
469
fprintf(stdout, "loaded successfully (%s, %s)\n", d1, d2);
472
edge_divergences(cp, &div01, &div10);
475
"# beta | logZ_beta | Psi_beta | D(Q_beta|P_0) | D(Q_beta|P_1) | t \n");
476
for(beta = 0.0; beta < 1.0; beta += default_stepsize) {
477
if( chernoff_beta <= 0.0 ) {
478
/* we haven't crossed the intersection yet, this occurs
479
the first time the divergence order is inverted */
480
chernoff_rate = div_Qbeta_P[0] + div_Qbeta_P[1];
481
calculate_divergences(cp, beta, div_Qbeta_P, &logZbeta, &Psibeta);
482
if( div_Qbeta_P[0] <= div_Qbeta_P[1] ) {
483
chernoff_beta = beta - default_stepsize/2;
484
chernoff_rate = (chernoff_rate + div_Qbeta_P[0] + div_Qbeta_P[1])/4;
487
calculate_divergences(cp, beta, div_Qbeta_P, &logZbeta, &Psibeta);
490
calculate_divergences(cp, 1.0, div_Qbeta_P, &logZbeta, &Psibeta);
493
"# chernoff_rate %f chernoff_beta %f\n",
494
nats2bits(chernoff_rate), chernoff_beta);
501
/***********************************************************
503
***********************************************************/
504
int set_option(int op, char *optarg) {
508
fprintf(stdout, "hypex version %s\n", VERSION);
509
fprintf(stdout, COPYBLURB, "hypex");
512
case 'h': /* select memory size in powers of 2 */
513
default_max_hash_bits = atoi(optarg);
514
if( default_max_hash_bits > MAX_HASH_BITS ) {
516
"maximum hash size will be 2^%d\n",
518
default_max_hash_bits = MAX_HASH_BITS;
520
default_max_tokens = (1<<default_max_hash_bits);
523
case 'H': /* select memory size in powers of 2 */
524
default_max_grow_hash_bits = atoi(optarg);
525
if( default_max_grow_hash_bits > MAX_HASH_BITS ) {
527
"maximum hash size will be 2^%d\n",
529
default_max_grow_hash_bits = MAX_HASH_BITS;
531
default_max_grow_tokens = (1<<default_max_grow_hash_bits);
532
u_options |= (1<<U_OPTION_GROWHASH);
535
case 's': /* select stepsize */
536
default_stepsize = atof(optarg);
537
if( (default_stepsize <= 0.0) ||
538
(default_stepsize >= 1.0) ) {
540
"stepsize must be between 0.0 and 1.0, using 0.1\n");
541
default_stepsize = 0.1;
546
u_options |= (1<<U_OPTION_VERBOSE);
555
void sanitize_options() {
557
/* consistency checks */
559
/* decide if we need some options */
564
int main(int argc, char **argv) {
574
init_category_pair(&cp);
576
/* set up internationalization */
577
if( !setlocale(LC_ALL, "") ) {
579
"could not set locale, internationalization disabled\n");
581
if( u_options & (1<<U_OPTION_VERBOSE) ) {
583
"international locales not supported\n");
587
#if defined(HAVE_GETPAGESIZE)
588
system_pagesize = getpagesize();
590
if( system_pagesize == -1 ) { system_pagesize = BUFSIZ; }
592
/* parse the options */
593
while( (op = getopt(argc, argv,
594
"H:h:s:Vv")) > -1 ) {
595
set_option(op, optarg);
598
/* end option processing */
603
/* now process each pair of files on the command line */
604
if( (optind > -1) && *(argv + optind) ) {
605
for(i = optind; i < argc; i++) {
606
for(j = i + 1; j < argc; j++) {
607
process_dump_pair(argv[i], argv[j], &cp);
611
errormsg(E_ERROR, "missing model dumps! Use dbacl(1) with -d switch\n");
616
free_category_pair(&cp);