219
221
if( u_options & (1<<U_OPTION_VERBOSE) ) {
220
222
fprintf(stdout, "%s %6.2" FMT_printf_score_t " * %-4.1f ",
222
-CVT_BITS(cat[i].score/cat[i].complexity),
224
-nats2bits(cat[i].score/cat[i].complexity),
223
225
cat[i].complexity);
225
227
fprintf(stdout, "%s %6.2" FMT_printf_score_t " ",
226
cat[i].filename, -CVT_BITS(cat[i].score));
228
cat[i].filename, -nats2bits(cat[i].score));
229
231
fprintf(stdout, "%s", textbuf);
322
325
fprintf(stdout, "%s ( %5.2" FMT_printf_score_t
323
326
" # %5.2" FMT_printf_score_t " )* %-.1f ",
325
-CVT_BITS(cat[i].score/cat[i].complexity),
326
CVT_BITS(sqrt(cat[i].score_s2/cat[i].complexity)),
328
-nats2bits(cat[i].score/cat[i].complexity),
329
nats2bits(sqrt(cat[i].score_s2/cat[i].complexity)),
327
330
cat[i].complexity);
329
332
fprintf(stdout, "%s %5.2" FMT_printf_score_t " * %-.1f ",
331
-CVT_BITS(cat[i].score/cat[i].complexity),
334
-nats2bits(cat[i].score/cat[i].complexity),
332
335
cat[i].complexity);
334
337
if( u_options & (1<<U_OPTION_CONFIDENCE) ) {
339
342
fprintf(stdout, "%s %5.2" FMT_printf_score_t " ",
341
-CVT_BITS(cat[i].score));
344
-nats2bits(cat[i].score));
344
347
fprintf(stdout, "\n");
346
349
if( u_options & (1<<U_OPTION_APPEND) ) {
350
no_title = (bool_t)1;
348
351
for(i = 0; i < cat_count; i++) {
349
352
if( cat[i].model_num_docs > 0 ) {
351
354
fprintf(stdout, "# mean_complexity ");
355
no_title = (bool_t)0;
354
357
fprintf(stdout, "%s %5.2" FMT_printf_score_t " ",
368
371
fprintf(stdout, "%s ( %5.2" FMT_printf_score_t " + "
369
372
"%-5.2" FMT_printf_score_t,
371
CVT_BITS(cat[i].score_div),
372
CVT_BITS(cat[i].score_shannon));
374
nats2bits(cat[i].score_div),
375
nats2bits(cat[i].score_shannon));
373
376
if( u_options & (1<<U_OPTION_VAR) ) {
374
377
fprintf(stdout, " # %5.2" FMT_printf_score_t,
375
CVT_BITS(sqrt(cat[i].score_s2/cat[i].complexity)));
378
nats2bits(sqrt(cat[i].score_s2/cat[i].complexity)));
377
380
fprintf(stdout, " )* %-6.1f", cat[i].complexity);
378
381
if( u_options & (1<<U_OPTION_CONFIDENCE) ) {
397
400
for(i = 0; i < cat_count; i++) {
398
if( i != exit_code ) {
401
if( (int)i != exit_code ) {
399
402
/* c is a standard normal variable */
400
403
c = (-cat[i].score/cat[i].complexity -
401
404
-cat[exit_code].score/cat[exit_code].complexity) /
429
432
* FILE MANAGEMENT FUNCTIONS *
430
433
***********************************************************/
432
bool_t check_magic_write(char *path, char *magic, int len) {
435
bool_t check_magic_write(char *path, char *magic, size_t len) {
434
437
char buf[MAGIC_BUFSIZE];
441
444
errormsg(E_ERROR,"the file %s is already used for something, "
442
445
"use another filename. Nothing written.\n", path);
446
449
/* it's an existing category file */
453
456
/* the standard tmpfile() call doesn't tell the filename,
458
461
3) if you want a particular directory, prepend it to tmplate.
459
462
4) file is opened for read/write, but truncated to zero.
461
FILE *mytmpfile(const char *tmplate, char **tmpname) {
465
FILE *mytmpfile(const char *tmplate, /*@out@*/ char **tmpname) {
462
466
FILE *result = NULL;
464
470
l = strlen(tmplate);
465
471
*tmpname = (char *)malloc(sizeof(char)*(l + 8));
492
498
bool_t myrename(const char *src, const char *dest) {
493
499
#if defined ATOMIC_CATSAVE
494
500
/* the rename is atomic on posix */
495
return (rename(src, dest) == 0);
501
return (bool_t)(rename(src, dest) == 0);
497
return 1; /* just pretend */
503
return (bool_t)1; /* just pretend */
514
520
(m_options & (1<<M_OPTION_REFMODEL)) ? "(ref)" : ""));
516
522
(0 < fprintf(output,
517
MAGIC2_o, learner->divergence, learner->logZ, learner->max_order,
523
MAGIC2_o, learner->divergence, learner->logZ,
524
(short int)learner->max_order,
518
525
(m_options & (1<<M_OPTION_MULTINOMIAL)) ? "multinomial" : "hierarchical" ));
520
527
(0 < fprintf(output, MAGIC3,
532
539
/* print out any regexes we might need */
533
540
for(c = 0; c < regex_count; c++) {
534
541
/* write the bitmap */
535
543
for(p = smb, s = 1; s <= MAX_SUBMATCH; s++) {
536
544
if( re[c].submatches & (1<<s) ) {
545
*p++ = (char)s + '0';
599
607
Also, we don't try to create the file - if the file doesn't exist,
600
608
we won't gain much time by using mmap on that single occasion. */
601
609
if( *online && (u_options & (1<<U_OPTION_MMAP)) ) {
603
611
output = fopen(learner->filename, "r+b");
606
614
if( out_iobuf ) {
607
setvbuf(output, (char *)out_iobuf, _IOFBF, BUFFER_MAG * system_pagesize);
615
setvbuf(output, (char *)out_iobuf, (int)_IOFBF, (size_t)(BUFFER_MAG * system_pagesize));
610
618
ok = ok && write_category_headers(learner, output);
614
622
/* now mmap the file and write out the arrays real quick */
615
623
mmap_offset = ftell(output);
616
if( mmap_offset == -1 ) {
624
if( mmap_offset == (long)-1 ) {
620
mmap_length = mmap_offset + (ASIZE * ASIZE * SIZEOF_DIGRAMS) +
628
mmap_length = (size_t)mmap_offset + (ASIZE * ASIZE * SIZEOF_DIGRAMS) +
621
629
learner->max_tokens * sizeof(c_item_t);
623
ok = ok && (-1 != ftruncate(fileno(output), mmap_length));
631
ok = ok && (-1 != ftruncate(fileno(output), (off_t)mmap_length));
628
636
mmap_start = (byte_t *)MMAP(0, mmap_length,
629
637
PROT_READ|PROT_WRITE, MAP_SHARED, fileno(output), 0);
638
if( mmap_start == MAP_FAILED ) { mmap_start = NULL; }
630
639
if( !mmap_start ) {
683
692
/* this keeps track to see if writing is successful,
684
693
it's not foolproof, but probably good enough */
687
696
output = mytmpfile(learner->filename, &tempname);
690
699
if( out_iobuf ) {
691
setvbuf(output, (char *)out_iobuf, _IOFBF, BUFFER_MAG * system_pagesize);
700
setvbuf(output, (char *)out_iobuf, (int)_IOFBF, (size_t)(BUFFER_MAG * system_pagesize));
694
703
ok = ok && write_category_headers(learner, output);
707
716
for(j = 0; j < ASIZE; j++) {
708
717
shval = HTON_DIGRAM(PACK_DIGRAMS(learner->dig[i][j]));
709
718
for(n = 0; n < 1; ) {
710
if( 0 > (n = fwrite(&shval, SIZEOF_DIGRAMS, 1, output)) ) {
719
if( 0 > (n = fwrite(&shval, SIZEOF_DIGRAMS, (size_t)1, output)) ) {
712
721
goto skip_remaining;
728
737
ci.lam = HTON_LAMBDA(ci.lam);
730
739
for(n = 0; n < 1; ) {
731
if( 0 > (n = fwrite(&ci, sizeof(ci), 1, output)) ) {
740
if( 0 > (n = fwrite(&ci, sizeof(ci), (size_t)1, output)) ) {
733
742
goto skip_remaining;
780
789
(xcat->max_order == learner->max_order) &&
781
790
(xcat->max_hash_bits == learner->max_hash_bits) ) {
783
793
/* we only overwrite the header if there's exactly enough space */
784
794
/* we must overwrite 3 lines which all start with # */
785
795
q = p = strchr((char *)xcat->mmap_start + 1, '#');
787
n = snprintf(buf, REPLBUF - max, MAGIC2_o,
788
learner->divergence, learner->logZ, learner->max_order,
797
n = snprintf(buf, (size_t)(REPLBUF - max), MAGIC2_o,
798
learner->divergence, learner->logZ,
799
(short int)learner->max_order,
789
800
(m_options & (1<<M_OPTION_MULTINOMIAL)) ? "multinomial" : "hierarchical" );
792
803
q = strchr(q + 1, '#');
793
804
if( q && (max < REPLBUF) ) {
794
n = snprintf(buf + max, REPLBUF - max, MAGIC3,
805
n = snprintf(buf + max, (size_t)(REPLBUF - max), MAGIC3,
795
806
(short int)learner->max_hash_bits,
796
807
(long int)learner->full_token_count,
797
808
(long int)learner->unique_token_count,
801
812
q = strchr(q + 1, '#');
802
813
if( q && (max < REPLBUF) ) {
803
n = snprintf(buf + max, REPLBUF - max, MAGIC8_o,
814
n = snprintf(buf + max, (size_t)(REPLBUF - max), MAGIC8_o,
804
815
learner->shannon,
805
816
learner->alpha, learner->beta,
806
817
learner->mu, learner->s2);
857
868
bool_t tmp_seek_start(learner_t *learner) {
858
869
if( learner->tmp.mmap_start ) {
859
870
learner->tmp.mmap_cursor = learner->tmp.mmap_offset;
861
872
} else if( learner->tmp.file ) {
862
873
clearerr(learner->tmp.file);
863
874
return (fseek(learner->tmp.file, learner->tmp.offset, SEEK_SET) == 0);
868
879
bool_t tmp_seek_end(learner_t *learner) {
869
880
if( learner->tmp.mmap_start ) {
870
881
learner->tmp.mmap_cursor = learner->tmp.mmap_offset + learner->tmp.used;
872
883
} else if( learner->tmp.file ) {
873
884
return (fseek(learner->tmp.file,
874
885
learner->tmp.offset + learner->tmp.used, SEEK_SET) == 0);
879
890
long tmp_get_pos(learner_t *learner) {
882
893
} else if( learner->tmp.file ) {
883
894
return ftell(learner->tmp.file) - learner->tmp.offset;
888
899
size_t tmp_read_block(learner_t *learner, byte_t *buf, size_t bufsiz,
889
900
const byte_t **startp) {
890
size_t left = learner->tmp.used - tmp_get_pos(learner);
891
if( bufsiz > left ) { bufsiz = (left >= 0) ? left : 0; }
901
long left = learner->tmp.used - tmp_get_pos(learner);
902
if( bufsiz > (size_t)left ) { bufsiz = (left >= 0) ? (size_t)left : 0; }
892
903
if( learner->tmp.mmap_start ) {
893
904
/* memcpy(buf, learner->tmp.mmap_start + learner->tmp.mmap_cursor, bufsiz); */
894
905
*startp = learner->tmp.mmap_start + learner->tmp.mmap_cursor;
931
942
(byte_t *)MMAP(learner->tmp.mmap_start, learner->tmp.mmap_length,
932
943
PROT_READ|PROT_WRITE, MAP_SHARED,
933
944
fileno(learner->tmp.file), offset);
945
if( learner->tmp.mmap_start == MAP_FAILED ) { learner->tmp.mmap_start = NULL; }
934
946
if( !learner->tmp.mmap_start ) {
935
947
if( u_options & (1<<U_OPTION_VERBOSE) ) {
936
948
errormsg(E_WARNING, "could not mmap token file after resize\n");
1111
1123
(byte_t *)MMAP(0, mmap_hash_offset,
1112
1124
PROT_READ|PROT_WRITE, MAP_SHARED, fileno(input), 0);
1125
if( mmap_start == MAP_FAILED ) { mmap_start = NULL; }
1113
1126
if( mmap_start ) {
1114
1127
/* first we overwrite the learner struct with the contents of
1115
1128
the mmapped region */
1121
1134
(byte_t *)MMAP(mmap_start,
1122
1135
mmap_hash_offset + sizeof(l_item_t) * learner->max_tokens,
1123
1136
PROT_READ|PROT_WRITE, MAP_SHARED, fileno(input), 0);
1137
if( mmap_start == MAP_FAILED ) { mmap_start = NULL; }
1124
1138
if( mmap_start ) {
1125
1139
/* now fill some member variables */
1126
1140
learner->mmap_start = mmap_start;
1320
1334
if( out_iobuf ) {
1321
setvbuf(input, (char *)out_iobuf, _IOFBF, BUFFER_MAG * system_pagesize);
1335
setvbuf(input, (char *)out_iobuf, (int)_IOFBF, (size_t)(BUFFER_MAG * system_pagesize));
1324
1338
if( !fgets(buf, MAGIC_BUFSIZE, input) ||
1375
1389
(byte_t *)MMAP(0, learner->tmp.mmap_length,
1376
1390
PROT_READ|PROT_WRITE, MAP_SHARED,
1377
1391
fileno(learner->tmp.file), offset);
1392
if( learner->tmp.mmap_start == MAP_FAILED ) { learner->tmp.mmap_start = NULL; }
1378
1393
if( learner->tmp.mmap_start ) {
1379
1394
MLOCK(learner->tmp.mmap_start, learner->tmp.mmap_length);
1380
1395
MADVISE(learner->tmp.mmap_start, learner->tmp.mmap_length,
1472
1487
if( out_iobuf ) {
1473
setvbuf(output, (char *)out_iobuf, _IOFBF, BUFFER_MAG * system_pagesize);
1488
setvbuf(output, (char *)out_iobuf, (int)_IOFBF, (size_t)(BUFFER_MAG * system_pagesize));
1749
1764
" alpha %" FMT_printf_score_t
1750
1765
" beta %" FMT_printf_score_t "\n",
1751
1766
(long int)effective_count,
1752
CVT_BITS(learner->shannon),
1753
CVT_BITS(learner->mu),
1754
CVT_BITS(learner->s2),
1755
CVT_BITS(learner->alpha),
1756
CVT_BITS(learner->beta));
1767
nats2bits(learner->shannon),
1768
nats2bits(learner->mu),
1769
nats2bits(learner->s2),
1770
nats2bits(learner->alpha),
1771
nats2bits(learner->beta));
1903
1918
if( i->count < K_TOKEN_COUNT_MAX ) {
1920
if( learner->t_max < i->count ) {
1921
learner->t_max = i->count;
1905
1923
if( m_options & (1<<M_OPTION_CALCENTROPY) ) {
1906
1924
if( (learner->doc.emp.top < learner->doc.emp.max) ||
1907
1925
emplist_grow(&learner->doc.emp) ) {
2057
2077
learner->tmpiobuf = (void *)valloc(BUFFER_MAG * system_pagesize);
2059
2079
if( learner->tmp.iobuf ) {
2060
setvbuf(learner->tmp.file, (char *)learner->tmp.iobuf, _IOFBF,
2061
BUFFER_MAG * system_pagesize);
2080
setvbuf(learner->tmp.file, (char *)learner->tmp.iobuf, (int)_IOFBF,
2081
(size_t)(BUFFER_MAG * system_pagesize));
2104
2124
learner->doc.reservoir[i].stack = NULL;
2128
if( learner->tmp.file ) {
2129
fclose(learner->tmp.file);
2130
learner->tmp.file = NULL;
2133
cleanup_tempfiles();
2109
2136
/* calculates the most probable Dirichlet parameters
2911
2938
old_lam = UNPACK_LAMBDA(i->lam);
2914
/* "iterative scaling" lower bound */
2915
new_lam = (log((score_t)i->count) - logXi -
2916
UNPACK_RWEIGHTS(i->tmp.min.dref))/R + logzonr -
2917
UNPACK_LWEIGHTS(i->tmp.min.ltrms);
2940
if( (i->typ.order == 1) || (i->count > ftreshold) ) {
2941
/* "iterative scaling" lower bound */
2942
new_lam = (log((score_t)i->count) - logXi -
2943
UNPACK_RWEIGHTS(i->tmp.min.dref))/R + logzonr -
2944
UNPACK_LWEIGHTS(i->tmp.min.ltrms);
2919
2949
if( isnan(new_lam) ) {
2920
2950
/* precision problem, just ignore, don't change lambda */
3009
3039
learner->alpha, learner->beta,
3010
3040
learner->mu, learner->s2);
3042
fprintf(out, MAGIC9, (long int)learner->t_max, (long int)learner->b_count);
3012
3044
/* print out any regexes we might need */
3013
3045
for(c = 0; c < regex_count; c++) {
3014
3046
/* write the bitmap */
3046
3078
/* now write weight in hash */
3047
3079
id = hash_full_token(tok);
3048
3080
k = find_in_learner(learner, id); /* guaranteed to be found */
3049
fprintf(out, "%9.3f %9.3f %7d %8lx ",
3081
fprintf(out, MAGIC_DUMPTBL_o,
3050
3082
(weight_t)UNPACK_LAMBDA(k->lam),
3051
3083
UNPACK_RWEIGHTS(k->tmp.min.dref), k->count,
3052
3084
(long unsigned int)k->id);
3385
3417
m_options |= (1<<M_OPTION_CHAR_GRAPH);
3386
3418
} else if( !strcasecmp(optarg, "adp") ) {
3387
3419
m_options |= (1<<M_OPTION_CHAR_ADP);
3420
} else if( !strcasecmp(optarg, "char") ) {
3421
m_options |= (1<<M_OPTION_CHAR_CHAR);
3389
3423
errormsg(E_WARNING,
3390
3424
"unrecognized option \"%s\", ignoring.\n",
3412
3446
"maximum reached, random text category omitted\n");
3413
3447
} else if( u_options & (1<<U_OPTION_LEARN) ) {
3414
3448
errormsg(E_ERROR,"cannot use options -l and -R together\n");
3417
3451
u_options |= (1<<U_OPTION_CLASSIFY);
3512
3546
fprintf(stdout, "Feature memory requirements: %d bytes (classifying), %d bytes (learning)\n",
3513
3547
(int)sizeof(c_item_t), (int)sizeof(l_item_t));
3514
3548
fprintf(stdout, "To change these settings, recompile from source.\n");
3518
3552
m_options |= (1<<M_OPTION_CALCENTROPY);
3556
3590
"maximum reached, category ignored\n");
3557
3591
} else if( u_options & (1<<U_OPTION_LEARN) ) {
3558
3592
errormsg(E_ERROR, "cannot use options -l and -c together\n");
3561
3595
u_options |= (1<<U_OPTION_CLASSIFY);
3639
3673
errormsg(E_WARNING, "maximum reached, filter ignored\n");
3640
3674
} else if( u_options & (1<<U_OPTION_LEARN) ) {
3641
3675
errormsg(E_ERROR, "cannot use options -l and -f together\n");
3643
3677
} else if( !*optarg ) {
3644
3678
errormsg(E_ERROR, "filter must be category name or number\n");
3647
3681
u_options |= (1<<U_OPTION_FILTER);
3648
3682
filter[filter_count] = -1;
3686
3720
if( u_options & (1<<U_OPTION_CLASSIFY) ) {
3687
3721
errormsg(E_ERROR,
3688
3722
"cannot use options -l and -c together\n");
3690
3724
} else if( u_options & (1<<U_OPTION_LEARN) ) {
3691
3725
errormsg(E_ERROR,
3692
3726
"option -l can only occur once\n");
3694
3728
} else if( !*optarg ) {
3695
3729
errormsg(E_ERROR, "category name must not be empty\n");
3743
3781
if( ((u_options>>U_OPTION_CLASSIFY) & 1) +
3744
3782
((u_options>>U_OPTION_LEARN) & 1) != 1 ) {
3745
3783
errormsg(E_ERROR, "please use either -c or -l option.\n");
3749
3787
if( *online && (m_options & (1<<M_OPTION_CALCENTROPY)) ) {
3777
3815
((m_options>>M_OPTION_MBOX_FORMAT) & 1) > 1 ) {
3778
3816
errormsg(E_ERROR,
3779
3817
"please use only one of either -T text or -T email options.\n");
3783
3821
if( ((m_options>>M_OPTION_XML) & 1) +
3784
3822
((m_options>>M_OPTION_HTML) & 1) > 1 ) {
3785
3823
errormsg(E_ERROR,
3786
3824
"please use only one of either -T xml or -T html options.\n");
3793
3831
if( !(m_options & (1<<M_OPTION_XML)) ) {
3794
3832
m_options |= (1<<M_OPTION_HTML);
3796
3835
/* for mboxes, only compute ngrams for each line individually */
3797
m_options &= ~(1<<M_OPTION_NGRAM_STRADDLE_NL);
3836
/* m_options &= ~(1<<M_OPTION_NGRAM_STRADDLE_NL); */
3798
3838
/* always pretend the -X switch was used */
3799
3839
/* if( u_options & (1<<U_OPTION_LEARN) ) { */
3800
3840
/* m_options |= (1<<M_OPTION_CALCENTROPY); */
3854
3894
!(m_options & (1<<M_OPTION_CHAR_ALNUM)) &&
3855
3895
!(m_options & (1<<M_OPTION_CHAR_CEF)) &&
3856
3896
!(m_options & (1<<M_OPTION_CHAR_ADP)) &&
3897
!(m_options & (1<<M_OPTION_CHAR_CHAR)) &&
3857
3898
!(m_options & (1<<M_OPTION_CHAR_GRAPH)) ) {
3858
3899
if( m_options & (1<<M_OPTION_MBOX_FORMAT) ) {
3859
3900
m_options |= (1<<M_OPTION_CHAR_ADP);