100
101
strcat(buf, MOPTION(M_OPTION_SHOW_SCRIPT, opt));
101
102
strcat(buf, MOPTION(M_OPTION_SHOW_STYLE, opt));
102
103
strcat(buf, MOPTION(M_OPTION_SHOW_HTML_COMMENTS, opt));
103
strcat(buf, MOPTION(M_OPTION_USE_REGEX, opt));
104
strcat(buf, MOPTION(M_OPTION_USE_STDTOK, opt));
104
105
strcat(buf, MOPTION(M_OPTION_ATTACHMENTS, opt));
105
106
strcat(buf, MOPTION(M_OPTION_WARNING_BAD, opt));
106
107
strcat(buf, MOPTION(M_OPTION_NGRAM_STRADDLE_NL, opt));
123
124
strcat(buf, MOPTION(U_OPTION_DECIMATE, opt));
124
125
strcat(buf, MOPTION(U_OPTION_GROWHASH, opt));
125
126
strcat(buf, MOPTION(U_OPTION_INDENTED, opt));
126
strcat(buf, MOPTION(U_OPTION_ZEROLEARN, opt));
127
strcat(buf, MOPTION(U_OPTION_NOZEROLEARN, opt));
127
128
strcat(buf, MOPTION(U_OPTION_LAPLACE, opt));
128
129
strcat(buf, MOPTION(U_OPTION_DIRICHLET, opt));
129
130
strcat(buf, MOPTION(U_OPTION_JAYNES, opt));
268
271
if( c >= regex_count ) { /* not found */
269
272
/* add it to our list */
270
re[regex_count].string = strdup(buf);
273
if( strchr(buf, '(') ) {
274
re[regex_count].string = strdup(buf);
276
char *dup = (char *)malloc(strlen(buf)+2);
278
sprintf(dup, "(%s)", buf);
280
"no captures found in regex, converting to '%s'\n",
283
re[regex_count].string = dup;
285
if( !re[regex_count].string ) {
287
"could not prepare regular expression '%s'.\n",
271
290
/* and compile the regex */
272
291
if( regcomp(&re[regex_count].regex,
273
292
re[regex_count].string, REG_EXTENDED) != 0 ) {
434
455
cat->c_options = 0;
435
456
cat->hash = NULL;
436
457
cat->mmap_offset = 0;
440
/* frees the resrouces associated with a category */
441
void free_category(category_t *cat) {
458
cat->mmap_start = NULL;
461
bool_t create_category_hash(category_t *cat, FILE *input, int protf) {
464
if( u_options & (1<<U_OPTION_MMAP) ) {
465
cat->mmap_offset = ftell(input);
466
if( cat->mmap_offset > 0 ) {
468
(byte_t *)MMAP(0, sizeof(c_item_t) * cat->max_tokens +
470
protf, MAP_SHARED, fileno(input), 0);
471
if( cat->mmap_start == MAP_FAILED ) { cat->mmap_start = NULL; }
472
if( cat->mmap_start ) {
473
cat->hash = (c_item_t *)(cat->mmap_start + cat->mmap_offset);
474
MADVISE(cat->hash, sizeof(c_item_t) * cat->max_tokens,
475
MADV_SEQUENTIAL|MADV_WILLNEED);
476
/* lock the pages to prevent swapping - on Linux, this
477
works without root privs so long as the user limits
478
are big enough - mine are unlimited ;-)
479
On other OSes, root may me necessary. If we can't
480
lock, it doesn't really matter, but cross validations
481
and multiple classifications are a _lot_ faster with locking. */
482
MLOCK(cat->hash, sizeof(c_item_t) * cat->max_tokens);
483
cat->c_options |= (1<<C_OPTION_MMAPPED_HASH);
489
cat->c_options &= ~(1<<C_OPTION_MMAPPED_HASH);
490
/* allocate hash table */
491
cat->hash = (c_item_t *)malloc(sizeof(c_item_t) * cat->max_tokens);
493
errormsg(E_ERROR, "not enough memory for category %s\n",
498
MADVISE(cat->hash, sizeof(c_item_t) * cat->max_tokens,
501
/* read in hash table */
504
while(!ferror(input) && !feof(input) && (j < i) ) {
505
j += fread(cat->hash + j, sizeof(c_item_t), i - j, input);
509
errormsg(E_ERROR, "corrupt category? %s\n",
520
void free_category_hash(category_t *cat) {
442
521
if( cat->hash ) {
443
if( cat->c_options & (1<<C_OPTION_MMAPPED_HASH) ) {
522
if( cat->mmap_start != NULL ) {
444
523
MUNMAP(cat->mmap_start, cat->max_tokens * sizeof(c_item_t) +
445
524
cat->mmap_offset);
446
525
cat->mmap_start = NULL;
526
cat->mmap_offset = 0;
447
527
cat->hash = NULL;
449
529
if( cat->hash ) {
612
703
oldscore = cat[i].score;
617
/* see if this is for us */
618
if( ((re == 0) && (tt.order <= cat[i].max_order)) ||
619
((re > 0) && (cat[i].retype & (1<<(re-1)))) ) {
707
/* see if this token is for us. The rule is: a category either
708
uses the standard tokenizer (in that case re = INVALID_RE),
709
or it uses only those regexes which are listed in the retype
710
bitmap. Since re = 0 is taken by the standard tokenizer,
711
this occurs when re > 0 and we have to subtract 1 to check
712
the bitmap. Simple, really ;-) */
713
apply = ( ((re == INVALID_RE) &&
714
(tt.order <= cat[i].max_order) && !cat[i].retype) ||
716
(cat[i].retype & (1<<(re-1)))) );
621
719
/* if token found, add its lambda weight */
622
720
k = find_in_category(&cat[i], id);
642
740
ref = UNPACK_RWEIGHTS(PACK_RWEIGHTS(ref));
644
/* don't forget the normalizing constant, and update
645
complexity for this category - note that by updating the
646
complexity on order 1 tokens instead of max_order tokens,
647
we slightly overestimate (by max_order - 1), but a
648
benefit is that we start off well away from zero, instead
649
of hovering above and below zero. */
650
renorm = cat[i].logZ;
744
/* update the complexity */
745
/* this is actually very simple in hindsight, but took
746
me a long time to get right. Different versions of dbacl
747
compute the complexity in different ways, and I kept changing
748
the method because I wasn't happy.
750
In previous versions, complexity is an integer, which begs
751
the question "what does it count?". For simple models
752
(max_order = 1) this is easy: we count the number of
753
tokens. But for max_order > 1, it's not obvious, because we
754
need to divide by 1/max_order asymptotically.
756
One way is to increment the complexity if we encounter a
757
token of order max_order. This is correct for Markovian
758
models and corresponds to the dbacl.ps writeup, but causes
759
trouble in some edge cases. For example, if we classify a
760
very short document, there might not be enough tokens to
761
make sense. This actually occurs when dbacl must classify
762
individual lines, and some lines contain one or two tokens
763
only. Worse, dbacl used to renormalize at the same time as
764
updating the complexity, which increases the likelihood of
765
having a negative divergence score estimate in the first
766
few iterations - very bad. Finally, the complexity is
767
nearly meaningless for models built up from regular
768
expressions, because both the start and the end of each
769
line contains incomplete n-grams (recall regexes can't
772
So to solve these problems, some previous versions of dbacl
773
counted always the order 1 tokens. Asymptotically, this
774
makes no difference, but again it fails on edge
775
cases. Firstly, doing this means that the complexity for a
776
simple model is the same as the complexity for an n-gram
777
model for any n, so that makes it hard to compare mixed
778
models because n-gram model scores are consitently biased
779
for n > 1. Another problem is again with regexes, because
780
the incomplete n-gram tokens at the start and end of each
781
line add up to a pretty large error over thousands of
784
The solution to the above problems is twofold: first, we
785
renormalize after each token, regardless of its order. Of
786
course this means we must divide logZ by the number of
787
tokens per complexity unit, ie renorm = delta * logZ with
788
delta = 1/max_order. Once I realized this it was obvious
789
that the complexity should be also incremented by delta for
790
every token. As a side effect, the complexity is now a real
791
number, and actually measures not just the max_order token
792
count, but also the fraction of incomplete n-grams. This
793
seems like the right way to go, especially for models based
794
on regexes, since now we also count the incomplete n-grams
795
at both ends of the line, which adds up to quite a bit over
798
cat[i].fcomplexity++; /* don't actually need this, but nice to have */
799
cat[i].complexity += cat[i].delta;
654
801
/* now adjust the score */
655
802
switch(cat[i].model_type) {
657
804
multinomial_correction = h ?
658
805
(log((weight_t)cat[i].complexity) - log((weight_t)h->count)) : 0.0;
659
cat[i].score += lambda + multinomial_correction + ref - renorm;
806
cat[i].score += lambda + multinomial_correction + ref - cat[i].renorm;
663
cat[i].score += lambda + ref - renorm;
810
cat[i].score += lambda + ref - cat[i].renorm;
664
811
if( tt.order == cat[i].max_order ) {
665
812
cat[i].score_shannon += shannon_correction;
730
881
/***********************************************************
731
882
* FILE MANAGEMENT FUNCTIONS *
732
883
***********************************************************/
734
/* loads a category hash
735
returns 0 on failure, you should free the category in that case */
736
error_code_t load_category(category_t *cat) {
884
error_code_t load_category_header(FILE *input, category_t *cat) {
738
885
char buf[MAGIC_BUFSIZE];
739
886
char scratchbuf[MAGIC_BUFSIZE];
741
887
short int shint_val;
742
888
long int lint_val1, lint_val2, lint_val3;
746
input = fopen(cat->fullfilename, "rb");
748
891
if( !fgets(buf, MAGIC_BUFSIZE, input) ||
749
892
strncmp(buf, MAGIC1, MAGIC1_LEN) ) {
750
893
errormsg(E_ERROR,
751
894
"not a dbacl " SIGNATURE " category file [%s]\n",
752
895
cat->fullfilename);
818
959
/* if this category did not register a regex, it wants
819
960
the default processing, so we flag this */
820
961
if( !cat->retype ) {
821
cat->m_options &= ~(1<<M_OPTION_USE_REGEX);
962
cat->m_options |= (1<<M_OPTION_USE_STDTOK);
824
965
/* if we haven't read a character class, use alpha */
825
966
if( !(cat->m_options & (1<<M_OPTION_CHAR_ALPHA)) &&
826
967
!(cat->m_options & (1<<M_OPTION_CHAR_ALNUM)) &&
827
968
!(cat->m_options & (1<<M_OPTION_CHAR_CEF)) &&
969
!(cat->m_options & (1<<M_OPTION_CHAR_CHAR)) &&
828
970
!(cat->m_options & (1<<M_OPTION_CHAR_ADP)) &&
829
971
!(cat->m_options & (1<<M_OPTION_CHAR_GRAPH)) ) {
830
972
if( cat->m_options & (1<<M_OPTION_MBOX_FORMAT) ) {
833
975
cat->m_options |= (1<<M_OPTION_CHAR_ALPHA);
978
/* if we're here, success! */
985
error_code_t explicit_load_category(category_t *cat, char *openf, int protf) {
990
/* this is needed in case we try to open with write permissions,
991
which would otherwise create the file */
993
input = fopen(cat->fullfilename, "rb");
994
if( input && (strcmp(openf, "rb") != 0) ) {
995
input = freopen(cat->fullfilename, openf, input);
1000
if( !load_category_header(input, cat) ) {
837
1005
/* read character frequencies */
838
1006
i = ASIZE * ASIZE;
858
if( u_options & (1<<U_OPTION_MMAP) ) {
859
cat->mmap_offset = ftell(input);
860
if( cat->mmap_offset > 0 ) {
862
(byte_t *)MMAP(0, sizeof(c_item_t) * cat->max_tokens +
864
PROT_READ, MAP_SHARED, fileno(input), 0);
865
if( cat->mmap_start ) {
866
cat->hash = (c_item_t *)(cat->mmap_start + cat->mmap_offset);
867
MADVISE(cat->hash, sizeof(c_item_t) * cat->max_tokens,
868
MADV_SEQUENTIAL|MADV_WILLNEED);
869
/* lock the pages to prevent swapping - on Linux, this
870
works without root privs so long as the user limits
871
are big enough - mine are unlimited ;-)
872
On other OSes, root may me necessary. If we can't
873
lock, it doesn't really matter, but cross validations
874
and multiple classifications are a _lot_ faster with locking. */
875
MLOCK(cat->hash, sizeof(c_item_t) * cat->max_tokens);
876
cat->c_options |= (1<<C_OPTION_MMAPPED_HASH);
882
cat->c_options &= ~(1<<C_OPTION_MMAPPED_HASH);
883
/* allocate hash table */
884
cat->hash = (c_item_t *)malloc(sizeof(c_item_t) * cat->max_tokens);
886
errormsg(E_ERROR, "not enough memory for category %s\n",
892
MADVISE(cat->hash, sizeof(c_item_t) * cat->max_tokens,
895
/* read in hash table */
898
while(!ferror(input) && !feof(input) && (j < i) ) {
899
j += fread(cat->hash + j, sizeof(c_item_t), i - j, input);
903
errormsg(E_ERROR, "corrupt category? %s\n",
1026
if( !create_category_hash(cat, input, protf) ) {
1040
/* loads a category hash
1041
returns 0 on failure, you should free the category in that case */
1042
error_code_t load_category(category_t *cat) {
1043
return explicit_load_category(cat, "rb", PROT_READ);
1046
/* loads a category file for potential read/write */
1047
error_code_t open_category(category_t *cat) {
1048
return explicit_load_category(cat, "r+b", PROT_READ|PROT_WRITE);
921
1051
error_code_t reload_category(category_t *cat) {
923
1053
/* free the hash, but keep the cat->fullfilename */
925
if( cat->c_options & (1<<C_OPTION_MMAPPED_HASH) ) {
926
MUNMAP(cat->mmap_start, cat->max_tokens * sizeof(c_item_t));
927
cat->mmap_start = NULL;
1054
free_category_hash(cat);
936
1055
return load_category(cat) &&
937
1056
sanitize_model_options(&m_options,cat);