~ubuntu-branches/ubuntu/hardy/dbacl/hardy

« back to all changes in this revision

Viewing changes to src/catfun.c

Committer: Bazaar Package Importer
Author(s): Zak B. Elep
Date: 2006-03-26 22:35:35 UTC
mto: (2.1.1 etch) (1.1.2 upstream)
mto: This revision was merged to the branch mainline in revision 4.
Revision ID: james.westby@ubuntu.com-20060326223535-icwiulpkzesds4mq

Import upstream version 1.12

files added:
TREC

TREC/Makefile.am

TREC/Makefile.in

TREC/OPTIONS

TREC/OPTIONS.TREC2005.1cefhuj

TREC/OPTIONS.TREC2005.2adphu

TREC/OPTIONS.TREC2005.3adphd

TREC/OPTIONS.TREC2005.4adp

TREC/OPTIONS.adp-dir-d

TREC/OPTIONS.adp-u-d

TREC/OPTIONS.adp-unif-d

TREC/OPTIONS.bi-adp-unif-d

TREC/OPTIONS.bi-simple-d

TREC/OPTIONS.cef-dir-d

TREC/OPTIONS.cef-unif-d

TREC/OPTIONS.puretext-d

TREC/OPTIONS.simple-d

TREC/OPTIONS.simple-v

TREC/README

TREC/SFX

TREC/TREC2005.txt

TREC/basic-email

TREC/classify

TREC/finalize

TREC/initialize

TREC/train

TREC/verify-stderr

config/depcomp

contrib

contrib/Makefile.am

contrib/Makefile.in

contrib/README

contrib/clint_adams-patch-dbacl-1.9.gz

doc/chess

doc/chess/Makefile.am

doc/chess/Makefile.in

doc/chess/combine_half_moves.sh

doc/chess/csfpc1.png

doc/chess/csfpc2.png

doc/chess/csfpc3.png

doc/chess/dce-1.sh

doc/chess/dce-2.sh

doc/chess/dce-3.sh

doc/chess/dce-basic.sh

doc/chess/dce.sh

doc/chess/down.png

doc/chess/randomizer.awk

doc/chess/renorm.awk

doc/chess/spam_chess.html

doc/chess/spoiler.png

doc/is_it_working.html

man/hypex.1in

src/hypex.c

src/hypex.h

src/lint-check.sh

src/splintrc

src/tests/email-style.shin

src/tests/sample.spam-11

src/tests/verify.email-style

files removed:
src/stamp-h.in

files modified:
COPYING

ChangeLog

INSTALL

Makefile.am

Makefile.in

NEWS

aclocal.m4

config/config.guess

config/config.sub

configure

configure.in

doc/Makefile.am

doc/Makefile.in

doc/email.html

doc/tutorial.html

man/Makefile.am

man/Makefile.in

man/dbacl.1in

man/hmine.1in

src/Makefile.am

src/Makefile.in

src/bayesol.c

src/catfun.c

src/config.h.in

src/dbacl.c

src/dbacl.h

src/fh.c

src/hmine.c

src/hmine.h

src/hparse.c

src/icheck.c

src/mailcross.in

src/mailinspect.c

src/mbw.c

src/mbw.h

src/probs.c

src/rfc2822.c

src/rfc822.c

src/risk-lexer.c

src/risk-parser.c

src/risk-parser.h

src/risk-parser.y

src/tests/Makefile.am

src/tests/Makefile.in

src/tests/dbacl-a.shin

src/tests/dbacl-g.shin

src/tests/dbacl-jap.shin

src/tests/dbacl-o.shin

src/tests/email-badmime1.shin

src/tests/email-badmime2.shin

src/tests/email-forms.shin

src/tests/email-headers.shin

src/tests/email-l.shin

src/tests/email-maildir.shin

src/tests/email-mbox.shin

src/tests/email-pgp.shin

src/tests/email-scripts.shin

src/tests/email-theaders.shin

src/tests/email-uri.shin

src/tests/email-uu.shin

src/tests/email-xheaders.shin

src/tests/html-alt.shin

src/tests/html-links.shin

src/tests/html.shin

src/tests/icheck.shin

src/tests/lscheck.shin

src/tests/model-sum1.shin

src/tests/pcheck-2821b.shin

src/tests/pcheck-2821g.shin

src/tests/pcheck-2822b.shin

src/tests/pcheck-2822g.shin

src/tests/pcheck-821b.shin

src/tests/pcheck-821g.shin

src/tests/pcheck-822b.shin

src/tests/pcheck-822g.shin

src/tests/reservoir.shin

src/tests/score-1.shin

src/tests/score-2.shin

src/tests/shannon-1.shin

src/tests/shannon-2.shin

src/tests/verify.email-forms

src/tests/verify.email-pgp

src/tests/verify.email-scripts

src/tests/verify.email-theaders

src/tests/verify.email-xheaders

src/tests/xml.shin

src/util.c

src/util.h

ts/Makefile.in

ts/dbaclA

ts/dbaclB

ts/dbaclC

ts/dbaclL

Show diffs side-by-side

added added

removed removed

src/catfun.c

strcat(buf, MOPTION(M_OPTION_CHAR_GRAPH, opt));

strcat(buf, MOPTION(M_OPTION_CHAR_ADP, opt));

strcat(buf, MOPTION(M_OPTION_CHAR_CEF, opt));

strcat(buf, MOPTION(M_OPTION_CHAR_CHAR, opt));

strcat(buf, MOPTION(M_OPTION_HEADERS, opt));

strcat(buf, MOPTION(M_OPTION_PLAIN, opt));

strcat(buf, MOPTION(M_OPTION_NOPLAIN, opt));

100

101

strcat(buf, MOPTION(M_OPTION_SHOW_SCRIPT, opt));

101

102

strcat(buf, MOPTION(M_OPTION_SHOW_STYLE, opt));

102

103

strcat(buf, MOPTION(M_OPTION_SHOW_HTML_COMMENTS, opt));

103

strcat(buf, MOPTION(M_OPTION_USE_REGEX, opt));

104

strcat(buf, MOPTION(M_OPTION_USE_STDTOK, opt));

104

105

strcat(buf, MOPTION(M_OPTION_ATTACHMENTS, opt));

105

106

strcat(buf, MOPTION(M_OPTION_WARNING_BAD, opt));

106

107

strcat(buf, MOPTION(M_OPTION_NGRAM_STRADDLE_NL, opt));

123

124

strcat(buf, MOPTION(U_OPTION_DECIMATE, opt));

124

125

strcat(buf, MOPTION(U_OPTION_GROWHASH, opt));

125

126

strcat(buf, MOPTION(U_OPTION_INDENTED, opt));

126

strcat(buf, MOPTION(U_OPTION_ZEROLEARN, opt));

127

strcat(buf, MOPTION(U_OPTION_NOZEROLEARN, opt));

127

128

strcat(buf, MOPTION(U_OPTION_LAPLACE, opt));

128

129

strcat(buf, MOPTION(U_OPTION_DIRICHLET, opt));

129

130

strcat(buf, MOPTION(U_OPTION_JAYNES, opt));

154

155

(1<<M_OPTION_REFMODEL)|

155

156

(1<<M_OPTION_I18N)|

156

157

(1<<M_OPTION_CASEN)|

158

(1<<M_OPTION_USE_STDTOK)|

157

159

(1<<M_OPTION_CALCENTROPY)|

158

160

(1<<M_OPTION_MULTINOMIAL);

159

161

*mopt |= (cat->m_options & mask);

211

213

mask =

212

214

(1<<M_OPTION_CHAR_ALPHA)|

213

215

(1<<M_OPTION_CHAR_ALNUM)|

216

(1<<M_OPTION_CHAR_CHAR)|

214

217

(1<<M_OPTION_CHAR_GRAPH)|

215

218

(1<<M_OPTION_CHAR_ADP)|

216

219

(1<<M_OPTION_CHAR_CEF);

217

220

218

221

if( (*mopt & mask) && ((cat->m_options & mask) != (*mopt & mask)) ) {

219

errormsg(E_ERROR,

222

errormsg(E_FATAL,

220

223

"category %s has incompatible token set (check -e switch)\n",

221

224

cat->filename);

222

225

return 0;

267

270

}

268

271

if( c >= regex_count ) { /* not found */

269

272

/* add it to our list */

270

re[regex_count].string = strdup(buf);

273

if( strchr(buf, '(') ) {

274

re[regex_count].string = strdup(buf);

275

} else {

276

char *dup = (char *)malloc(strlen(buf)+2);

277

if( dup ) {

278

sprintf(dup, "(%s)", buf);

279

errormsg(E_WARNING,

280

"no captures found in regex, converting to '%s'\n",

281

dup);

282

}

283

re[regex_count].string = dup;

284

}

285

if( !re[regex_count].string ) {

286

errormsg(E_FATAL,

287

"could not prepare regular expression '%s'.\n",

288

buf);

289

}

271

290

/* and compile the regex */

272

291

if( regcomp(&re[regex_count].regex,

273

292

re[regex_count].string, REG_EXTENDED) != 0 ) {

410

429

cat->score_div = 0.0;

411

430

cat->score_s2 = 0.0;

412

431

cat->score_shannon = 0.0;

432

cat->score_exp = 0.0;

413

433

cat->shannon = 0.0;

414

434

cat->alpha = 0.0;

415

435

cat->beta = 0.0;

416

436

cat->mu = 0.0;

417

437

cat->s2 = 0.0;

418

cat->complexity = 0;

438

cat->complexity = 0.0;

439

cat->fcomplexity = 0;

419

440

cat->max_order = 0;

420

441

p = strrchr(cat->fullfilename, '/');

421

442

if( p ) {

434

455

cat->c_options = 0;

435

456

cat->hash = NULL;

436

457

cat->mmap_offset = 0;

437

cat->mmap_start = 0;

438

}

439

440

/* frees the resrouces associated with a category */

441

void free_category(category_t *cat) {

458

cat->mmap_start = NULL;

459

}

460

461

bool_t create_category_hash(category_t *cat, FILE *input, int protf) {

462

hash_count_t i, j;

463

464

if( u_options & (1<<U_OPTION_MMAP) ) {

465

cat->mmap_offset = ftell(input);

466

if( cat->mmap_offset > 0 ) {

467

cat->mmap_start =

468

(byte_t *)MMAP(0, sizeof(c_item_t) * cat->max_tokens +

469

cat->mmap_offset,

470

protf, MAP_SHARED, fileno(input), 0);

471

if( cat->mmap_start == MAP_FAILED ) { cat->mmap_start = NULL; }

472

if( cat->mmap_start ) {

473

cat->hash = (c_item_t *)(cat->mmap_start + cat->mmap_offset);

474

MADVISE(cat->hash, sizeof(c_item_t) * cat->max_tokens,

475

MADV_SEQUENTIAL|MADV_WILLNEED);

476

/* lock the pages to prevent swapping - on Linux, this

477

works without root privs so long as the user limits

478

are big enough - mine are unlimited ;-)

479

On other OSes, root may me necessary. If we can't

480

lock, it doesn't really matter, but cross validations

481

and multiple classifications are a _lot_ faster with locking. */

482

MLOCK(cat->hash, sizeof(c_item_t) * cat->max_tokens);

483

cat->c_options |= (1<<C_OPTION_MMAPPED_HASH);

484

}

485

}

486

}

487

488

if( !cat->hash ) {

489

cat->c_options &= ~(1<<C_OPTION_MMAPPED_HASH);

490

/* allocate hash table */

491

cat->hash = (c_item_t *)malloc(sizeof(c_item_t) * cat->max_tokens);

492

if( !cat->hash ) {

493

errormsg(E_ERROR, "not enough memory for category %s\n",

494

cat->filename);

495

return 0;

496

}

497

498

MADVISE(cat->hash, sizeof(c_item_t) * cat->max_tokens,

499

MADV_SEQUENTIAL);

500

501

/* read in hash table */

502

i = cat->max_tokens;

503

j = 0;

504

while(!ferror(input) && !feof(input) && (j < i) ) {

505

j += fread(cat->hash + j, sizeof(c_item_t), i - j, input);

506

}

507

508

if( j < i ) {

509

errormsg(E_ERROR, "corrupt category? %s\n",

510

cat->fullfilename);

511

free(cat->hash);

512

return 0;

513

}

514

515

}

516

return 1;

517

}

518

519

520

void free_category_hash(category_t *cat) {

442

521

if( cat->hash ) {

443

if( cat->c_options & (1<<C_OPTION_MMAPPED_HASH) ) {

522

if( cat->mmap_start != NULL ) {

444

523

MUNMAP(cat->mmap_start, cat->max_tokens * sizeof(c_item_t) +

445

524

cat->mmap_offset);

446

525

cat->mmap_start = NULL;

526

cat->mmap_offset = 0;

447

527

cat->hash = NULL;

448

528

}

449

529

if( cat->hash ) {

451

531

cat->hash = NULL;

452

532

}

453

533

}

534

}

535

536

/* frees the resrouces associated with a category */

537

void free_category(category_t *cat) {

538

free_category_hash(cat);

454

539

if( cat->filename ) { free(cat->filename); }

455

540

if( cat->fullfilename ) { free(cat->fullfilename); }

456

541

}

495

580

#endif

496

581

497

582

cat->logZ = 0.0;

583

cat->divergence = 0.0;

584

cat->delta = 0.0;

585

cat->renorm = 0.0;

498

586

cat->hash = NULL;

587

cat->mmap_start = NULL;

588

cat->mmap_offset = 0;

499

589

cat->model_type = simple;

500

590

cat->max_order = 1;

501

591

cat->m_options = 0;

533

623

category_count_t i = 0;

534

624

weight_t multinomial_correction = 0.0;

535

625

weight_t shannon_correction = 0.0;

536

weight_t renorm, lambda, ref, oldscore;

626

weight_t lambda, ref, oldscore;

627

bool_t apply;

537

628

alphabet_size_t pp, pc;

538

629

hash_value_t id;

539

630

char *q;

612

703

oldscore = cat[i].score;

613

704

lambda = 0.0;

614

705

ref = 0.0;

615

renorm = 0.0;

616

706

617

/* see if this is for us */

618

if( ((re == 0) && (tt.order <= cat[i].max_order)) ||

619

((re > 0) && (cat[i].retype & (1<<(re-1)))) ) {

707

/* see if this token is for us. The rule is: a category either

708

uses the standard tokenizer (in that case re = INVALID_RE),

709

or it uses only those regexes which are listed in the retype

710

bitmap. Since re = 0 is taken by the standard tokenizer,

711

this occurs when re > 0 and we have to subtract 1 to check

712

the bitmap. Simple, really ;-) */

713

apply = ( ((re == INVALID_RE) &&

714

(tt.order <= cat[i].max_order) && !cat[i].retype) ||

715

((re > 0) &&

716

(cat[i].retype & (1<<(re-1)))) );

717

if( apply ) {

620

718

621

719

/* if token found, add its lambda weight */

622

720

k = find_in_category(&cat[i], id);

641

739

}

642

740

ref = UNPACK_RWEIGHTS(PACK_RWEIGHTS(ref));

643

741

644

/* don't forget the normalizing constant, and update

645

complexity for this category - note that by updating the

646

complexity on order 1 tokens instead of max_order tokens,

647

we slightly overestimate (by max_order - 1), but a

648

benefit is that we start off well away from zero, instead

649

of hovering above and below zero. */

650

renorm = cat[i].logZ;

651

cat[i].complexity++;

652

742

}

653

743

744

/* update the complexity */

745

/* this is actually very simple in hindsight, but took

746

me a long time to get right. Different versions of dbacl

747

compute the complexity in different ways, and I kept changing

748

the method because I wasn't happy.

749

750

In previous versions, complexity is an integer, which begs

751

the question "what does it count?". For simple models

752

(max_order = 1) this is easy: we count the number of

753

tokens. But for max_order > 1, it's not obvious, because we

754

need to divide by 1/max_order asymptotically.

755

756

One way is to increment the complexity if we encounter a

757

token of order max_order. This is correct for Markovian

758

models and corresponds to the dbacl.ps writeup, but causes

759

trouble in some edge cases. For example, if we classify a

760

very short document, there might not be enough tokens to

761

make sense. This actually occurs when dbacl must classify

762

individual lines, and some lines contain one or two tokens

763

only. Worse, dbacl used to renormalize at the same time as

764

updating the complexity, which increases the likelihood of

765

having a negative divergence score estimate in the first

766

few iterations - very bad. Finally, the complexity is

767

nearly meaningless for models built up from regular

768

expressions, because both the start and the end of each

769

line contains incomplete n-grams (recall regexes can't

770

straddle newlines).

771

772

So to solve these problems, some previous versions of dbacl

773

counted always the order 1 tokens. Asymptotically, this

774

makes no difference, but again it fails on edge

775

cases. Firstly, doing this means that the complexity for a

776

simple model is the same as the complexity for an n-gram

777

model for any n, so that makes it hard to compare mixed

778

models because n-gram model scores are consitently biased

779

for n > 1. Another problem is again with regexes, because

780

the incomplete n-gram tokens at the start and end of each

781

line add up to a pretty large error over thousands of

782

tokens.

783

784

The solution to the above problems is twofold: first, we

785

renormalize after each token, regardless of its order. Of

786

course this means we must divide logZ by the number of

787

tokens per complexity unit, ie renorm = delta * logZ with

788

delta = 1/max_order. Once I realized this it was obvious

789

that the complexity should be also incremented by delta for

790

every token. As a side effect, the complexity is now a real

791

number, and actually measures not just the max_order token

792

count, but also the fraction of incomplete n-grams. This

793

seems like the right way to go, especially for models based

794

on regexes, since now we also count the incomplete n-grams

795

at both ends of the line, which adds up to quite a bit over

796

many lines. */

797

798

cat[i].fcomplexity++; /* don't actually need this, but nice to have */

799

cat[i].complexity += cat[i].delta;

800

654

801

/* now adjust the score */

655

802

switch(cat[i].model_type) {

656

803

case simple:

657

804

multinomial_correction = h ?

658

805

(log((weight_t)cat[i].complexity) - log((weight_t)h->count)) : 0.0;

659

cat[i].score += lambda + multinomial_correction + ref - renorm;

806

cat[i].score += lambda + multinomial_correction + ref - cat[i].renorm;

660

807

break;

661

808

case sequential:

662

809

default:

663

cat[i].score += lambda + ref - renorm;

810

cat[i].score += lambda + ref - cat[i].renorm;

664

811

if( tt.order == cat[i].max_order ) {

665

812

cat[i].score_shannon += shannon_correction;

666

813

}

676

823

677

824

if( u_options & (1<<U_OPTION_DUMP) ) {

678

825

if( u_options & (1<<U_OPTION_SCORES) ) {

679

fprintf(stdout, " %8.2f * %-5d\t",

826

fprintf(stdout, " %8.2f * %-6.1f\t",

680

827

-cat[i].score/cat[i].complexity,

681

828

cat[i].complexity);

682

829

} else if( u_options & (1<<U_OPTION_POSTERIOR) ) {

684

831

} else {

685

832

fprintf(stdout,

686

833

"%7.2f %7.2f %7.2f %7.2f %8lx\t",

687

lambda, ref, -renorm,

688

multinomial_correction,

689

(long unsigned int)(k ? NTOH_ID(k->id) : 0));

834

lambda, ref, apply ? -cat[i].renorm : 0.0,

835

multinomial_correction,

836

(long unsigned int)((k && apply) ? NTOH_ID(k->id) : 0));

690

837

}

691

838

}

692

839

694

841

695

842

if( u_options & (1<<U_OPTION_DUMP) ) {

696

843

print_token(stdout, tok);

697

fprintf(stdout, "\n");

844

if( re > 0 ) {

845

fprintf(stdout, "<re=%d>\n", re);

846

} else {

847

fprintf(stdout, "\n");

848

}

698

849

}

699

850

700

851

}

730

881

/***********************************************************

731

882

* FILE MANAGEMENT FUNCTIONS *

732

883

***********************************************************/

733

734

/* loads a category hash

735

returns 0 on failure, you should free the category in that case */

736

error_code_t load_category(category_t *cat) {

737

hash_count_t i, j;

884

error_code_t load_category_header(FILE *input, category_t *cat) {

738

885

char buf[MAGIC_BUFSIZE];

739

886

char scratchbuf[MAGIC_BUFSIZE];

740

741

887

short int shint_val;

742

888

long int lint_val1, lint_val2, lint_val3;

743

889

744

FILE *input;

745

746

input = fopen(cat->fullfilename, "rb");

747

890

if( input ) {

748

891

if( !fgets(buf, MAGIC_BUFSIZE, input) ||

749

892

strncmp(buf, MAGIC1, MAGIC1_LEN) ) {

750

893

errormsg(E_ERROR,

751

894

"not a dbacl " SIGNATURE " category file [%s]\n",

752

895

cat->fullfilename);

753

fclose(input);

754

896

return 0;

755

897

}

756

898

760

902

(sscanf(buf, MAGIC2_i, &cat->divergence, &cat->logZ,

761

903

&shint_val, scratchbuf) < 4) ) {

762

904

errormsg(E_ERROR, "bad category file [2]\n");

763

fclose(input);

764

905

return 0;

765

906

}

766

907

cat->max_order = (token_order_t)shint_val;

908

cat->delta = 1.0/(score_t)(cat->max_order);

909

cat->renorm = cat->delta * cat->logZ;

767

910

if( scratchbuf[0] == 'm' ) {

768

911

cat->model_type = simple;

769

912

} else {

777

920

&lint_val2,

778

921

&lint_val3) < 4) ) {

779

922

errormsg(E_ERROR, "bad category file [3]\n");

780

fclose(input);

781

923

return 0;

782

924

}

783

925

cat->max_hash_bits = (token_order_t)shint_val;

793

935

&cat->alpha, &cat->beta,

794

936

&cat->mu, &cat->s2) < 5) ) {

795

937

errormsg(E_ERROR, "bad category file [8]\n");

796

fclose(input);

797

938

return 0;

798

939

}

799

940

818

959

/* if this category did not register a regex, it wants

819

960

the default processing, so we flag this */

820

961

if( !cat->retype ) {

821

cat->m_options &= ~(1<<M_OPTION_USE_REGEX);

962

cat->m_options |= (1<<M_OPTION_USE_STDTOK);

822

963

}

823

964

824

965

/* if we haven't read a character class, use alpha */

825

966

if( !(cat->m_options & (1<<M_OPTION_CHAR_ALPHA)) &&

826

967

!(cat->m_options & (1<<M_OPTION_CHAR_ALNUM)) &&

827

968

!(cat->m_options & (1<<M_OPTION_CHAR_CEF)) &&

969

!(cat->m_options & (1<<M_OPTION_CHAR_CHAR)) &&

828

970

!(cat->m_options & (1<<M_OPTION_CHAR_ADP)) &&

829

971

!(cat->m_options & (1<<M_OPTION_CHAR_GRAPH)) ) {

830

972

if( cat->m_options & (1<<M_OPTION_MBOX_FORMAT) ) {

833

975

cat->m_options |= (1<<M_OPTION_CHAR_ALPHA);

834

976

}

835

977

}

978

/* if we're here, success! */

979

return 1;

980

}

981

return 0;

982

}

983

984

985

error_code_t explicit_load_category(category_t *cat, char *openf, int protf) {

986

hash_count_t i, j;

987

988

FILE *input;

989

990

/* this is needed in case we try to open with write permissions,

991

which would otherwise create the file */

992

993

input = fopen(cat->fullfilename, "rb");

994

if( input && (strcmp(openf, "rb") != 0) ) {

995

input = freopen(cat->fullfilename, openf, input);

996

}

997

998

if( input ) {

999

1000

if( !load_category_header(input, cat) ) {

1001

fclose(input);

1002

return 0;

1003

}

836

1004

837

1005

/* read character frequencies */

838

1006

i = ASIZE * ASIZE;

855

1023

}

856

1024

#endif

857

1025

858

if( u_options & (1<<U_OPTION_MMAP) ) {

859

cat->mmap_offset = ftell(input);

860

if( cat->mmap_offset > 0 ) {

861

cat->mmap_start =

862

(byte_t *)MMAP(0, sizeof(c_item_t) * cat->max_tokens +

863

cat->mmap_offset,

864

PROT_READ, MAP_SHARED, fileno(input), 0);

865

if( cat->mmap_start ) {

866

cat->hash = (c_item_t *)(cat->mmap_start + cat->mmap_offset);

867

MADVISE(cat->hash, sizeof(c_item_t) * cat->max_tokens,

868

MADV_SEQUENTIAL|MADV_WILLNEED);

869

/* lock the pages to prevent swapping - on Linux, this

870

works without root privs so long as the user limits

871

are big enough - mine are unlimited ;-)

872

On other OSes, root may me necessary. If we can't

873

lock, it doesn't really matter, but cross validations

874

and multiple classifications are a _lot_ faster with locking. */

875

MLOCK(cat->hash, sizeof(c_item_t) * cat->max_tokens);

876

cat->c_options |= (1<<C_OPTION_MMAPPED_HASH);

877

}

878

}

879

}

880

881

if( !cat->hash ) {

882

cat->c_options &= ~(1<<C_OPTION_MMAPPED_HASH);

883

/* allocate hash table */

884

cat->hash = (c_item_t *)malloc(sizeof(c_item_t) * cat->max_tokens);

885

if( !cat->hash ) {

886

errormsg(E_ERROR, "not enough memory for category %s\n",

887

cat->filename);

888

fclose(input);

889

return 0;

890

}

891

892

MADVISE(cat->hash, sizeof(c_item_t) * cat->max_tokens,

893

MADV_SEQUENTIAL);

894

895

/* read in hash table */

896

i = cat->max_tokens;

897

j = 0;

898

while(!ferror(input) && !feof(input) && (j < i) ) {

899

j += fread(cat->hash + j, sizeof(c_item_t), i - j, input);

900

}

901

902

if( j < i ) {

903

errormsg(E_ERROR, "corrupt category? %s\n",

904

cat->fullfilename);

905

free(cat->hash);

906

fclose(input);

907

return 0;

908

}

909

1026

if( !create_category_hash(cat, input, protf) ) {

1027

fclose(input);

1028

return 0;

910

1029

}

911

1030

912

1031

fclose(input);

918

1037

}

919

1038

920

1039

1040

/* loads a category hash

1041

returns 0 on failure, you should free the category in that case */

1042

error_code_t load_category(category_t *cat) {

1043

return explicit_load_category(cat, "rb", PROT_READ);

1044

}

1045

1046

/* loads a category file for potential read/write */

1047

error_code_t open_category(category_t *cat) {

1048

return explicit_load_category(cat, "r+b", PROT_READ|PROT_WRITE);

1049

}

1050

921

1051

error_code_t reload_category(category_t *cat) {

922

1052

if( cat ) {

923

1053

/* free the hash, but keep the cat->fullfilename */

924

if( cat->hash ) {

925

if( cat->c_options & (1<<C_OPTION_MMAPPED_HASH) ) {

926

MUNMAP(cat->mmap_start, cat->max_tokens * sizeof(c_item_t));

927

cat->mmap_start = NULL;

928

cat->hash = NULL;

929

}

930

931

if( cat->hash ) {

932

free(cat->hash);

933

cat->hash = NULL;

934

}

935

}

1054

free_category_hash(cat);

936

1055

return load_category(cat) &&

937

1056

sanitize_model_options(&m_options,cat);

938

1057

}

Older »