~ubuntu-branches/ubuntu/hardy/dbacl/hardy

« back to all changes in this revision

Viewing changes to src/fh.c

Committer: Bazaar Package Importer
Author(s): Zak B. Elep
Date: 2006-03-26 22:35:35 UTC
mto: (2.1.1 etch) (1.1.2 upstream)
mto: This revision was merged to the branch mainline in revision 4.
Revision ID: james.westby@ubuntu.com-20060326223535-icwiulpkzesds4mq

Import upstream version 1.12

files added:
TREC

TREC/Makefile.am

TREC/Makefile.in

TREC/OPTIONS

TREC/OPTIONS.TREC2005.1cefhuj

TREC/OPTIONS.TREC2005.2adphu

TREC/OPTIONS.TREC2005.3adphd

TREC/OPTIONS.TREC2005.4adp

TREC/OPTIONS.adp-dir-d

TREC/OPTIONS.adp-u-d

TREC/OPTIONS.adp-unif-d

TREC/OPTIONS.bi-adp-unif-d

TREC/OPTIONS.bi-simple-d

TREC/OPTIONS.cef-dir-d

TREC/OPTIONS.cef-unif-d

TREC/OPTIONS.puretext-d

TREC/OPTIONS.simple-d

TREC/OPTIONS.simple-v

TREC/README

TREC/SFX

TREC/TREC2005.txt

TREC/basic-email

TREC/classify

TREC/finalize

TREC/initialize

TREC/train

TREC/verify-stderr

config/depcomp

contrib

contrib/Makefile.am

contrib/Makefile.in

contrib/README

contrib/clint_adams-patch-dbacl-1.9.gz

doc/chess

doc/chess/Makefile.am

doc/chess/Makefile.in

doc/chess/combine_half_moves.sh

doc/chess/csfpc1.png

doc/chess/csfpc2.png

doc/chess/csfpc3.png

doc/chess/dce-1.sh

doc/chess/dce-2.sh

doc/chess/dce-3.sh

doc/chess/dce-basic.sh

doc/chess/dce.sh

doc/chess/down.png

doc/chess/randomizer.awk

doc/chess/renorm.awk

doc/chess/spam_chess.html

doc/chess/spoiler.png

doc/is_it_working.html

man/hypex.1in

src/hypex.c

src/hypex.h

src/lint-check.sh

src/splintrc

src/tests/email-style.shin

src/tests/sample.spam-11

src/tests/verify.email-style

files removed:
src/stamp-h.in

files modified:
COPYING

ChangeLog

INSTALL

Makefile.am

Makefile.in

NEWS

aclocal.m4

config/config.guess

config/config.sub

configure

configure.in

doc/Makefile.am

doc/Makefile.in

doc/email.html

doc/tutorial.html

man/Makefile.am

man/Makefile.in

man/dbacl.1in

man/hmine.1in

src/Makefile.am

src/Makefile.in

src/bayesol.c

src/catfun.c

src/config.h.in

src/dbacl.c

src/dbacl.h

src/fh.c

src/hmine.c

src/hmine.h

src/hparse.c

src/icheck.c

src/mailcross.in

src/mailinspect.c

src/mbw.c

src/mbw.h

src/probs.c

src/rfc2822.c

src/rfc822.c

src/risk-lexer.c

src/risk-parser.c

src/risk-parser.h

src/risk-parser.y

src/tests/Makefile.am

src/tests/Makefile.in

src/tests/dbacl-a.shin

src/tests/dbacl-g.shin

src/tests/dbacl-jap.shin

src/tests/dbacl-o.shin

src/tests/email-badmime1.shin

src/tests/email-badmime2.shin

src/tests/email-forms.shin

src/tests/email-headers.shin

src/tests/email-l.shin

src/tests/email-maildir.shin

src/tests/email-mbox.shin

src/tests/email-pgp.shin

src/tests/email-scripts.shin

src/tests/email-theaders.shin

src/tests/email-uri.shin

src/tests/email-uu.shin

src/tests/email-xheaders.shin

src/tests/html-alt.shin

src/tests/html-links.shin

src/tests/html.shin

src/tests/icheck.shin

src/tests/lscheck.shin

src/tests/model-sum1.shin

src/tests/pcheck-2821b.shin

src/tests/pcheck-2821g.shin

src/tests/pcheck-2822b.shin

src/tests/pcheck-2822g.shin

src/tests/pcheck-821b.shin

src/tests/pcheck-821g.shin

src/tests/pcheck-822b.shin

src/tests/pcheck-822g.shin

src/tests/reservoir.shin

src/tests/score-1.shin

src/tests/score-2.shin

src/tests/shannon-1.shin

src/tests/shannon-2.shin

src/tests/verify.email-forms

src/tests/verify.email-pgp

src/tests/verify.email-scripts

src/tests/verify.email-theaders

src/tests/verify.email-xheaders

src/tests/xml.shin

src/util.c

src/util.h

ts/Makefile.in

ts/dbaclA

ts/dbaclB

ts/dbaclC

ts/dbaclL

Show diffs side-by-side

added added

removed removed

src/fh.c

348

/* the token class is a common label for a subset of features,

349

such as e.g. all features which appear in the header. The label

350

should be a number greater than AMIN. If all tokens have the same

351

class, then we effectively obtain he dbacl 1.7 and earlier behaviour.

351

class, then we effectively obtain the dbacl 1.7 and earlier behaviour.

352

IT DOESN"T MAKE SENSE to have multiple classes and multiple orders.

353

It's one or the other, otherwise we need several normalizing constants.

354

355

token_type_t get_token_type(token_order_t o) {

356

token_type_t tt;

357

tt.order = o;

358

tt.mark = 0;

358

359

360

if( (m_options & (1<<M_OPTION_MBOX_FORMAT)) &&

360

361

!(m_options & (1<<M_OPTION_NGRAM_STRADDLE_NL)) ) {

383

384

}

384

385

break;

385

386

case msUNDEF:

387

tt.cls = AMIN + 3;

388

break;

386

389

case msBODY:

387

tt.cls = AMIN + 3;

390

tt.cls = AMIN + 3;

388

391

break;

389

392

case msATTACH:

390

393

tt.cls = AMIN + 2;

391

394

break;

395

default:

396

tt.cls = AMIN + 1;

392

397

}

393

398

} else {

394

399

tt.cls = AMIN + 1;

441

446

if( stat(fullp, &statinfo) == 0 ) {

442

447

switch(statinfo.st_mode & S_IFMT) {

443

448

case S_IFREG:

444

input = fopen(fullp, "r");

449

input = fopen(fullp, "rb");

445

450

if( input ) {

446

451

inputfile = fullp;

447

452

/* set some initial options */

466

471

}

467

472

}

468

473

474

void reset_current_token(char *tokbuf, char **q, token_order_t *how_many) {

475

tokbuf[0] = DIAMOND;

476

tokbuf[1] = '\0';

477

*q = tokbuf + 1;

478

*how_many = 0;

479

}

480

469

481

/* reads a text file as input and applies several filters. */

470

482

void process_file(FILE *input,

471

483

int (*line_filter)(MBOX_State *, char *),

482

494

int extra_lines = 2;

483

495

e = 0;

484

496

/* initialize the norex state */

485

tokbuf[0] = DIAMOND;

486

tokbuf[1] = '\0';

487

q = tokbuf + 1;

488

how_many = 0;

489

497

reset_current_token(tokbuf, &q, &how_many);

490

498

491

499

set_iobuf_mode(input);

492

500

493

501

inputline = 0;

494

502

503

/* extra lines are used to flush data conversion caches, but not

504

needed for plain text */

505

if( u_options & (1<<U_OPTION_FILTER) ) { extra_lines = 0; }

506

495

507

/* now start processing */

496

508

while( fill_textbuf(input, &extra_lines) ) {

497

509

inputline++;

523

535

524

536

/* default processing: reads tokens and passes them to

525

537

the word_fun */

526

if( !(m_options & (1<<M_OPTION_USE_REGEX)) ) {

538

if( (m_options & (1<<M_OPTION_USE_STDTOK)) ) {

527

539

std_tokenizer(pptextbuf, &q, tokbuf, &how_many, ngram_order,

528

540

word_fun, get_token_type);

529

541

}

533

545

/* now summarize this line if required */

534

546

if( post_line_fun ) { (*post_line_fun)(pptextbuf); }

535

547

548

if( !(m_options & (1<<M_OPTION_NGRAM_STRADDLE_NL)) ) {

549

reset_current_token(tokbuf, &q, &how_many);

550

}

551

536

552

if( cmd & (1<<CMD_RELOAD_CATS) ) {

537

553

538

554

reload_all_categories();

542

558

543

559

}

544

560

/* since std_tokenizer tokens can straddle lines, we should

545

flush the last token fragment */

546

if( !(m_options & (1<<M_OPTION_USE_REGEX)) ) {

561

flush the last token fragment - note this has nothing to do with

562

the M_OPTION_NGRAM_STRADDLE_NL flag, it's an issue caused by caching

563

decoders such as the base64 and qp line filters. */

564

if( (m_options & (1<<M_OPTION_USE_STDTOK)) ) {

547

565

std_tokenizer(NULL, &q, tokbuf, &how_many, ngram_order,

548

566

word_fun, get_token_type);

549

567

if( post_line_fun ) { (*post_line_fun)(NULL); }

584

602

if( stat(fullp, &statinfo) == 0 ) {

585

603

switch(statinfo.st_mode & S_IFMT) {

586

604

case S_IFREG:

587

input = fopen(fullp, "r");

605

input = fopen(fullp, "rb");

588

606

if( input ) {

589

607

inputfile = fullp;

590

608

/* set some initial options */

631

649

set_iobuf_mode(input);

632

650

633

651

/* initialize the norex state */

634

tokbuf[0] = DIAMOND;

635

tokbuf[1] = '\0';

636

q = tokbuf + 1;

637

how_many = 0;

652

reset_current_token(tokbuf, &q, &how_many);

638

653

639

654

memset(&input_shiftstate, 0, sizeof(mbstate_t));

640

655

inputline = 0;

656

/* extra lines are used to flush data conversion caches, but not

657

needed for plain text */

658

if( u_options & (1<<U_OPTION_FILTER) ) { extra_lines = 0; }

659

641

660

while( fill_textbuf(input, &extra_lines) ) {

642

661

inputline++;

643

662

/* preprocesses textbuf, optionally censors it */

673

692

674

693

/* default processing: reads tokens and passes them to

675

694

the word_fun */

676

if( !(m_options & (1<<M_OPTION_USE_REGEX)) ) {

695

if( (m_options & (1<<M_OPTION_USE_STDTOK)) ) {

677

696

w_std_tokenizer(wc_textbuf, &q, tokbuf, &how_many, ngram_order,

678

697

word_fun, get_token_type);

679

698

}

683

702

/* now summarize this line if required */

684

703

if( post_line_fun ) { (*post_line_fun)(pptextbuf); }

685

704

705

if( !(m_options & (1<<M_OPTION_NGRAM_STRADDLE_NL)) ) {

706

reset_current_token(tokbuf, &q, &how_many);

707

}

708

686

709

if( cmd & (1<<CMD_RELOAD_CATS) ) {

687

710

688

711

reload_all_categories();

693

716

}

694

717

/* since w_std_tokenizer tokens can straddle lines, we should

695

718

flush the last token fragment */

696

if( !(m_options & (1<<M_OPTION_USE_REGEX)) ) {

719

if( (m_options & (1<<M_OPTION_USE_STDTOK)) ) {

697

720

w_std_tokenizer(NULL, &q, tokbuf, &how_many, ngram_order,

698

721

word_fun, get_token_type);

699

722

if( post_line_fun ) { (*post_line_fun)(NULL); }

Older »