~ubuntu-branches/ubuntu/trusty/swish-e/trusty

« back to all changes in this revision

Viewing changes to src/rank.c

Committer: Bazaar Package Importer
Author(s): Ludovic Drolez
Date: 2009-11-05 16:23:33 UTC
mfrom: (1.2.3 upstream)
Revision ID: james.westby@ubuntu.com-20091105162333-9xf7dmhhhvt97bvw

Tags: 2.4.7-1

http://bugs.debian.org/543187

http://bugs.debian.org/512987

* New upstream release
* Added Japanese and Russian debconf translations. Closes: #543187, #512987

files added:
debian/po/ja.po

debian/po/ru.po

src/snowball/stem_hu.c

src/snowball/stem_hu.h

src/snowball/stem_ro.c

src/snowball/stem_ro.h

files removed:
data

data/home

data/home/bmoseley

data/home/bmoseley/swish

data/home/bmoseley/swish/swish_release_build

data/home/bmoseley/swish/swish_release_build/swish-e-2007-01-29

data/home/bmoseley/swish/swish_release_build/swish-e-2007-01-29/swish-e-2.4.5

man/data

man/data/home

man/data/home/bmoseley

man/data/home/bmoseley/swish

man/data/home/bmoseley/swish/swish_release_build

man/data/home/bmoseley/swish/swish_release_build/swish-e-2007-01-29

man/data/home/bmoseley/swish/swish_release_build/swish-e-2007-01-29/swish-e-2.4.5

man/data/home/bmoseley/swish/swish_release_build/swish-e-2007-01-29/swish-e-2.4.5/man

perl/Makefile.PL.deb

files modified:
INSTALL

Makefile.in

README

aclocal.m4

conf/Makefile.in

config/config.guess

config/config.sub

config/ltmain.sh

configure

configure.in

debian/changelog

debian/compat

debian/control

example/Makefile.in

example/SWISH-Stemmer-0.05.tar.gz

example/modules/SWISH/DateRanges.pm

example/modules/SWISH/DefaultHighlight.pm

example/modules/SWISH/PhraseHighlight.pm

example/modules/SWISH/SimpleHighlight.pm

example/modules/SWISH/TemplateDefault.pm

example/modules/SWISH/TemplateDumper.pm

example/modules/SWISH/TemplateFrame.pm

example/modules/SWISH/TemplateHTMLTemplate.pm

example/modules/SWISH/TemplateToolkit.pm

example/search.cgi.in

example/swish.cgi.in

example/swish.gif

example/swish.tmpl

example/swish.tt

filter-bin/Makefile.in

filters/Makefile.in

filters/SWISH/Makefile.in

filters/swish-filter-test.in

html/Makefile.in

html/api.html

html/changes.html

html/filter.html

html/index.html

html/install.html

html/readme.html

html/search.cgi.html

html/spider.html

html/swish-3.0.html

html/swish-bugs.html

html/swish-config.html

html/swish-faq.html

html/swish-library.html

html/swish-run.html

html/swish-search.html

html/swish.cgi.html

html/swish.css

man/Makefile.in

man/SWISH-CONFIG.1

man/SWISH-FAQ.1

man/SWISH-LIBRARY.1

man/SWISH-RUN.1

man/swish-e.1

perl/API.pm

perl/API.xs

perl/Changes

perl/Makefile.PL

perl/Makefile.mingw

perl/README

perl/t/test.t

perl/typemap

pod/CHANGES.pod

pod/INSTALL.pod

pod/Makefile.in

pod/README.pod

pod/SWISH-3.0.pod

pod/SWISH-BUGS.pod

pod/SWISH-CONFIG.pod

pod/SWISH-FAQ.pod

pod/SWISH-LIBRARY.pod

pod/SWISH-RUN.pod

pod/SWISH-SEARCH.pod

prog-bin/Makefile.in

prog-bin/doc2txt.pm

prog-bin/pdf2html.pm

prog-bin/pdf2xml.pm

prog-bin/spider.pl.in

rpm/swish-e.spec.in

src/Makefile.am

src/Makefile.in

src/acconfig.h.in

src/array.c

src/array.h

src/bash.h

src/btree.c

src/btree.h

src/check.h

src/compress.c

src/compress.h

src/config.h

src/date_time.c

src/date_time.h

src/db.h

src/db_native.c

src/db_native.h

src/db_read.c

src/db_write.c

src/docprop.c

src/docprop.h

src/docprop_write.c

src/double_metaphone.c

src/double_metaphone.h

src/dump.h

src/entities.c

src/entities.h

src/error.c

src/expat/Makefile.in

src/expat/expat.dsw

src/expat/xmlparse/xmlparse.dsp

src/expat/xmltok/xmltok.dsp

src/extprog.c

src/fhash.c

src/fhash.h

src/file.c

src/file.h

src/filter.c

src/filter.h

src/fs.c

src/fs.h

src/getruntime.h

src/hash.h

src/headers.c

src/headers.h

src/html.c

src/html.h

src/http.h

src/httpserver.c

src/httpserver.h

src/index.c

src/index.h

src/keychar_out.c

src/keychar_out.h

src/libtest.c

src/mem.c

src/mem.h

src/merge.c

src/metanames.h

src/methods.c

src/parse_conffile.c

src/parse_conffile.h

src/parser.c

src/parser.h

src/pre_sort.c

src/proplimit.c

src/proplimit.h

src/ramdisk.c

src/ramdisk.h

src/rank.c

src/rank.h

src/replace/Makefile.am

src/replace/Makefile.in

src/result_output.c

src/result_output.h

src/result_sort.c

src/result_sort.h

src/search.c

src/search.h

src/snowball/Makefile.am

src/snowball/Makefile.in

src/snowball/api.c

src/snowball/api.h

src/snowball/header.h

src/snowball/stem_de.c

src/snowball/stem_de.h

src/snowball/stem_dk.c

src/snowball/stem_dk.h

src/snowball/stem_en1.c

src/snowball/stem_en1.h

src/snowball/stem_en2.c

src/snowball/stem_en2.h

src/snowball/stem_es.c

src/snowball/stem_es.h

src/snowball/stem_fi.c

src/snowball/stem_fi.h

src/snowball/stem_fr.c

src/snowball/stem_fr.h

src/snowball/stem_it.c

src/snowball/stem_it.h

src/snowball/stem_nl.c

src/snowball/stem_nl.h

src/snowball/stem_no.c

src/snowball/stem_no.h

src/snowball/stem_pt.c

src/snowball/stem_pt.h

src/snowball/stem_ru.c

src/snowball/stem_ru.h

src/snowball/stem_se.c

src/snowball/stem_se.h

src/snowball/utilities.c

src/soundex.c

src/soundex.h

src/stemmer.c

src/stemmer.h

src/swish-e.h

src/swish.c

src/swish.h

src/swish2.c

src/swish_qsort.c

src/swish_qsort.h

src/swish_words.c

src/swish_words.h

src/swishspider

src/swregex.c

src/swregex.h

src/swstring.c

src/txt.c

src/txt.h

src/vms/regexpr.h

src/win32/acconfig.h

src/win32/dirent.h

src/win32/filebase.nsh

src/win32/libswishe.dsp

src/win32/libswishindex.dsp

src/win32/swishe.dsp

src/win32/swishe.dsw

src/worddata.c

src/worddata.h

src/xml.c

src/xml.h

tests/Makefile.in

Show diffs side-by-side

added added

removed removed

src/rank.c

$Id: rank.c,v 1.25 2006/10/18 02:58:02 karman Exp $

$Id: rank.c 2291 2009-03-31 01:56:00Z karpet $

This file is part of Swish-e.

#include <stdlib.h>

#include "swish.h"

#include "db.h"

#include "rank.h"

291

292

sw->structure_map_set = 1; /* flag */

292

293

}

293

294

295

int

296

getrank ( RESULT *r )

297

{

299

IndexFILE *indexf;

300

int scheme;

301

302

303

302

indexf = r->db_results->indexf;

304

303

sw = indexf->sw;

305

304

scheme = sw->RankScheme;

306

305

307

#ifdef DEBUG_RANK

308

fprintf( stderr, "Ranking Scheme: %d \n", scheme );

309

#endif

310

306

if( DEBUG_RANK )

307

{

308

fprintf( stderr, "-----------------------------------------------------------------\n");

309

fprintf( stderr, "Ranking Scheme: %d \n", scheme );

310

}

311

312

switch ( scheme )

313

{

344

** from double to int that degrades performance search

345

** I have switched to integer computations.

346

347

** To avoid the loss of precission I use rank *10000,

347

** To avoid the loss of precision I use rank *10000,

348

** reduction *10000, factor *10000, etc...

349

350

367

SWISH *sw;

368

int metaID;

369

int freq;

370

#ifdef DEBUG_RANK

371

int struct_tally[256];

372

for ( i = 0; i <= 255; i++ )

373

struct_tally[i] = 0;

374

#endif

370

int struct_tally[256];

371

372

if( DEBUG_RANK )

373

{

374

for ( i = 0; i <= 255; i++ )

375

struct_tally[i] = 0;

376

}

375

377

376

378

/* has rank already been calculated? */

377

379

if ( r->rank >= 0 )

395

397

/* pre-build the structure map array */

396

398

/* this maps a word's structure to a rank value */

397

399

if ( !sw->structure_map_set )

400

{

398

401

build_struct_map( sw );

399

402

}

400

403

401

404

402

405

/* Add up the raw word values for each word found in the current document. */

404

407

405

408

/* Might also bias words with low position values, for example */

406

409

/* Should really consider r->tfrequency, which is the number of files that have */

407

/* this word. If the word is not found in many files then it should be ranked higher */

410

/* this word. If the word is not found in many files then it should be ranked higher */

408

411

409

412

rank = 1;

410

413

freq = r->frequency;

415

418

{

416

419

/* GET_STRUCTURE must return value in range! */

417

420

rank += sw->structure_map[ GET_STRUCTURE(posdata[i]) ] + meta_bias;

418

#ifdef DEBUG_RANK

419

fprintf(stderr, "Word entry %d at position %d has struct %d\n", i, GET_POSITION(posdata[i]), GET_STRUCTURE(posdata[i]) );

420

struct_tally[ GET_STRUCTURE(posdata[i]) ]++;

421

#endif

422

423

}

424

425

#ifdef DEBUG_RANK

426

fprintf( stderr, "File num: %d. Raw Rank: %d. Frequency: %d ", r->filenum, rank, r->frequency );

427

#endif

421

if( DEBUG_RANK > 1 )

422

{

423

fprintf(stderr, "Word entry %d at position %d has struct %d\n", i, GET_POSITION(posdata[i]), GET_STRUCTURE(posdata[i]) );

424

struct_tally[ GET_STRUCTURE(posdata[i]) ]++;

425

}

426

}

427

428

if( DEBUG_RANK )

429

{

430

fprintf( stderr, "File num: %d. Raw Rank: %d. Frequency: %d ", r->filenum, rank, r->frequency );

431

}

428

432

429

433

/* Ranks could end up less than zero -- but since the *final* rank is calcualted here */

430

434

/* we can't know the *lowest* value to use an offset. It might be better to track */

437

441

rank = scale_word_score( rank );

438

442

439

443

440

#ifdef DEBUG_RANK

444

if( DEBUG_RANK > 1 )

445

{

441

446

fprintf( stderr, "scaled rank: %d\n Structure tally:\n", rank );

442

447

443

448

for ( i = 0; i <= 255; i++ )

449

{

444

450

if ( struct_tally[i] )

445

451

{

446

452

fprintf( stderr, " struct 0x%x = count of %2d (", i, struct_tally[i] );

454

460

if ( i & IN_FILE ) fprintf(stderr," FILE");

455

461

fprintf(stderr," ) x rank map of %d = %d\n\n", sw->structure_map[i], sw->structure_map[i] * struct_tally[i]);

456

462

}

457

#endif

463

}

464

}

458

465

459

466

460

467

476

483

{

477

484

if(words >= 100000) /* log10(10000) is 5 */

478

485

reduction = 50000; /* As it was in previous version (5 * 10000) */

479

else /* rare case - do not overrun the static arrays (tehy only have 1000 entries) */

486

487

/* rare case - do not overrun the static arrays (they only have 1000 entries) */

488

else

480

489

reduction = (int) (10000 * (floor(log10((double)words) + 0.5)));

481

490

}

482

491

else

488

497

}

489

498

490

499

491

/* multiple ranking schemes allow for more fine-tuning as users' require.

500

/* multiple ranking schemes allow for more fine-tuning as users require.

492

501

493

502

Use the -R <num> command line option or RankScheme() API method.

494

503

495

Default is to use getrank() -- the same as -R 0

504

Default is to use getrankDEF() -- the same as -R 0

496

505

497

506

IDF ranking uses the total word frequency across all searched indexes

498

507

and a normalizing formula to negate effect of docs with different sizes.

531

540

int total_word_freq;

532

541

int word_weight;

533

542

int word_score;

543

534

544

/* int density_magic = 2; */

535

545

536

546

/* the value named 'rank' in getrank() is here named 'word_score'.

537

547

it's largely semantic, but helps emphasize that *docs* are ranked,

538

548

but *words* are scored. The doc rank is calculated based on the accrued word scores.

539

549

540

541

550

However, the hash key name 'rank' is preserved in the r (RESULT) object

542

551

for compatibility with getrank()

543

552

545

554

546

555

/* this first part is identical to getrankDEF -- could be optimized as a single function */

547

556

548

549

#ifdef DEBUG_RANK

550

557

int struct_tally[256];

551

for ( i = 0; i <= 255; i++ )

552

struct_tally[i] = 0;

553

#endif

558

if( DEBUG_RANK )

559

{

560

for ( i = 0; i <= 255; i++ )

561

struct_tally[i] = 0;

562

}

554

563

555

564

556

565

if ( r->rank >= 0 )

565

574

meta_bias = indexf->header.metaEntryArray[ metaID - 1 ]->rank_bias;

566

575

567

576

if ( !sw->structure_map_set )

577

{

568

578

build_struct_map( sw );

569

579

}

580

570

581

/* here we start to diverge */

571

582

572

583

574

585

freq = r->frequency;

575

586

576

587

577

#ifdef DEBUG_RANK

578

fprintf( stderr, "File num: %d Word Score: %d Frequency: %d ", r->filenum, word_score, freq );

579

#endif

588

if( DEBUG_RANK )

589

{

590

fprintf( stderr, "File num: %d Word Score: %d Frequency: %d ", r->filenum, word_score, freq );

591

}

580

592

581

593

582

594

/* don't do this here; let density calc do it

598

610

total_word_freq = r->tfrequency;

599

611

idf = (int) ( log( total_files / total_word_freq ) * 1000 );

600

612

601

/* take 3 significant digits of the IDF.

602

this helps create a wider spread

613

/* *1000 helps create a wider spread

603

614

between the most common words and the rest of the pack:

604

615

"word frequencies in natural language obey a power-law distribution" -- Maciej Ceglowski

605

616

607

618

if ( idf < 1 )

608

619

idf = 1;

609

620

/* only ubiquitous words like 'the' get idfs < 1.

610

these should probably be stopwords anyway... */

611

612

613

#ifdef DEBUG_RANK

621

these should probably be stopwords anyway...

622

623

624

625

if( DEBUG_RANK )

626

{

614

627

fprintf(stderr, "Total files: %d Total word freq: %d IDF: %d \n",

615

628

total_files, total_word_freq, idf );

616

#endif

629

}

617

630

618

631

/* calc word density. this normalizes document length so that longer docs

619

632

don't rank higher out of sheer quantity. Hopefully this is a little more

625

638

total_words = sw->TotalWordPos;

626

639

average_words = total_words / total_files;

627

640

628

#ifdef DEBUG_RANK

629

fprintf(stderr, "Total words: %d Average words: %d Indexed words in this doc: %d ",

641

if( DEBUG_RANK )

642

{

643

fprintf(stderr, "Total words: %d Average words: %d Indexed words in this doc: %d ",

630

644

total_words, average_words, words );

631

#endif

645

}

632

646

633

647

634

648

/* normalizing term density in a collection.

679

693

}

680

694

681

695

682

density = ( ( average_words * 100 ) / words ) * freq;

696

density = ( ( average_words * 1000 ) / words ) * freq;

683

697

684

698

/* minimum density */

685

699

if (density < 1)

690

704

word_weight = ( density * idf ) / 100;

691

705

692

706

693

#ifdef DEBUG_RANK

694

fprintf(stderr, "Density: %d Word Weight: %d \n", density, word_weight );

695

#endif

707

if( DEBUG_RANK )

708

{

709

fprintf(stderr, "Density: %d Word Weight: %d \n", density, word_weight );

710

}

696

711

697

712

698

713

for(i = 0; i < freq; i++)

701

716

702

717

word_score += word_weight * ( sw->structure_map[ GET_STRUCTURE(posdata[i]) ] + meta_bias );

703

718

704

#ifdef DEBUG_RANK

705

fprintf(stderr, "Word entry %d at position %d has struct %d\n", i, GET_POSITION(posdata[i]), GET_STRUCTURE(posdata[i]) );

719

if( DEBUG_RANK > 1 )

720

{

721

fprintf(stderr, "Word entry %d at position %d has struct %d\n", i, GET_POSITION(posdata[i]), GET_STRUCTURE(posdata[i]) );

706

722

707

struct_tally[ GET_STRUCTURE(posdata[i]) ]++;

708

#endif

723

struct_tally[ GET_STRUCTURE(posdata[i]) ]++;

724

}

709

725

710

726

}

711

727

716

732

717

733

718

734

719

#ifdef DEBUG_RANK

720

fprintf(stderr, "Rank after IDF weighting: %d \n", word_score );

721

#endif

722

723

/* scaling word_score?? */

724

725

/* Scale the rank - this was originally based on frequency */

726

/* Uses lookup tables for values <= 1000, otherwise calculate */

727

728

word_score = scale_word_score( word_score );

729

730

731

732

#ifdef DEBUG_RANK

733

fprintf( stderr, "scaled rank: %d\n Structure tally:\n", word_score );

734

735

for ( i = 0; i <= 255; i++ )

736

if ( struct_tally[i] )

737

{

738

fprintf( stderr, " struct 0x%x = count of %2d (", i, struct_tally[i] );

739

if ( i & IN_EMPHASIZED ) fprintf(stderr," EM");

740

if ( i & IN_HEADER ) fprintf(stderr," HEADING");

741

if ( i & IN_COMMENTS ) fprintf(stderr," COMMENT");

742

if ( i & IN_META ) fprintf(stderr," META");

743

if ( i & IN_BODY ) fprintf(stderr," BODY");

744

if ( i & IN_HEAD ) fprintf(stderr," HEAD");

745

if ( i & IN_TITLE ) fprintf(stderr," TITLE");

746

if ( i & IN_FILE ) fprintf(stderr," FILE");

747

fprintf(stderr," ) x rank map of %d = %d\n\n", sw->structure_map[i], sw->structure_map[i] * struct_tally[i]);

748

}

749

#endif

750

751

752

return ( r->rank = word_score / 100 );

753

735

if( DEBUG_RANK )

736

{

737

fprintf(stderr, "Raw score after IDF weighting: %d \n", word_score );

738

}

739

740

word_score = scale_word_score( word_score );

741

742

if( DEBUG_RANK > 1 )

743

{

744

fprintf( stderr, "scaled rank: %d\n Structure tally:\n", word_score );

745

746

for ( i = 0; i <= 255; i++ )

747

{

748

if ( struct_tally[i] )

749

{

750

fprintf( stderr, " struct 0x%x = count of %2d (", i, struct_tally[i] );

751

if ( i & IN_EMPHASIZED ) fprintf(stderr," EM");

752

if ( i & IN_HEADER ) fprintf(stderr," HEADING");

753

if ( i & IN_COMMENTS ) fprintf(stderr," COMMENT");

754

if ( i & IN_META ) fprintf(stderr," META");

755

if ( i & IN_BODY ) fprintf(stderr," BODY");

756

if ( i & IN_HEAD ) fprintf(stderr," HEAD");

757

if ( i & IN_TITLE ) fprintf(stderr," TITLE");

758

if ( i & IN_FILE ) fprintf(stderr," FILE");

759

fprintf(stderr," ) x rank map of %d = %d\n\n", sw->structure_map[i], sw->structure_map[i] * struct_tally[i]);

760

}

761

}

762

}

763

764

if ( DEBUG_RANK )

765

{

766

fprintf(stderr, "Scaled score: %d \n", word_score );

767

768

}

769

770

return ( r->rank = word_score );

754

771

}

755

772

756

773

int

Older »