~ubuntu-branches/ubuntu/trusty/swish-e/trusty

« back to all changes in this revision

Viewing changes to src/rank.c

Committer: Bazaar Package Importer
Author(s): Ludovic Drolez
Date: 2008-09-25 21:52:31 UTC
mfrom: (4.1.4 intrepid)
Revision ID: james.westby@ubuntu.com-20080925215231-vk46pq42o533syg2

Tags: 2.4.5-5

http://bugs.debian.org/500154

swish.cgi was not working. Fixed with a 1 char patch. Closes: #500154

files added:
COPYING

data

data/home

data/home/bmoseley

data/home/bmoseley/swish

data/home/bmoseley/swish/swish_release_build

data/home/bmoseley/swish/swish_release_build/swish-e-2007-01-29

data/home/bmoseley/swish/swish_release_build/swish-e-2007-01-29/swish-e-2.4.5

debian/po/de.po

debian/swish.config

filters/SWISH/Filters/pp2html.pm

html/api.html

html/changes.html

html/filter.html

html/install.html

html/readme.html

html/search.cgi.html

html/swish-3.0.html

html/swish-bugs.html

html/swish-config.html

html/swish-faq.html

html/swish-library.html

html/swish-run.html

html/swish-search.html

html/swish.cgi.html

html/swish.css

man/data

man/data/home

man/data/home/bmoseley

man/data/home/bmoseley/swish

man/data/home/bmoseley/swish/swish_release_build

man/data/home/bmoseley/swish/swish_release_build/swish-e-2007-01-29

man/data/home/bmoseley/swish/swish_release_build/swish-e-2007-01-29/swish-e-2.4.5

man/data/home/bmoseley/swish/swish_release_build/swish-e-2007-01-29/swish-e-2.4.5/man

perl/Makefile.mingw

pod/Makefile.am

pod/Makefile.in

tests/check_fuzzy

tests/test.fuzzy.config

files removed:
doc

doc/Makefile.in

doc/Pod

doc/Pod/HtmlPsPdf

doc/Pod/HtmlPsPdf.pm

doc/Pod/HtmlPsPdf/Book.pm

doc/Pod/HtmlPsPdf/Chapter.pm

doc/Pod/HtmlPsPdf/Common.pm

doc/Pod/HtmlPsPdf/Config.pm

doc/Pod/HtmlPsPdf/Html.pm

doc/Pod/HtmlPsPdf/RunTime.pm

doc/Version.pm

doc/bin

doc/bin/build

doc/bin/html2ps

doc/bin/pod2hpp

doc/conf

doc/conf/html2ps-global.conf

doc/conf/html2ps-slides.conf

doc/conf/html2ps.conf

doc/conf/html2ps.html

doc/conf/hyphen.tex

doc/split.conf

doc/swish.conf

doc/tmpl

doc/tmpl/index.tmpl

doc/tmpl/indexps.tmpl

doc/tmpl/page.tmpl

doc/tmpl/pageps.tmpl

doc/tmpl/splitpage.tmpl

html/.htaccess

html/.swishcgi.conf

html/API.html

html/CHANGES.html

html/Filter.html

html/INSTALL.html

html/README.html

html/SWISH-3.0.html

html/SWISH-BUGS.html

html/SWISH-CONFIG.html

html/SWISH-FAQ.html

html/SWISH-LIBRARY.html

html/SWISH-RUN.html

html/SWISH-SEARCH.html

html/images

html/images/dotrule1.gif

html/images/swish.gif

html/images/swish2.gif

html/images/swish2b.gif

html/images/swishbanner1.gif

html/index_long.html

html/search.html

html/searchdoc.html

html/split.pl

html/style.css

html/swish.conf

html/swish.html

html/timestamp

pod/.htaccess

pod/images

pod/images/dotrule1.gif

pod/images/swish.gif

pod/images/swish2.gif

pod/images/swish2b.gif

pod/images/swishbanner1.gif

pod/search.cgi

pod/style.css

src/win32/installer.nsi

files modified:
INSTALL

Makefile.am

Makefile.in

README

README.cvs

TODO

aclocal.m4

conf/Makefile.in

conf/README

config/config.guess

config/config.sub

config/ltmain.sh

configure

configure.in

debian/README.Debian

debian/changelog

debian/control

debian/po/cs.po

debian/po/fr.po

debian/po/nl.po

debian/po/pt.po

debian/po/sv.po

debian/po/templates.pot

debian/po/vi.po

debian/rules

debian/swish-e.doc-base

debian/swish-e.examples

debian/swish-e.templates

example/Makefile.am

example/Makefile.in

example/modules/SWISH/TemplateFrame.pm *

example/modules/SWISH/TemplateToolkit.pm

example/search.cgi.in

example/swish.cgi.in

filter-bin/Makefile.am

filter-bin/Makefile.in

filter-bin/swish_filter.pl.in

filters/Makefile.am

filters/Makefile.in

filters/SWISH/Filter.pm.in

filters/SWISH/Filters/XLtoHTML.pm

filters/SWISH/Makefile.am

filters/SWISH/Makefile.in

filters/swish-filter-test.in

html/Makefile.am

html/Makefile.in

html/index.html

html/spider.html

man/Makefile.am

man/Makefile.in

man/SWISH-CONFIG.1

man/SWISH-FAQ.1

man/SWISH-LIBRARY.1

man/SWISH-RUN.1

man/swish-e.1

perl/API.pm

perl/API.xs

perl/Changes

perl/MANIFEST

perl/Makefile.PL

perl/t/test.conf

perl/t/test.t

pod/CHANGES.pod

pod/INSTALL.pod

pod/README.pod

pod/SWISH-3.0.pod

pod/SWISH-BUGS.pod

pod/SWISH-CONFIG.pod

pod/SWISH-FAQ.pod

pod/SWISH-LIBRARY.pod

pod/SWISH-RUN.pod

pod/SWISH-SEARCH.pod

prog-bin/DirTree.pl.in

prog-bin/Makefile.am

prog-bin/Makefile.in

prog-bin/spider.pl.in

rpm/swish-e.spec.in

src/Makefile.am

src/Makefile.in

src/acconfig.h.in

src/array.c

src/array.h

src/bash.c

src/bash.h

src/btree.c

src/btree.h

src/check.c

src/check.h

src/compress.c

src/compress.h

src/config.h

src/date_time.c

src/date_time.h

src/db.h

src/db_native.c

src/db_native.h

src/db_read.c

src/db_write.c

src/docprop.c

src/docprop.h

src/docprop_write.c

src/double_metaphone.c

src/double_metaphone.h

src/dump.c

src/dump.h

src/entities.c

src/entities.h

src/error.c

src/error.h

src/expat/Makefile.in

src/extprog.c

src/extprog.h

src/fhash.c

src/fhash.h

src/file.c

src/file.h

src/filter.c

src/filter.h

src/fs.c

src/fs.h

src/getruntime.c

src/getruntime.h

src/hash.c

src/hash.h

src/headers.c

src/headers.h

src/html.c

src/html.h

src/http.c

src/http.h

src/httpserver.c

src/httpserver.h

src/index.c

src/index.h

src/keychar_out.c

src/keychar_out.h

src/libtest.c

src/list.c

src/list.h

src/mem.c

src/mem.h

src/merge.c

src/merge.h

src/metanames.c

src/metanames.h

src/methods.c

src/parse_conffile.c

src/parse_conffile.h

src/parser.c

src/parser.h

src/pre_sort.c

src/proplimit.c

src/proplimit.h

src/ramdisk.c

src/ramdisk.h

src/rank.c

src/rank.h

src/replace/Makefile.in

src/result_output.c

src/result_output.h

src/result_sort.c

src/result_sort.h

src/search.c

src/search.h

src/snowball/Makefile.in

src/soundex.c

src/soundex.h

src/stemmer.c

src/stemmer.h

src/swish-e.h

src/swish.c

src/swish.h

src/swish2.c

src/swish_qsort.c

src/swish_qsort.h

src/swish_words.c

src/swish_words.h

src/swregex.c

src/swregex.h

src/swstring.c

src/swstring.h

src/sys.h

src/txt.c

src/txt.h

src/win32/acconfig.h

src/win32/build.sh

src/win32/dist.sh

src/win32/filebase.nsh

src/win32/fixperl.pl

src/win32/libswishe.dsp

src/win32/libswishindex.dsp

src/win32/release.nsi

src/win32/swishe.dsp

src/win32/swishe.dsw

src/worddata.c

src/worddata.h

src/xml.c

src/xml.h

tests/Makefile.am

tests/Makefile.in

Show diffs side-by-side

added added

removed removed

src/rank.c

$Id: rank.c,v 1.25 2006/10/18 02:58:02 karman Exp $

This file is part of Swish-e.

Swish-e is free software; you can redistribute it and/or modify

it under the terms of the GNU General Public License as published by

the Free Software Foundation; either version 2 of the License, or

(at your option) any later version.

Swish-e is distributed in the hope that it will be useful,

but WITHOUT ANY WARRANTY; without even the implied warranty of

MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

GNU General Public License for more details.

You should have received a copy of the GNU General Public License

along with Swish-e; if not, write to the Free Software

Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA

See the COPYING file that accompanies the Swish-e distribution for details

of the GNU GPL and the special exception available for linking against

the Swish-e library.

** Mon May 9 14:51:21 CDT 2005

** added GPL (nothing previously)

#include "swish.h"

#include "db.h"

251

280

for ( structure = 0; structure < array_size; structure++ )

252

281

{

253

282

int factor = 1; /* All words are of value 1 */

254

283

255

284

for (i = 0; i < (int)numRanks; i++)

256

285

if (ranks[i].mask & structure)

257

286

factor += ranks[i].rank;

261

290

262

291

sw->structure_map_set = 1; /* flag */

263

292

}

264

265

293

294

266

295

int

267

296

getrank ( RESULT *r )

268

297

{

269

SWISH *sw;

270

IndexFILE *indexf;

271

int scheme;

272

273

274

indexf = r->db_results->indexf;

275

sw = indexf->sw;

276

scheme = sw->RankScheme;

277

298

SWISH *sw;

299

IndexFILE *indexf;

300

int scheme;

301

302

303

indexf = r->db_results->indexf;

304

sw = indexf->sw;

305

scheme = sw->RankScheme;

306

278

307

#ifdef DEBUG_RANK

279

308

fprintf( stderr, "Ranking Scheme: %d \n", scheme );

280

309

#endif

281

310

282

311

283

switch ( scheme )

284

{

285

286

case 0:

287

{

288

return ( getrankDEF( r ) );

289

}

290

291

case 1:

292

{

293

if ( indexf->header.ignoreTotalWordCountWhenRanking ) {

294

fprintf(stderr, "IgnoreTotalWordCountWhenRanking must be 0 to use IDF ranking\n");

295

exit(1);

296

}

297

298

return ( getrankIDF( r ) );

299

}

300

301

default:

302

{

303

return ( getrankDEF( r ) );

304

}

305

306

}

312

switch ( scheme )

313

{

314

315

case 0:

316

{

317

return ( getrankDEF( r ) );

318

}

319

320

case 1:

321

{

322

if ( indexf->header.ignoreTotalWordCountWhenRanking ) {

323

fprintf(stderr, "IgnoreTotalWordCountWhenRanking must be 0 to use IDF ranking\n");

324

exit(1);

325

}

326

327

return ( getrankIDF( r ) );

328

}

329

330

default:

331

{

332

return ( getrankDEF( r ) );

333

}

334

335

}

307

336

308

337

}

309

338

310

339

311

/* 2001-11 jmruiz With thousands results (>1000000) this routine is a bottleneck.

340

/* 2001-11 jmruiz With thousands results (>1000000) this routine is a bottleneck.

312

341

** (it is called thousands of times) Trying to avoid

313

342

** this I have added some optimizations.

314

343

** To avoid the annoying conversion in "return (int)rank"

328

357

int

329

358

getrankDEF( RESULT *r )

330

359

{

331

int *posdata;

360

unsigned int *posdata;

332

361

int meta_bias;

333

362

IndexFILE *indexf;

334

363

int rank;

335

364

int reduction;

336

int words;

365

int words; /* number of word positions (total words, not unique) in the given file -- probably should be per metaname */

337

366

int i;

338

367

SWISH *sw;

339

368

int metaID;

379

408

380

409

rank = 1;

381

410

freq = r->frequency;

382

if ( freq > 100 )

411

if ( freq > 100 )

383

412

freq = 100;

384

413

385

414

for(i = 0; i < freq; i++)

431

460

432

461

/* Return if IgnoreTotalWordCountWhenRanking is true (the default) */

433

462

if ( indexf->header.ignoreTotalWordCountWhenRanking )

434

return ( r->rank = rank / 100);

435

463

return ( r->rank = rank / 100);

464

436

465

437

466

438

467

439

468

/* bias by total words in the file -- this is off by default */

440

469

441

470

/* if word count is significant, reduce rank by a number between 1.0 and 5.0 */

442

#ifdef USE_BTREE

443

getTotalWordsPerFile(sw, indexf, r->filenum-1, &words);

444

#else

445

getTotalWordsPerFile(indexf, r->filenum-1, &words);

446

#endif

471

words = getTotalWordsInFile( indexf, r->filenum );

447

472

448

473

if (words <= 10)

449

474

reduction = 10000; /* 10000 * log10(10) = 10000 */

456

481

}

457

482

else

458

483

reduction = swish_log10[words];

459

484

460

485

r->rank = (rank * 100) / reduction;

461

486

462

487

return r->rank;

490

515

getrankIDF( RESULT *r )

491

516

{

492

517

493

int *posdata;

518

unsigned int *posdata;

494

519

int meta_bias;

495

520

IndexFILE *indexf;

496

521

int words;

498

523

SWISH *sw;

499

524

int metaID;

500

525

int freq;

501

int total_files;

502

int total_words;

503

int average_words;

504

int density;

505

int idf;

506

int total_word_freq;

507

int word_weight;

508

int word_score;

509

/* int density_magic = 2; */

510

526

int total_files;

527

int total_words;

528

int average_words;

529

int density;

530

int idf;

531

int total_word_freq;

532

int word_weight;

533

int word_score;

534

/* int density_magic = 2; */

535

511

536

/* the value named 'rank' in getrank() is here named 'word_score'.

512

537

it's largely semantic, but helps emphasize that *docs* are ranked,

513

538

but *words* are scored. The doc rank is calculated based on the accrued word scores.

514

515

539

540

516

541

However, the hash key name 'rank' is preserved in the r (RESULT) object

517

542

for compatibility with getrank()

518

543

519

544

520

521

/* this first part is identical to getrankDEF -- could be optimized as a single function */

522

523

545

546

/* this first part is identical to getrankDEF -- could be optimized as a single function */

547

548

524

549

#ifdef DEBUG_RANK

525

550

int struct_tally[256];

526

551

for ( i = 0; i <= 255; i++ )

534

559

indexf = r->db_results->indexf;

535

560

sw = indexf->sw;

536

561

posdata = r->posdata;

537

538

562

563

539

564

metaID = r->rank * -1;

540

565

meta_bias = indexf->header.metaEntryArray[ metaID - 1 ]->rank_bias;

541

566

545

570

/* here we start to diverge */

546

571

547

572

548

word_score = 1;

549

freq = r->frequency;

550

551

573

word_score = 1;

574

freq = r->frequency;

575

576

552

577

#ifdef DEBUG_RANK

553

578

fprintf( stderr, "File num: %d Word Score: %d Frequency: %d ", r->filenum, word_score, freq );

554

579

#endif

555

580

556

581

557

582

/* don't do this here; let density calc do it

558

if ( freq > 100 )

583

if ( freq > 100 )

559

584

freq = 100;

560

585

561

586

562

563

564

IDF is the Inverse Document Frequency, or, the weight of the word in relationship to the

587

588

589

IDF is the Inverse Document Frequency, or, the weight of the word in relationship to the

565

590

collection of documents as a whole. Multiply the weight against the rank to give

566

591

greater weight to words that appear less often in the collection.

567

592

568

593

The biggest impact should be seen when OR'ing words together instead of AND'ing them.

569

594

570

595

571

572

total_files = sw->TotalFiles;

573

total_word_freq = r->tfrequency;

574

idf = (int) ( log( total_files / total_word_freq ) * 1000 );

575

596

597

total_files = sw->TotalFiles;

598

total_word_freq = r->tfrequency;

599

idf = (int) ( log( total_files / total_word_freq ) * 1000 );

600

576

601

/* take 3 significant digits of the IDF.

577

602

this helps create a wider spread

578

603

between the most common words and the rest of the pack:

579

604

"word frequencies in natural language obey a power-law distribution" -- Maciej Ceglowski

580

605

581

606

582

607

if ( idf < 1 )

583

idf = 1;

584

/* only ubiquitous words like 'the' get idfs < 1.

585

these should probably be stopwords anyway... */

586

587

608

idf = 1;

609

/* only ubiquitous words like 'the' get idfs < 1.

610

these should probably be stopwords anyway... */

611

612

588

613

#ifdef DEBUG_RANK

589

614

fprintf(stderr, "Total files: %d Total word freq: %d IDF: %d \n",

590

total_files, total_word_freq, idf );

615

total_files, total_word_freq, idf );

591

616

#endif

592

617

593

618

/* calc word density. this normalizes document length so that longer docs

594

619

don't rank higher out of sheer quantity. Hopefully this is a little more

595

620

scientific than the out-of-the-air calc in getrankDEF() -- though effectiveness

596

621

is likely the same... */

597

598

#ifdef USE_BTREE

599

getTotalWordsPerFile(sw, indexf, r->filenum-1, &words);

600

#else

601

getTotalWordsPerFile(indexf, r->filenum-1, &words);

602

#endif

603

604

total_words = sw->TotalWordPos;

605

average_words = total_words / total_files;

606

622

623

words = getTotalWordsInFile( indexf, r->filenum );

624

625

total_words = sw->TotalWordPos;

626

average_words = total_words / total_files;

627

607

628

#ifdef DEBUG_RANK

608

629

fprintf(stderr, "Total words: %d Average words: %d Indexed words in this doc: %d ",

609

total_words, average_words, words );

630

total_words, average_words, words );

610

631

#endif

611

632

612

613

633

614

634

/* normalizing term density in a collection.

615

635

616

Amati & Van Rijsbergen "normalization 2" from

636

Amati & Van Rijsbergen "normalization 2" from

617

637

A Study of Parameter Tuning for Term Frequency Normalization

618

638

Ben HE

619

639

Department of Computing Science

621

641

Glasgow, UK

622

642

ben@dcs.gla.ac.uk

623

643

624

term_freq_density = term_freq * log( 1 + c * av_doc_leng/doc_leng)

644

term_freq_density = term_freq * log( 1 + c * av_doc_leng/doc_leng)

625

645

626

646

where c > 0 (optimized at 2 ... we think...)

627

647

628

648

629

649

630

650

631

632

density = freq * log( 1 + ( density_magic * ( average_words / words ) ) ); */

633

651

652

density = freq * log( 1 + ( density_magic * ( average_words / words ) ) ); */

653

634

654

/* doesn't work that well with int values. Use below (cruder) instead.

635

655

NOTE that there is likely a sweet spot for density. A word like 'the' will always have a high

636

656

density in normal language, but it is not very relevant. A word like 'foo' might have a very

637

657

low density in doc A but slightly higher in doc B -- doc B is likely more relevant. So low

638

658

density is not a good indicator and neither is high density. Instead, something like:

639

659

640

660

| density scale ---> |

641

661

0 X 100

642

662

useless sweet useless

643

663

644

645

646

density = ( ( average_words * 100 ) / words ) * freq;

647

664

665

666

/* if for some reason there are 0 words in this document, we'll get a divide by zero

667

error in this next calculation. why might a doc have no words in it, and yet be in the

668

set of matches we're evaluating? maybe because stopwords aren't counted?

669

it's a mystery, just like mankind...

670

set words=1 if <1 so that we at least avoid the core dump -- would it be better to somehow

671

skip it altogether? throw an error? let's warn on stderr for now, just to alert the user

672

that something is awry. */

673

674

if ( words < 1 ) {

675

676

fprintf(stderr, "Word count for document %d is zero\n", r->filenum );

677

words = 1;

678

679

}

680

681

682

density = ( ( average_words * 100 ) / words ) * freq;

683

648

684

/* minimum density */

649

685

if (density < 1)

650

density = 1;

651

686

density = 1;

687

652

688

/* scale word_weight back down by 100 or so, just to make it a little saner */

653

654

word_weight = ( density * idf ) / 100;

655

689

690

word_weight = ( density * idf ) / 100;

691

656

692

657

693

#ifdef DEBUG_RANK

658

694

fprintf(stderr, "Density: %d Word Weight: %d \n", density, word_weight );

664

700

/* GET_STRUCTURE must return value in range! */

665

701

666

702

word_score += word_weight * ( sw->structure_map[ GET_STRUCTURE(posdata[i]) ] + meta_bias );

667

703

668

704

#ifdef DEBUG_RANK

669

705

fprintf(stderr, "Word entry %d at position %d has struct %d\n", i, GET_POSITION(posdata[i]), GET_STRUCTURE(posdata[i]) );

670

706

671

707

struct_tally[ GET_STRUCTURE(posdata[i]) ]++;

672

708

#endif

673

709

678

714

if ( word_score < 1 )

679

715

word_score = 1;

680

716

681

682

717

718

683

719

#ifdef DEBUG_RANK

684

fprintf(stderr, "Rank after IDF weighting: %d \n", word_score );

720

fprintf(stderr, "Rank after IDF weighting: %d \n", word_score );

685

721

#endif

686

722

687

/* scaling word_score?? */

688

689

/* Scale the rank - this was originally based on frequency */

690

/* Uses lookup tables for values <= 1000, otherwise calculate */

691

692

word_score = scale_word_score( word_score );

723

/* scaling word_score?? */

724

725

/* Scale the rank - this was originally based on frequency */

726

/* Uses lookup tables for values <= 1000, otherwise calculate */

727

728

word_score = scale_word_score( word_score );

693

729

694

730

695

731

696

732

#ifdef DEBUG_RANK

697

733

fprintf( stderr, "scaled rank: %d\n Structure tally:\n", word_score );

698

734

699

735

for ( i = 0; i <= 255; i++ )

700

736

if ( struct_tally[i] )

701

737

{

713

749

#endif

714

750

715

751

716

return ( r->rank = word_score / 100 );

717

752

return ( r->rank = word_score / 100 );

753

718

754

}

719

755

720

756

int

725

761

/* Uses lookup tables for values <= 1000, otherwise calculate */

726

762

727

763

728

return score > 1000

729

? (int) floor( (log((double)score) * 10000 ) + 0.5)

730

: swish_log[score];

731

764

return score > 1000

765

? (int) floor( (log((double)score) * 10000 ) + 0.5)

766

: swish_log[score];

767

732

768

}

Older »