3
$Id: rank.c,v 1.25 2006/10/18 02:58:02 karman Exp $
6
This file is part of Swish-e.
8
Swish-e is free software; you can redistribute it and/or modify
9
it under the terms of the GNU General Public License as published by
10
the Free Software Foundation; either version 2 of the License, or
11
(at your option) any later version.
13
Swish-e is distributed in the hope that it will be useful,
14
but WITHOUT ANY WARRANTY; without even the implied warranty of
15
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16
GNU General Public License for more details.
18
You should have received a copy of the GNU General Public License
19
along with Swish-e; if not, write to the Free Software
20
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
22
See the COPYING file that accompanies the Swish-e distribution for details
23
of the GNU GPL and the special exception available for linking against
26
** Mon May 9 14:51:21 CDT 2005
27
** added GPL (nothing previously)
262
291
sw->structure_map_set = 1; /* flag */
267
296
getrank ( RESULT *r )
274
indexf = r->db_results->indexf;
276
scheme = sw->RankScheme;
303
indexf = r->db_results->indexf;
305
scheme = sw->RankScheme;
278
307
#ifdef DEBUG_RANK
279
308
fprintf( stderr, "Ranking Scheme: %d \n", scheme );
288
return ( getrankDEF( r ) );
293
if ( indexf->header.ignoreTotalWordCountWhenRanking ) {
294
fprintf(stderr, "IgnoreTotalWordCountWhenRanking must be 0 to use IDF ranking\n");
298
return ( getrankIDF( r ) );
303
return ( getrankDEF( r ) );
317
return ( getrankDEF( r ) );
322
if ( indexf->header.ignoreTotalWordCountWhenRanking ) {
323
fprintf(stderr, "IgnoreTotalWordCountWhenRanking must be 0 to use IDF ranking\n");
327
return ( getrankIDF( r ) );
332
return ( getrankDEF( r ) );
311
/* 2001-11 jmruiz With thousands results (>1000000) this routine is a bottleneck.
340
/* 2001-11 jmruiz With thousands results (>1000000) this routine is a bottleneck.
312
341
** (it is called thousands of times) Trying to avoid
313
342
** this I have added some optimizations.
314
343
** To avoid the annoying conversion in "return (int)rank"
432
461
/* Return if IgnoreTotalWordCountWhenRanking is true (the default) */
433
462
if ( indexf->header.ignoreTotalWordCountWhenRanking )
434
return ( r->rank = rank / 100);
463
return ( r->rank = rank / 100);
439
468
/* bias by total words in the file -- this is off by default */
441
470
/* if word count is significant, reduce rank by a number between 1.0 and 5.0 */
443
getTotalWordsPerFile(sw, indexf, r->filenum-1, &words);
445
getTotalWordsPerFile(indexf, r->filenum-1, &words);
471
words = getTotalWordsInFile( indexf, r->filenum );
449
474
reduction = 10000; /* 10000 * log10(10) = 10000 */
509
/* int density_magic = 2; */
534
/* int density_magic = 2; */
511
536
/* the value named 'rank' in getrank() is here named 'word_score'.
512
537
it's largely semantic, but helps emphasize that *docs* are ranked,
513
538
but *words* are scored. The doc rank is calculated based on the accrued word scores.
516
541
However, the hash key name 'rank' is preserved in the r (RESULT) object
517
542
for compatibility with getrank()
521
/* this first part is identical to getrankDEF -- could be optimized as a single function */
546
/* this first part is identical to getrankDEF -- could be optimized as a single function */
524
549
#ifdef DEBUG_RANK
525
550
int struct_tally[256];
526
551
for ( i = 0; i <= 255; i++ )
545
570
/* here we start to diverge */
552
577
#ifdef DEBUG_RANK
553
578
fprintf( stderr, "File num: %d Word Score: %d Frequency: %d ", r->filenum, word_score, freq );
557
582
/* don't do this here; let density calc do it
564
IDF is the Inverse Document Frequency, or, the weight of the word in relationship to the
589
IDF is the Inverse Document Frequency, or, the weight of the word in relationship to the
565
590
collection of documents as a whole. Multiply the weight against the rank to give
566
591
greater weight to words that appear less often in the collection.
568
593
The biggest impact should be seen when OR'ing words together instead of AND'ing them.
572
total_files = sw->TotalFiles;
573
total_word_freq = r->tfrequency;
574
idf = (int) ( log( total_files / total_word_freq ) * 1000 );
597
total_files = sw->TotalFiles;
598
total_word_freq = r->tfrequency;
599
idf = (int) ( log( total_files / total_word_freq ) * 1000 );
576
601
/* take 3 significant digits of the IDF.
577
602
this helps create a wider spread
578
603
between the most common words and the rest of the pack:
579
604
"word frequencies in natural language obey a power-law distribution" -- Maciej Ceglowski
584
/* only ubiquitous words like 'the' get idfs < 1.
585
these should probably be stopwords anyway... */
609
/* only ubiquitous words like 'the' get idfs < 1.
610
these should probably be stopwords anyway... */
588
613
#ifdef DEBUG_RANK
589
614
fprintf(stderr, "Total files: %d Total word freq: %d IDF: %d \n",
590
total_files, total_word_freq, idf );
615
total_files, total_word_freq, idf );
593
618
/* calc word density. this normalizes document length so that longer docs
594
619
don't rank higher out of sheer quantity. Hopefully this is a little more
595
620
scientific than the out-of-the-air calc in getrankDEF() -- though effectiveness
596
621
is likely the same... */
599
getTotalWordsPerFile(sw, indexf, r->filenum-1, &words);
601
getTotalWordsPerFile(indexf, r->filenum-1, &words);
604
total_words = sw->TotalWordPos;
605
average_words = total_words / total_files;
623
words = getTotalWordsInFile( indexf, r->filenum );
625
total_words = sw->TotalWordPos;
626
average_words = total_words / total_files;
607
628
#ifdef DEBUG_RANK
608
629
fprintf(stderr, "Total words: %d Average words: %d Indexed words in this doc: %d ",
609
total_words, average_words, words );
630
total_words, average_words, words );
614
634
/* normalizing term density in a collection.
616
Amati & Van Rijsbergen "normalization 2" from
636
Amati & Van Rijsbergen "normalization 2" from
617
637
A Study of Parameter Tuning for Term Frequency Normalization
619
639
Department of Computing Science
622
642
ben@dcs.gla.ac.uk
624
term_freq_density = term_freq * log( 1 + c * av_doc_leng/doc_leng)
644
term_freq_density = term_freq * log( 1 + c * av_doc_leng/doc_leng)
626
646
where c > 0 (optimized at 2 ... we think...)
632
density = freq * log( 1 + ( density_magic * ( average_words / words ) ) ); */
652
density = freq * log( 1 + ( density_magic * ( average_words / words ) ) ); */
634
654
/* doesn't work that well with int values. Use below (cruder) instead.
635
655
NOTE that there is likely a sweet spot for density. A word like 'the' will always have a high
636
656
density in normal language, but it is not very relevant. A word like 'foo' might have a very
637
657
low density in doc A but slightly higher in doc B -- doc B is likely more relevant. So low
638
658
density is not a good indicator and neither is high density. Instead, something like:
640
660
| density scale ---> |
642
662
useless sweet useless
646
density = ( ( average_words * 100 ) / words ) * freq;
666
/* if for some reason there are 0 words in this document, we'll get a divide by zero
667
error in this next calculation. why might a doc have no words in it, and yet be in the
668
set of matches we're evaluating? maybe because stopwords aren't counted?
669
it's a mystery, just like mankind...
670
set words=1 if <1 so that we at least avoid the core dump -- would it be better to somehow
671
skip it altogether? throw an error? let's warn on stderr for now, just to alert the user
672
that something is awry. */
676
fprintf(stderr, "Word count for document %d is zero\n", r->filenum );
682
density = ( ( average_words * 100 ) / words ) * freq;
648
684
/* minimum density */
652
688
/* scale word_weight back down by 100 or so, just to make it a little saner */
654
word_weight = ( density * idf ) / 100;
690
word_weight = ( density * idf ) / 100;
657
693
#ifdef DEBUG_RANK
658
694
fprintf(stderr, "Density: %d Word Weight: %d \n", density, word_weight );
678
714
if ( word_score < 1 )
683
719
#ifdef DEBUG_RANK
684
fprintf(stderr, "Rank after IDF weighting: %d \n", word_score );
720
fprintf(stderr, "Rank after IDF weighting: %d \n", word_score );
687
/* scaling word_score?? */
689
/* Scale the rank - this was originally based on frequency */
690
/* Uses lookup tables for values <= 1000, otherwise calculate */
692
word_score = scale_word_score( word_score );
723
/* scaling word_score?? */
725
/* Scale the rank - this was originally based on frequency */
726
/* Uses lookup tables for values <= 1000, otherwise calculate */
728
word_score = scale_word_score( word_score );
696
732
#ifdef DEBUG_RANK
697
733
fprintf( stderr, "scaled rank: %d\n Structure tally:\n", word_score );
699
735
for ( i = 0; i <= 255; i++ )
700
736
if ( struct_tally[i] )