416
419
/* GET_STRUCTURE must return value in range! */
417
420
rank += sw->structure_map[ GET_STRUCTURE(posdata[i]) ] + meta_bias;
419
fprintf(stderr, "Word entry %d at position %d has struct %d\n", i, GET_POSITION(posdata[i]), GET_STRUCTURE(posdata[i]) );
420
struct_tally[ GET_STRUCTURE(posdata[i]) ]++;
426
fprintf( stderr, "File num: %d. Raw Rank: %d. Frequency: %d ", r->filenum, rank, r->frequency );
423
fprintf(stderr, "Word entry %d at position %d has struct %d\n", i, GET_POSITION(posdata[i]), GET_STRUCTURE(posdata[i]) );
424
struct_tally[ GET_STRUCTURE(posdata[i]) ]++;
430
fprintf( stderr, "File num: %d. Raw Rank: %d. Frequency: %d ", r->filenum, rank, r->frequency );
429
433
/* Ranks could end up less than zero -- but since the *final* rank is calcualted here */
430
434
/* we can't know the *lowest* value to use an offset. It might be better to track */
491
/* multiple ranking schemes allow for more fine-tuning as users' require.
500
/* multiple ranking schemes allow for more fine-tuning as users require.
493
502
Use the -R <num> command line option or RankScheme() API method.
495
Default is to use getrank() -- the same as -R 0
504
Default is to use getrankDEF() -- the same as -R 0
497
506
IDF ranking uses the total word frequency across all searched indexes
498
507
and a normalizing formula to negate effect of docs with different sizes.
531
540
int total_word_freq;
534
544
/* int density_magic = 2; */
536
546
/* the value named 'rank' in getrank() is here named 'word_score'.
537
547
it's largely semantic, but helps emphasize that *docs* are ranked,
538
548
but *words* are scored. The doc rank is calculated based on the accrued word scores.
541
550
However, the hash key name 'rank' is preserved in the r (RESULT) object
542
551
for compatibility with getrank()
598
610
total_word_freq = r->tfrequency;
599
611
idf = (int) ( log( total_files / total_word_freq ) * 1000 );
601
/* take 3 significant digits of the IDF.
602
this helps create a wider spread
613
/* *1000 helps create a wider spread
603
614
between the most common words and the rest of the pack:
604
615
"word frequencies in natural language obey a power-law distribution" -- Maciej Ceglowski
609
620
/* only ubiquitous words like 'the' get idfs < 1.
610
these should probably be stopwords anyway... */
621
these should probably be stopwords anyway...
614
627
fprintf(stderr, "Total files: %d Total word freq: %d IDF: %d \n",
615
628
total_files, total_word_freq, idf );
618
631
/* calc word density. this normalizes document length so that longer docs
619
632
don't rank higher out of sheer quantity. Hopefully this is a little more
625
638
total_words = sw->TotalWordPos;
626
639
average_words = total_words / total_files;
629
fprintf(stderr, "Total words: %d Average words: %d Indexed words in this doc: %d ",
643
fprintf(stderr, "Total words: %d Average words: %d Indexed words in this doc: %d ",
630
644
total_words, average_words, words );
634
648
/* normalizing term density in a collection.
702
717
word_score += word_weight * ( sw->structure_map[ GET_STRUCTURE(posdata[i]) ] + meta_bias );
705
fprintf(stderr, "Word entry %d at position %d has struct %d\n", i, GET_POSITION(posdata[i]), GET_STRUCTURE(posdata[i]) );
721
fprintf(stderr, "Word entry %d at position %d has struct %d\n", i, GET_POSITION(posdata[i]), GET_STRUCTURE(posdata[i]) );
707
struct_tally[ GET_STRUCTURE(posdata[i]) ]++;
723
struct_tally[ GET_STRUCTURE(posdata[i]) ]++;
720
fprintf(stderr, "Rank after IDF weighting: %d \n", word_score );
723
/* scaling word_score?? */
725
/* Scale the rank - this was originally based on frequency */
726
/* Uses lookup tables for values <= 1000, otherwise calculate */
728
word_score = scale_word_score( word_score );
733
fprintf( stderr, "scaled rank: %d\n Structure tally:\n", word_score );
735
for ( i = 0; i <= 255; i++ )
736
if ( struct_tally[i] )
738
fprintf( stderr, " struct 0x%x = count of %2d (", i, struct_tally[i] );
739
if ( i & IN_EMPHASIZED ) fprintf(stderr," EM");
740
if ( i & IN_HEADER ) fprintf(stderr," HEADING");
741
if ( i & IN_COMMENTS ) fprintf(stderr," COMMENT");
742
if ( i & IN_META ) fprintf(stderr," META");
743
if ( i & IN_BODY ) fprintf(stderr," BODY");
744
if ( i & IN_HEAD ) fprintf(stderr," HEAD");
745
if ( i & IN_TITLE ) fprintf(stderr," TITLE");
746
if ( i & IN_FILE ) fprintf(stderr," FILE");
747
fprintf(stderr," ) x rank map of %d = %d\n\n", sw->structure_map[i], sw->structure_map[i] * struct_tally[i]);
752
return ( r->rank = word_score / 100 );
737
fprintf(stderr, "Raw score after IDF weighting: %d \n", word_score );
740
word_score = scale_word_score( word_score );
744
fprintf( stderr, "scaled rank: %d\n Structure tally:\n", word_score );
746
for ( i = 0; i <= 255; i++ )
748
if ( struct_tally[i] )
750
fprintf( stderr, " struct 0x%x = count of %2d (", i, struct_tally[i] );
751
if ( i & IN_EMPHASIZED ) fprintf(stderr," EM");
752
if ( i & IN_HEADER ) fprintf(stderr," HEADING");
753
if ( i & IN_COMMENTS ) fprintf(stderr," COMMENT");
754
if ( i & IN_META ) fprintf(stderr," META");
755
if ( i & IN_BODY ) fprintf(stderr," BODY");
756
if ( i & IN_HEAD ) fprintf(stderr," HEAD");
757
if ( i & IN_TITLE ) fprintf(stderr," TITLE");
758
if ( i & IN_FILE ) fprintf(stderr," FILE");
759
fprintf(stderr," ) x rank map of %d = %d\n\n", sw->structure_map[i], sw->structure_map[i] * struct_tally[i]);
766
fprintf(stderr, "Scaled score: %d \n", word_score );
770
return ( r->rank = word_score );