281
283
* @param query the query sequence [in]
282
284
* @param location locations on the query to be indexed in table [in]
283
285
* @param mb_lt the (already allocated) megablast lookup table structure [in|out]
284
* @param width number of bytes in megablast word (2 or 3) [in]
285
286
* @param lookup_options specifies the word_size [in]
286
287
* @return zero on success, negative number on failure.
290
s_FillContigMBTable(BLAST_SequenceBlk* query, BlastSeqLoc* location,
291
BlastMBLookupTable* mb_lt, Uint1 width,
291
s_FillContigMBTable(BLAST_SequenceBlk* query,
292
BlastSeqLoc* location,
293
BlastMBLookupTable* mb_lt,
292
294
const LookupTableOptions* lookup_options)
295
297
BlastSeqLoc* loc;
296
298
const Uint1 kNucMask = 0xfc;
297
299
/* 12-mers (or perhaps 8-mers) are used to build the lookup table
298
and this is what kWordLength specifies. */
299
const Int4 kWordLength = COMPRESSION_RATIO*width;
300
and this is what kLutWordLength specifies. */
301
const Int4 kLutWordLength = mb_lt->lut_word_length;
302
const Int4 kLutMask = mb_lt->hashsize - 1;
300
303
/* The user probably specified a much larger word size (like 28)
301
304
and this is what full_word_size is. */
305
Int4 full_word_size = mb_lt->word_length;
304
PV_ARRAY_TYPE *pv_array=NULL;
307
PV_ARRAY_TYPE *pv_array;
305
308
Int4 pv_array_bts;
306
const Int4 kMask = (1 << (8*width - 2)) - 1;
307
309
/* The calculation of the longest chain can be cpu intensive for
308
310
long queries or sets of queries. So we use a helper_array to
309
311
keep track of this, but compress it by kCompressionFactor so
339
336
Since sequence pointer points to the end of the word, subtract
340
337
word length from the loop boundaries. */
341
338
Int4 from = loc->ssr->left;
342
Int4 to = loc->ssr->right - kWordLength;
339
Int4 to = loc->ssr->right - kLutWordLength;
344
341
Int4 last_offset;
349
/* case of unmasked region >= kWordLength but < full_word_size,
346
/* case of unmasked region >= kLutWordLength but < full_word_size,
350
347
so no hits should be generated. */
351
348
if (full_word_size > (loc->ssr->right - loc->ssr->left + 1))
354
351
seq = query->sequence_start + from;
355
pos = seq + kWordLength;
352
pos = seq + kLutWordLength;
357
354
/* Also add 1 to all indices, because lookup table indices count
359
from -= kWordLength - 2;
356
from -= kLutWordLength - 2;
360
357
last_offset = to + 2;
362
359
for (index = from; index <= last_offset; index++) {
364
if ((val & kNucMask) != 0) { /* ambiguity or gap */
361
/* if an ambiguity is encountered, do not add
362
any words that would contain it */
363
if ((val & kNucMask) != 0) {
366
pos = seq + kWordLength;
365
pos = seq + kLutWordLength;
370
369
/* get next base */
371
ecode = ((ecode & kMask) << 2) + val;
370
ecode = ((ecode << 2) & kLutMask) + val;
374
#ifdef LOOKUP_VERBOSE
375
mb_lt->num_words_added++;
375
377
if (mb_lt->hashtable[ecode] == 0) {
378
#ifdef LOOKUP_VERBOSE
376
379
mb_lt->num_unique_pos_added++;
377
381
pv_array[ecode>>pv_array_bts] |=
378
382
(((PV_ARRAY_TYPE) 1)<<(ecode&PV_ARRAY_MASK));
418
if ((mb_lt = (BlastMBLookupTable*) calloc(1, sizeof(BlastMBLookupTable))) == NULL)
423
mb_lt = (BlastMBLookupTable*)calloc(1, sizeof(BlastMBLookupTable));
423
bytes_in_word = (lookup_options->word_size + 1)/ 4;
424
if (bytes_in_word < 3)
431
mb_lt->hashsize = 1 << 16;
437
Int4 table_entries=0;
438
/* determine the approximate number of hashtable entries */
439
for (loc = location; loc; loc = loc->next) {
440
from = loc->ssr->left;
441
to = loc->ssr->right;
442
table_entries += (to - from);
445
/* To fit in the external cache of latter-day micro-
446
processors, the PV array must be compressed. pv_shift
447
below is the power of two that the array size is
448
divided by. The target PV array size is 128 kBytes.
450
If the query is too small or too large, the compression
451
should be higher. Small queries don't reuse the PV array,
452
and large queries saturate it. In either case, cache
453
is better used on other data. */
455
if (lookup_options->word_size == 11 && lookup_options->mb_template_length > 0) {
456
mb_lt->hashsize = 1 << 22;
457
if( table_entries <= kSmallQueryCutoff ||
458
table_entries >= kLargeQueryCutoff )
464
mb_lt->hashsize = 1 << 24;
465
if( table_entries <= kSmallQueryCutoff ||
466
table_entries >= kLargeQueryCutoff )
473
if (lookup_options->mb_template_length)
475
mb_lt->full_byte_scan = lookup_options->full_byte_scan; /* Only applies to discontiguous megablast. */
476
/* The next two are not allowed in discontiguous megablast. */
477
mb_lt->ag_scanning_mode = FALSE;
478
mb_lt->variable_wordsize = FALSE;
479
mb_lt->scan_step = 0;
483
mb_lt->ag_scanning_mode = TRUE; /* FIXME, are there wordsize where this should be FALSE? */
484
mb_lt->scan_step = CalculateBestStride(lookup_options->word_size, lookup_options->variable_wordsize,
485
lookup_options->lut_type);
486
mb_lt->variable_wordsize = lookup_options->variable_wordsize;
489
if ((mb_lt->hashtable = (Int4*)
490
calloc(mb_lt->hashsize, sizeof(Int4))) == NULL) {
491
MBLookupTableDestruct(mb_lt);
495
mb_lt->pv_array_bts = PV_ARRAY_BTS + pv_shift;
497
pv_size = mb_lt->hashsize>>(mb_lt->pv_array_bts);
498
/* Size measured in PV_ARRAY_TYPE's */
499
if ((mb_lt->pv_array = calloc(PV_ARRAY_BYTES, pv_size)) == NULL)
501
MBLookupTableDestruct(mb_lt);
505
if (lookup_options->mb_template_length)
506
status = s_FillDiscMBTable(query, location, mb_lt, width, lookup_options);
508
status = s_FillContigMBTable(query, location, mb_lt, width, lookup_options);
428
/* first determine how 'wide' the lookup table is,
429
i.e. how many nucleotide letters are in a single entry */
431
if (lookup_options->mb_template_length > 0) {
433
/* for discontiguous megablast the choice is fixed */
434
if (lookup_options->word_size == 11)
435
mb_lt->lut_word_length = 11;
437
mb_lt->lut_word_length = 12;
440
/* Choose the smallest lookup table that will not
441
be congested; this is a compromise between good
442
cache performance and few table accesses. The
443
number of entries where one table width becomes
444
better than another is probably machine-dependent */
446
ASSERT(lookup_options->word_size >= 10);
448
switch(lookup_options->word_size) {
450
if (approx_table_entries < 18000)
451
mb_lt->lut_word_length = 9;
453
mb_lt->lut_word_length = 10;
456
if (approx_table_entries < 18000)
457
mb_lt->lut_word_length = 9;
458
else if (approx_table_entries < 180000)
459
mb_lt->lut_word_length = 10;
461
mb_lt->lut_word_length = 11;
464
if (approx_table_entries < 18000)
465
mb_lt->lut_word_length = 9;
466
else if (approx_table_entries < 60000)
467
mb_lt->lut_word_length = 10;
468
else if (approx_table_entries < 900000)
469
mb_lt->lut_word_length = 11;
471
mb_lt->lut_word_length = 12;
474
if (approx_table_entries < 300000)
475
mb_lt->lut_word_length = 11;
477
mb_lt->lut_word_length = 12;
481
mb_lt->lut_word_length = MIN(lookup_options->word_size,
482
mb_lt->lut_word_length);
485
mb_lt->word_length = lookup_options->word_size;
486
mb_lt->hashsize = iexp(kAlphabetSize, mb_lt->lut_word_length);
487
mb_lt->hashtable = (Int4*)calloc(mb_lt->hashsize, sizeof(Int4));
488
if (mb_lt->hashtable == NULL) {
489
MBLookupTableDestruct(mb_lt);
493
/* Allocate the PV array. To fit in the external cache of
494
latter-day microprocessors, the PV array cannot have one
495
bit for for every lookup table entry. Instead we choose
496
a size that should fit in cache and make a single bit
497
of the PV array handle multiple hashtable entries if
500
If the query is too small or too large, the compression
501
should be higher. Small queries don't reuse the PV array,
502
and large queries saturate it. In either case, cache
503
is better used on something else */
505
if (mb_lt->hashsize <= 8 * kTargetPVSize)
506
pv_size = mb_lt->hashsize >> PV_ARRAY_BTS;
508
pv_size = kTargetPVSize / PV_ARRAY_BYTES;
510
if(approx_table_entries <= kSmallQueryCutoff ||
511
approx_table_entries >= kLargeQueryCutoff) {
512
pv_size = pv_size / 2;
514
mb_lt->pv_array_bts = ilog2(mb_lt->hashsize / pv_size);
515
mb_lt->pv_array = calloc(PV_ARRAY_BYTES, pv_size);
516
if (mb_lt->pv_array == NULL) {
517
MBLookupTableDestruct(mb_lt);
521
if (lookup_options->mb_template_length > 0) {
522
/* discontiguous megablast */
523
mb_lt->full_byte_scan = lookup_options->full_byte_scan;
524
status = s_FillDiscMBTable(query, location, mb_lt, lookup_options);
527
/* contiguous megablast */
528
mb_lt->scan_step = mb_lt->word_length - mb_lt->lut_word_length + 1;
529
status = s_FillContigMBTable(query, location, mb_lt, lookup_options);
510
532
if (status > 0) {
511
533
MBLookupTableDestruct(mb_lt);
543
579
max_hits -= mb_lt->longest_chain;
545
581
abs_start = subject->sequence;
546
s = abs_start + start_offset/COMPRESSION_RATIO;
547
compressed_scan_step = mb_lt->scan_step / COMPRESSION_RATIO;
548
compressed_wordsize = mb_lt->compressed_wordsize;
549
word_size = compressed_wordsize*COMPRESSION_RATIO;
553
/* NB: s in this function always points to the start of the word!
555
if (mb_lt->scan_step % COMPRESSION_RATIO == 0) {
556
Uint1* s_end = abs_start + subject->length/COMPRESSION_RATIO -
558
for ( ; s <= s_end; s += compressed_scan_step) {
559
BlastNaLookupInitIndex(compressed_wordsize, s, &index);
561
if (NA_PV_TEST(pv_array, index, pv_array_bts)) {
562
if (total_hits >= max_hits)
564
q_off = mb_lt->hashtable[index];
565
s_off = (s - abs_start)*COMPRESSION_RATIO;
567
offset_pairs[total_hits].qs_offsets.q_off = q_off - 1;
568
offset_pairs[total_hits++].qs_offsets.s_off = s_off;
569
q_off = mb_lt->next_pos[q_off];
573
*end_offset = (s - abs_start)*COMPRESSION_RATIO;
575
Int4 last_offset = subject->length - word_size;
579
for (s_off = start_offset; s_off <= last_offset;
580
s_off += mb_lt->scan_step) {
582
lut_word_length = mb_lt->lut_word_length;
583
mask = mb_lt->hashsize - 1;
584
scan_step = mb_lt->scan_step;
585
last_offset = subject->length - lut_word_length;
586
ASSERT(lut_word_length == 9 ||
587
lut_word_length == 10 ||
588
lut_word_length == 11 ||
589
lut_word_length == 12);
591
if (lut_word_length > 9) {
592
/* perform scanning for lookup tables of width 10, 11, and 12.
593
These widths require three bytes of the compressed subject
594
sequence, and possibly a fourth if the word is not known
595
to be aligned on a 4-base boundary */
597
if (scan_step % COMPRESSION_RATIO == 0) {
599
/* for strides that are a multiple of 4, words are
600
always aligned and three bytes of the subject sequence
601
will always hold a complete word (plus possible extra bases
602
that must be shifted away). s_end below points to either
603
the second to last or third to last byte of the subject
604
sequence; all subject sequences must therefore have a
607
Uint1* s_end = abs_start + (subject->length - lut_word_length) /
609
Int4 shift = 2 * (12 - lut_word_length);
610
s = abs_start + start_offset / COMPRESSION_RATIO;
611
scan_step = scan_step / COMPRESSION_RATIO;
613
for ( ; s <= s_end; s += scan_step) {
615
index = s[0] << 16 | s[1] << 8 | s[2];
616
index = index >> shift;
618
if (NA_PV_TEST(pv_array, index, pv_array_bts)) {
619
if (total_hits >= max_hits)
621
q_off = mb_lt->hashtable[index];
622
s_off = (s - abs_start)*COMPRESSION_RATIO;
624
offset_pairs[total_hits].qs_offsets.q_off = q_off - 1;
625
offset_pairs[total_hits++].qs_offsets.s_off = s_off;
626
q_off = mb_lt->next_pos[q_off];
630
*end_offset = (s - abs_start)*COMPRESSION_RATIO;
633
/* when the stride is not a multiple of 4, extra bases
634
may occur both before and after every word read from
635
the subject sequence. The portion of each 16-base
636
region that contains the actual word depends on the
637
offset of the word and the lookup table width, and
638
must be recalculated for each 16-base region */
640
for (s_off = start_offset; s_off <= last_offset; s_off += scan_step) {
642
Int4 shift = 2*(16 - (s_off % COMPRESSION_RATIO + lut_word_length));
643
s = abs_start + (s_off / COMPRESSION_RATIO);
645
index = s[0] << 24 | s[1] << 16 | s[2] << 8 | s[3];
646
index = (index >> shift) & mask;
648
if (NA_PV_TEST(pv_array, index, pv_array_bts)) {
649
if (total_hits >= max_hits)
651
q_off = mb_lt->hashtable[index];
653
offset_pairs[total_hits].qs_offsets.q_off = q_off - 1;
654
offset_pairs[total_hits++].qs_offsets.s_off = s_off;
655
q_off = mb_lt->next_pos[q_off];
663
/* perform scanning for a lookup tables of width 9
664
Here the stride will never be a multiple of 4
665
(this table is only used for very small word sizes) */
667
for (s_off = start_offset; s_off <= last_offset; s_off += scan_step) {
669
Int4 shift = 2*(12 - (s_off % COMPRESSION_RATIO + lut_word_length));
581
670
s = abs_start + (s_off / COMPRESSION_RATIO);
582
bit = 2*(s_off % COMPRESSION_RATIO);
583
/* Compute index for a word made of full bytes */
584
s = BlastNaLookupInitIndex(compressed_wordsize, s, &index);
585
/* Adjust the word index by the base within a byte */
586
adjusted_index = BlastNaLookupAdjustIndex(s, index, mb_lt->mask, bit);
587
if (NA_PV_TEST(pv_array, adjusted_index, pv_array_bts)) {
672
index = s[0] << 16 | s[1] << 8 | s[2];
673
index = (index >> shift) & mask;
675
if (NA_PV_TEST(pv_array, index, pv_array_bts)) {
588
676
if (total_hits >= max_hits)
590
q_off = mb_lt->hashtable[adjusted_index];
678
q_off = mb_lt->hashtable[index];
592
680
offset_pairs[total_hits].qs_offsets.q_off = q_off - 1;
593
681
offset_pairs[total_hits++].qs_offsets.s_off = s_off;
601
689
return total_hits;
604
Int4 MB_ScanSubject(const LookupTableWrap* lookup,
605
const BLAST_SequenceBlk* subject, Int4 start_offset,
606
BlastOffsetPair* NCBI_RESTRICT offset_pairs, Int4 max_hits,
609
Uint1* abs_start,* s_end;
614
BlastMBLookupTable* mb_lt;
615
PV_ARRAY_TYPE *pv_array;
617
Int4 compressed_wordsize;
619
ASSERT(lookup->lut_type == MB_LOOKUP_TABLE);
620
mb_lt = (BlastMBLookupTable*) lookup->lut;
622
pv_array = mb_lt->pv_array;
623
pv_array_bts = mb_lt->pv_array_bts;
624
compressed_wordsize = mb_lt->compressed_wordsize;
626
/* Since the test for number of hits here is done after adding them,
627
subtract the longest chain length from the allowed offset array size.
629
max_hits -= mb_lt->longest_chain;
631
abs_start = subject->sequence;
632
s = abs_start + start_offset/COMPRESSION_RATIO;
633
s_end = abs_start + (*end_offset)/COMPRESSION_RATIO;
635
s = BlastNaLookupInitIndex(mb_lt->compressed_wordsize, s, &index);
637
/* In the following code, s points to the byte right after the end of
640
if (NA_PV_TEST(pv_array, index, pv_array_bts)) {
641
if (total_hits >= max_hits)
643
q_off = mb_lt->hashtable[index];
644
s_off = ((s - abs_start) - compressed_wordsize)*COMPRESSION_RATIO;
646
offset_pairs[total_hits].qs_offsets.q_off = q_off - 1;
647
offset_pairs[total_hits++].qs_offsets.s_off = s_off;
648
q_off = mb_lt->next_pos[q_off];
651
/* Compute the next value of the lookup index
652
(shifting sequence by a full byte) */
653
index = BlastNaLookupComputeIndex(FULL_BYTE_SHIFT, mb_lt->mask,
658
((s - abs_start) - compressed_wordsize)*COMPRESSION_RATIO;
663
692
Int4 MB_DiscWordScanSubject(const LookupTableWrap* lookup,
664
693
const BLAST_SequenceBlk* subject, Int4 start_offset,
665
694
BlastOffsetPair* NCBI_RESTRICT offset_pairs, Int4 max_hits,