363
362
ungapped_data->length = q_end - q_beg;
364
363
ungapped_data->score = score;
366
/** Perform ungapped extension of a word hit. Use an approximate method
367
* and revert to rigorous ungapped alignment if the approximate score
369
* @param query The query sequence [in]
370
* @param subject The subject sequence [in]
371
* @param matrix The scoring matrix [in]
372
* @param q_off The offset of a word in query [in]
373
* @param s_match_end The first offset in the subject sequence that
374
* is not known to exactly match the query sequence [in]
375
* @param s_off The offset of a word in subject [in]
376
* @param X The drop-off parameter for the ungapped extension [in]
377
* @param ungapped_data The ungapped extension information [out]
378
* @param score_table Array containing precompute sums of
379
* match and mismatch scores [in]
380
* @param reduced_cutoff Score beyond which a rigorous extension is used [in]
383
s_NuclUngappedExtend(BLAST_SequenceBlk* query,
384
BLAST_SequenceBlk* subject, Int4** matrix,
385
Int4 q_off, Int4 s_match_end, Int4 s_off,
386
Int4 X, BlastUngappedData* ungapped_data,
387
const Int4 *score_table, Int4 reduced_cutoff)
389
Uint1* q_start = query->sequence;
390
Uint1* s_start = subject->sequence;
398
/* The left extension begins behind (q_ext,s_ext); this
399
is the first 4-base boundary after s_off. */
401
len = (COMPRESSION_RATIO - (s_off % COMPRESSION_RATIO)) %
406
s = s_start + s_ext / COMPRESSION_RATIO;
407
len = MIN(q_ext, s_ext) / COMPRESSION_RATIO;
413
for (i = 0; i < len; s--, q -= 4, i++) {
414
Uint1 s_byte = s[-1];
415
Uint1 q_byte = (q[-4]<<6) | (q[-3]<<4) | (q[-2]<<2) | q[-1];
417
sum += score_table[q_byte ^ s_byte];
419
new_q = q - 4; score += sum; sum = 0;
426
/* record the start point of the extension */
428
ungapped_data->q_start = new_q - q_start;
429
ungapped_data->s_start = s_ext - (q_ext - ungapped_data->q_start);
431
/* the right extension begins at the first bases not
432
examined by the previous loop */
435
s = s_start + s_ext / COMPRESSION_RATIO;
436
len = MIN(query->length - q_ext, subject->length - s_ext) /
441
for (i = 0; i < len; s++, q += 4, i++) {
443
Uint1 q_byte = (q[0]<<6) | (q[1]<<4) | (q[2]<<2) | q[3];
445
sum += score_table[q_byte ^ s_byte];
447
new_q = q + 3; score += sum; sum = 0;
454
if (score > reduced_cutoff) {
455
/* the current score is high enough; throw away the
456
alignment and recompute it rigorously */
457
s_NuclUngappedExtendExact(query, subject, matrix, q_off,
458
s_off, X, ungapped_data);
461
/* record the length and score of the extension. Make
462
sure the alignment extends at least to s_match_end */
463
ungapped_data->score = score;
464
ungapped_data->length = MAX(s_match_end - ungapped_data->s_start,
465
(new_q - q_start) - ungapped_data->q_start + 1);
369
469
/** Mask for encoding in one integer a hit offset and whether that hit has
802
908
Int4** matrix, Blast_ExtendWord* ewp,
803
909
BlastInitHitList* init_hitlist)
912
Uint4 query_length = query->length;
913
Uint4 subject_length = subject->length;
914
Uint1* q_start = query->sequence;
805
915
Uint1* s_start = subject->sequence;
806
Uint1* q_start = query->sequence;
810
Uint4 word_length, compressed_wordsize, compressed_word_length;
811
Uint4 reduced_word_length, reduced_wordsize;
812
Uint4 extra_bytes_needed;
813
Uint4 extra_bases, left, right;
916
Uint4 word_length, lut_word_length;
918
Uint4 max_bases_left, max_bases_right;
919
Uint4 extended_left, extended_right;
814
923
Int4 hits_extended = 0;
815
Boolean extend_partial_byte = FALSE;
817
924
Uint1 template_length = 0;
819
if (lookup_wrap->lut_type == MB_LOOKUP_TABLE) {
820
BlastMBLookupTable* mb_lt = (BlastMBLookupTable*)lookup_wrap->lut;
821
word_length = mb_lt->word_length;
823
(word_length - COMPRESSION_RATIO + 1)/COMPRESSION_RATIO;
824
compressed_wordsize = mb_lt->compressed_wordsize;
825
extend_partial_byte = !mb_lt->variable_wordsize;
826
/* When ungapped extension is not performed, the hit will be new only if
827
it more than scan_step away from the previous hit. */
828
if(!word_params->options->ungapped_extension)
829
min_step = mb_lt->scan_step;
830
template_length = mb_lt->template_length;
832
BlastLookupTable* lookup = (BlastLookupTable*)lookup_wrap->lut;
833
word_length = lookup->word_length;
834
reduced_wordsize = lookup->wordsize;
835
compressed_wordsize = lookup->reduced_wordsize;
836
extend_partial_byte = !lookup->variable_wordsize;
837
/* When ungapped extension is not performed, the hit will be new only if
838
it more than scan_step away from the previous hit. */
839
if(!word_params->options->ungapped_extension)
840
min_step = lookup->scan_step;
843
extra_bytes_needed = reduced_wordsize - compressed_wordsize;
845
if (!extend_partial_byte) {
846
/* If partial bytes are not checked, we need to check one full extra byte
847
at the end of the word. */
848
++extra_bytes_needed;
851
reduced_word_length = COMPRESSION_RATIO*reduced_wordsize;
852
compressed_word_length = COMPRESSION_RATIO*compressed_wordsize;
853
extra_bases = word_length - reduced_word_length;
855
for (i = 0; i < num_hits; ++i) {
856
Uint4 q_offset = offset_pairs[i].qs_offsets.q_off;
857
Uint4 s_offset = offset_pairs[i].qs_offsets.s_off;
858
/* Here it is guaranteed that subject offset is divisible by 4,
859
because we only extend to the right, so scanning stride must be
861
s = s_start + s_offset/COMPRESSION_RATIO;
926
BlastLookupTable* lut;
928
ASSERT(lookup_wrap->lut_type != MB_LOOKUP_TABLE);
929
lut = (BlastLookupTable*)lookup_wrap->lut;
930
word_length = lut->word_length;
931
lut_word_length = lut->lut_word_length;
932
extra_bases = word_length - lut_word_length;
934
for (index = 0; index < num_hits; ++index) {
935
Uint4 s_offset = offset_pairs[index].qs_offsets.s_off;
936
Uint4 q_offset = offset_pairs[index].qs_offsets.q_off;
938
/* begin with the left extension; the initialization
939
is slightly faster. q below points to the first base
940
of the lookup table hit and s points to the first
941
four bases of the hit (which is guaranteed to be
942
aligned on a byte boundary) */
944
max_bases_left = MIN(extra_bases, MIN(q_offset, s_offset));
862
945
q = q_start + q_offset;
864
/* Check for extra bytes if required for longer words. */
865
if (extra_bytes_needed &&
866
!BlastNaCompareExtraBytes(q+compressed_word_length,
867
s+compressed_wordsize, extra_bytes_needed))
870
if (extend_partial_byte) {
871
/* mini extension to the left */
872
Uint1 max_bases = MIN(COMPRESSION_RATIO, MIN(q_offset, s_offset));
873
left = BlastNaMiniExtendLeft(q, s-1, max_bases);
875
/* mini extension to the right */
877
MIN(COMPRESSION_RATIO,
878
MIN(subject->length - s_offset - reduced_word_length,
879
query->length - q_offset - reduced_word_length));
883
s += reduced_wordsize;
884
q += reduced_word_length;
885
right = BlastNaMiniExtendRight(q, s, max_bases);
946
s = s_start + s_offset / COMPRESSION_RATIO;
948
for (extended_left = 0; extended_left < max_bases_left;
949
s--, q -= 4, extended_left++) {
952
if ((byte & 3) != q[-1] || ++extended_left == max_bases_left)
954
if (((byte>>2) & 3) != q[-2] || ++extended_left == max_bases_left)
956
if (((byte>>4) & 3) != q[-3] || ++extended_left == max_bases_left)
958
if ((byte>>6) != q[-4])
962
/* do the right extension if the left extension did not
963
find all the bases required. Begin at the first base
964
past the lookup table hit and move forwards */
966
s_off = s_offset + lut_word_length;
969
if (extended_left < extra_bases) {
970
q_off = q_offset + lut_word_length;
972
s = s_start + s_off / COMPRESSION_RATIO;
973
max_bases_right = MIN(extra_bases - extended_left,
974
MIN(query_length - q_off,
975
subject_length - s_off));
977
for (; extended_right < max_bases_right;
978
s++, q += 4, extended_right++) {
981
if ((byte>>6) != q[0] || ++extended_right == max_bases_right)
983
if (((byte>>4) & 3) != q[1] || ++extended_right == max_bases_right)
985
if (((byte>>2) & 3) != q[2] || ++extended_right == max_bases_right)
987
if ((byte & 3) != q[3])
991
/* check if enough extra matches were found */
993
if (extended_left + extended_right < extra_bases)
997
/* check the diagonal on which the hit lies. The boundaries
998
extend from the first match of the hit to one beyond the
1001
if (word_params->container_type == eWordStacks) {
1002
hit_ready = s_BlastnStacksExtendInitialHit(query, subject, min_step,
1003
template_length, word_params,
1004
matrix, ewp->stack_table,
1005
q_offset - extended_left,
1006
s_off + extended_right,
1007
s_offset - extended_left,
889
right = COMPRESSION_RATIO;
892
if (left + right >= extra_bases) {
893
Boolean hit_ready = FALSE;
894
/* Check if this diagonal has already been explored. */
895
if (word_params->container_type == eWordStacks) {
897
s_BlastnStacksExtendInitialHit(query, subject, min_step, template_length,
898
word_params, matrix, ewp->stack_table, q_offset,
899
s_offset + reduced_word_length + right,
900
s_offset, init_hitlist);
903
s_BlastnDiagExtendInitialHit(query, subject, min_step, template_length,
904
word_params, matrix, ewp->diag_table, q_offset,
905
s_offset + reduced_word_length + right,
906
s_offset, init_hitlist);
1010
hit_ready = s_BlastnDiagExtendInitialHit(query, subject, min_step,
1011
template_length, word_params,
1012
matrix, ewp->diag_table,
1013
q_offset - extended_left,
1014
s_off + extended_right,
1015
s_offset - extended_left,
912
1021
return hits_extended;
962
/** Extend an exact match in both directions up to the provided
964
* @param q_start Pointer to the start of the extension in query [in]
965
* @param s_start Pointer to the start of the extension in subject [in]
966
* @param max_bases_left At most how many bases to extend to the left [in]
967
* @param max_bases_right At most how many bases to extend to the right [in]
968
* @param max_length The length of the required exact match [in]
969
* @param extend_partial_byte Should partial byte extension be perfomed?[in]
970
* @param extended_right How far has extension succeeded to the right? [out]
971
* @return TRUE if extension successful
974
s_BlastNaExactMatchExtend(Uint1* q_start, Uint1* s_start,
975
Uint4 max_bases_left, Uint4 max_bases_right, Uint4 max_length,
976
Boolean extend_partial_byte, Uint4* extended_right)
985
/* Extend to the right; start from the firstt byte (it must be the
986
first one that's guaranteed to match by the lookup table hit). */
990
while (length < max_length && max_bases_right >= COMPRESSION_RATIO) {
991
if (*s != PACK_WORD(q))
993
length += COMPRESSION_RATIO;
995
q += COMPRESSION_RATIO;
996
max_bases_right -= COMPRESSION_RATIO;
998
if (extend_partial_byte) {
999
if (max_bases_right > 0) {
1000
length += BlastNaMiniExtendRight(q, s,
1001
(Uint1) MIN(max_bases_right, COMPRESSION_RATIO));
1005
*extended_right = length;
1007
if (length >= max_length)
1010
if (max_bases_left < max_length - length)
1013
max_bases_left = max_length - length;
1015
/* Extend to the left; start with the byte just before the first. */
1016
q = q_start - COMPRESSION_RATIO;
1018
while (length < max_length && max_bases_left >= COMPRESSION_RATIO) {
1019
if (*s != PACK_WORD(q))
1021
length += COMPRESSION_RATIO;
1023
q -= COMPRESSION_RATIO;
1024
max_bases_left -= COMPRESSION_RATIO;
1026
if (length >= max_length)
1028
if (extend_partial_byte && max_bases_left > 0) {
1029
length += BlastNaMiniExtendLeft(q+COMPRESSION_RATIO, s,
1030
(Uint1) MIN(max_bases_left, COMPRESSION_RATIO));
1033
return (length >= max_length);
1037
1072
BlastNaExtendRightAndLeft(const BlastOffsetPair* offset_pairs, Int4 num_hits,
1038
1073
const BlastInitialWordParameters* word_params,
1046
1081
Uint4 subject_length = subject->length;
1047
1082
Uint1* q_start = query->sequence;
1048
1083
Uint1* s_start = subject->sequence;
1049
Uint4 word_length = 0;
1084
Uint4 word_length, lut_word_length;
1050
1085
Uint4 q_off, s_off;
1051
1086
Uint4 max_bases_left, max_bases_right;
1052
Uint4 extended_right;
1087
Uint4 extended_left, extended_right;
1055
1090
Uint4 min_step = 0;
1056
Boolean variable_wordsize = FALSE;
1057
1091
Int4 hits_extended = 0;
1058
1092
Uint1 template_length = 0;
1060
1095
if (lookup_wrap->lut_type == MB_LOOKUP_TABLE) {
1061
1096
BlastMBLookupTable* lut = (BlastMBLookupTable*)lookup_wrap->lut;
1062
1097
word_length = lut->word_length;
1098
lut_word_length = lut->lut_word_length;
1063
1099
/* When ungapped extension is not performed, the hit will be new only if
1064
1100
it more than scan_step away from the previous hit. */
1065
1101
if(!word_params->options->ungapped_extension)
1066
1102
min_step = lut->scan_step;
1067
variable_wordsize = lut->variable_wordsize;
1068
1103
template_length = lut->template_length;
1070
1105
BlastLookupTable* lut = (BlastLookupTable*)lookup_wrap->lut;
1071
1106
word_length = lut->word_length;
1072
variable_wordsize = lut->variable_wordsize;
1107
lut_word_length = lut->lut_word_length;
1073
1108
if(!word_params->options->ungapped_extension)
1074
1109
min_step = lut->scan_step;
1077
for (index = 0; index < num_hits; ++index) {
1078
Uint4 s_offset = offset_pairs[index].qs_offsets.s_off;
1079
Uint4 q_offset = offset_pairs[index].qs_offsets.q_off;
1080
/* Adjust offsets to the start of the next full byte in the
1082
shift = (COMPRESSION_RATIO - s_offset%COMPRESSION_RATIO)
1083
% COMPRESSION_RATIO;
1084
q_off = q_offset + shift;
1085
s_off = s_offset + shift;
1086
s = s_start + s_off/COMPRESSION_RATIO;
1087
q = q_start + q_off;
1090
MIN(word_length, MIN(q_off, s_off));
1091
max_bases_right = MIN(word_length,
1092
MIN(query_length-q_off, subject_length-s_off));
1094
if (s_BlastNaExactMatchExtend(q, s, max_bases_left,
1095
max_bases_right, word_length,
1096
(Boolean) !variable_wordsize, &extended_right))
1098
/* Check if this diagonal has already been explored and save
1099
the hit if needed. */
1100
Boolean hit_ready = FALSE;
1101
/* Check if this diagonal has already been explored. */
1102
if (word_params->container_type == eWordStacks) {
1104
s_BlastnStacksExtendInitialHit(query, subject, min_step, template_length,
1105
word_params, matrix, ewp->stack_table, q_offset,
1106
s_off + extended_right, s_offset, init_hitlist);
1109
s_BlastnDiagExtendInitialHit(query, subject, min_step, template_length,
1110
word_params, matrix, ewp->diag_table, q_offset,
1111
s_off + extended_right, s_offset, init_hitlist);
1111
extra_bases = word_length - lut_word_length;
1113
if (extra_bases == 0) {
1114
/* if the lookup table is the width of a full word,
1115
no extensions are needed. Immediately check if the
1116
diagonal has been explored */
1118
for (index = 0; index < num_hits; ++index) {
1119
Uint4 s_offset = offset_pairs[index].qs_offsets.s_off;
1120
Uint4 q_offset = offset_pairs[index].qs_offsets.q_off;
1121
if (word_params->container_type == eWordStacks) {
1122
hit_ready = s_BlastnStacksExtendInitialHit(query, subject, min_step,
1123
template_length, word_params,
1124
matrix, ewp->stack_table,
1126
s_offset + lut_word_length,
1127
s_offset, init_hitlist);
1129
hit_ready = s_BlastnDiagExtendInitialHit(query, subject, min_step,
1130
template_length, word_params,
1131
matrix, ewp->diag_table,
1133
s_offset + lut_word_length,
1134
s_offset, init_hitlist);
1141
/* The initial lookup table hit must be extended. We
1142
trust that the bases of the hit itself are exact matches,
1143
and look only for exact matches before and after the hit.
1145
Most of the time, the lookup table width is close to the
1146
word size so only a few bases need examining. Also, most of
1147
the time (for random sequences) extensions will fail
1148
almost immediately (the first base examined will not
1149
match about 3/4 of the time). Thus it is critical to reduce
1150
as much as possible all work that is not the actual
1151
examination of sequence data */
1153
for (index = 0; index < num_hits; ++index) {
1154
Uint4 s_offset = offset_pairs[index].qs_offsets.s_off;
1155
Uint4 q_offset = offset_pairs[index].qs_offsets.q_off;
1157
/* begin with the left extension; the initialization
1158
is slightly faster. Point to the first base of the
1159
lookup table hit and work backwards */
1161
max_bases_left = MIN(extra_bases, MIN(q_offset, s_offset));
1162
q = q_start + q_offset;
1163
s = s_start + s_offset / COMPRESSION_RATIO;
1166
for (extended_left = 0; extended_left < max_bases_left;
1169
if (s_off % COMPRESSION_RATIO == 3)
1171
if (((Uint1)(*s << (2*(s_off % COMPRESSION_RATIO))) >> 6) != *q)
1175
/* do the right extension if the left extension did not
1176
find all the bases required. Begin at the first base
1177
beyond the lookup table hit and move forwards */
1179
s_off = s_offset + lut_word_length;
1181
if (extended_left < extra_bases) {
1182
q_off = q_offset + lut_word_length;
1183
q = q_start + q_off;
1184
s = s_start + s_off / COMPRESSION_RATIO;
1185
max_bases_right = MIN(extra_bases - extended_left,
1186
MIN(query_length - q_off,
1187
subject_length - s_off));
1189
for (extended_right = 0; extended_right < max_bases_right;
1191
if (((Uint1)(*s << (2*(s_off % COMPRESSION_RATIO))) >> 6) != *q)
1194
if (s_off % COMPRESSION_RATIO == 0)
1198
/* check if enough extra matches were found */
1200
if (extended_left + extended_right < extra_bases)
1204
/* check the diagonal on which the hit lies. The boundaries
1205
extend from the first match of the hit to one beyond the
1208
if (word_params->container_type == eWordStacks) {
1209
hit_ready = s_BlastnStacksExtendInitialHit(query, subject, min_step,
1210
template_length, word_params,
1211
matrix, ewp->stack_table,
1212
q_offset - extended_left,
1214
s_offset - extended_left,
1217
hit_ready = s_BlastnDiagExtendInitialHit(query, subject, min_step,
1218
template_length, word_params,
1219
matrix, ewp->diag_table,
1220
q_offset - extended_left,
1222
s_offset - extended_left,
1114
1226
++hits_extended;
1137
1249
Int4 subject_length = subject->length;
1138
1250
Int4 hits_extended = 0;
1252
ASSERT(mb_lt->discontiguous ||
1253
word_params->extension_method == eRightAndLeft);
1140
1255
start_offset = 0;
1141
1256
if (mb_lt->discontiguous) {
1142
1257
last_start = subject_length - mb_lt->template_length;
1143
1258
last_end = subject_length;
1145
1260
last_end = subject_length;
1146
switch (word_params->extension_method) {
1147
case eRightAndLeft: case eUpdateDiag:
1148
/* Look for matches all the way to the end of the sequence. */
1150
last_end - COMPRESSION_RATIO*mb_lt->compressed_wordsize;
1153
/* Need to leave word_length to the end of the sequence for extension
1155
if (mb_lt->variable_wordsize)
1156
last_end -= last_end%COMPRESSION_RATIO;
1157
last_start = last_end - mb_lt->word_length;
1158
last_end = last_start + mb_lt->compressed_wordsize*COMPRESSION_RATIO;
1161
abort(); /* All the relevant cases should be handled above, otherwise variables may not be properly set
1162
for the rest of the funciton. */
1261
last_start = last_end - mb_lt->lut_word_length;
1167
1264
/* start_offset points to the beginning of the word */
1168
1265
while ( start_offset <= last_start ) {
1169
1267
/* Set the last argument's value to the end of the last word,
1170
1268
without the extra bases for the discontiguous case */
1171
1269
next_start = last_end;
1172
1270
if (mb_lt->discontiguous) {
1173
1271
hitsfound = MB_DiscWordScanSubject(lookup_wrap, subject, start_offset,
1174
1272
offset_pairs, max_hits, &next_start);
1175
} else if (word_params->extension_method == eRightAndLeft) {
1176
1274
hitsfound = MB_AG_ScanSubject(lookup_wrap, subject, start_offset,
1177
1275
offset_pairs, max_hits, &next_start);
1179
hitsfound = MB_ScanSubject(lookup_wrap, subject, start_offset,
1180
offset_pairs, max_hits, &next_start);
1183
1278
switch (word_params->extension_method) {