352
340
/* Point all pointers to the beginning of the alignment. */
353
341
best_q_start = best_q_end = current_q_start = query;
354
342
best_s_start = best_s_end = current_s_start = subject;
355
best_start_esp = best_end_esp = current_start_esp = current_esp =
357
343
/* There are no previous edit scripts at the beginning. */
358
best_prev_esp = current_prev_esp = current_start_prev_esp = NULL;
359
best_end_esp_num = 0;
362
while (current_esp) {
363
/* Process substitutions one operation at a time, full gaps in one
365
if (current_esp->op_type == eGapAlignSub) {
366
sum += factor*matrix[*query & kResidueMask][*subject];
370
} else if (current_esp->op_type == eGapAlignDel) {
371
sum -= gap_open + gap_extend * current_esp->num;
372
subject += current_esp->num;
373
op_index += current_esp->num;
374
} else if (current_esp->op_type == eGapAlignIns) {
375
sum -= gap_open + gap_extend * current_esp->num;
376
query += current_esp->num;
377
op_index += current_esp->num;
345
best_end_esp_num = -1;
347
for (index=0; index<esp->size; index++)
349
int op_index = 0; /* Index of an operation within a single edit script. */
350
for (op_index=0; op_index<esp->num[index]; )
352
/* Process substitutions one operation at a time, full gaps in one step. */
353
if (esp->op_type[index] == eGapAlignSub) {
354
sum += factor*matrix[*query & kResidueMask][*subject];
358
} else if (esp->op_type[index] == eGapAlignDel) {
359
sum -= gap_open + gap_extend * esp->num[index];
360
subject += esp->num[index];
361
op_index += esp->num[index];
362
} else if (esp->op_type[index] == eGapAlignIns) {
363
sum -= gap_open + gap_extend * esp->num[index];
364
query += esp->num[index];
365
op_index += esp->num[index];
381
369
/* Point current edit script chain start to the new place.
382
370
If we are in the middle of an edit script, reduce its length and
383
371
point operation index to the beginning of a modified edit script;
384
372
if we are at the end, move to the next edit script. */
385
if (op_index < current_esp->num) {
386
current_esp->num -= op_index;
387
current_start_esp = current_esp;
388
current_start_prev_esp = current_prev_esp;
391
current_start_esp = current_esp->next;
392
current_start_prev_esp = current_esp;
394
/* Set sum to 0, to start a fresh count. */
396
/* Set current starting positions in sequences to the new start. */
397
current_q_start = query;
398
current_s_start = subject;
373
if (op_index < esp->num[index]) {
374
esp->num[index] -= op_index;
375
current_start_esp_index = index;
378
current_start_esp_index = index + 1;
380
/* Set sum to 0, to start a fresh count. */
382
/* Set current starting positions in sequences to the new start. */
383
current_q_start = query;
384
current_s_start = subject;
400
/* If score has passed the cutoff at some point, leave the best score
401
and edit scripts positions information untouched, otherwise reset
402
the best score to 0 and point the best edit script positions to
404
if (score < hit_params->cutoff_score) {
405
/* Start from new offset; discard all previous information. */
406
best_q_start = query;
407
best_s_start = subject;
386
/* If score has passed the cutoff at some point, leave the best score
387
and edit scripts positions information untouched, otherwise reset
388
the best score to 0 and point the best edit script positions to
390
if (score < hit_params->cutoff_score) {
391
/* Start from new offset; discard all previous information. */
392
best_q_start = query;
393
best_s_start = subject;
410
/* Set best start and end edit script pointers to new start. */
411
best_start_esp = current_start_esp;
412
best_end_esp = current_start_esp;
413
best_prev_esp = current_prev_esp;
415
} else if (sum > score) {
416
/* Remember this point as the best scoring end point, and the current
396
/* Set best start and end edit script pointers to new start. */
397
best_start_esp_index = current_start_esp_index;
398
best_end_esp_index = current_start_esp_index;
400
/* break; */ /* start on next GapEditScript. */
401
} else if (sum > score) {
402
/* Remember this point as the best scoring end point, and the current
417
403
start of the alignment as the start of the best alignment. */
420
best_q_start = current_q_start;
421
best_s_start = current_s_start;
423
best_s_end = subject;
425
best_start_esp = current_start_esp;
426
best_end_esp = current_esp;
427
best_prev_esp = current_start_prev_esp;
428
best_end_esp_num = op_index;
430
/* If operation index has reached the end of the current edit script,
431
move on to the next link in the edit script chain. */
432
if (op_index >= current_esp->num) {
434
current_prev_esp = current_esp;
435
current_esp = current_esp->next;
406
best_q_start = current_q_start;
407
best_s_start = current_s_start;
409
best_s_end = subject;
411
best_start_esp_index = current_start_esp_index;
412
best_end_esp_index = index;
413
best_end_esp_num = op_index;
437
416
} /* loop on edit scripts */
1059
1045
(h2->query.offset - h2->subject.offset);
1062
1049
/** An auxiliary structure used for merging HSPs */
1063
1050
typedef struct BlastHSPSegment {
1064
Int4 q_start, /**< Start of segment in query. */
1065
q_end; /**< End of segment in query. */
1066
Int4 s_start, /**< Start of segment in subject. */
1067
s_end; /**< End of segment in subject. */
1068
1051
struct BlastHSPSegment* next; /**< Next link in the chain. */
1052
Int4 q_start, /**< Query start of segment. */
1053
s_start; /**< Subject start of segment. */
1054
Int4 length; /**< length of segments. */
1055
Int4 diagonal; /**< diagonal (q_start - s_start). */
1056
GapEditScript* esp; /**< pointer to edit script, not owned! */
1057
Int4 esp_offset; /**< element of above esp this node refers to */
1069
1058
} BlastHSPSegment;
1071
/** Maximal diagonal distance between HSP starting offsets, within which HSPs
1072
* from search of different chunks of subject sequence are considered for
1075
#define OVERLAP_DIAG_CLOSE 10
1077
/** Merge the two HSPs if they intersect.
1078
* @param hsp1 The first HSP; also contains the result of merge. [in] [out]
1079
* @param hsp2 The second HSP [in]
1080
* @param start The starting offset of the subject coordinates where the
1081
* intersection is possible [in]
1084
s_BlastHSPsMerge(BlastHSP* hsp1, BlastHSP* hsp2, Int4 start)
1086
BlastHSPSegment* segments1,* segments2,* new_segment1,* new_segment2;
1087
GapEditScript* esp1,* esp2,* esp;
1088
Int4 end = start + DBSEQ_CHUNK_OVERLAP - 1;
1089
Int4 min_diag, max_diag, num1, num2, dist = 0, next_dist = 0;
1090
Int4 diag1_start, diag1_end, diag2_start, diag2_end;
1092
Uint1 intersection_found;
1093
EGapAlignOpType op_type = eGapAlignSub;
1095
if (!hsp1->gap_info || !hsp2->gap_info) {
1061
/** function to create and populate a BlastHSPSegement.
1062
* The new segemnt is added to an existing chain unless NULL
1064
* @param segment object to be allocated and populated [in|out]
1065
* @param q_start start of match on query [in]
1066
* @param s_start start of match on subject [in]
1067
* @param esp GapEditScript for this alignment [in]
1068
* @param esp_offset relevant element of GapEditScript [in]
1071
s_AddHSPSegment(BlastHSPSegment** segment, int q_start, int s_start, GapEditScript* esp, int esp_offset)
1073
BlastHSPSegment* new = malloc(sizeof(BlastHSPSegment));
1075
new->q_start = q_start;
1076
new->s_start = s_start;
1077
new->length = esp->num[esp_offset];
1078
new->diagonal = q_start - s_start;
1080
new->esp_offset = esp_offset;
1083
BlastHSPSegment* var = *segment;
1094
/** Deallocates all segments in chain.
1095
* Does not free underlying data
1096
* @param segments linked list of segments
1098
static BlastHSPSegment*
1099
s_DeleteAllHSPSegments(BlastHSPSegment* segments)
1101
BlastHSPSegment* var = segments;
1102
BlastHSPSegment* next;
1115
BlastMergeTwoHSPs(BlastHSP* hsp1, BlastHSP* hsp2, Int4 start)
1117
BlastHSPSegment *segment1=NULL, *segment2=NULL;
1118
Boolean found = FALSE;
1120
Int4 q_start, s_start; /* start on query and subject sequences. */
1121
Int4 max_s_end = -1, max_q_end = -1; /* furthest extent of first HSP. */
1122
Int4 index; /* loop index. */
1124
if (!hsp1->gap_info || !hsp2->gap_info)
1096
1126
/* Assume that this is an ungapped alignment, hence simply compare
1097
1127
diagonals. Do not merge if they are on different diagonals */
1098
1128
if (s_DiagCompareHSPs(&hsp1, &hsp2) == 0 &&
1106
/* Find whether these HSPs have an intersection point */
1107
segments1 = (BlastHSPSegment*) calloc(1, sizeof(BlastHSPSegment));
1109
esp1 = hsp1->gap_info;
1110
esp2 = hsp2->gap_info;
1112
segments1->q_start = hsp1->query.offset;
1113
segments1->s_start = hsp1->subject.offset;
1114
while (segments1->s_start < start) {
1115
if (esp1->op_type == eGapAlignIns)
1116
segments1->q_start += esp1->num;
1117
else if (segments1->s_start + esp1->num < start) {
1118
if (esp1->op_type == eGapAlignSub) {
1119
segments1->s_start += esp1->num;
1120
segments1->q_start += esp1->num;
1121
} else if (esp1->op_type == eGapAlignDel)
1122
segments1->s_start += esp1->num;
1127
/* Current esp is the first segment within the overlap region */
1128
segments1->s_end = segments1->s_start + esp1->num - 1;
1129
if (esp1->op_type == eGapAlignSub)
1130
segments1->q_end = segments1->q_start + esp1->num - 1;
1132
segments1->q_end = segments1->q_start;
1134
new_segment1 = segments1;
1136
for (esp = esp1->next; esp; esp = esp->next) {
1137
new_segment1->next = (BlastHSPSegment*)
1138
calloc(1, sizeof(BlastHSPSegment));
1139
new_segment1->next->q_start = new_segment1->q_end + 1;
1140
new_segment1->next->s_start = new_segment1->s_end + 1;
1141
new_segment1 = new_segment1->next;
1142
if (esp->op_type == eGapAlignSub) {
1143
new_segment1->q_end += esp->num - 1;
1144
new_segment1->s_end += esp->num - 1;
1145
} else if (esp->op_type == eGapAlignIns) {
1146
new_segment1->q_end += esp->num - 1;
1147
new_segment1->s_end = new_segment1->s_start;
1149
new_segment1->s_end += esp->num - 1;
1150
new_segment1->q_end = new_segment1->q_start;
1154
/* Now create the second segments list */
1156
segments2 = (BlastHSPSegment*) calloc(1, sizeof(BlastHSPSegment));
1157
segments2->q_start = hsp2->query.offset;
1158
segments2->s_start = hsp2->subject.offset;
1159
segments2->q_end = segments2->q_start + esp2->num - 1;
1160
segments2->s_end = segments2->s_start + esp2->num - 1;
1162
new_segment2 = segments2;
1164
for (esp = esp2->next; esp && new_segment2->s_end < end;
1166
new_segment2->next = (BlastHSPSegment*)
1167
calloc(1, sizeof(BlastHSPSegment));
1168
new_segment2->next->q_start = new_segment2->q_end + 1;
1169
new_segment2->next->s_start = new_segment2->s_end + 1;
1170
new_segment2 = new_segment2->next;
1171
if (esp->op_type == eGapAlignIns) {
1172
new_segment2->s_end = new_segment2->s_start;
1173
new_segment2->q_end = new_segment2->q_start + esp->num - 1;
1174
} else if (esp->op_type == eGapAlignDel) {
1175
new_segment2->s_end = new_segment2->s_start + esp->num - 1;
1176
new_segment2->q_end = new_segment2->q_start;
1177
} else if (esp->op_type == eGapAlignSub) {
1178
new_segment2->s_end = new_segment2->s_start + esp->num - 1;
1179
new_segment2->q_end = new_segment2->q_start + esp->num - 1;
1183
new_segment1 = segments1;
1184
new_segment2 = segments2;
1185
intersection_found = 0;
1187
while (new_segment1 && new_segment2 && !intersection_found) {
1188
if (new_segment1->s_end < new_segment2->s_start ||
1189
new_segment1->q_end < new_segment2->q_start) {
1190
new_segment1 = new_segment1->next;
1194
if (new_segment2->s_end < new_segment1->s_start ||
1195
new_segment2->q_end < new_segment1->q_start) {
1196
new_segment2 = new_segment2->next;
1200
diag1_start = new_segment1->s_start - new_segment1->q_start;
1201
diag2_start = new_segment2->s_start - new_segment2->q_start;
1202
diag1_end = new_segment1->s_end - new_segment1->q_end;
1203
diag2_end = new_segment2->s_end - new_segment2->q_end;
1205
if (diag1_start == diag1_end && diag2_start == diag2_end &&
1206
diag1_start == diag2_start) {
1207
/* Both segments substitutions, on same diagonal */
1208
intersection_found = 1;
1209
dist = new_segment2->s_end - new_segment1->s_start + 1;
1211
} else if (diag1_start != diag1_end && diag2_start != diag2_end) {
1212
/* Both segments gaps - must intersect */
1213
intersection_found = 3;
1215
op_type = eGapAlignIns;
1216
dist = new_segment2->s_end - new_segment1->s_start + 1;
1217
next_dist = new_segment2->q_end - new_segment1->q_start - dist + 1;
1218
if (new_segment2->q_end - new_segment1->q_start < dist) {
1219
dist = new_segment2->q_end - new_segment1->q_start + 1;
1220
op_type = eGapAlignDel;
1221
next_dist = new_segment2->s_end - new_segment1->s_start - dist + 1;
1224
} else if (diag1_start != diag1_end) {
1225
max_diag = MAX(diag1_start, diag1_end);
1226
min_diag = MIN(diag1_start, diag1_end);
1227
if (diag2_start >= min_diag && diag2_start <= max_diag) {
1228
intersection_found = 2;
1229
dist = diag2_start - min_diag + 1;
1230
if (new_segment1->s_end == new_segment1->s_start)
1231
next_dist = new_segment2->s_end - new_segment1->s_end + 1;
1233
next_dist = new_segment2->q_end - new_segment1->q_end + 1;
1236
} else if (diag2_start != diag2_end) {
1237
max_diag = MAX(diag2_start, diag2_end);
1238
min_diag = MIN(diag2_start, diag2_end);
1239
if (diag1_start >= min_diag && diag1_start <= max_diag) {
1240
intersection_found = 2;
1241
next_dist = max_diag - diag1_start + 1;
1242
if (new_segment2->s_end == new_segment2->s_start)
1243
dist = new_segment2->s_start - new_segment1->s_start + 1;
1245
dist = new_segment2->q_start - new_segment1->q_start + 1;
1249
if (new_segment1->s_end <= new_segment2->s_end) {
1250
new_segment1 = new_segment1->next;
1253
new_segment2 = new_segment2->next;
1258
/* Free the segments linked lists that are no longer needed. */
1259
for ( ; segments1; segments1 = new_segment1) {
1260
new_segment1 = segments1->next;
1264
for ( ; segments2; segments2 = new_segment2) {
1265
new_segment2 = segments2->next;
1269
if (intersection_found) {
1271
for (index = 0; index < num1-1; index++)
1273
for (index = 0; index < num2-1; index++) {
1277
if (intersection_found < 3) {
1285
switch (intersection_found) {
1288
esp1->next = esp2->next;
1293
esp2->num = next_dist;
1300
esp2->op_type = op_type;
1301
esp2->num = next_dist;
1308
hsp1->query.end = hsp2->query.end;
1309
hsp1->subject.end = hsp2->subject.end;
1312
return (Boolean) intersection_found;
1137
esp = hsp1->gap_info;
1138
q_start = hsp1->query.offset;
1139
s_start = hsp1->subject.offset;
1140
for (index=0; index<esp->size; index++)
1142
if (esp->op_type[index] == eGapAlignSub)
1144
if (s_start+(esp->num[index]) > start) /* End of segment within overlap region. */
1146
s_AddHSPSegment(&segment1, q_start, s_start, esp, index);
1147
max_q_end = MAX(max_q_end, q_start+esp->num[index]);
1148
max_s_end = MAX(max_s_end, s_start+esp->num[index]);
1150
q_start += esp->num[index];
1151
s_start += esp->num[index];
1153
else if (esp->op_type[index] == eGapAlignIns)
1154
q_start += esp->num[index];
1155
else if (esp->op_type[index] == eGapAlignDel)
1156
s_start += esp->num[index];
1159
esp = hsp2->gap_info;
1160
q_start = hsp2->query.offset;
1161
s_start = hsp2->subject.offset;
1162
for (index=0; index<esp->size; index++)
1164
if (esp->op_type[index] == eGapAlignSub)
1166
if (max_s_end > s_start)
1167
s_AddHSPSegment(&segment2, q_start, s_start, esp, index);
1171
q_start += esp->num[index];
1172
s_start += esp->num[index];
1174
else if (esp->op_type[index] == eGapAlignIns)
1175
q_start += esp->num[index];
1176
else if (esp->op_type[index] == eGapAlignDel)
1177
s_start += esp->num[index];
1180
if (segment1 && segment2)
1182
BlastHSPSegment *segment1_var=segment1, *segment2_var=segment2;
1183
while (segment1_var)
1185
BlastHSPSegment* segment2_var = segment2;
1186
while (segment2_var)
1188
if (segment2_var->diagonal == segment1_var->diagonal)
1190
if (segment1_var->q_start <= segment2_var->q_start
1191
&& (segment1_var->q_start+segment1_var->length) >= segment2_var->q_start)
1192
found = TRUE; /* start of 2 in extent of 1. */
1193
else if (segment1_var->q_start >= segment2_var->q_start
1194
&& (segment1_var->q_start) <= segment2_var->q_start+segment2_var->length)
1195
found = TRUE; /* start of 1 in extent of 2. */
1199
segment2_var = segment2_var->next;
1203
segment1_var = segment1_var->next;
1208
int end_1 = segment1_var->q_start + segment1_var->length;
1209
int end_2 = segment2_var->q_start + segment2_var->length;
1210
GapEditScript* esp_temp_1 = segment1_var->esp;
1211
GapEditScript* esp_temp_2 = segment2_var->esp;
1212
GapEditScript* new_esp = NULL;
1215
/* we use the GapEditScript element pointed to be segment1_var, ignoring the one
1216
pointed to by segment2_var as it overlaps with the first. */
1217
esp_temp_1->num[segment1_var->esp_offset] += (end_2 - end_1);
1218
new_size = segment1_var->esp_offset + esp_temp_2->size - segment2_var->esp_offset;
1219
new_esp = GapEditScriptNew(new_size);
1220
GapEditScriptPartialCopy(new_esp, 0, esp_temp_1, 0, segment1_var->esp_offset);
1221
GapEditScriptPartialCopy(new_esp, 1+segment1_var->esp_offset, esp_temp_2,
1222
segment2_var->esp_offset+1, esp_temp_2->size-1);
1223
hsp1->gap_info = GapEditScriptDelete(hsp1->gap_info);
1224
hsp1->gap_info = new_esp;
1225
hsp1->query.end = hsp2->query.end;
1226
hsp1->subject.end = hsp2->subject.end;
1227
hsp1->score = MAX(hsp1->score, hsp2->score); /* Neither score may be correct, so we use the highest one. */
1231
segment1 = s_DeleteAllHSPSegments(segment1);
1232
segment2 = s_DeleteAllHSPSegments(segment2);
1237
/** Maximal diagonal distance between HSP starting offsets, within which HSPs
1238
* from search of different chunks of subject sequence are considered for
1241
#define OVERLAP_DIAG_CLOSE 10
1315
1242
/********************************************************************************
1316
1243
Functions manipulating BlastHSPList's
1317
1244
********************************************************************************/
1870
1806
hsp_count = hsp_list->hspcnt;
1872
1808
qsort(hsp_array, hsp_count, sizeof(BlastHSP*), s_QueryOffsetCompareHSPs);
1873
while (index < hsp_count-increment)
1875
if (hsp_array[index+increment] == NULL)
1881
if (hsp_array[index] && hsp_array[index]->query.offset == hsp_array[index+increment]->query.offset &&
1882
hsp_array[index]->subject.offset == hsp_array[index+increment]->subject.offset &&
1883
hsp_array[index]->context == hsp_array[index+increment]->context)
1885
if (hsp_array[index]->score > hsp_array[index+increment]->score) {
1886
hsp_array[index+increment] =
1887
Blast_HSPFree(hsp_array[index+increment]);
1890
hsp_array[index] = Blast_HSPFree(hsp_array[index]);
1810
while (i < hsp_count) {
1812
while (i+j < hsp_count &&
1813
hsp_array[i] && hsp_array[i+j] &&
1814
hsp_array[i]->context == hsp_array[i+j]->context &&
1815
hsp_array[i]->query.offset == hsp_array[i+j]->query.offset &&
1816
hsp_array[i]->subject.offset == hsp_array[i+j]->subject.offset) {
1817
hsp_array[i+j] = Blast_HSPFree(hsp_array[i+j]);
1900
1823
qsort(hsp_array, hsp_count, sizeof(BlastHSP*), s_QueryEndCompareHSPs);
1903
while (index < hsp_count-increment)
1905
if (hsp_array[index+increment] == NULL)
1911
if (hsp_array[index] &&
1912
hsp_array[index]->query.end == hsp_array[index+increment]->query.end &&
1913
hsp_array[index]->subject.end == hsp_array[index+increment]->subject.end &&
1914
hsp_array[index]->context == hsp_array[index+increment]->context)
1916
if (hsp_array[index]->score > hsp_array[index+increment]->score) {
1917
hsp_array[index+increment] =
1918
Blast_HSPFree(hsp_array[index+increment]);
1921
hsp_array[index] = Blast_HSPFree(hsp_array[index]);
1825
while (i < hsp_count) {
1827
while (i+j < hsp_count &&
1828
hsp_array[i] && hsp_array[i+j] &&
1829
hsp_array[i]->context == hsp_array[i+j]->context &&
1830
hsp_array[i]->query.end == hsp_array[i+j]->query.end &&
1831
hsp_array[i]->subject.end == hsp_array[i+j]->subject.end) {
1832
hsp_array[i+j] = Blast_HSPFree(hsp_array[i+j]);
1931
1838
retval = Blast_HSPListPurgeNullHSPs(hsp_list);
1935
1842
return hsp_list->hspcnt;
1938
/** Status values returned by an HSP inclusion test function. */
1939
typedef enum EHSPInclusionStatus {
1940
eEqual = 0, /**< Identical */
1941
eFirstInSecond, /**< First included in rectangle formed by second */
1942
eSecondInFirst, /**< Second included in rectangle formed by first */
1943
eDiagNear, /**< Diagonals are near, but neither HSP is included in
1945
eDiagDistant /**< Diagonals are far apart, or different contexts */
1946
} EHSPInclusionStatus;
1948
1845
/** Diagonal distance between HSPs, outside of which one HSP cannot be
1949
1846
* considered included in the other.
1951
1848
#define MIN_DIAG_DIST 60
1953
/** HSP inclusion criterion for megablast: one HSP must be included in a
1954
* diagonal strip of a certain width around the other, and also in a rectangle
1955
* formed by the other HSP's endpoints.
1957
static EHSPInclusionStatus
1958
s_BlastHSPInclusionTest(BlastHSP* hsp1, BlastHSP* hsp2)
1960
if (hsp1->context != hsp2->context ||
1961
!MB_HSP_CLOSE(hsp1->query.offset, hsp2->query.offset,
1962
hsp1->subject.offset, hsp2->subject.offset,
1964
return eDiagDistant;
1966
if (hsp1->query.offset == hsp2->query.offset &&
1967
hsp1->query.end == hsp2->query.end &&
1968
hsp1->subject.offset == hsp2->subject.offset &&
1969
hsp1->subject.end == hsp2->subject.end &&
1970
hsp1->score == hsp2->score) {
1972
} else if (hsp1->query.offset >= hsp2->query.offset &&
1973
hsp1->query.end <= hsp2->query.end &&
1974
hsp1->subject.offset >= hsp2->subject.offset &&
1975
hsp1->subject.end <= hsp2->subject.end &&
1976
hsp1->score <= hsp2->score) {
1977
return eFirstInSecond;
1978
} else if (hsp1->query.offset <= hsp2->query.offset &&
1979
hsp1->query.end >= hsp2->query.end &&
1980
hsp1->subject.offset <= hsp2->subject.offset &&
1981
hsp1->subject.end >= hsp2->subject.end &&
1982
hsp1->score >= hsp2->score) {
1983
return eSecondInFirst;
1988
/** How many HSPs to check for inclusion for each new HSP? */
1989
#define MAX_NUM_CHECK_INCLUSION 20
1991
1850
/** Remove redundant HSPs in an HSP list based on a diagonal inclusion test: if
1992
1851
* an HSP is within a certain diagonal distance of another HSP, and its endpoints
1993
1852
* are contained in a rectangle formed by another HSP, then it is removed.
1994
1853
* Performed only after a single-phase greedy gapped extension, when there is no
1995
1854
* extra traceback stage that could fix the inclusions.
1996
* @param hsp_list HSP list to check [in] [out]
1855
* @param query_blk Query sequence for the HSPs
1856
* @param subject_blk Subject sequence for the HSPs
1857
* @param query_info Used to map HSPs uniquely onto the complete
1858
* set of query sequences
1859
* @param hsp_list HSP list to check (must be in standard sorted order) [in/out]
1999
s_BlastHSPListCheckDiagonalInclusion(BlastHSPList* hsp_list)
1862
s_BlastHSPListCheckDiagonalInclusion(BLAST_SequenceBlk* query_blk,
1863
BLAST_SequenceBlk* subject_blk,
1864
const BlastQueryInfo* query_info,
1865
BlastHSPList* hsp_list)
2001
Int4 index, new_hspcnt, index1, index2;
2002
1868
BlastHSP** hsp_array = hsp_list->hsp_array;
2003
Boolean shift_needed = FALSE;
2004
EHSPInclusionStatus inclusion_status = eDiagNear;
1869
BlastIntervalTree *tree;
2006
1871
if (hsp_list->hspcnt <= 1)
2009
qsort(hsp_array, hsp_list->hspcnt, sizeof(BlastHSP*),
2012
for (index=1, new_hspcnt=0; index<hsp_list->hspcnt; index++) {
2013
if (!hsp_array[index])
2015
inclusion_status = eDiagNear;
2016
for (index1 = new_hspcnt; inclusion_status != eDiagDistant &&
2017
index1 >= 0 && new_hspcnt-index1 < MAX_NUM_CHECK_INCLUSION;
2020
s_BlastHSPInclusionTest(hsp_array[index], hsp_array[index1]);
2021
if (inclusion_status == eFirstInSecond ||
2022
inclusion_status == eEqual) {
2023
/* Free the new HSP and break out of the inclusion test loop */
2024
hsp_array[index] = Blast_HSPFree(hsp_array[index]);
2026
} else if (inclusion_status == eSecondInFirst) {
2027
hsp_array[index1] = Blast_HSPFree(hsp_array[index1]);
2028
shift_needed = TRUE;
2032
/* If some lower indexed HSPs have been removed, shift the subsequent
2035
/* Find the first non-NULL HSP, going backwards */
2036
while (index1 >= 0 && !hsp_array[index1])
2038
/* Go forward, and shift any non-NULL HSPs */
2039
for (index2 = ++index1; index1 <= new_hspcnt; index1++) {
2040
if (hsp_array[index1])
2041
hsp_array[index2++] = hsp_array[index1];
2043
new_hspcnt = index2 - 1;
2044
shift_needed = FALSE;
2046
if (hsp_array[index] != NULL)
2047
hsp_array[++new_hspcnt] = hsp_array[index];
1874
/* Remove any HSPs that are contained within other HSPs.
1875
Since the list is sorted by score already, any HSP
1876
contained by a previous HSP is guaranteed to have a
1877
lower score, and may be purged. */
1879
tree = Blast_IntervalTreeInit(0, query_blk->length + 1,
1880
0, subject_blk->length + 1);
1882
for (index = 0; index < hsp_list->hspcnt; index++) {
1883
BlastHSP *hsp = hsp_array[index];
1885
if (BlastIntervalTreeContainsHSP(tree, hsp, query_info,
1887
hsp_array[index] = Blast_HSPFree(hsp);
1890
BlastIntervalTreeAddHSP(hsp, tree, query_info,
2049
/* Set all HSP pointers that will not be used any more to NULL */
2050
memset(&hsp_array[new_hspcnt+1], 0,
2051
(hsp_list->hspcnt - new_hspcnt - 1)*sizeof(BlastHSP*));
2052
hsp_list->hspcnt = new_hspcnt + 1;
2054
/* Sort the HSP array by score again. */
2055
Blast_HSPListSortByScore(hsp_list);
1894
tree = Blast_IntervalTreeFree(tree);
1895
Blast_HSPListPurgeNullHSPs(hsp_list);
3176
3051
return phi_results;
3055
Blast_HSPResultsFromHSPStream(BlastHSPStream* hsp_stream,
3057
const BlastHitSavingOptions* hit_options,
3058
const BlastExtensionOptions* ext_options,
3059
const BlastScoringOptions* scoring_options)
3061
BlastHSPResults* retval = NULL;
3062
SBlastHitsParameters* bhp = NULL;
3063
BlastHSPList* hsp_list = NULL;
3065
SBlastHitsParametersNew(hit_options, ext_options, scoring_options, &bhp);
3067
retval = Blast_HSPResultsNew((Int4) num_queries);
3069
while (BlastHSPStreamRead(hsp_stream, &hsp_list) != kBlastHSPStream_Eof) {
3070
Blast_HSPResultsInsertHSPList(retval, hsp_list,
3071
bhp->prelim_hitlist_size);
3073
bhp = SBlastHitsParametersFree(bhp);
3077
/** Comparison function for sorting HSP lists in increasing order of the
3078
* number of HSPs in a hit. Needed for s_TrimResultsByTotalHSPLimit below.
3079
* @param v1 Pointer to the first HSP list [in]
3080
* @param v2 Pointer to the second HSP list [in]
3083
s_CompareHsplistHspcnt(const void* v1, const void* v2)
3085
BlastHSPList* r1 = *((BlastHSPList**) v1);
3086
BlastHSPList* r2 = *((BlastHSPList**) v2);
3088
if (r1->hspcnt < r2->hspcnt)
3090
else if (r1->hspcnt > r2->hspcnt)
3096
/** Removes extra results if a limit is imposed on the total number of HSPs
3097
* returned. If the search involves multiple query sequences, the total HSP
3098
* limit is applied separately to each query.
3099
* The trimming algorithm makes sure that at least 1 HSP is returned for each
3100
* database sequence hit. Suppose results for a given query consist of HSP
3101
* lists for N database sequences, and the limit is T. HSP lists are sorted in
3102
* order of increasing number of HSPs in each list. Then the algorithm proceeds
3103
* by leaving at most i*T/N HSPs for the first i HSP lists, for every
3105
* @param results Results after preliminary stage of a BLAST search [in|out]
3106
* @param total_hsp_limit Limit on total number of HSPs [in]
3107
* @return TRUE if HSP limit has been exceeded, FALSE otherwise.
3110
s_TrimResultsByTotalHSPLimit(BlastHSPResults* results, Uint4 total_hsp_limit)
3113
Boolean hsp_limit_exceeded = FALSE;
3115
if (total_hsp_limit == 0) {
3116
return hsp_limit_exceeded;
3119
for (query_index = 0; query_index < results->num_queries; ++query_index) {
3120
BlastHitList* hit_list = NULL;
3121
BlastHSPList** hsplist_array = NULL;
3122
Int4 hsplist_count = 0;
3125
if ( !(hit_list = results->hitlist_array[query_index]) )
3127
/* The count of HSPs is separate for each query. */
3128
hsplist_count = hit_list->hsplist_count;
3129
hsplist_array = (BlastHSPList**)
3130
malloc(hsplist_count*sizeof(BlastHSPList*));
3132
for (subj_index = 0; subj_index < hsplist_count; ++subj_index) {
3133
hsplist_array[subj_index] = hit_list->hsplist_array[subj_index];
3136
qsort((void*)hsplist_array, hsplist_count,
3137
sizeof(BlastHSPList*), s_CompareHsplistHspcnt);
3140
Int4 tot_hsps = 0; /* total number of HSPs */
3141
Uint4 hsp_per_seq = MAX(1, total_hsp_limit/hsplist_count);
3142
for (subj_index = 0; subj_index < hsplist_count; ++subj_index) {
3143
Int4 allowed_hsp_num = ((subj_index+1)*hsp_per_seq) - tot_hsps;
3144
BlastHSPList* hsp_list = hsplist_array[subj_index];
3145
if (hsp_list->hspcnt > allowed_hsp_num) {
3146
/* Free the extra HSPs */
3148
for (hsp_index = allowed_hsp_num;
3149
hsp_index < hsp_list->hspcnt; ++hsp_index) {
3150
Blast_HSPFree(hsp_list->hsp_array[hsp_index]);
3152
hsp_list->hspcnt = allowed_hsp_num;
3153
hsp_limit_exceeded = TRUE;
3155
tot_hsps += hsp_list->hspcnt;
3158
sfree(hsplist_array);
3161
return hsp_limit_exceeded;
3165
Blast_HSPResultsFromHSPStreamWithLimit(BlastHSPStream* hsp_stream,
3167
const BlastHitSavingOptions* hit_options,
3168
const BlastExtensionOptions* ext_options,
3169
const BlastScoringOptions* scoring_options,
3171
Boolean* removed_hsps)
3173
Boolean rm_hsps = FALSE;
3174
BlastHSPResults* retval = Blast_HSPResultsFromHSPStream(hsp_stream,
3180
rm_hsps = s_TrimResultsByTotalHSPLimit(retval, max_num_hsps);
3182
*removed_hsps = rm_hsps;