1
/* ===========================================================================
4
* National Center for Biotechnology Information
6
* This software/database is a "United States Government Work" under the
7
* terms of the United States Copyright Act. It was written as part of
8
* the author's official duties as a United States Government employee and
9
* thus cannot be copyrighted. This software/database is freely available
10
* to the public for use. The National Library of Medicine and the U.S.
11
* Government have not placed any restriction on its use or reproduction.
13
* Although all reasonable efforts have been taken to ensure the accuracy
14
* and reliability of the software and data, the NLM and the U.S.
15
* Government do not and cannot warrant the performance or results that
16
* may be obtained by using this software or data. The NLM and the U.S.
17
* Government disclaim all warranties, express or implied, including
18
* warranties of performance, merchantability or fitness for any particular
21
* Please cite the author in any work or product based on this material.
23
* ===========================================================================*/
26
* @file smith_waterman.c
27
* Routines for computing rigorous, Smith-Waterman alignments.
29
* @author Alejandro Schaffer, E. Michael Gertz
31
#ifndef SKIP_DOXYGEN_PROCESSING
32
static char const rcsid[] =
33
"$Id: smith_waterman.c,v 1.4 2006/06/29 16:50:07 gertz Exp $";
34
#endif /* SKIP_DOXYGEN_PROCESSING */
36
#include <algo/blast/core/ncbi_std.h>
37
#include <algo/blast/composition_adjustment/composition_constants.h>
38
#include <algo/blast/composition_adjustment/smith_waterman.h>
40
/** A structure used internally by the Smith-Waterman algorithm to
42
typedef struct SwGapInfo {
43
int noGap; /**< score if opening a gap */
44
int gapExists; /**< score if continuing a gap */
49
* Compute the score and right-hand endpoints of the locally optimal
50
* Smith-Waterman alignment. Called by Blast_SmithWatermanScoreOnly
51
* when there are no forbidden ranges. nonempty. See
52
* Blast_SmithWatermanScoreOnly for the meaning of the parameters to
56
BLbasicSmithWatermanScoreOnly(int *score, int *matchSeqEnd, int *queryEnd,
57
const Uint1 * matchSeq, int matchSeqLength,
58
const Uint1 * query, int queryLength,
59
int **matrix, int gapOpen, int gapExtend,
62
int bestScore; /* best score seen so far */
63
int newScore; /* score of next entry */
64
int bestMatchSeqPos, bestQueryPos; /* position ending best score in
65
matchSeq and query sequences */
66
SwGapInfo *scoreVector; /* keeps one row of the
67
Smith-Waterman matrix overwrite
68
old row with new row */
69
int *matrixRow; /* one row of score matrix */
70
int newGapCost; /* cost to have a gap of one character */
71
int prevScoreNoGapMatchSeq; /* score one row and column up with
73
int prevScoreGapMatchSeq; /* score if a gap already started in
75
int continueGapScore; /* score for continuing a gap in matchSeq */
76
int matchSeqPos, queryPos; /* positions in matchSeq and query */
78
scoreVector = (SwGapInfo *) malloc(matchSeqLength * sizeof(SwGapInfo));
79
if (scoreVector == NULL) {
85
newGapCost = gapOpen + gapExtend;
86
for (matchSeqPos = 0; matchSeqPos < matchSeqLength; matchSeqPos++) {
87
scoreVector[matchSeqPos].noGap = 0;
88
scoreVector[matchSeqPos].gapExists = -gapOpen;
90
for (queryPos = 0; queryPos < queryLength; queryPos++) {
92
matrixRow = matrix[queryPos];
94
matrixRow = matrix[query[queryPos]];
96
prevScoreNoGapMatchSeq = 0;
97
prevScoreGapMatchSeq = -(gapOpen);
98
for (matchSeqPos = 0; matchSeqPos < matchSeqLength; matchSeqPos++) {
99
/* testing scores with a gap in matchSeq, either starting a
100
* new gap or extending an existing gap*/
101
if ((newScore = newScore - newGapCost) >
102
(prevScoreGapMatchSeq = prevScoreGapMatchSeq - gapExtend))
103
prevScoreGapMatchSeq = newScore;
104
/* testing scores with a gap in query, either starting a
105
* new gap or extending an existing gap*/
106
if ((newScore = scoreVector[matchSeqPos].noGap - newGapCost) >
108
scoreVector[matchSeqPos].gapExists - gapExtend))
109
continueGapScore = newScore;
110
/* compute new score extending one position in matchSeq
113
prevScoreNoGapMatchSeq + matrixRow[matchSeq[matchSeqPos]];
115
newScore = 0; /*Smith-Waterman locality condition*/
116
/*test two alternatives*/
117
if (newScore < prevScoreGapMatchSeq)
118
newScore = prevScoreGapMatchSeq;
119
if (newScore < continueGapScore)
120
newScore = continueGapScore;
121
prevScoreNoGapMatchSeq = scoreVector[matchSeqPos].noGap;
122
scoreVector[matchSeqPos].noGap = newScore;
123
scoreVector[matchSeqPos].gapExists = continueGapScore;
124
if (newScore > bestScore) {
125
bestScore = newScore;
126
bestQueryPos = queryPos;
127
bestMatchSeqPos = matchSeqPos;
134
*matchSeqEnd = bestMatchSeqPos;
135
*queryEnd = bestQueryPos;
143
* Find the left-hand endpoints of the locally optimal Smith-Waterman
144
* alignment. Called by Blast_SmithWatermanFindStart when there are no
145
* forbidden ranges. See Blast_SmithWatermanFindStartfor the meaning
146
* of the parameters to this routine.
149
BLSmithWatermanFindStart(int *score_out,
150
int *matchSeqStart, int *queryStart,
151
const Uint1 * matchSeq, int matchSeqLength,
153
int **matrix, int gapOpen, int gapExtend,
154
int matchSeqEnd, int queryEnd, int score_in,
155
int positionSpecific)
157
int bestScore; /* best score seen so far*/
158
int newScore; /* score of next entry*/
159
int bestMatchSeqPos, bestQueryPos; /*position starting best score in
160
matchSeq and database sequences */
161
SwGapInfo *scoreVector; /* keeps one row of the Smith-Waterman
162
matrix overwrite old row with new row */
163
int *matrixRow; /* one row of score matrix */
164
int newGapCost; /* cost to have a gap of one character */
165
int prevScoreNoGapMatchSeq; /* score one row and column up
167
int prevScoreGapMatchSeq; /* score if a gap already started in
169
int continueGapScore; /* score for continuing a gap in query */
170
int matchSeqPos, queryPos; /* positions in matchSeq and query */
172
scoreVector = (SwGapInfo *) malloc(matchSeqLength * sizeof(SwGapInfo));
173
if (scoreVector == NULL) {
179
newGapCost = gapOpen + gapExtend;
180
for (matchSeqPos = 0; matchSeqPos < matchSeqLength; matchSeqPos++) {
181
scoreVector[matchSeqPos].noGap = 0;
182
scoreVector[matchSeqPos].gapExists = -(gapOpen);
184
for (queryPos = queryEnd; queryPos >= 0; queryPos--) {
185
if (positionSpecific)
186
matrixRow = matrix[queryPos];
188
matrixRow = matrix[query[queryPos]];
190
prevScoreNoGapMatchSeq = 0;
191
prevScoreGapMatchSeq = -(gapOpen);
192
for (matchSeqPos = matchSeqEnd; matchSeqPos >= 0; matchSeqPos--) {
193
/* testing scores with a gap in matchSeq, either starting
194
* a new gap or extending an existing gap */
195
if ((newScore = newScore - newGapCost) >
196
(prevScoreGapMatchSeq = prevScoreGapMatchSeq - gapExtend))
197
prevScoreGapMatchSeq = newScore;
198
/* testing scores with a gap in query, either starting a
199
* new gap or extending an existing gap */
200
if ((newScore = scoreVector[matchSeqPos].noGap - newGapCost) >
202
scoreVector[matchSeqPos].gapExists - gapExtend))
203
continueGapScore = newScore;
204
/* compute new score extending one position in matchSeq
207
prevScoreNoGapMatchSeq + matrixRow[matchSeq[matchSeqPos]];
209
newScore = 0; /* Smith-Waterman locality condition */
210
/* test two alternatives */
211
if (newScore < prevScoreGapMatchSeq)
212
newScore = prevScoreGapMatchSeq;
213
if (newScore < continueGapScore)
214
newScore = continueGapScore;
215
prevScoreNoGapMatchSeq = scoreVector[matchSeqPos].noGap;
216
scoreVector[matchSeqPos].noGap = newScore;
217
scoreVector[matchSeqPos].gapExists = continueGapScore;
218
if (newScore > bestScore) {
219
bestScore = newScore;
220
bestQueryPos = queryPos;
221
bestMatchSeqPos = matchSeqPos;
223
if (bestScore >= score_in)
226
if (bestScore >= score_in)
232
*matchSeqStart = bestMatchSeqPos;
233
*queryStart = bestQueryPos;
234
*score_out = bestScore;
241
* Compute the score and right-hand endpoints of the locally optimal
242
* Smith-Waterman alignment, subject to the restriction that some
243
* ranges are forbidden. Called by Blast_SmithWatermanScoreOnly when
244
* forbiddenRanges is nonempty. See Blast_SmithWatermanScoreOnly for
245
* the meaning of the parameters to this routine.
248
BLspecialSmithWatermanScoreOnly(int *score, int *matchSeqEnd, int *queryEnd,
249
const Uint1 * matchSeq, int matchSeqLength,
250
const Uint1 *query, int queryLength,
251
int **matrix, int gapOpen, int gapExtend,
252
const int *numForbidden,
253
int ** forbiddenRanges,
254
int positionSpecific)
256
int bestScore; /* best score seen so far */
257
int newScore; /* score of next entry*/
258
int bestMatchSeqPos, bestQueryPos; /*position ending best score in
259
matchSeq and database sequences */
260
SwGapInfo *scoreVector; /* keeps one row of the Smith-Waterman
261
matrix overwrite old row with new row */
262
int *matrixRow; /* one row of score matrix */
263
int newGapCost; /* cost to have a gap of one character */
264
int prevScoreNoGapMatchSeq; /* score one row and column up
266
int prevScoreGapMatchSeq; /* score if a gap already started in
268
int continueGapScore; /* score for continuing a gap in query */
269
int matchSeqPos, queryPos; /* positions in matchSeq and query */
270
int forbidden; /* is this position forbidden? */
271
int f; /* index over forbidden positions */
273
scoreVector = (SwGapInfo *) malloc(matchSeqLength * sizeof(SwGapInfo));
274
if (scoreVector == NULL) {
280
newGapCost = gapOpen + gapExtend;
281
for (matchSeqPos = 0; matchSeqPos < matchSeqLength; matchSeqPos++) {
282
scoreVector[matchSeqPos].noGap = 0;
283
scoreVector[matchSeqPos].gapExists = -(gapOpen);
285
for (queryPos = 0; queryPos < queryLength; queryPos++) {
286
if (positionSpecific)
287
matrixRow = matrix[queryPos];
289
matrixRow = matrix[query[queryPos]];
291
prevScoreNoGapMatchSeq = 0;
292
prevScoreGapMatchSeq = -(gapOpen);
293
for (matchSeqPos = 0; matchSeqPos < matchSeqLength; matchSeqPos++) {
294
/* testing scores with a gap in matchSeq, either starting
295
* a new gap or extending an existing gap */
296
if ((newScore = newScore - newGapCost) >
297
(prevScoreGapMatchSeq = prevScoreGapMatchSeq - gapExtend))
298
prevScoreGapMatchSeq = newScore;
299
/* testing scores with a gap in query, either starting a
300
* new gap or extending an existing gap */
301
if ((newScore = scoreVector[matchSeqPos].noGap - newGapCost) >
303
scoreVector[matchSeqPos].gapExists - gapExtend))
304
continueGapScore = newScore;
305
/* compute new score extending one position in matchSeq
308
for (f = 0; f < numForbidden[queryPos]; f++) {
309
if ((matchSeqPos >= forbiddenRanges[queryPos][2 * f]) &&
310
(matchSeqPos <= forbiddenRanges[queryPos][2*f + 1])) {
316
newScore = COMPO_SCORE_MIN;
319
prevScoreNoGapMatchSeq + matrixRow[matchSeq[matchSeqPos]];
321
newScore = 0; /* Smith-Waterman locality condition */
322
/* test two alternatives */
323
if (newScore < prevScoreGapMatchSeq)
324
newScore = prevScoreGapMatchSeq;
325
if (newScore < continueGapScore)
326
newScore = continueGapScore;
327
prevScoreNoGapMatchSeq = scoreVector[matchSeqPos].noGap;
328
scoreVector[matchSeqPos].noGap = newScore;
329
scoreVector[matchSeqPos].gapExists = continueGapScore;
330
if (newScore > bestScore) {
331
bestScore = newScore;
332
bestQueryPos = queryPos;
333
bestMatchSeqPos = matchSeqPos;
340
*matchSeqEnd = bestMatchSeqPos;
341
*queryEnd = bestQueryPos;
349
* Find the left-hand endpoints of the locally optimal Smith-Waterman
350
* alignment, subject to the restriction that certain ranges may not
351
* be aligned. Called by Blast_SmithWatermanFindStart if
352
* forbiddenRanges is nonempty. See Blast_SmithWatermanFindStartfor
353
* the meaning of the parameters to this routine.
356
BLspecialSmithWatermanFindStart(int * score_out,
357
int *matchSeqStart, int *queryStart,
358
const Uint1 * matchSeq, int matchSeqLength,
359
const Uint1 *query, int **matrix,
360
int gapOpen, int gapExtend, int matchSeqEnd,
361
int queryEnd, int score_in,
362
const int *numForbidden,
363
int ** forbiddenRanges,
364
int positionSpecific)
366
int bestScore; /* best score seen so far */
367
int newScore; /* score of next entry */
368
int bestMatchSeqPos, bestQueryPos; /* position starting best score in
369
matchSeq and database sequences */
370
SwGapInfo *scoreVector; /* keeps one row of the
371
Smith-Waterman matrix; overwrite
372
old row with new row*/
373
int *matrixRow; /* one row of score matrix */
374
int newGapCost; /* cost to have a gap of one character */
375
int prevScoreNoGapMatchSeq; /* score one row and column up
377
int prevScoreGapMatchSeq; /* score if a gap already started in
379
int continueGapScore; /* score for continuing a gap in query */
380
int matchSeqPos, queryPos; /* positions in matchSeq and query */
381
int forbidden; /* is this position forbidden? */
382
int f; /* index over forbidden positions */
384
scoreVector = (SwGapInfo *) malloc(matchSeqLength * sizeof(SwGapInfo));
385
if (scoreVector == NULL) {
391
newGapCost = gapOpen + gapExtend;
392
for (matchSeqPos = 0; matchSeqPos < matchSeqLength; matchSeqPos++) {
393
scoreVector[matchSeqPos].noGap = 0;
394
scoreVector[matchSeqPos].gapExists = -(gapOpen);
396
for (queryPos = queryEnd; queryPos >= 0; queryPos--) {
397
if (positionSpecific)
398
matrixRow = matrix[queryPos];
400
matrixRow = matrix[query[queryPos]];
402
prevScoreNoGapMatchSeq = 0;
403
prevScoreGapMatchSeq = -(gapOpen);
404
for (matchSeqPos = matchSeqEnd; matchSeqPos >= 0; matchSeqPos--) {
405
/* testing scores with a gap in matchSeq, either starting a
406
* new gap or extending an existing gap*/
407
if ((newScore = newScore - newGapCost) >
408
(prevScoreGapMatchSeq = prevScoreGapMatchSeq - gapExtend))
409
prevScoreGapMatchSeq = newScore;
410
/* testing scores with a gap in query, either starting a
411
* new gap or extending an existing gap*/
412
if ((newScore = scoreVector[matchSeqPos].noGap - newGapCost) >
414
scoreVector[matchSeqPos].gapExists - gapExtend))
415
continueGapScore = newScore;
416
/* compute new score extending one position in matchSeq
419
for (f = 0; f < numForbidden[queryPos]; f++) {
420
if ((matchSeqPos >= forbiddenRanges[queryPos][2 * f]) &&
421
(matchSeqPos <= forbiddenRanges[queryPos][2*f + 1])) {
427
newScore = COMPO_SCORE_MIN;
430
prevScoreNoGapMatchSeq + matrixRow[matchSeq[matchSeqPos]];
432
newScore = 0; /* Smith-Waterman locality condition */
433
/* test two alternatives */
434
if (newScore < prevScoreGapMatchSeq)
435
newScore = prevScoreGapMatchSeq;
436
if (newScore < continueGapScore)
437
newScore = continueGapScore;
438
prevScoreNoGapMatchSeq = scoreVector[matchSeqPos].noGap;
439
scoreVector[matchSeqPos].noGap = newScore;
440
scoreVector[matchSeqPos].gapExists = continueGapScore;
441
if (newScore > bestScore) {
442
bestScore = newScore;
443
bestQueryPos = queryPos;
444
bestMatchSeqPos = matchSeqPos;
446
if (bestScore >= score_in)
449
if (bestScore >= score_in)
455
*matchSeqStart = bestMatchSeqPos;
456
*queryStart = bestQueryPos;
457
*score_out = bestScore;
463
/* Documented in smith_waterman.h. */
465
Blast_ForbiddenRangesRelease(Blast_ForbiddenRanges * self)
469
for (f = 0; f < self->capacity; f++) free(self->ranges[f]);
471
free(self->ranges); self->ranges = NULL;
472
free(self->numForbidden); self->numForbidden = NULL;
476
/* Documented in smith_waterman.h. */
478
Blast_ForbiddenRangesInitialize(Blast_ForbiddenRanges * self,
482
self->capacity = capacity;
483
self->numForbidden = NULL;
485
self->isEmpty = TRUE;
487
self->numForbidden = (int *) calloc(capacity, sizeof(int));
488
if (self->numForbidden == NULL)
490
self->ranges = (int **) calloc(capacity, sizeof(int *));
491
if (self->ranges == NULL)
493
for (f = 0; f < capacity; f++) {
494
self->numForbidden[f] = 0;
495
self->ranges[f] = (int *) malloc(2 * sizeof(int));
496
if (self->ranges[f] == NULL)
498
self->ranges[f][0] = 0;
499
self->ranges[f][1] = 0;
503
Blast_ForbiddenRangesRelease(self);
508
/* Documented in smith_waterman.h. */
509
void Blast_ForbiddenRangesClear(Blast_ForbiddenRanges * self)
512
for (f = 0; f < self->capacity; f++) {
513
self->numForbidden[f] = 0;
515
self->isEmpty = TRUE;
519
/* Documented in smith_waterman.h. */
521
Blast_ForbiddenRangesPush(Blast_ForbiddenRanges * self,
528
for (f = queryStart; f < queryEnd; f++) {
529
int last = 2 * self->numForbidden[f];
530
if (0 != last) { /* we must resize the array */
532
realloc(self->ranges[f], (last + 2) * sizeof(int));
533
if (new_ranges == NULL)
535
self->ranges[f] = new_ranges;
537
self->ranges[f][last] = matchStart;
538
self->ranges[f][last + 1] = matchEnd;
540
self->numForbidden[f]++;
542
self->isEmpty = FALSE;
548
/* Documented in smith_waterman.h. */
550
Blast_SmithWatermanScoreOnly(int *score,
551
int *matchSeqEnd, int *queryEnd,
552
const Uint1 * subject_data, int subject_length,
553
const Uint1 * query_data, int query_length,
557
int positionSpecific,
558
const Blast_ForbiddenRanges * forbiddenRanges )
560
if (forbiddenRanges->isEmpty) {
561
return BLbasicSmithWatermanScoreOnly(score, matchSeqEnd,
562
queryEnd, subject_data,
564
query_data, query_length,
569
return BLspecialSmithWatermanScoreOnly(score, matchSeqEnd,
570
queryEnd, subject_data,
573
query_length, matrix,
575
forbiddenRanges->numForbidden,
576
forbiddenRanges->ranges,
582
/* Documented in smith_waterman.h. */
584
Blast_SmithWatermanFindStart(int * score_out,
587
const Uint1 * subject_data, int subject_length,
588
const Uint1 * query_data,
595
int positionSpecific,
596
const Blast_ForbiddenRanges * forbiddenRanges)
598
if (forbiddenRanges->isEmpty) {
599
return BLSmithWatermanFindStart(score_out, matchSeqStart,
600
queryStart, subject_data,
601
subject_length, query_data,
602
matrix, gapOpen, gapExtend,
603
matchSeqEnd, queryEnd,
604
score_in, positionSpecific);
606
return BLspecialSmithWatermanFindStart(score_out,
613
matchSeqEnd, queryEnd,
615
forbiddenRanges->numForbidden,
616
forbiddenRanges->ranges,