1
/* $Id: greedy_align.c,v 1.24 2004/10/07 14:05:53 papadopo Exp $
2
* ===========================================================================
5
* National Center for Biotechnology Information
7
* This software/database is a "United States Government Work" under the
8
* terms of the United States Copyright Act. It was written as part of
9
* the author's official duties as a United States Government employee and
10
* thus cannot be copyrighted. This software/database is freely available
11
* to the public for use. The National Library of Medicine and the U.S.
12
* Government have not placed any restriction on its use or reproduction.
14
* Although all reasonable efforts have been taken to ensure the accuracy
15
* and reliability of the software and data, the NLM and the U.S.
16
* Government do not and cannot warrant the performance or results that
17
* may be obtained by using this software or data. The NLM and the U.S.
18
* Government disclaim all warranties, express or implied, including
19
* warranties of performance, merchantability or fitness for any particular
22
* Please cite the author in any work or product based on this material.
24
* ===========================================================================
26
* Author: Webb Miller and Co. Adopted for NCBI libraries by Sergey Shavirin
28
* Initial Creation Date: 10/27/1999
32
/** @file greedy_align.c
33
* Greedy gapped alignment functions
36
static char const rcsid[] =
37
"$Id: greedy_align.c,v 1.24 2004/10/07 14:05:53 papadopo Exp $";
39
#include <algo/blast/core/greedy_align.h>
40
#include <algo/blast/core/blast_util.h> /* for NCBI2NA_UNPACK_BASE macros */
42
/** Constants used during greedy alignment */
44
MAX_SPACE = 1000000, /**< The number of SThreeVal structures available
45
in a single link of the greedy memory pool */
46
LARGE = 100000000, /**< used to represent infinity */
49
/** Ensures that an edit script has enough memory allocated
50
to hold a given number of edit operations
52
@param script The script to examine [in/modified]
53
@param total_ops Number of operations the script must support [in]
54
@return 0 if successful, nonzero otherwise
57
edit_script_realloc(MBGapEditScript *script, Uint4 total_ops)
59
if (script->num_ops_allocated <= total_ops) {
60
Uint4 new_size = total_ops * 3 / 2;
63
new_ops = realloc(script->edit_ops, new_size *
68
script->edit_ops = new_ops;
69
script->num_ops_allocated = new_size;
74
/** Add an edit operation to an edit script
76
@param script The edit script to update [in/modified]
77
@param op_type The edit operation to add [in]
78
@param num_ops The number of operations of the specified type [in]
79
@return 0 on success, nonzero otherwise
82
edit_script_add_new(MBGapEditScript *script,
83
enum EOpType op_type, Uint4 num_ops)
85
if (edit_script_realloc(script, script->num_ops + 2) != 0)
88
ASSERT(op_type != eEditOpError);
90
script->last_op = op_type;
91
script->edit_ops[script->num_ops].op_type = op_type;
92
script->edit_ops[script->num_ops].num_ops = num_ops;
98
/** Initialize an edit script structure
100
@param script Pointer to an uninitialized edit script
101
@return The initialized edit script
103
static MBGapEditScript *
104
edit_script_init(MBGapEditScript *script)
106
script->edit_ops = NULL;
107
script->num_ops_allocated = 0;
109
script->last_op = eEditOpError;
110
edit_script_realloc(script, 8);
114
/** Add a new operation to an edit script, possibly combining
115
it with the last operation if the two operations are identical
117
@param script The edit script to update [in/modified]
118
@param op_type The operation type to add [in]
119
@param num_ops The number of the specified type of operation to add [in]
120
@return 0 on success, nonzero otherwise
123
edit_script_add(MBGapEditScript *script,
124
enum EOpType op_type, Uint4 num_ops)
126
if (op_type == eEditOpError) {
127
#ifdef ERR_POST_EX_DEFINED
128
ErrPostEx(SEV_FATAL, 1, 0,
129
"edit_script_more: bad opcode %d:%d", op_type, num_ops);
134
if (script->last_op == op_type)
135
script->edit_ops[script->num_ops-1].num_ops += num_ops;
137
edit_script_add_new(script, op_type, num_ops);
142
/** See greedy_align.h for description */
144
MBGapEditScriptAppend(MBGapEditScript *dest_script,
145
MBGapEditScript *src_script)
149
for (i = 0; i < src_script->num_ops; i++) {
150
MBEditOp *edit_op = src_script->edit_ops + i;
152
edit_script_add(dest_script, edit_op->op_type, edit_op->num_ops);
158
/** See greedy_align.h for description */
160
MBGapEditScriptNew(void)
162
MBGapEditScript *script = calloc(1, sizeof(MBGapEditScript));
164
return edit_script_init(script);
168
/** See greedy_align.h for description */
170
MBGapEditScriptFree(MBGapEditScript *script)
175
if (script->edit_ops)
176
sfree(script->edit_ops);
182
/** Reverse the order of the operations in an edit script
184
@param script The script to be reversed [in/modified]
185
@return Pointer to the updated edit script
187
static MBGapEditScript *
188
edit_script_reverse_inplace(MBGapEditScript *script)
191
const Uint4 num_ops = script->num_ops;
192
const Uint4 mid = num_ops / 2;
193
const Uint4 end = num_ops - 1;
195
for (i = 0; i < mid; ++i) {
196
const MBEditOp t = script->edit_ops[i];
197
script->edit_ops[i] = script->edit_ops[end - i];
198
script->edit_ops[end - i] = t;
203
/** see greedy_align.h for description */
209
/** @todo FIXME: Later code assumes that a request will
210
never be made for more than MAX_SPACE structures at once */
212
new_space = (SMBSpace*)malloc(sizeof(SMBSpace));
213
if (new_space == NULL)
216
new_space->space_array = (SThreeVal*)malloc(
217
MAX_SPACE * sizeof(SThreeVal));
218
if (new_space->space_array == NULL) {
222
new_space->space_used = 0;
223
new_space->space_allocated = MAX_SPACE;
224
new_space->next = NULL;
229
/** Mark the input collection of space structures as unused
230
@param space The space to mark
233
refresh_mb_space(SMBSpace* space)
235
while (space != NULL) {
236
space->space_used = 0;
241
/** see greedy_align.h for description */
242
void MBSpaceFree(SMBSpace* space)
244
SMBSpace* next_space;
246
while (space != NULL) {
247
next_space = space->next;
248
sfree(space->space_array);
254
/** Allocate a specified number of SThreeVal structures from
257
@param pool The memory pool [in]
258
@param num_alloc The number of structures to allocate [in]
259
@return Pointer to the allocated memory, or NULL in case of error
262
get_mb_space(SMBSpace* pool, Int4 num_alloc)
268
while (pool->space_used + num_alloc > pool->space_allocated) {
269
if (pool->next == NULL) {
270
pool->next = MBSpaceNew();
271
if (pool->next == NULL) {
272
#ifdef ERR_POST_EX_DEFINED
273
ErrPostEx(SEV_WARNING, 0, 0, "Cannot get new space for greedy extension");
281
out_ptr = pool->space_array + pool->space_used;
282
pool->space_used += num_alloc;
287
/** Compute the greatest common divisor of two numbers
288
@param a First number [in]
289
@param b Second number [in]
290
@return The computed gcd
300
while ((c = a % b) != 0) {
307
/** Compute the greatest common divisor of three numbers,
308
divide that quantity out of each number, and return the computed GCD
309
@param a Pointer to first number [in/modified]
310
@param b Pointer to second number [in/modified]
311
@param c Pointer to third number [in/modified]
312
@return The computed gcd
315
gdb3(Int4* a, Int4* b, Int4* c)
321
g = gcd(*a, gcd(*b, *c));
330
/** During the traceback for a greedy affine alignment,
331
determine the state of the traceback that results
332
from moving a specified number of diagonals upwards
333
in the traceback array from a match
335
@param flast_d 2-D Array of traceback scores [in]
336
@param lower Array of per-diagonal lower bounds [in]
337
@param upper Array of per-diagonal upper bounds [in]
338
@param d Starting score / ending score(?) [in/modified]
339
@param diag Starting diagonal(?) [in]
340
@param Mis_cost Cost of a mismatch [in]
341
@param row1 The row of the traceback array where
342
the next score occurs(?) [out]
343
@return The state for the next traceback operation
346
get_lastC(SThreeVal** flast_d, Int4* lower, Int4* upper,
347
Int4* d, Int4 diag, Int4 Mis_cost, Int4* row1)
351
if (diag >= lower[(*d)-Mis_cost] &&
352
diag <= upper[(*d)-Mis_cost]) {
354
row = flast_d[(*d)-Mis_cost][diag].C;
355
if (row >= MAX(flast_d[*d][diag].I, flast_d[*d][diag].D)) {
358
return eEditOpReplace;
361
if (flast_d[*d][diag].I > flast_d[*d][diag].D) {
362
*row1 = flast_d[*d][diag].I;
363
return eEditOpInsert;
366
*row1 = flast_d[*d][diag].D;
367
return eEditOpDelete;
371
/** During the traceback for a greedy affine alignment,
372
determine the state of the traceback that results
373
from moving a specified number of diagonals upwards
374
in the traceback array from an insertion or deletion
376
@param flast_d 2-D Array of scores [in]
377
@param lower Array of per-diagonal lower bounds [in]
378
@param upper Array of per-diagonal upper bounds [in]
379
@param d Starting score / ending score(?) [in/modified]
380
@param diag Starting diagonal(?) [in]
381
@param GO_cost Cost to open a gap [in]
382
@param GE_cost Cost to extend a gap [in]
383
@param IorD The state of the traceback at present [in]
384
@return The state for the next traceback operation
387
get_last_ID(SThreeVal** flast_d, Int4* lower, Int4* upper,
388
Int4* d, Int4 diag, Int4 GO_cost,
389
Int4 GE_cost, enum EOpType IorD)
394
if (IorD == eEditOpInsert)
399
if (ndiag >= lower[(*d)-GE_cost] &&
400
ndiag <= upper[(*d)-GE_cost]) {
402
if (IorD == eEditOpInsert)
403
row = flast_d[(*d)-GE_cost][ndiag].I;
405
row = flast_d[(*d)-GE_cost][ndiag].D;
411
if (ndiag >= lower[(*d)-GO_cost-GE_cost] &&
412
ndiag <= upper[(*d)-GO_cost-GE_cost] &&
413
row < flast_d[(*d)-GO_cost-GE_cost][ndiag].C) {
415
*d = (*d) - GO_cost - GE_cost;
416
return eEditOpReplace;
422
/** During the traceback for a non-affine greedy alignment,
423
compute the diagonal that the next traceback operation
426
@param flast_d 2-D Array of scores [in]
427
@param d Starting score(?) [in]
428
@param diag Starting diagonal(?) [in]
429
@param row1 The next traceback row to examine(?) [out]
430
@return The next diagonal in the traceback
433
get_last(Int4 **flast_d, Int4 d, Int4 diag, Int4 *row1)
435
if (flast_d[d-1][diag-1] > MAX(flast_d[d-1][diag], flast_d[d-1][diag+1])) {
436
*row1 = flast_d[d-1][diag-1];
439
if (flast_d[d-1][diag] > flast_d[d-1][diag+1]) {
440
*row1 = flast_d[d-1][diag];
443
*row1 = flast_d[d-1][diag+1];
447
/** see greedy_align.h for description */
448
Int4 BLAST_GreedyAlign(const Uint1* seq1, Int4 len1,
449
const Uint1* seq2, Int4 len2,
450
Boolean reverse, Int4 xdrop_threshold,
451
Int4 match_cost, Int4 mismatch_cost,
452
Int4* extent1, Int4* extent2,
453
SGreedyAlignMem* aux_data,
454
MBGapEditScript *script, Uint1 rem)
456
Int4 col; /* column number */
457
Int4 row; /* row number */
458
Int4 d; /* current distance */
459
Int4 k; /* current diagonal */
460
Int4 flower, fupper; /* boundaries for searching diagonals */
461
Int4 MAX_D; /* maximum cost */
464
Int4** flast_d = aux_data->flast_d; /* rows containing the last d */
465
Int4* max_row; /* reached for cost d=0, ... len1. */
467
Int4 match_cost_half = match_cost / 2;
468
Int4 op_cost = mismatch_cost + match_cost;
469
Int4 d_dropoff = ICEIL(xdrop_threshold + match_cost_half, op_cost);
473
Int4 best_diag = INT4_MAX / 2;
474
Int4* max_row_free = aux_data->max_row_free;
476
SMBSpace* mem_pool = aux_data->space;
479
MAX_D = (Int4) (len1 / GREEDY_MAX_COST_FRACTION + 1);
484
/* find the offset of the first mismatch between
490
while (row < len1 && row < len2) {
491
if (seq2[len2-1-row] != seq1[len1-1-row])
497
while (row < len1 && row < len2) {
498
if (seq2[len2-1-row] !=
499
NCBI2NA_UNPACK_BASE(seq1[(len1-1-row) / 4],
500
3 - (len1-1-row) % 4))
508
while (row < len1 && row < len2) {
509
if (seq2[row] != seq1[row])
515
while (row < len1 && row < len2) {
516
if (seq2[row] != NCBI2NA_UNPACK_BASE(seq1[(row+rem) / 4],
524
/* update the extents of the alignment, and bail out
525
early if no further work is needed. */
529
if (row == len1 || row == len2) {
531
edit_script_add(script, eEditOpReplace, row);
535
/* set up the memory pool */
537
if (script == NULL) {
540
else if (mem_pool == NULL) {
541
aux_data->space = mem_pool = MBSpaceNew();
544
refresh_mb_space(mem_pool);
547
max_row = max_row_free + d_dropoff;
548
for (k = 0; k < d_dropoff; k++)
551
flast_d[0][ORIGIN] = row;
552
max_row[0] = row * match_cost;
559
Int4 x, flower0, fupper0;
561
flast_d[d - 1][flower-1] = -1;
562
flast_d[d - 1][flower] = -1;
563
flast_d[d - 1][fupper] = -1;
564
flast_d[d - 1][fupper+1] = -1;
566
x = max_row[d - d_dropoff] + op_cost * d - xdrop_threshold;
567
x = ICEIL(x, match_cost_half);
572
for (k = flower0; k <= fupper0; k++) {
573
row = MAX(flast_d[d - 1][k + 1], flast_d[d - 1][k]) + 1;
574
row = MAX(row, flast_d[d - 1][k - 1]);
575
col = row + k - ORIGIN;
576
if (row + col >= x) {
587
if (row > max_len || row < 0) {
592
/* Slide down the diagonal. */
595
while (row < len2 && col < len1 &&
596
seq2[len2-1-row] == seq1[len1-1-col]) {
601
while (row < len2 && col < len1 && seq2[len2-1-row] ==
602
NCBI2NA_UNPACK_BASE(seq1[(len1-1-col) / 4],
603
3 - (len1-1-col) % 4)) {
610
while (row < len2 && col < len1 &&
611
seq2[row] == seq1[col]) {
616
while (row < len2 && col < len1 && seq2[row] ==
617
NCBI2NA_UNPACK_BASE(seq1[(col+rem) / 4],
618
3 - (col+rem) % 4)) {
625
if (row + col > cur_max) {
639
k = cur_max * match_cost_half - d * op_cost;
640
if (max_row[d - 1] < k) {
644
*extent2 = flast_d[d][b_diag];
645
*extent1 = (*extent2) + b_diag - ORIGIN;
648
max_row[d] = max_row[d - 1];
658
if (script == NULL) {
659
flast_d[d] = flast_d[d - 2];
662
/* space array consists of SThreeVal structures which are
663
3 times larger than Int4, so divide requested amount by 3
665
flast_d[d] = (Int4*) get_mb_space(mem_pool,
666
(fupper - flower + 7) / 3);
667
if (flast_d[d] != NULL)
668
flast_d[d] = flast_d[d] - flower + 2;
674
/* perform traceback if desired */
676
if (script != NULL) {
685
Int4 row1, col1, diag1;
686
diag1 = get_last(flast_d, d, diag, &row1);
687
col1 = row1 + diag1 - ORIGIN;
689
if (row - row1 > 0) {
690
edit_script_add(script, eEditOpReplace, row - row1);
693
else if (diag1 < diag) {
694
if (row - row1 > 0) {
695
edit_script_add(script, eEditOpReplace, row - row1);
697
edit_script_add(script, eEditOpInsert, 1);
700
if (row - row1 - 1 > 0) {
701
edit_script_add(script, eEditOpReplace, row - row1 - 1);
703
edit_script_add(script, eEditOpDelete, 1);
710
edit_script_add(script, eEditOpReplace, flast_d[0][ORIGIN]);
712
edit_script_reverse_inplace(script);
717
/** See greedy_align.h for description */
718
Int4 BLAST_AffineGreedyAlign (const Uint1* seq1, Int4 len1,
719
const Uint1* seq2, Int4 len2,
720
Boolean reverse, Int4 xdrop_threshold,
721
Int4 match_score, Int4 mismatch_score,
722
Int4 gap_open, Int4 gap_extend,
723
Int4* extent1, Int4* extent2,
724
SGreedyAlignMem* aux_data,
725
MBGapEditScript *script, Uint1 rem)
727
Int4 col; /* column number */
728
Int4 row; /* row number */
729
Int4 d; /* current distance */
730
Int4 k; /* current diagonal */
731
Int4 flower, fupper; /* boundaries for searching diagonals */
732
Int4 MAX_D; /* maximum cost */
734
Int4 final_score = 0;
735
SThreeVal** flast_d; /* rows containing the last d */
737
Int4* max_row; /* reached for cost d=0, ... len1. */
738
Int4 Mis_cost, GO_cost, GE_cost;
740
Int4 match_score_half;
746
Int4 best_diag = INT4_MAX/2;
747
char nlower = 0, nupper = 0;
748
SMBSpace* mem_pool = aux_data->space;
754
if (match_score % 2 == 1) {
757
xdrop_threshold *= 2;
762
if (gap_open == 0 && gap_extend == 0) {
763
return BLAST_GreedyAlign(seq1, len1, seq2, len2, reverse,
764
xdrop_threshold, match_score,
765
mismatch_score, extent1, extent2,
766
aux_data, script, rem);
769
match_score_half = match_score / 2;
770
Mis_cost = mismatch_score + match_score;
772
GE_cost = gap_extend + match_score_half;
773
gd = gdb3(&Mis_cost, &GO_cost, &GE_cost);
774
D_diff = ICEIL(xdrop_threshold + match_score_half, gd);
776
MAX_D = (Int4) (len1/GREEDY_MAX_COST_FRACTION + 1);
777
max_d = MAX_D * GE_cost;
779
max_cost = MAX(Mis_cost, GO_cost + GE_cost);
783
/* find the offset of the first mismatch between
789
while (row < len1 && row < len2) {
790
if (seq2[len2-1-row] != seq1[len1-1-row])
796
while (row < len1 && row < len2) {
797
if (seq2[len2-1-row] !=
798
NCBI2NA_UNPACK_BASE(seq1[(len1-1-row) / 4],
799
3 - (len1-1-row) % 4))
807
while (row < len1 && row < len2) {
808
if (seq2[row] != seq1[row])
814
while (row < len1 && row < len2) {
815
if (seq2[row] != NCBI2NA_UNPACK_BASE(seq1[(row+rem) / 4],
823
/* update the extents of the alignment, and bail out
824
early if no further work is needed */
828
if (row == len1 || row == len2) {
830
edit_script_add(script, eEditOpReplace, row);
831
return row * match_score;
834
/* set up the memory pool */
836
if (script == NULL) {
839
else if (!mem_pool) {
840
aux_data->space = mem_pool = MBSpaceNew();
843
refresh_mb_space(mem_pool);
846
flast_d = aux_data->flast_d_affine;
847
max_row_free = aux_data->max_row_free;
848
max_row = max_row_free + D_diff;
849
for (k = 0; k < D_diff; k++)
852
uplow_free = aux_data->uplow_free;
854
upper = uplow_free + max_d + 1 + max_cost;
856
/* set boundary for -1,-2,...,-max_cost+1*/
857
for (k = 0; k < max_cost; k++) {
864
flast_d[0][ORIGIN].C = row;
865
flast_d[0][ORIGIN].I = -2;
866
flast_d[0][ORIGIN].D = -2;
867
max_row[0] = row * match_score;
876
Int4 x, flower0, fupper0;
878
x = max_row[d - D_diff] + gd * d - xdrop_threshold;
879
x = ICEIL(x, match_score_half);
886
for (k = flower0; k <= fupper0; k++) {
888
if (k+1 <= upper[d-GO_cost-GE_cost] &&
889
k+1 >= lower[d-GO_cost-GE_cost]) {
890
row = flast_d[d-GO_cost-GE_cost][k+1].C;
892
if (k+1 <= upper[d-GE_cost] && k+1 >= lower[d-GE_cost] &&
893
row < flast_d[d-GE_cost][k+1].D) {
894
row = flast_d[d-GE_cost][k+1].D;
898
if (2 * row + k - ORIGIN >= x) {
899
flast_d[d][k].D = row;
902
flast_d[d][k].D = -2;
906
if (k-1 <= upper[d-GO_cost-GE_cost] &&
907
k-1 >= lower[d-GO_cost-GE_cost]) {
908
row = flast_d[d-GO_cost-GE_cost][k-1].C;
910
if (k-1 <= upper[d-GE_cost] &&
911
k-1 >= lower[d-GE_cost] &&
912
row < flast_d[d-GE_cost][k-1].I) {
913
row = flast_d[d-GE_cost][k-1].I;
915
if (2 * row + k - ORIGIN >= x) {
916
flast_d[d][k].I = row;
919
flast_d[d][k].I = -2;
922
row = MAX(flast_d[d][k].I, flast_d[d][k].D);
923
if (k <= upper[d-Mis_cost] &&
924
k >= lower[d-Mis_cost]) {
925
row = MAX(flast_d[d-Mis_cost][k].C+1,row);
928
col = row + k - ORIGIN;
929
if (row + col >= x) {
936
flast_d[d][k].C = -2;
940
if (row > max_len || row < -2) {
941
flower = k; nlower = k+1;
944
/* slide down the diagonal */
947
while (row < len2 && col < len1 &&
948
seq2[len2-1-row] == seq1[len1-1-col]) {
953
while (row < len2 && col < len1 && seq2[len2-1-row] ==
954
NCBI2NA_UNPACK_BASE(seq1[(len1-1-col) / 4],
955
3 - (len1-1-col) % 4)) {
962
while (row < len2 && col < len1 &&
963
seq2[row] == seq1[col]) {
968
while (row < len2 && col < len1 && seq2[row] ==
969
NCBI2NA_UNPACK_BASE(seq1[(col+rem) / 4],
970
3 - (col+rem) % 4)) {
976
flast_d[d][k].C = row;
977
if (row + col > cur_max) {
991
k = cur_max * match_score_half - d * gd;
992
if (max_row[d - 1] < k) {
996
*extent2 = flast_d[d][b_diag].C;
997
*extent1 = (*extent2) + b_diag - ORIGIN;
1000
max_row[d] = max_row[d - 1];
1003
if (flower <= fupper) {
1013
if (lower[d-max_cost] <= upper[d-max_cost])
1015
if (stop_condition == 0)
1019
flower = MIN(lower[d-Mis_cost],
1020
MIN(lower[d-GO_cost-GE_cost], lower[d-GE_cost])-1);
1022
flower = MAX(flower, nlower);
1024
fupper = MAX(upper[d-Mis_cost],
1025
MAX(upper[d-GO_cost-GE_cost], upper[d-GE_cost])+1);
1027
fupper = MIN(fupper, nupper);
1030
if (script == NULL) {
1031
flast_d[d] = flast_d[d - max_cost-1];
1034
flast_d[d] = get_mb_space(mem_pool, fupper-flower+1)-flower;
1035
if (flast_d[d] == NULL)
1041
/* compute the traceback if desired */
1043
if (script != NULL) {
1050
state = eEditOpReplace;
1052
if (state == eEditOpReplace) {
1053
/* diag unchanged */
1054
state = get_lastC(flast_d, lower, upper,
1055
&d, diag, Mis_cost, &row1);
1057
edit_script_add(script, eEditOpReplace, row-row1);
1061
if (state == eEditOpInsert) {
1063
state = get_last_ID(flast_d, lower, upper, &d,
1064
diag, GO_cost, GE_cost, eEditOpInsert);
1066
edit_script_add(script, eEditOpInsert, 1);
1069
edit_script_add(script, eEditOpDelete, 1);
1070
state = get_last_ID(flast_d, lower, upper, &d,
1071
diag, GO_cost, GE_cost, eEditOpDelete);
1077
edit_script_add(script, eEditOpReplace, flast_d[0][ORIGIN].C);
1079
edit_script_reverse_inplace(script);
1081
final_score = max_row[final_score];