1
/* $Id: blast_tune.c,v 1.1 2006/04/19 17:40:28 papadopo Exp $
2
* ===========================================================================
5
* National Center for Biotechnology Information
7
* This software/database is a "United States Government Work" under the
8
* terms of the United States Copyright Act. It was written as part of
9
* the author's official duties as a United States Government employee and
10
* thus cannot be copyrighted. This software/database is freely available
11
* to the public for use. The National Library of Medicine and the U.S.
12
* Government have not placed any restriction on its use or reproduction.
14
* Although all reasonable efforts have been taken to ensure the accuracy
15
* and reliability of the software and data, the NLM and the U.S.
16
* Government do not and cannot warrant the performance or results that
17
* may be obtained by using this software or data. The NLM and the U.S.
18
* Government disclaim all warranties, express or implied, including
19
* warranties of performance, merchantability or fitness for any particular
22
* Please cite the author in any work or product based on this material.
24
* ===========================================================================
26
* Author: Jason Papadopoulos
30
/** @file blast_tune.c
31
* Routines that compute a blastn word size appropriate for finding,
32
* with high probability, alignments with specified length and
36
#ifndef SKIP_DOXYGEN_PROCESSING
37
static char const rcsid[] =
38
"$Id: blast_tune.c,v 1.1 2006/04/19 17:40:28 papadopo Exp $";
39
#endif /* SKIP_DOXYGEN_PROCESSING */
41
#include <algo/blast/core/blast_def.h>
42
#include <algo/blast/core/blast_tune.h>
44
/** structure containing intermediate data to be processed */
45
typedef struct MatrixData {
46
Int4 matrix_dim_alloc; /**< max matrix size */
47
Int4 matrix_dim; /**< current matrix size */
48
double hit_probability; /**< for the current Markov chain, the
49
probability that blastn will find
50
a hit of specified length with
52
double percent_identity; /**< the target percent identity, used
53
to choose the blastn word size */
54
double *power_matrix; /**< space for iterated Markov chain */
55
double *prod_matrix; /**< scratch space for matrix multiply */
58
/** the probability that a random alignment will be found.
59
Given particulars about the alignment, we will attempt
60
to compute the largest blastn word size that has at least
61
this probability of finding a random alignment */
62
#define TARGET_HIT_PROB 0.98
64
/** initialize intermediate state. Note that memory for
65
* the matrices gets allocated later.
66
* @param m pointer to intermediate state [in][out]
67
* @return -1 if m is NULL, zero otherwise
69
static Int2 s_MatrixDataInit(MatrixData *m)
74
memset(m, 0, sizeof(MatrixData));
78
/** Free previously allocated scratch data
79
* @param m pointer to intermediate state [in][out]
81
static void s_MatrixDataFree(MatrixData *m)
84
sfree(m->power_matrix);
85
sfree(m->prod_matrix);
89
/** Set up for the next calculation of hit probability.
90
* @param m Space for the Markov chain calculation [in][out]
91
* @param new_word_size The blastn word size to be used
92
* for the current test. The internally generated
93
* matrix has dimension one larger than this [in]
94
* @param percent_identity The desired amount of identity in
95
* alignments. A fractional number (0...1) [in]
96
* @return 0 if successful
98
static Int2 s_MatrixDataReset(MatrixData *m,
100
double percent_identity)
105
m->hit_probability = 0.0;
106
m->percent_identity = percent_identity;
107
m->matrix_dim = new_word_size + 1;
109
/* reallocate the two scratch matrices only if the new
110
matrix dimension exceeds the amount of space previously
112
if (m->matrix_dim > m->matrix_dim_alloc) {
114
Int4 num_cells = m->matrix_dim * m->matrix_dim;
115
m->matrix_dim_alloc = m->matrix_dim;
116
m->power_matrix = (double *)realloc(m->power_matrix,
117
num_cells * sizeof(double));
118
m->prod_matrix = (double *)realloc(m->prod_matrix,
119
num_cells * sizeof(double));
121
if (m->power_matrix == NULL || m->prod_matrix == NULL) {
122
sfree(m->power_matrix);
123
sfree(m->prod_matrix);
130
/** Loads the initial value for matrix exponentiation. This is
131
* the starting Markov chain described in the reference.
132
* @param matrix The matrix to be initialized [in][out]
133
* @param matrix_dim Dimension of the matrix [in]
134
* @param identity The desired amount of identity in
135
* alignments. A fractional number (0...1) [in]
137
static void s_SetInitialMatrix(double *matrix,
144
memset(matrix, 0, matrix_dim * matrix_dim * sizeof(double));
146
for (i = 0, row = matrix; i < matrix_dim - 1;
147
i++, row += matrix_dim) {
148
row[0] = 1.0 - identity;
154
/** Multiply the current exponentiated matrix by the original
155
* state transition matrix. Since the latter is very sparse and
156
* has a regular structure, this operation is essentially
157
* instantaneous compared to an ordinary matrix-matrix multiply
158
* @param a Matrix to multiply [in]
159
* @param identity The desired amount of identity in
160
* alignments. A fractional number (0...1). Note that
161
* this is the only information needed to create the
162
* state transition matrix, and its structure is sufficiently
163
* regular that the matrix can be implicitly used [in]
164
* @param prod space for the matrix product [out]
165
* @param dim The dimension of all matrices [in]
167
static void s_MatrixMultiply(double *a,
169
double *prod, Int4 dim)
174
double comp_identity = 1.0 - identity;
176
/* compute the first column of the product */
179
for (i = 0; i < dim; i++) {
182
for (j = 0; j < dim - 1; j++)
185
prod_row[0] = comp_identity * accum;
190
/* computed the second to the last columns */
193
for (i = 0; i < dim; i++) {
194
for (j = 1; j < dim; j++) {
195
prod_row[j] = identity * a_row[j-1];
201
/* modify the last column slightly */
203
prod_row = prod + dim - 1;
204
for (i = 0; i < dim; i++) {
205
prod_row[0] += a_row[0];
211
/** Multiply a square matrix by itself
212
* @param a The matrix [in]
213
* @param prod Space to store the product [out]
214
* @param dim The matrix dimesnion [in]
216
static void s_MatrixSquare(double *a, double *prod, Int4 dim)
219
double *prod_row = prod;
221
Int4 full_entries = dim & ~3;
223
/* matrix multiplication is probably the most heavily
224
studied computational problem, and there are many
225
high-quality implementations for computing matrix
226
products. All of them 1) are enormously faster than
227
this implementation, 2) are far more complicated than
228
is practical, 3) are optimized for matrix sizes much
229
larger than are dealt with here, and 4) are not worth
230
adding a dependency on a BLAS implementation just for
231
this application. The following is 'fast enough' */
233
for (i = 0; i < dim; i++, prod_row += dim, a_row += dim) {
235
for (j = 0; j < dim; j++) {
237
double *a_col = a + j;
239
for (k = 0; k < full_entries; k += 4, a_col += 4 * dim) {
240
accum += a_row[k] * a_col[0] +
241
a_row[k+1] * a_col[dim] +
242
a_row[k+2] * a_col[2*dim] +
243
a_row[k+3] * a_col[3*dim];
245
for (; k < dim; k++, a_col += dim) {
246
accum += a_row[k] * a_col[0];
254
/** swap two matrices by swapping pointers to them */
255
#define SWAP_MATRIX(a,b) { \
261
/** For fixed word size and alignment properties, compute
262
* the probability that blastn with that word size will
263
* find a seed within a random alignment.
264
* @param m Space for the Markov chain calculation [in][out]
265
* @param word_size The blastn word size [in]
266
* @param min_percent_identity How much identity is expected in
267
* random alignments. Less identity means the probability of
268
* finding such alignments is decreased [in]
269
* @param min_align_length The smallest alignment length desired.
270
* Longer length gives blastn more leeway to find seeds
271
* and increases the computed probability that alignments
273
* @return 0 if the probability was successfully computed
275
static Int2 s_FindHitProbability(MatrixData *m,
277
double min_percent_identity,
278
Int4 min_align_length)
281
Int4 num_squares = 0;
283
if (min_align_length == 0)
286
if (s_MatrixDataReset(m, word_size, min_percent_identity))
289
/* initialize the matrix of state transitions */
290
s_SetInitialMatrix(m->power_matrix, m->matrix_dim,
291
min_percent_identity);
293
/* Exponentiate the starting matrix. The probability desired
294
is the top right entry of the resulting matrix. Use left-to-
295
right binary exponentiation, since this allows the original
296
(very sparse) transition matrix to be used throughout the
297
exponentiation process */
299
mask = (Uint4)(0x80000000);
300
while (!(min_align_length & mask))
303
for (mask = mask / 2, num_squares = 0; mask;
304
mask = mask / 2, num_squares++) {
306
if (num_squares == 0)
307
s_MatrixMultiply(m->power_matrix, m->percent_identity,
308
m->prod_matrix, m->matrix_dim);
310
s_MatrixSquare(m->power_matrix, m->prod_matrix, m->matrix_dim);
311
SWAP_MATRIX(m->prod_matrix, m->power_matrix);
313
if (min_align_length & mask) {
314
s_MatrixMultiply(m->power_matrix, m->percent_identity,
315
m->prod_matrix, m->matrix_dim);
316
SWAP_MATRIX(m->prod_matrix, m->power_matrix);
320
m->hit_probability = m->power_matrix[m->matrix_dim - 1];
325
/** For specified alignment properties, compute the blastn word size
326
* that will cause random alignments with those properties to be
327
* found with specified (high) probability.
328
* @param m Space for the Markov chain calculation [in][out]
329
* @param min_percent_identity How much identity is expected in
330
* random alignments [in]
331
* @param min_align_length The smallest alignment length desired [in]
332
* @return The optimal word size, or zero if the optimization
335
static Int4 s_FindWordSize(MatrixData *m,
336
double min_percent_identity,
337
Int4 min_align_length)
339
const double k_min_w = 4; /* minimum acceptable word size */
340
const double k_max_w = 110; /* maximum acceptable word size */
344
/* we treat the optimization problem as an exercise in
345
rootfinding, and use bisection. Bisection is appropriate
346
here because the root does not need to be found to
347
high accuracy (since the final word size must be an
348
integer) and because the function described by
349
s_FindHitProbability is monotonically decreasing but
350
can drop off very sharply, i.e. can still be badly behaved.
352
Begin by bracketing the target probability. The initial range
353
should be appropriate for common searches */
356
if (s_FindHitProbability(m, (Int4)(w1 + 0.5),
357
min_percent_identity,
358
min_align_length) != 0) {
361
p1 = m->hit_probability - TARGET_HIT_PROB;
364
if (s_FindHitProbability(m, (Int4)(w0 + 0.5),
365
min_percent_identity,
366
min_align_length) != 0) {
369
p0 = m->hit_probability - TARGET_HIT_PROB;
371
/* modify the initial range if it does not bracket the
372
target probability */
375
/* push the range to the right. Progressively double
376
the word size until the root is bracketed or the
377
maximum word size is reached */
379
while (p1 > 0 && w1 < k_max_w) {
381
w1 = MIN(2 * w1, k_max_w);
382
if (s_FindHitProbability(m, (Int4)(w1 + 0.5),
383
min_percent_identity,
384
min_align_length) != 0) {
387
p1 = m->hit_probability - TARGET_HIT_PROB;
390
/* if the root is still not bracketed, return the
391
largest possible word size */
394
return (Int4)(w1 + 0.5);
398
/* push the range to the left. The smallest word size
399
is reached much sooner, so choose it immediately */
403
if (s_FindHitProbability(m, (Int4)(w0 + 0.5),
404
min_percent_identity,
405
min_align_length) != 0) {
408
p0 = m->hit_probability - TARGET_HIT_PROB;
410
/* and return that word size if it's still not enough */
412
return (Int4)(w0 + 0.5);
415
/* bisect the initial range until the bounds have
416
converged to each other */
417
while (fabs(w1 - w0) > 1) {
418
double p2, w2 = (w0 + w1) / 2;
420
if (s_FindHitProbability(m, (Int4)(w2 + 0.5),
421
min_percent_identity,
422
min_align_length) != 0) {
425
p2 = m->hit_probability - TARGET_HIT_PROB;
435
/* conservatively return the lower bound, since that gives
436
a more accurate word size */
437
return (Int4)(w0 + 0.5);
440
/* see blast_tune.h */
441
Int4 BLAST_FindBestNucleotideWordSize(double min_percent_identity,
442
Int4 min_align_length)
447
/* perform sanity checks */
449
if (min_percent_identity >= 1.0 || min_percent_identity < 0.6)
452
if (min_align_length > 10000)
453
min_align_length = 10000;
454
else if (min_align_length < 0)
456
else if (min_align_length < 8)
459
/* find the best word size */
460
s_MatrixDataInit(&m);
461
retval = s_FindWordSize(&m, min_percent_identity,
463
s_MatrixDataFree(&m);