1
/* $Id: blast_stat.h,v 1.57 2004/10/05 21:33:23 camacho Exp $
2
* ===========================================================================
5
* National Center for Biotechnology Information
7
* This software/database is a "United States Government Work" under the
8
* terms of the United States Copyright Act. It was written as part of
9
* the author's official duties as a United States Government employee and
10
* thus cannot be copyrighted. This software/database is freely available
11
* to the public for use. The National Library of Medicine and the U.S.
12
* Government have not placed any restriction on its use or reproduction.
14
* Although all reasonable efforts have been taken to ensure the accuracy
15
* and reliability of the software and data, the NLM and the U.S.
16
* Government do not and cannot warrant the performance or results that
17
* may be obtained by using this software or data. The NLM and the U.S.
18
* Government disclaim all warranties, express or implied, including
19
* warranties of performance, merchantability or fitness for any particular
22
* Please cite the author in any work or product based on this material.
24
* ===========================================================================
30
/** @file blast_stat.h
31
* Definitions and prototypes used by blast_stat.c to calculate BLAST
32
* statistics. @todo FIXME: needs doxygen comments
35
#ifndef __BLAST_STAT__
36
#define __BLAST_STAT__
38
#include <algo/blast/core/blast_def.h>
39
#include <algo/blast/core/blast_message.h>
46
* Defines for the matrix 'preferences' (as specified by S. Altschul).
47
* Used in arrays such as blosum45_prefs in blast_stat.c
49
#define BLAST_MATRIX_NOMINAL 0 /**< acceptable values, not recommended. */
50
#define BLAST_MATRIX_PREFERRED 1 /**< These value are preferred over others. */
51
#define BLAST_MATRIX_BEST 2 /**< This is the best value, only one per matrix. */
54
#define BLASTMAT_DIR "/usr/ncbi/blast/matrix" /**< Default location for blast databases. */
57
Structure to hold the Karlin-Altschul parameters.
59
typedef struct Blast_KarlinBlk {
60
double Lambda; /**< Lambda value used in statistics */
61
double K; /**< K value used in statistics */
62
double logK; /**< natural log of K value used in statistics */
63
double H; /**< H value used in statistics */
64
double paramC; /**< for use in seed. */
70
/********************************************************************
72
Structures relating to scoring or the BlastScoreBlk
74
********************************************************************/
76
#define BLAST_SCORE_MIN INT2_MIN /**< minimum allowed score (for one letter comparison). */
77
#define BLAST_SCORE_MAX INT2_MAX /**< maximum allowed score (for one letter comparison). */
80
/** Holds score frequencies used in calculation
81
of Karlin-Altschul parameters for an ungapped search.
83
typedef struct Blast_ScoreFreq {
84
Int4 score_min; /**< lowest allowed scores */
85
Int4 score_max; /**< highest allowed scores */
86
Int4 obs_min; /**< lowest observed (actual) scores */
87
Int4 obs_max; /**< highest observed (actual) scores */
88
double score_avg; /**< average score, must be negative for local alignment. */
89
double* sprob0; /**< arrays for frequency of given score */
90
double* sprob; /**< arrays for frequency of given score, shifted down by score_min. */
93
#define BLAST_MATRIX_SIZE 32
96
typedef struct SBLASTMatrixStructure {
97
Int4 *matrix[BLAST_MATRIX_SIZE];
98
Int4 long_matrix[BLAST_MATRIX_SIZE*BLAST_MATRIX_SIZE]; /* not used */
99
} SBLASTMatrixStructure;
101
/** Structure used for scoring calculations.
103
typedef struct BlastScoreBlk {
104
Boolean protein_alphabet; /**< TRUE if alphabet_code is for a
105
protein alphabet (e.g., ncbistdaa etc.), FALSE for nt. alphabets. */
106
Uint1 alphabet_code; /**< NCBI alphabet code. */
107
Int2 alphabet_size; /**< size of alphabet. */
108
Int2 alphabet_start; /**< numerical value of 1st letter. */
109
SBLASTMatrixStructure* matrix_struct; /**< Holds info about matrix. */
110
Int4 **matrix; /**< Substitution matrix */
111
Int4 **posMatrix; /**< Sub matrix for position depend BLAST. */
112
Int2 mat_dim1; /**< dimensions of matrix. */
113
Int2 mat_dim2; /**< dimensions of matrix. */
114
Int4 loscore; /**< Min. substitution scores */
115
Int4 hiscore; /**< Max. substitution scores */
116
Int4 penalty; /**< penalty for mismatch in blastn. */
117
Int4 reward; /**< reward for match in blastn. */
118
double scale_factor; /**< multiplier for all cutoff and dropoff scores */
119
Boolean read_in_matrix; /**< If TRUE, matrix is read in, otherwise
120
produce one from penalty and reward above. */
121
Blast_ScoreFreq** sfp; /**< score frequencies. */
122
double **posFreqs; /**<matrix of position specific frequencies*/
123
/* kbp & kbp_gap are ptrs that should be set to kbp_std, kbp_psi, etc. */
124
Blast_KarlinBlk** kbp; /**< Karlin-Altschul parameters. Actually just a placeholder. */
125
Blast_KarlinBlk** kbp_gap; /**< K-A parameters for gapped alignments. Actually just a placeholder. */
126
/* Below are the Karlin-Altschul parameters for non-position based ('std')
127
and position based ('psi') searches. */
128
Blast_KarlinBlk **kbp_std, /**< K-A parameters for ungapped alignments */
129
**kbp_psi, /**< K-A parameters for position-based alignments. */
130
**kbp_gap_std, /**< K-A parameters for std (not position-based) alignments */
131
**kbp_gap_psi; /**< K-A parameters for psi alignments. */
132
Blast_KarlinBlk* kbp_ideal; /**< Ideal values (for query with average database composition). */
133
Int4 number_of_contexts; /**< Used by sfp and kbp, how large are these*/
134
char* name; /**< name of matrix. */
135
Uint1* ambiguous_res; /**< Array of ambiguous res. (e.g, 'X', 'N')*/
136
Int2 ambig_size, /**< size of array above. FIXME: not needed here? */
137
ambig_occupy; /**< How many occupied? */
138
ListNode* comments; /**< Comments about matrix. */
139
Int4 query_length; /**< the length of the query. */
140
Int8 effective_search_sp; /**< product of above two */
144
Stores the letter frequency of a sequence or database.
146
typedef struct Blast_ResFreq {
147
Uint1 alphabet_code; /**< indicates alphabet. */
148
double* prob; /**< letter probs, (possible) non-zero offset. */
149
double* prob0; /**< probs, zero offset. */
153
* Allocates and initializes BlastScoreBlk
154
* @param alphabet either BLASTAA_SEQ_CODE or BLASTNA_SEQ_CODE [in]
155
* @param number_of_contexts how many strands or sequences [in]
156
* @return BlastScoreBlk*
158
BlastScoreBlk* BlastScoreBlkNew (Uint1 alphabet, Int4 number_of_contexts);
160
/** Deallocates BlastScoreBlk as well as all associated structures.
161
* @param sbp BlastScoreBlk to be deallocated [in]
162
* @return NULL pointer.
164
BlastScoreBlk* BlastScoreBlkFree (BlastScoreBlk* sbp);
166
/** Sets sbp->matrix field using sbp->name field using
167
* the matrices in the toolkit (util/tables/raw_scoremat.h).
168
* @param sbp the object containing matrix and name [in|out]
169
* @return 0 on success, 1 if matrix could not be loaded
171
Int2 BlastScoreBlkMatrixLoad(BlastScoreBlk* sbp);
173
/** Set the ambiguous residue (e.g, 'N', 'X') in the BlastScoreBlk*.
174
* Convert from ncbieaa to sbp->alphabet_code (i.e., ncbistdaa) first.
176
* @param sbp the object to be modified [in|out]
177
* @param ambiguous_res the residue to be set on the BlastScoreBlk
178
* @return zero on success, others on error
180
Int2 BLAST_ScoreSetAmbigRes (BlastScoreBlk* sbp, char ambiguous_res);
183
/** Calculate the ungapped Karlin parameters. This function should be called
184
* once for each context, or frame translated.
185
* @param sbp the object to be modified [in|out]
186
* @param string the query sequence [in]
187
* @param length length of above sequence [in]
188
* @param context_number which element in various arrays [in]
189
* @return zero on success.
191
Int2 BLAST_ScoreBlkFill (BlastScoreBlk* sbp, char* string, Int4 length, Int4 context_number);
193
/** This function fills in the BlastScoreBlk structure.
195
* -read in the matrix
197
* @param sbp Scoring block [in] [out]
198
* @param matrix Full path to the matrix in the directory structure [in]
200
Int2 BLAST_ScoreBlkMatFill (BlastScoreBlk* sbp, char* matrix);
202
/** Callocs a Blast_KarlinBlk
203
* @return pointer to the Blast_KarlinBlk
205
Blast_KarlinBlk* Blast_KarlinBlkCreate (void);
207
/** Deallocates the KarlinBlk
208
* @param kbp KarlinBlk to be deallocated [in]
211
Blast_KarlinBlk* Blast_KarlinBlkDestruct(Blast_KarlinBlk* kbp);
213
/** Fills in lambda, H, and K values, as calculated by Stephen Altschul
214
* in Methods in Enzy. (vol 266, page 474).
215
* @param kbp object to be filled in [in|out]
216
* @param gap_open cost of gap existence [in]
217
* @param gap_extend cost to extend a gap one letter [in]
218
* @param decline_align cost of declining to align a letter [in]
219
* @param matrix_name name of the matrix to be used [in]
220
* @param error_return filled in with error message if needed [out]
221
* @return zero on success
223
Int2 Blast_KarlinBlkGappedCalc (Blast_KarlinBlk* kbp, Int4 gap_open,
224
Int4 gap_extend, Int4 decline_align, const char* matrix_name,
225
Blast_Message** error_return);
228
/** Calculates the standard Karlin parameters. This is used
229
* if the query is translated and the calculated (real) Karlin
230
* parameters are bad, as they're calculated for non-coding regions.
231
* @param sbp ScoreBlk used to calculate "ideal" values. [in]
233
Blast_KarlinBlk* Blast_KarlinBlkIdealCalc(const BlastScoreBlk* sbp);
236
/** Fills KarlinBlk pointers in BlastScoreBlk with "ideal" values if the
237
* ideal Lambda is less than the actual Lambda. This happens if
238
* if the query is translated and the calculated (real) Karlin
240
* parameters are bad, as they're calculated for non-coding regions.
241
* @param sbp the object to be modified [in|out]
242
* @param context_start first context to start with [in]
243
* @param context_end last context to work on [in]
244
* @return zero on success
246
Int2 Blast_KarlinBlkStandardCalc(BlastScoreBlk* sbp, Int4 context_start,
249
/** Attempts to fill KarlinBlk for given gap opening, extensions etc.
251
* @param kbp object to be filled in [in|out]
252
* @param gap_open gap existence cost [in]
253
* @param gap_extend gap extension cost [in]
254
* @param decline_align cost to not align part of a sequence [in]
255
* @param matrix_name name of the matrix used [in]
256
* @return -1 if matrix_name is NULL;
257
* 1 if matrix not found
258
* 2 if matrix found, but open, extend etc. values not supported.
260
Int2 Blast_KarlinkGapBlkFill(Blast_KarlinBlk* kbp, Int4 gap_open, Int4 gap_extend, Int4 decline_align, const char* matrix_name);
262
/** Prints a messages about the allowed matrices, BlastKarlinkGapBlkFill should return 1 before this is called.
263
* @param matrix the matrix to print a message about [in]
264
* @return the message
266
char* BLAST_PrintMatrixMessage(const char *matrix);
268
/** Prints a messages about the allowed open etc values for the given matrix,
269
* BlastKarlinkGapBlkFill should return 2 before this is called.
270
* @param matrix name of the matrix [in]
271
* @param gap_open gap existence cost [in]
272
* @param gap_extend cost to extend a gap by one [in]
273
* @param decline_align cost of declining to align [in]
276
char* BLAST_PrintAllowedValues(const char *matrix, Int4 gap_open, Int4 gap_extend, Int4 decline_align);
278
/** Calculates the parameter Lambda given an initial guess for its value */
280
Blast_KarlinLambdaNR(Blast_ScoreFreq* sfp, double initialLambdaGuess);
282
/** Calculates the Expect value based upon the search space and some Karlin-Altschul
283
* parameters. It is "simple" as it does not use sum-statistics.
284
* @param S the score of the alignment. [in]
285
* @param kbp the Karlin-Altschul parameters. [in]
286
* @param searchsp total search space to be used [in]
287
* @return the expect value
289
double BLAST_KarlinStoE_simple (Int4 S, Blast_KarlinBlk* kbp, Int8 searchsp);
291
/** Compute a divisor used to weight the evalue of a collection of
292
* "nsegs" distinct alignments. These divisors are used to compensate
293
* for the effect of choosing the best among multiple collections of
296
* Stephen F. Altschul. Evaluating the statitical significance of
297
* multiple distinct local alignments. In Suhai, editior, Theoretical
298
* and Computational Methods in Genome Research, pages 1-14. Plenum
299
* Press, New York, 1997.
301
* The "decayrate" parameter of this routine is a value in the
302
* interval (0,1). Typical values of decayrate are .1 and .5.
303
* @param decayrate adjusts for (multiple) tests of number of HSP sum groups [in]
304
* @param nsegs the number of HSPs in the sum group [in]
305
* @return divisor used to compensate for multiple tests
307
double BLAST_GapDecayDivisor(double decayrate, unsigned nsegs );
309
/** Calculate the cutoff score from the expected number of HSPs or vice versa.
310
* @param S The calculated score [in] [out]
311
* @param E The calculated e-value [in] [out]
312
* @param kbp The Karlin-Altschul statistical parameters [in]
313
* @param searchsp The effective search space [in]
314
* @param dodecay Use gap decay feature? [in]
315
* @param gap_decay_rate Gap decay rate to use, if dodecay is set [in]
317
Int2 BLAST_Cutoffs (Int4 *S, double* E, Blast_KarlinBlk* kbp,
318
Int8 searchsp, Boolean dodecay, double gap_decay_rate);
320
/** Calculates the e-value for alignments with "small" gaps (typically
321
* under fifty residues/basepairs) following ideas of Stephen Altschul's.
322
* @param start_points the number of starting points permitted between
323
* adjacent alignments; max_overlap + max_gap + 1 [in]
324
* @param num the number of distinct alignments in this collection [in]
325
* @param xsum the sum of the scores of these alignments each individually
326
* normalized using an appropriate value of Lambda and logK [in]
327
* @param query_length effective len of the query seq [in]
328
* @param subject_length effective len of the subject seq [in]
329
* @param searchsp_eff effective size of the search space [in]
330
* @param weight_divisor a divisor used to weight the e-value
331
* when multiple collections of alignments are being considered by
332
* the calling routine [in]
333
* @return the expect value
335
double BLAST_SmallGapSumE (Int4 start_points, Int2 num, double xsum,
336
Int4 query_length, Int4 subject_length,
337
Int8 searchsp_eff, double weight_divisor);
339
/** Calculates the e-value of a collection multiple distinct
340
* alignments with asymmetric gaps between the alignments. The gaps
341
* in one (protein) sequence are typically small (like in
342
* BLAST_SmallGapSumE) gap an the gaps in the other (translated DNA)
343
* sequence are possibly large (up to 4000 bp.) This routine is used
344
* for linking HSPs representing exons in the DNA sequence that are
345
* separated by introns.
346
* @param query_start_points the number of starting points in
347
* the query sequence permitted between adjacent alignments;
348
* max_overlap + max_gap + 1. [in]
349
* @param subject_start_points the number of starting points in
350
* the subject sequence permitted between adjacent alignments [in]
351
* @param num number of distinct alignments in one collection [in]
352
* @param xsum the sum of the scores of these alignments each individually
353
* normalized using an appropriate value of Lambda and logK [in]
354
* @param query_length effective length of query [in]
355
* @param subject_length effective length of subject [in]
356
* @param searchsp_eff effective size of the search space [in]
357
* @param weight_divisor a divisor used to weight the e-value
358
* when multiple collections of alignments are being considered by
359
* the calling routine [in]
360
* @return sum expect value.
362
double BLAST_UnevenGapSumE (Int4 query_start_points, Int4 subject_start_points,
363
Int2 num, double xsum,
364
Int4 query_length, Int4 subject_length,
365
Int8 searchsp_eff, double weight_divisor);
367
/** Calculates the e-value if a collection of distinct alignments with
368
* arbitrarily large gaps between the alignments
369
* @param kbp statistical parameters [in]
370
* @param num number of distinct alignments in the collection [in]
371
* @param xsum the sum of the scores of these alignments each individually
372
* normalized using an appropriate value of Lambda and logK [in]
373
* @param query_length effective length of query sequence [in]
374
* @param subject_length effective length of subject sequence [in]
375
* @param searchsp_eff effective size of the search space [in]
376
* @param weight_divisor a divisor used to weight the e-value
377
* when multiple collections of alignments are being considered by the
378
* calling routine [in]
379
* @return sum expect value.
381
double BLAST_LargeGapSumE (Int2 num, double xsum,
382
Int4 query_length, Int4 subject_length,
383
Int8 searchsp_eff, double weight_divisor );
385
/** Extract the alpha and beta settings for this matrixName, and these
386
* gap open and gap extension costs
387
* @param matrixName name of the matrix used [in]
388
* @param alpha Karlin-Altschul parameter to be set [out]
389
* @param beta Karlin-Altschul parameter to be set [out]
390
* @param gapped TRUE if a gapped search [in]
391
* @param gap_open existence cost of a gap [in]
392
* @param gap_extend extension cost of a gap [in]
394
void BLAST_GetAlphaBeta (const char* matrixName, double *alpha,
395
double *beta, Boolean gapped, Int4 gap_open, Int4 gap_extend);
398
/** Calculate a new PSSM, using composition-based statistics, for use
399
* with RPS BLAST. This function produces a PSSM for a single RPS DB
400
* sequence (of size db_seq_length) and incorporates information from
401
* the RPS blast query. Each individual database sequence must call this
402
* function to retrieve its own PSSM. The matrix is returned (and must
403
* be freed elsewhere). posMatrix is the portion of the complete
404
* concatenated PSSM that is specific to this DB sequence
405
* @param scalingFactor used to rescale Lambda [in]
406
* @param rps_query_length length of query sequence [in]
407
* @param rps_query_seq the query sequence [in]
408
* @param db_seq_length Length of the database sequence [in]
409
* @param posMatrix matrix (actual) values to be used [in]
410
* @param matrix_name Name of the score matrix underlying the RPS search [in]
413
Int4 ** RPSCalculatePSSM(double scalingFactor, Int4 rps_query_length,
414
const Uint1 * rps_query_seq, Int4 db_seq_length,
415
Int4 **posMatrix, const char *matrix_name);
419
* Computes the adjustment to the lengths of the query and database sequences
420
* that is used to compensate for edge effects when computing evalues.
422
* The length adjustment is an integer-valued approximation to the fixed
423
* point of the function
426
* (alpha/lambda) * (log K + log((m - ell)*(n - N ell)))
428
* where m is the query length n is the length of the database and N is the
429
* number of sequences in the database. The values beta, alpha, lambda and
430
* K are statistical, Karlin-Altschul parameters.
432
* The value of the length adjustment computed by this routine, A,
433
* will always be an integer smaller than the fixed point of
434
* f(ell). Usually, it will be the largest such integer. However, the
435
* computed length adjustment, A, will also be so small that
437
* K * (m - A) * (n - N * A) > MAX(m,n).
439
* Moreover, an iterative method is used to compute A, and under
440
* unusual circumstances the iterative method may not converge.
442
* @param K the statistical parameter K [in]
443
* @param logK the natural logarithm of K [in]
444
* @param alpha_d_lambda the ratio of the statistical parameters
445
* alpha and lambda (for ungapped alignments, the
446
* value 1/H should be used) [in]
447
* @param beta the statistical parameter beta (for ungapped
448
* alignments, beta == 0) [in]
449
* @param query_length the length of the query sequence [in]
450
* @param db_length the length of the database [in]
451
* @param db_num_seqs the number of sequences in the database [in]
452
* @param length_adjustment the computed value of the length adjustment [out]
454
* @return 0 if length_adjustment is known to be the largest integer less
455
* than the fixed point of f(ell); 1 otherwise.
458
BLAST_ComputeLengthAdjustment(double K,
460
double alpha_d_lambda,
465
Int4 * length_adjustment);
468
/** Allocates a new Blast_ResFreq structure and fills in the prob element
469
based upon the contents of sbp.
470
* @param sbp The BlastScoreBlk* used to init prob [in]
472
Blast_ResFreq* Blast_ResFreqNew(const BlastScoreBlk* sbp);
474
/** Deallocates Blast_ResFreq and prob0 element.
475
* @param rfp the Blast_ResFreq to be deallocated.
477
Blast_ResFreq* Blast_ResFreqDestruct(Blast_ResFreq* rfp);
480
/** Calculates residues frequencies given a standard distribution.
481
* @param sbp the BlastScoreBlk provides information on alphabet.
482
* @param rfp the prob element on this Blast_ResFreq is used.
483
* @return zero on success
485
Int2 Blast_ResFreqStdComp(const BlastScoreBlk* sbp, Blast_ResFreq* rfp);
487
/** Creates a new structure to keep track of score frequencies for a scoring
489
* @param score_min Minimum score [in]
490
* @param score_max Maximum score [in]
491
* @return allocated and initialized pointer to Blast_ScoreFreq
494
Blast_ScoreFreqNew(Int4 score_min, Int4 score_max);
496
/** Deallocates the score frequencies structure
497
* @param sfp the structure to deallocate [in]
501
Blast_ScoreFreqDestruct(Blast_ScoreFreq* sfp);
503
/** Fills a buffer with the 'standard' alphabet
504
* (given by STD_AMINO_ACID_FREQS[index].ch).
506
* @param alphabet_code specifies alphabet [in]
507
* @param residues buffer to be filled in [in|out]
508
* @param residue_size size of "residues" buffer [in]
509
* @return Number of residues in alphabet or negative returns upon error.
512
Blast_GetStdAlphabet(Uint1 alphabet_code, Uint1* residues,
515
/** Computes the parameters lambda, H K for use in calculating the
516
* statistical significance of high-scoring segments or subalignments (see
517
* comment on blast_stat.c for more details).
518
* @param kbp object containing Lambda, H, and K as well as scoring information [in|out]
519
* @param sfp array of probabilities for all scores [in]
520
* @return zero on success, 1 on error.
523
Blast_KarlinBlkCalc(Blast_KarlinBlk* kbp, Blast_ScoreFreq* sfp);
525
/** Given a sequence of 'length' amino acid residues, compute the
526
* probability of each residue and put that in the array resProb
527
* Excludes ambiguity characters.
529
* @param sequence the sequence to be computed upon [in]
530
* @param length the length of the sequence [in]
531
* @param resProb the object to be filled in [in|out]
534
Blast_FillResidueProbability(const Uint1* sequence, Int4 length, double * resProb);
536
/** Fill in the matrix for blastn using the penaly and rewards
537
* The query sequence alphabet is blastna, the subject sequence
538
* is ncbi2na. The alphabet blastna is defined in blast_stat.h
539
* and the first four elements of blastna are identical to ncbi2na.
540
* if sbp->matrix==NULL, it is allocated.
541
* @param sbp the BlastScoreBlk on which reward, penalty, and matrix will be set
543
* @return zero on success.
545
Int2 BlastScoreBlkNuclMatrixCreate(BlastScoreBlk* sbp);
550
#endif /* !__BLAST_STAT__ */