1.1.2
by Aaron M. Ucko
Import upstream version 6.1.20041020 |
1 |
/* $Id: blast_stat.h,v 1.57 2004/10/05 21:33:23 camacho Exp $
|
1.1.1
by Aaron M. Ucko
Import upstream version 6.1.20040616 |
2 |
* ===========================================================================
|
3 |
*
|
|
4 |
* PUBLIC DOMAIN NOTICE
|
|
5 |
* National Center for Biotechnology Information
|
|
6 |
*
|
|
7 |
* This software/database is a "United States Government Work" under the
|
|
8 |
* terms of the United States Copyright Act. It was written as part of
|
|
9 |
* the author's official duties as a United States Government employee and
|
|
10 |
* thus cannot be copyrighted. This software/database is freely available
|
|
11 |
* to the public for use. The National Library of Medicine and the U.S.
|
|
12 |
* Government have not placed any restriction on its use or reproduction.
|
|
13 |
*
|
|
14 |
* Although all reasonable efforts have been taken to ensure the accuracy
|
|
15 |
* and reliability of the software and data, the NLM and the U.S.
|
|
16 |
* Government do not and cannot warrant the performance or results that
|
|
17 |
* may be obtained by using this software or data. The NLM and the U.S.
|
|
18 |
* Government disclaim all warranties, express or implied, including
|
|
19 |
* warranties of performance, merchantability or fitness for any particular
|
|
20 |
* purpose.
|
|
21 |
*
|
|
22 |
* Please cite the author in any work or product based on this material.
|
|
23 |
*
|
|
24 |
* ===========================================================================
|
|
25 |
*
|
|
26 |
* Author: Tom Madden
|
|
27 |
*
|
|
28 |
*/
|
|
29 |
||
30 |
/** @file blast_stat.h
|
|
31 |
* Definitions and prototypes used by blast_stat.c to calculate BLAST
|
|
32 |
* statistics. @todo FIXME: needs doxygen comments
|
|
33 |
*/
|
|
34 |
||
35 |
#ifndef __BLAST_STAT__
|
|
36 |
#define __BLAST_STAT__
|
|
37 |
||
38 |
#include <algo/blast/core/blast_def.h> |
|
39 |
#include <algo/blast/core/blast_message.h> |
|
40 |
||
41 |
#ifdef __cplusplus
|
|
42 |
extern "C" { |
|
43 |
#endif
|
|
44 |
||
1.1.2
by Aaron M. Ucko
Import upstream version 6.1.20041020 |
45 |
/**
|
46 |
* Defines for the matrix 'preferences' (as specified by S. Altschul).
|
|
47 |
* Used in arrays such as blosum45_prefs in blast_stat.c
|
|
48 |
*/
|
|
49 |
#define BLAST_MATRIX_NOMINAL 0 /**< acceptable values, not recommended. */ |
|
50 |
#define BLAST_MATRIX_PREFERRED 1 /**< These value are preferred over others. */ |
|
51 |
#define BLAST_MATRIX_BEST 2 /**< This is the best value, only one per matrix. */ |
|
52 |
||
53 |
||
54 |
#define BLASTMAT_DIR "/usr/ncbi/blast/matrix" /**< Default location for blast databases. */ |
|
1.1.1
by Aaron M. Ucko
Import upstream version 6.1.20040616 |
55 |
|
56 |
/**
|
|
57 |
Structure to hold the Karlin-Altschul parameters.
|
|
58 |
*/
|
|
59 |
typedef struct Blast_KarlinBlk { |
|
60 |
double Lambda; /**< Lambda value used in statistics */ |
|
1.1.2
by Aaron M. Ucko
Import upstream version 6.1.20041020 |
61 |
double K; /**< K value used in statistics */ |
62 |
double logK; /**< natural log of K value used in statistics */ |
|
1.1.1
by Aaron M. Ucko
Import upstream version 6.1.20040616 |
63 |
double H; /**< H value used in statistics */ |
64 |
double paramC; /**< for use in seed. */ |
|
65 |
} Blast_KarlinBlk; |
|
66 |
||
67 |
||
68 |
||
69 |
||
70 |
/********************************************************************
|
|
71 |
||
72 |
Structures relating to scoring or the BlastScoreBlk
|
|
73 |
||
74 |
********************************************************************/
|
|
75 |
||
1.1.2
by Aaron M. Ucko
Import upstream version 6.1.20041020 |
76 |
#define BLAST_SCORE_MIN INT2_MIN /**< minimum allowed score (for one letter comparison). */ |
77 |
#define BLAST_SCORE_MAX INT2_MAX /**< maximum allowed score (for one letter comparison). */ |
|
1.1.1
by Aaron M. Ucko
Import upstream version 6.1.20040616 |
78 |
|
79 |
||
80 |
/** Holds score frequencies used in calculation
|
|
81 |
of Karlin-Altschul parameters for an ungapped search.
|
|
82 |
*/
|
|
83 |
typedef struct Blast_ScoreFreq { |
|
1.1.2
by Aaron M. Ucko
Import upstream version 6.1.20041020 |
84 |
Int4 score_min; /**< lowest allowed scores */ |
85 |
Int4 score_max; /**< highest allowed scores */ |
|
86 |
Int4 obs_min; /**< lowest observed (actual) scores */ |
|
87 |
Int4 obs_max; /**< highest observed (actual) scores */ |
|
88 |
double score_avg; /**< average score, must be negative for local alignment. */ |
|
89 |
double* sprob0; /**< arrays for frequency of given score */ |
|
90 |
double* sprob; /**< arrays for frequency of given score, shifted down by score_min. */ |
|
1.1.1
by Aaron M. Ucko
Import upstream version 6.1.20040616 |
91 |
} Blast_ScoreFreq; |
92 |
||
93 |
#define BLAST_MATRIX_SIZE 32
|
|
94 |
||
95 |
/* Remove me */
|
|
96 |
typedef struct SBLASTMatrixStructure { |
|
97 |
Int4 *matrix[BLAST_MATRIX_SIZE]; |
|
98 |
Int4 long_matrix[BLAST_MATRIX_SIZE*BLAST_MATRIX_SIZE]; /* not used */ |
|
99 |
} SBLASTMatrixStructure; |
|
100 |
||
101 |
/** Structure used for scoring calculations.
|
|
102 |
*/
|
|
103 |
typedef struct BlastScoreBlk { |
|
104 |
Boolean protein_alphabet; /**< TRUE if alphabet_code is for a |
|
105 |
protein alphabet (e.g., ncbistdaa etc.), FALSE for nt. alphabets. */
|
|
106 |
Uint1 alphabet_code; /**< NCBI alphabet code. */ |
|
107 |
Int2 alphabet_size; /**< size of alphabet. */ |
|
108 |
Int2 alphabet_start; /**< numerical value of 1st letter. */ |
|
109 |
SBLASTMatrixStructure* matrix_struct; /**< Holds info about matrix. */ |
|
110 |
Int4 **matrix; /**< Substitution matrix */ |
|
111 |
Int4 **posMatrix; /**< Sub matrix for position depend BLAST. */ |
|
1.1.2
by Aaron M. Ucko
Import upstream version 6.1.20041020 |
112 |
Int2 mat_dim1; /**< dimensions of matrix. */ |
113 |
Int2 mat_dim2; /**< dimensions of matrix. */ |
|
114 |
Int4 loscore; /**< Min. substitution scores */ |
|
115 |
Int4 hiscore; /**< Max. substitution scores */ |
|
116 |
Int4 penalty; /**< penalty for mismatch in blastn. */ |
|
117 |
Int4 reward; /**< reward for match in blastn. */ |
|
1.1.1
by Aaron M. Ucko
Import upstream version 6.1.20040616 |
118 |
double scale_factor; /**< multiplier for all cutoff and dropoff scores */ |
119 |
Boolean read_in_matrix; /**< If TRUE, matrix is read in, otherwise |
|
120 |
produce one from penalty and reward above. */
|
|
121 |
Blast_ScoreFreq** sfp; /**< score frequencies. */ |
|
122 |
double **posFreqs; /**<matrix of position specific frequencies*/ |
|
123 |
/* kbp & kbp_gap are ptrs that should be set to kbp_std, kbp_psi, etc. */
|
|
1.1.2
by Aaron M. Ucko
Import upstream version 6.1.20041020 |
124 |
Blast_KarlinBlk** kbp; /**< Karlin-Altschul parameters. Actually just a placeholder. */ |
125 |
Blast_KarlinBlk** kbp_gap; /**< K-A parameters for gapped alignments. Actually just a placeholder. */ |
|
1.1.1
by Aaron M. Ucko
Import upstream version 6.1.20040616 |
126 |
/* Below are the Karlin-Altschul parameters for non-position based ('std')
|
127 |
and position based ('psi') searches. */
|
|
1.1.2
by Aaron M. Ucko
Import upstream version 6.1.20041020 |
128 |
Blast_KarlinBlk **kbp_std, /**< K-A parameters for ungapped alignments */ |
129 |
**kbp_psi, /**< K-A parameters for position-based alignments. */ |
|
130 |
**kbp_gap_std, /**< K-A parameters for std (not position-based) alignments */ |
|
131 |
**kbp_gap_psi; /**< K-A parameters for psi alignments. */ |
|
1.1.1
by Aaron M. Ucko
Import upstream version 6.1.20040616 |
132 |
Blast_KarlinBlk* kbp_ideal; /**< Ideal values (for query with average database composition). */ |
133 |
Int4 number_of_contexts; /**< Used by sfp and kbp, how large are these*/ |
|
134 |
char* name; /**< name of matrix. */ |
|
135 |
Uint1* ambiguous_res; /**< Array of ambiguous res. (e.g, 'X', 'N')*/ |
|
136 |
Int2 ambig_size, /**< size of array above. FIXME: not needed here? */ |
|
137 |
ambig_occupy; /**< How many occupied? */ |
|
138 |
ListNode* comments; /**< Comments about matrix. */ |
|
139 |
Int4 query_length; /**< the length of the query. */ |
|
140 |
Int8 effective_search_sp; /**< product of above two */ |
|
141 |
} BlastScoreBlk; |
|
142 |
||
143 |
/**
|
|
144 |
Stores the letter frequency of a sequence or database.
|
|
145 |
*/
|
|
146 |
typedef struct Blast_ResFreq { |
|
147 |
Uint1 alphabet_code; /**< indicates alphabet. */ |
|
148 |
double* prob; /**< letter probs, (possible) non-zero offset. */ |
|
149 |
double* prob0; /**< probs, zero offset. */ |
|
150 |
} Blast_ResFreq; |
|
151 |
||
152 |
/**
|
|
153 |
* Allocates and initializes BlastScoreBlk
|
|
154 |
* @param alphabet either BLASTAA_SEQ_CODE or BLASTNA_SEQ_CODE [in]
|
|
155 |
* @param number_of_contexts how many strands or sequences [in]
|
|
156 |
* @return BlastScoreBlk*
|
|
157 |
*/
|
|
158 |
BlastScoreBlk* BlastScoreBlkNew (Uint1 alphabet, Int4 number_of_contexts); |
|
159 |
||
160 |
/** Deallocates BlastScoreBlk as well as all associated structures.
|
|
161 |
* @param sbp BlastScoreBlk to be deallocated [in]
|
|
162 |
* @return NULL pointer.
|
|
163 |
*/
|
|
164 |
BlastScoreBlk* BlastScoreBlkFree (BlastScoreBlk* sbp); |
|
165 |
||
1.1.2
by Aaron M. Ucko
Import upstream version 6.1.20041020 |
166 |
/** Sets sbp->matrix field using sbp->name field using
|
167 |
* the matrices in the toolkit (util/tables/raw_scoremat.h).
|
|
168 |
* @param sbp the object containing matrix and name [in|out]
|
|
169 |
* @return 0 on success, 1 if matrix could not be loaded
|
|
170 |
*/
|
|
1.1.1
by Aaron M. Ucko
Import upstream version 6.1.20040616 |
171 |
Int2 BlastScoreBlkMatrixLoad(BlastScoreBlk* sbp); |
172 |
||
173 |
/** Set the ambiguous residue (e.g, 'N', 'X') in the BlastScoreBlk*.
|
|
174 |
* Convert from ncbieaa to sbp->alphabet_code (i.e., ncbistdaa) first.
|
|
175 |
*
|
|
176 |
* @param sbp the object to be modified [in|out]
|
|
177 |
* @param ambiguous_res the residue to be set on the BlastScoreBlk
|
|
178 |
* @return zero on success, others on error
|
|
179 |
*/
|
|
180 |
Int2 BLAST_ScoreSetAmbigRes (BlastScoreBlk* sbp, char ambiguous_res); |
|
181 |
||
182 |
||
1.1.2
by Aaron M. Ucko
Import upstream version 6.1.20041020 |
183 |
/** Calculate the ungapped Karlin parameters. This function should be called
|
184 |
* once for each context, or frame translated.
|
|
1.1.1
by Aaron M. Ucko
Import upstream version 6.1.20040616 |
185 |
* @param sbp the object to be modified [in|out]
|
186 |
* @param string the query sequence [in]
|
|
187 |
* @param length length of above sequence [in]
|
|
188 |
* @param context_number which element in various arrays [in]
|
|
189 |
* @return zero on success.
|
|
190 |
*/
|
|
191 |
Int2 BLAST_ScoreBlkFill (BlastScoreBlk* sbp, char* string, Int4 length, Int4 context_number); |
|
192 |
||
193 |
/** This function fills in the BlastScoreBlk structure.
|
|
194 |
* Tasks are:
|
|
195 |
* -read in the matrix
|
|
196 |
* -set maxscore
|
|
197 |
* @param sbp Scoring block [in] [out]
|
|
198 |
* @param matrix Full path to the matrix in the directory structure [in]
|
|
199 |
*/
|
|
200 |
Int2 BLAST_ScoreBlkMatFill (BlastScoreBlk* sbp, char* matrix); |
|
201 |
||
202 |
/** Callocs a Blast_KarlinBlk
|
|
203 |
* @return pointer to the Blast_KarlinBlk
|
|
204 |
*/
|
|
205 |
Blast_KarlinBlk* Blast_KarlinBlkCreate (void); |
|
206 |
||
207 |
/** Deallocates the KarlinBlk
|
|
208 |
* @param kbp KarlinBlk to be deallocated [in]
|
|
209 |
* @return NULL
|
|
210 |
*/
|
|
211 |
Blast_KarlinBlk* Blast_KarlinBlkDestruct(Blast_KarlinBlk* kbp); |
|
212 |
||
1.1.2
by Aaron M. Ucko
Import upstream version 6.1.20041020 |
213 |
/** Fills in lambda, H, and K values, as calculated by Stephen Altschul
|
214 |
* in Methods in Enzy. (vol 266, page 474).
|
|
215 |
* @param kbp object to be filled in [in|out]
|
|
216 |
* @param gap_open cost of gap existence [in]
|
|
217 |
* @param gap_extend cost to extend a gap one letter [in]
|
|
218 |
* @param decline_align cost of declining to align a letter [in]
|
|
219 |
* @param matrix_name name of the matrix to be used [in]
|
|
220 |
* @param error_return filled in with error message if needed [out]
|
|
221 |
* @return zero on success
|
|
222 |
*/
|
|
1.1.1
by Aaron M. Ucko
Import upstream version 6.1.20040616 |
223 |
Int2 Blast_KarlinBlkGappedCalc (Blast_KarlinBlk* kbp, Int4 gap_open, |
1.1.2
by Aaron M. Ucko
Import upstream version 6.1.20041020 |
224 |
Int4 gap_extend, Int4 decline_align, const char* matrix_name, |
1.1.1
by Aaron M. Ucko
Import upstream version 6.1.20040616 |
225 |
Blast_Message** error_return); |
226 |
||
227 |
||
228 |
/** Calculates the standard Karlin parameters. This is used
|
|
229 |
* if the query is translated and the calculated (real) Karlin
|
|
230 |
* parameters are bad, as they're calculated for non-coding regions.
|
|
231 |
* @param sbp ScoreBlk used to calculate "ideal" values. [in]
|
|
232 |
*/
|
|
1.1.2
by Aaron M. Ucko
Import upstream version 6.1.20041020 |
233 |
Blast_KarlinBlk* Blast_KarlinBlkIdealCalc(const BlastScoreBlk* sbp); |
1.1.1
by Aaron M. Ucko
Import upstream version 6.1.20040616 |
234 |
|
235 |
||
236 |
/** Fills KarlinBlk pointers in BlastScoreBlk with "ideal" values if the
|
|
237 |
* ideal Lambda is less than the actual Lambda. This happens if
|
|
238 |
* if the query is translated and the calculated (real) Karlin
|
|
239 |
*
|
|
240 |
* parameters are bad, as they're calculated for non-coding regions.
|
|
241 |
* @param sbp the object to be modified [in|out]
|
|
242 |
* @param context_start first context to start with [in]
|
|
243 |
* @param context_end last context to work on [in]
|
|
244 |
* @return zero on success
|
|
245 |
*/
|
|
246 |
Int2 Blast_KarlinBlkStandardCalc(BlastScoreBlk* sbp, Int4 context_start, |
|
247 |
Int4 context_end); |
|
248 |
||
1.1.2
by Aaron M. Ucko
Import upstream version 6.1.20041020 |
249 |
/** Attempts to fill KarlinBlk for given gap opening, extensions etc.
|
250 |
*
|
|
251 |
* @param kbp object to be filled in [in|out]
|
|
252 |
* @param gap_open gap existence cost [in]
|
|
253 |
* @param gap_extend gap extension cost [in]
|
|
254 |
* @param decline_align cost to not align part of a sequence [in]
|
|
255 |
* @param matrix_name name of the matrix used [in]
|
|
256 |
* @return -1 if matrix_name is NULL;
|
|
257 |
* 1 if matrix not found
|
|
258 |
* 2 if matrix found, but open, extend etc. values not supported.
|
|
1.1.1
by Aaron M. Ucko
Import upstream version 6.1.20040616 |
259 |
*/
|
1.1.2
by Aaron M. Ucko
Import upstream version 6.1.20041020 |
260 |
Int2 Blast_KarlinkGapBlkFill(Blast_KarlinBlk* kbp, Int4 gap_open, Int4 gap_extend, Int4 decline_align, const char* matrix_name); |
1.1.1
by Aaron M. Ucko
Import upstream version 6.1.20040616 |
261 |
|
262 |
/** Prints a messages about the allowed matrices, BlastKarlinkGapBlkFill should return 1 before this is called.
|
|
263 |
* @param matrix the matrix to print a message about [in]
|
|
264 |
* @return the message
|
|
265 |
*/
|
|
266 |
char* BLAST_PrintMatrixMessage(const char *matrix); |
|
267 |
||
268 |
/** Prints a messages about the allowed open etc values for the given matrix,
|
|
269 |
* BlastKarlinkGapBlkFill should return 2 before this is called.
|
|
270 |
* @param matrix name of the matrix [in]
|
|
271 |
* @param gap_open gap existence cost [in]
|
|
272 |
* @param gap_extend cost to extend a gap by one [in]
|
|
273 |
* @param decline_align cost of declining to align [in]
|
|
274 |
* @return message
|
|
275 |
*/
|
|
276 |
char* BLAST_PrintAllowedValues(const char *matrix, Int4 gap_open, Int4 gap_extend, Int4 decline_align); |
|
277 |
||
278 |
/** Calculates the parameter Lambda given an initial guess for its value */
|
|
279 |
double
|
|
280 |
Blast_KarlinLambdaNR(Blast_ScoreFreq* sfp, double initialLambdaGuess); |
|
281 |
||
282 |
/** Calculates the Expect value based upon the search space and some Karlin-Altschul
|
|
283 |
* parameters. It is "simple" as it does not use sum-statistics.
|
|
284 |
* @param S the score of the alignment. [in]
|
|
285 |
* @param kbp the Karlin-Altschul parameters. [in]
|
|
286 |
* @param searchsp total search space to be used [in]
|
|
287 |
* @return the expect value
|
|
288 |
*/
|
|
289 |
double BLAST_KarlinStoE_simple (Int4 S, Blast_KarlinBlk* kbp, Int8 searchsp); |
|
1.1.2
by Aaron M. Ucko
Import upstream version 6.1.20041020 |
290 |
|
291 |
/** Compute a divisor used to weight the evalue of a collection of
|
|
292 |
* "nsegs" distinct alignments. These divisors are used to compensate
|
|
293 |
* for the effect of choosing the best among multiple collections of
|
|
294 |
* alignments. See
|
|
295 |
*
|
|
296 |
* Stephen F. Altschul. Evaluating the statitical significance of
|
|
297 |
* multiple distinct local alignments. In Suhai, editior, Theoretical
|
|
298 |
* and Computational Methods in Genome Research, pages 1-14. Plenum
|
|
299 |
* Press, New York, 1997.
|
|
300 |
*
|
|
301 |
* The "decayrate" parameter of this routine is a value in the
|
|
302 |
* interval (0,1). Typical values of decayrate are .1 and .5.
|
|
303 |
* @param decayrate adjusts for (multiple) tests of number of HSP sum groups [in]
|
|
304 |
* @param nsegs the number of HSPs in the sum group [in]
|
|
305 |
* @return divisor used to compensate for multiple tests
|
|
306 |
*/
|
|
1.1.1
by Aaron M. Ucko
Import upstream version 6.1.20040616 |
307 |
double BLAST_GapDecayDivisor(double decayrate, unsigned nsegs ); |
308 |
||
309 |
/** Calculate the cutoff score from the expected number of HSPs or vice versa.
|
|
310 |
* @param S The calculated score [in] [out]
|
|
311 |
* @param E The calculated e-value [in] [out]
|
|
312 |
* @param kbp The Karlin-Altschul statistical parameters [in]
|
|
313 |
* @param searchsp The effective search space [in]
|
|
314 |
* @param dodecay Use gap decay feature? [in]
|
|
315 |
* @param gap_decay_rate Gap decay rate to use, if dodecay is set [in]
|
|
316 |
*/
|
|
317 |
Int2 BLAST_Cutoffs (Int4 *S, double* E, Blast_KarlinBlk* kbp, |
|
318 |
Int8 searchsp, Boolean dodecay, double gap_decay_rate); |
|
319 |
||
1.1.2
by Aaron M. Ucko
Import upstream version 6.1.20041020 |
320 |
/** Calculates the e-value for alignments with "small" gaps (typically
|
321 |
* under fifty residues/basepairs) following ideas of Stephen Altschul's.
|
|
322 |
* @param start_points the number of starting points permitted between
|
|
323 |
* adjacent alignments; max_overlap + max_gap + 1 [in]
|
|
324 |
* @param num the number of distinct alignments in this collection [in]
|
|
325 |
* @param xsum the sum of the scores of these alignments each individually
|
|
326 |
* normalized using an appropriate value of Lambda and logK [in]
|
|
327 |
* @param query_length effective len of the query seq [in]
|
|
328 |
* @param subject_length effective len of the subject seq [in]
|
|
329 |
* @param searchsp_eff effective size of the search space [in]
|
|
330 |
* @param weight_divisor a divisor used to weight the e-value
|
|
331 |
* when multiple collections of alignments are being considered by
|
|
332 |
* the calling routine [in]
|
|
333 |
* @return the expect value
|
|
334 |
*/
|
|
335 |
double BLAST_SmallGapSumE (Int4 start_points, Int2 num, double xsum, |
|
336 |
Int4 query_length, Int4 subject_length, |
|
337 |
Int8 searchsp_eff, double weight_divisor); |
|
338 |
||
339 |
/** Calculates the e-value of a collection multiple distinct
|
|
340 |
* alignments with asymmetric gaps between the alignments. The gaps
|
|
341 |
* in one (protein) sequence are typically small (like in
|
|
342 |
* BLAST_SmallGapSumE) gap an the gaps in the other (translated DNA)
|
|
343 |
* sequence are possibly large (up to 4000 bp.) This routine is used
|
|
344 |
* for linking HSPs representing exons in the DNA sequence that are
|
|
345 |
* separated by introns.
|
|
346 |
* @param query_start_points the number of starting points in
|
|
347 |
* the query sequence permitted between adjacent alignments;
|
|
348 |
* max_overlap + max_gap + 1. [in]
|
|
349 |
* @param subject_start_points the number of starting points in
|
|
350 |
* the subject sequence permitted between adjacent alignments [in]
|
|
351 |
* @param num number of distinct alignments in one collection [in]
|
|
352 |
* @param xsum the sum of the scores of these alignments each individually
|
|
353 |
* normalized using an appropriate value of Lambda and logK [in]
|
|
354 |
* @param query_length effective length of query [in]
|
|
355 |
* @param subject_length effective length of subject [in]
|
|
356 |
* @param searchsp_eff effective size of the search space [in]
|
|
357 |
* @param weight_divisor a divisor used to weight the e-value
|
|
358 |
* when multiple collections of alignments are being considered by
|
|
359 |
* the calling routine [in]
|
|
360 |
* @return sum expect value.
|
|
361 |
*/
|
|
362 |
double BLAST_UnevenGapSumE (Int4 query_start_points, Int4 subject_start_points, |
|
363 |
Int2 num, double xsum, |
|
364 |
Int4 query_length, Int4 subject_length, |
|
365 |
Int8 searchsp_eff, double weight_divisor); |
|
366 |
||
367 |
/** Calculates the e-value if a collection of distinct alignments with
|
|
368 |
* arbitrarily large gaps between the alignments
|
|
369 |
* @param kbp statistical parameters [in]
|
|
370 |
* @param num number of distinct alignments in the collection [in]
|
|
371 |
* @param xsum the sum of the scores of these alignments each individually
|
|
372 |
* normalized using an appropriate value of Lambda and logK [in]
|
|
373 |
* @param query_length effective length of query sequence [in]
|
|
374 |
* @param subject_length effective length of subject sequence [in]
|
|
375 |
* @param searchsp_eff effective size of the search space [in]
|
|
376 |
* @param weight_divisor a divisor used to weight the e-value
|
|
377 |
* when multiple collections of alignments are being considered by the
|
|
378 |
* calling routine [in]
|
|
379 |
* @return sum expect value.
|
|
380 |
*/
|
|
381 |
double BLAST_LargeGapSumE (Int2 num, double xsum, |
|
382 |
Int4 query_length, Int4 subject_length, |
|
383 |
Int8 searchsp_eff, double weight_divisor ); |
|
384 |
||
385 |
/** Extract the alpha and beta settings for this matrixName, and these
|
|
386 |
* gap open and gap extension costs
|
|
387 |
* @param matrixName name of the matrix used [in]
|
|
388 |
* @param alpha Karlin-Altschul parameter to be set [out]
|
|
389 |
* @param beta Karlin-Altschul parameter to be set [out]
|
|
390 |
* @param gapped TRUE if a gapped search [in]
|
|
391 |
* @param gap_open existence cost of a gap [in]
|
|
392 |
* @param gap_extend extension cost of a gap [in]
|
|
1.1.1
by Aaron M. Ucko
Import upstream version 6.1.20040616 |
393 |
*/
|
1.1.2
by Aaron M. Ucko
Import upstream version 6.1.20041020 |
394 |
void BLAST_GetAlphaBeta (const char* matrixName, double *alpha, |
395 |
double *beta, Boolean gapped, Int4 gap_open, Int4 gap_extend); |
|
396 |
||
397 |
||
398 |
/** Calculate a new PSSM, using composition-based statistics, for use
|
|
399 |
* with RPS BLAST. This function produces a PSSM for a single RPS DB
|
|
400 |
* sequence (of size db_seq_length) and incorporates information from
|
|
401 |
* the RPS blast query. Each individual database sequence must call this
|
|
402 |
* function to retrieve its own PSSM. The matrix is returned (and must
|
|
403 |
* be freed elsewhere). posMatrix is the portion of the complete
|
|
404 |
* concatenated PSSM that is specific to this DB sequence
|
|
405 |
* @param scalingFactor used to rescale Lambda [in]
|
|
406 |
* @param rps_query_length length of query sequence [in]
|
|
407 |
* @param rps_query_seq the query sequence [in]
|
|
408 |
* @param db_seq_length Length of the database sequence [in]
|
|
409 |
* @param posMatrix matrix (actual) values to be used [in]
|
|
410 |
* @param matrix_name Name of the score matrix underlying the RPS search [in]
|
|
411 |
* @return new pssm
|
|
412 |
*/
|
|
1.1.1
by Aaron M. Ucko
Import upstream version 6.1.20040616 |
413 |
Int4 ** RPSCalculatePSSM(double scalingFactor, Int4 rps_query_length, |
1.1.2
by Aaron M. Ucko
Import upstream version 6.1.20041020 |
414 |
const Uint1 * rps_query_seq, Int4 db_seq_length, |
415 |
Int4 **posMatrix, const char *matrix_name); |
|
416 |
||
417 |
||
418 |
/**
|
|
419 |
* Computes the adjustment to the lengths of the query and database sequences
|
|
420 |
* that is used to compensate for edge effects when computing evalues.
|
|
421 |
*
|
|
422 |
* The length adjustment is an integer-valued approximation to the fixed
|
|
423 |
* point of the function
|
|
424 |
*
|
|
425 |
* f(ell) = beta +
|
|
426 |
* (alpha/lambda) * (log K + log((m - ell)*(n - N ell)))
|
|
427 |
*
|
|
428 |
* where m is the query length n is the length of the database and N is the
|
|
429 |
* number of sequences in the database. The values beta, alpha, lambda and
|
|
430 |
* K are statistical, Karlin-Altschul parameters.
|
|
431 |
*
|
|
432 |
* The value of the length adjustment computed by this routine, A,
|
|
433 |
* will always be an integer smaller than the fixed point of
|
|
434 |
* f(ell). Usually, it will be the largest such integer. However, the
|
|
435 |
* computed length adjustment, A, will also be so small that
|
|
436 |
*
|
|
437 |
* K * (m - A) * (n - N * A) > MAX(m,n).
|
|
438 |
*
|
|
439 |
* Moreover, an iterative method is used to compute A, and under
|
|
440 |
* unusual circumstances the iterative method may not converge.
|
|
441 |
*
|
|
442 |
* @param K the statistical parameter K [in]
|
|
443 |
* @param logK the natural logarithm of K [in]
|
|
444 |
* @param alpha_d_lambda the ratio of the statistical parameters
|
|
445 |
* alpha and lambda (for ungapped alignments, the
|
|
446 |
* value 1/H should be used) [in]
|
|
447 |
* @param beta the statistical parameter beta (for ungapped
|
|
448 |
* alignments, beta == 0) [in]
|
|
449 |
* @param query_length the length of the query sequence [in]
|
|
450 |
* @param db_length the length of the database [in]
|
|
451 |
* @param db_num_seqs the number of sequences in the database [in]
|
|
452 |
* @param length_adjustment the computed value of the length adjustment [out]
|
|
453 |
*
|
|
454 |
* @return 0 if length_adjustment is known to be the largest integer less
|
|
455 |
* than the fixed point of f(ell); 1 otherwise.
|
|
456 |
*/
|
|
1.1.1
by Aaron M. Ucko
Import upstream version 6.1.20040616 |
457 |
Int4
|
458 |
BLAST_ComputeLengthAdjustment(double K, |
|
459 |
double logK, |
|
460 |
double alpha_d_lambda, |
|
461 |
double beta, |
|
462 |
Int4 query_length, |
|
463 |
Int8 db_length, |
|
464 |
Int4 db_num_seqs, |
|
465 |
Int4 * length_adjustment); |
|
466 |
||
467 |
||
468 |
/** Allocates a new Blast_ResFreq structure and fills in the prob element
|
|
469 |
based upon the contents of sbp.
|
|
470 |
* @param sbp The BlastScoreBlk* used to init prob [in]
|
|
471 |
*/
|
|
472 |
Blast_ResFreq* Blast_ResFreqNew(const BlastScoreBlk* sbp); |
|
473 |
||
474 |
/** Deallocates Blast_ResFreq and prob0 element.
|
|
475 |
* @param rfp the Blast_ResFreq to be deallocated.
|
|
476 |
*/
|
|
477 |
Blast_ResFreq* Blast_ResFreqDestruct(Blast_ResFreq* rfp); |
|
478 |
||
479 |
||
480 |
/** Calculates residues frequencies given a standard distribution.
|
|
481 |
* @param sbp the BlastScoreBlk provides information on alphabet.
|
|
482 |
* @param rfp the prob element on this Blast_ResFreq is used.
|
|
1.1.2
by Aaron M. Ucko
Import upstream version 6.1.20041020 |
483 |
* @return zero on success
|
1.1.1
by Aaron M. Ucko
Import upstream version 6.1.20040616 |
484 |
*/
|
485 |
Int2 Blast_ResFreqStdComp(const BlastScoreBlk* sbp, Blast_ResFreq* rfp); |
|
486 |
||
487 |
/** Creates a new structure to keep track of score frequencies for a scoring
|
|
488 |
* system.
|
|
489 |
* @param score_min Minimum score [in]
|
|
490 |
* @param score_max Maximum score [in]
|
|
1.1.2
by Aaron M. Ucko
Import upstream version 6.1.20041020 |
491 |
* @return allocated and initialized pointer to Blast_ScoreFreq
|
1.1.1
by Aaron M. Ucko
Import upstream version 6.1.20040616 |
492 |
*/
|
493 |
Blast_ScoreFreq* |
|
494 |
Blast_ScoreFreqNew(Int4 score_min, Int4 score_max); |
|
495 |
||
496 |
/** Deallocates the score frequencies structure
|
|
497 |
* @param sfp the structure to deallocate [in]
|
|
498 |
* @return NULL
|
|
499 |
*/
|
|
500 |
Blast_ScoreFreq* |
|
501 |
Blast_ScoreFreqDestruct(Blast_ScoreFreq* sfp); |
|
502 |
||
503 |
/** Fills a buffer with the 'standard' alphabet
|
|
504 |
* (given by STD_AMINO_ACID_FREQS[index].ch).
|
|
505 |
*
|
|
1.1.2
by Aaron M. Ucko
Import upstream version 6.1.20041020 |
506 |
* @param alphabet_code specifies alphabet [in]
|
507 |
* @param residues buffer to be filled in [in|out]
|
|
508 |
* @param residue_size size of "residues" buffer [in]
|
|
1.1.1
by Aaron M. Ucko
Import upstream version 6.1.20040616 |
509 |
* @return Number of residues in alphabet or negative returns upon error.
|
510 |
*/
|
|
511 |
Int2
|
|
512 |
Blast_GetStdAlphabet(Uint1 alphabet_code, Uint1* residues, |
|
1.1.2
by Aaron M. Ucko
Import upstream version 6.1.20041020 |
513 |
Uint4 residue_size); |
1.1.1
by Aaron M. Ucko
Import upstream version 6.1.20040616 |
514 |
|
1.1.2
by Aaron M. Ucko
Import upstream version 6.1.20041020 |
515 |
/** Computes the parameters lambda, H K for use in calculating the
|
516 |
* statistical significance of high-scoring segments or subalignments (see
|
|
517 |
* comment on blast_stat.c for more details).
|
|
518 |
* @param kbp object containing Lambda, H, and K as well as scoring information [in|out]
|
|
519 |
* @param sfp array of probabilities for all scores [in]
|
|
520 |
* @return zero on success, 1 on error.
|
|
521 |
*/
|
|
1.1.1
by Aaron M. Ucko
Import upstream version 6.1.20040616 |
522 |
Int2
|
523 |
Blast_KarlinBlkCalc(Blast_KarlinBlk* kbp, Blast_ScoreFreq* sfp); |
|
524 |
||
525 |
/** Given a sequence of 'length' amino acid residues, compute the
|
|
526 |
* probability of each residue and put that in the array resProb
|
|
1.1.2
by Aaron M. Ucko
Import upstream version 6.1.20041020 |
527 |
* Excludes ambiguity characters.
|
1.1.1
by Aaron M. Ucko
Import upstream version 6.1.20040616 |
528 |
*
|
529 |
* @param sequence the sequence to be computed upon [in]
|
|
530 |
* @param length the length of the sequence [in]
|
|
531 |
* @param resProb the object to be filled in [in|out]
|
|
532 |
*/
|
|
533 |
void
|
|
534 |
Blast_FillResidueProbability(const Uint1* sequence, Int4 length, double * resProb); |
|
535 |
||
1.1.2
by Aaron M. Ucko
Import upstream version 6.1.20041020 |
536 |
/** Fill in the matrix for blastn using the penaly and rewards
|
537 |
* The query sequence alphabet is blastna, the subject sequence
|
|
538 |
* is ncbi2na. The alphabet blastna is defined in blast_stat.h
|
|
539 |
* and the first four elements of blastna are identical to ncbi2na.
|
|
540 |
* if sbp->matrix==NULL, it is allocated.
|
|
541 |
* @param sbp the BlastScoreBlk on which reward, penalty, and matrix will be set
|
|
542 |
[in|out]
|
|
543 |
* @return zero on success.
|
|
544 |
*/
|
|
545 |
Int2 BlastScoreBlkNuclMatrixCreate(BlastScoreBlk* sbp); |
|
546 |
||
1.1.1
by Aaron M. Ucko
Import upstream version 6.1.20040616 |
547 |
#ifdef __cplusplus
|
548 |
}
|
|
549 |
#endif
|
|
550 |
#endif /* !__BLAST_STAT__ */ |