47
49
- calculate pseuod-scores from p-values.
49
51
******************************************************************************
51
53
* $Log: blastkar.c,v $
54
* Revision 6.102 2004/09/28 16:04:19 papadopo
56
* 1. Pass the effective database size into BlastSmallGapSumE,
57
* BlastLargeGapSumE and BlastUnevenGapSumE. The routines use this
58
* value in a simplified formula to compute the e-value of singleton sets.
59
* 2. Caused all routines for calculating the significance of multiple
60
* distinct alignments (BlastSmallGapSumE, BlastLargeGapSumE and
61
* BlastUnevenGapSumE) to use
63
* sum_{i in linked_set} (\lambda_i s_i - \ln K_i)
65
* as the weighted sum score, where (\lambda_i, K_i) are taken from
66
* the appropriate query context.
68
* Revision 6.101 2004/06/07 20:03:23 coulouri
69
* use floating point constants for comparisons with floating point variables
71
* Revision 6.100 2004/04/28 14:36:00 madden
72
* Changes from Mike Gertz:
73
* - I created the new routine BlastGapDecayDivisor that computes a
74
* divisor used to weight the evalue of a collection of distinct
76
* - I removed BlastGapDecay and BlastGapDecayInverse which had become
78
* - I modified the BlastCutoffs routine so that it uses the value
79
* returned by BlastGapDecayDivisor to weight evalues.
80
* - I modified BlastSmallGapSumE, BlastLargeGapSumE and
81
* BlastUnevenGapSumE no longer refer to the gap_prob parameter.
82
* Replaced the gap_decay_rate parameter of each of these routines with
83
* a weight_divisor parameter. Added documentation.
85
* Revision 6.99 2004/04/23 13:49:43 madden
86
* Cleaned up ifndef in BlastKarlinLHtoK
88
* Revision 6.98 2004/04/23 13:19:53 madden
89
* Rewrote BlastKarlinLHtoK to do the following and more:
90
* 1. fix a bug whereby the wrong formula was used when high score == 1
91
* and low score == -1;
92
* 2. fix a methodological error of truncating the first sum
93
* and trying to make it converge quickly by adding terms
94
* of a geometric progression, even though the geometric progression
95
* estimate is not correct in all cases;
96
* the old adjustment code is left in for historical purposes but
98
* 3. Eliminate the Boolean bi_modal_score variable. The old test that
99
* set the value of bi_modal_score would frequently fail to choose the
100
* correct value due to rounding error.
101
* 4. changed numerous local variable names to make them more meaningful;
102
* 5. added substantial comments to explain what the procedure
103
* is doing and what each variable represents
105
* Revision 6.97 2004/04/01 13:43:08 lavr
106
* Spell "occurred", "occurrence", and "occurring"
108
* Revision 6.96 2004/03/31 17:58:51 papadopo
109
* Mike Gertz' changes for length adjustment calculations
111
* Revision 6.95 2003/12/12 16:00:34 madden
112
* Add gap_decay_rate to BlastCutoffs, remove BlastCutoffs_simple, protection against overflow, removal of defunct _real variables (all from Mike Gertz)
114
* Revision 6.94 2003/11/30 03:36:38 camacho
115
* Fix compilation error
117
* Revision 6.93 2003/11/28 22:39:40 camacho
118
* +static keyword to BlastKarlinLtoH
120
* Revision 6.92 2003/11/28 15:16:38 camacho
121
* Combine newkar.c's contents with blastkar.c
123
* Revision 6.91 2003/11/26 19:08:10 madden
124
* simplified BlastKarlinLtoH and BlastKarlinLHtoK and provided better protection against overflow, new function NlmKarlinLambdaNR (all from Mike Gertz)
127
* Revision 6.90 2003/06/30 20:01:32 dondosha
128
* Correction in logic of finding matrices by BLASTMAT environment variable
130
* Revision 6.89 2003/05/30 17:25:36 coulouri
133
* Revision 6.88 2003/05/06 18:54:02 dondosha
134
* Set all gap/residue scores in blastn matrix to INT4_MIN/2
136
* Revision 6.87 2003/03/07 22:33:25 bealer
137
* - Fix UMR when alphabet_type is not set.
139
* Revision 6.86 2003/02/27 19:07:56 madden
140
* Add functions PrintMatrixMessage and PrintAllowedValuesMessage
142
* Revision 6.85 2003/02/26 18:23:49 madden
143
* Add functions BlastKarlinkGapBlkFill and BlastKarlinReportAllowedValues, call from BlastKarlinBlkGappedCalcEx
145
* Revision 6.84 2002/10/24 22:52:14 dondosha
146
* When checking config file for matrices path, allow aa or nt subdirectories too
148
* Revision 6.83 2002/07/22 20:10:12 dondosha
149
* Correction: previous change did not work for proteins
151
* Revision 6.82 2002/07/19 18:34:58 dondosha
152
* Ignore bits higher than 4 when computing frequencies - needed for megablast
154
* Revision 6.81 2002/05/17 20:30:37 madden
155
* Add comments on adding new matrix values
157
* Revision 6.80 2002/04/09 18:44:19 madden
158
* Do not return if status of BlastScoreBlkMatCreate is non-zero
160
* Revision 6.79 2002/02/26 22:25:21 dondosha
161
* Return error as soon as it is found that matrix name is not supported
52
163
* Revision 6.78 2001/12/13 14:30:49 madden
53
164
* Add BLASTKAR_SMALL_FLOAT to prevent floating point exception for very small floats
581
692
/**************************************************************************************
583
NOTE!!!!!!!!!!!!!!!!!!!!!!!!!!!
585
The arrays below list all the matrices allowed for gapped BLAST.
586
If new ones are added, remember to edit the function BlastLoadMatrixValues!!!
694
How the statistical parameters for the matrices are stored:
695
-----------------------------------------------------------
696
They parameters are stored in a two-dimensional array FloatHi (i.e.,
697
doubles, which has as it's first dimensions the number of different
698
gap existence and extension combinations and as it's second dimension 8.
699
The eight different columns specify:
701
1.) gap existence penalty (INT2_MAX denotes infinite).
702
2.) gap extension penalty (INT2_MAX denotes infinite).
703
3.) decline to align penalty (INT2_MAX denotes infinite).
710
(Items 4-8 are explained in:
711
Altschul SF, Bundschuh R, Olsen R, Hwa T.
712
The estimation of statistical parameters for local alignment score distributions.
713
Nucleic Acids Res. 2001 Jan 15;29(2):351-61.).
715
Take BLOSUM45 (below) as an example. Currently (5/17/02) there are
716
14 different allowed combinations (specified by "#define BLOSUM45_VALUES_MAX 14").
717
The first row in the array "blosum45_values" has INT2_MAX (i.e., 32767) for gap
718
existence, extension, and decline-to-align penalties. For all practical purposes
719
this value is large enough to be infinite, so the alignments will be ungapped.
720
BLAST may also use this value (INT2_MAX) as a signal to skip gapping, so a
721
different value should not be used if the intent is to have gapless extensions.
722
The next row is for the gap existence penalty 13 and the extension penalty 3.
723
The decline-to-align penalty is only supported in a few cases, so it is normally
727
How to add a new matrix to blastkar.c:
728
--------------------------------------
729
To add a new matrix to blastkar.c it is necessary to complete
730
four steps. As an example consider adding the matrix
733
1.) add a define specifying how many different existence and extensions
734
penalties are allowed, so it would be necessary to add the line:
736
#define TESTMATRIX_VALUES_MAX 14
738
if 14 values were to be allowed.
740
2.) add a two-dimensional array to contain the statistical parameters:
742
static Nlm_FloatHi testmatrix_values[TESTMATRIX_VALUES_MAX][8] ={ ...
744
3.) add a "prefs" array that should hint about the "optimal"
745
gap existence and extension penalties:
747
static Int4 testmatrix_prefs[TESTMATRIX_VALUES_MAX] = {
748
BLAST_MATRIX_NOMINAL,
752
4.) Go to the function BlastLoadMatrixValues (in this file) and
753
add two lines before the return at the end of the function:
755
matrix_info = MatrixInfoNew("TESTMATRIX", testmatrix_values, testmatrix_prefs, TESTMATRIX_VALUES_MAX);
756
ValNodeAddPointer(&retval, 0, matrix_info);
588
760
***************************************************************************************/
1366
1542
BlastScoreBlkMatFill(BLAST_ScoreBlkPtr sbp, CharPtr matrix)
1369
Char string[PATH_MAX], alphabet_type[3];
1545
Char string[PATH_MAX] = "", alphabet_type[3] = "";
1546
CharPtr matrix_dir = NULL;
1377
1550
if (sbp->read_in_matrix) {
1378
1551
/* Convert matrix name to upper case. */
1379
1552
matrix = Nlm_StrUpper(matrix);
1381
1554
sbp->name = StringSave(matrix); /* Save the name of the matrix. */
1556
/* 1. Try local directory */
1383
1557
if(FileLength(matrix) > 0)
1384
1558
fp = FileOpen(matrix, "r");
1560
/* 2. Try configuration file */
1386
1561
if (fp == NULL) {
1387
if(FindPath("ncbi", "ncbi", "data", string, PATH_MAX)) {
1388
StringCat(string, matrix);
1389
if(FileLength(string) > 0)
1390
fp = FileOpen(string, "r");
1562
if (sbp->protein_alphabet)
1563
Nlm_StringNCpy(alphabet_type, "aa", 2);
1565
Nlm_StringNCpy(alphabet_type, "nt", 2);
1566
alphabet_type[2] = NULLB;
1568
if(FindPath("ncbi", "ncbi", "data", string, PATH_MAX)) {
1569
matrix_dir = StringSave(string);
1570
sprintf(string, "%s%s", matrix_dir, matrix);
1571
if(FileLength(string) > 0) {
1572
fp = FileOpen(string, "r");
1574
sprintf(string, "%s%s%s%s", matrix_dir,
1575
alphabet_type, DIRDELIMSTR, matrix);
1576
if(FileLength(string) > 0)
1577
fp = FileOpen(string, "r");
1579
matrix_dir = MemFree(matrix_dir);
1393
1582
/* Trying to use local "data" directory */
1395
1584
if(fp == NULL) {
1396
sprintf(string, "data/%s", matrix);
1585
sprintf(string, "data%s%s", DIRDELIMSTR, matrix);
1397
1586
if(FileLength(string) > 0)
1398
1587
fp = FileOpen(string, "r");
2770
2960
BlastKarlinBlkGappedCalcEx(BLAST_KarlinBlkPtr kbp, Int4 gap_open, Int4 gap_extend, Int4 decline_align, CharPtr matrix_name, ValNodePtr PNTR error_return)
2773
Boolean found_matrix, found_values;
2964
Int2 status = BlastKarlinkGapBlkFill(kbp, gap_open, gap_extend, decline_align, matrix_name);
2966
if (status && error_return)
2970
MatrixInfoPtr matrix_info;
2971
ValNodePtr vnp, head;
2973
vnp = head = BlastLoadMatrixValues();
2975
sprintf(buffer, "%s is not a supported matrix", matrix_name);
2976
BlastConstructErrorMessage("BlastKarlinBlkGappedCalc", buffer, 2, error_return);
2980
matrix_info = vnp->data.ptrvalue;
2981
sprintf(buffer, "%s is a supported matrix", matrix_info->name);
2982
BlastConstructErrorMessage("BlastKarlinBlkGappedCalc", buffer, 2, error_return);
2986
BlastMatrixValuesDestruct(head);
2988
else if (status == 2)
2990
if (decline_align == INT2_MAX)
2991
sprintf(buffer, "Gap existence and extension values of %ld and %ld not supported for %s", (long) gap_open, (long) gap_extend, matrix_name);
2993
sprintf(buffer, "Gap existence, extension and decline-to-align values of %ld, %ld and %ld not supported for %s", (long) gap_open, (long) gap_extend, (long) decline_align, matrix_name);
2994
BlastConstructErrorMessage("BlastKarlinBlkGappedCalc", buffer, 2, error_return);
2995
BlastKarlinReportAllowedValues(matrix_name, error_return);
3003
Attempts to fill KarlinBlk for given gap opening, extensions etc.
3004
Will return non-zero status if that fails.
3006
return values: -1 if matrix_name is NULL;
3007
1 if matrix not found
3008
2 if matrix found, but open, extend etc. values not supported.
3011
BlastKarlinkGapBlkFill(BLAST_KarlinBlkPtr kbp, Int4 gap_open, Int4 gap_extend, Int4 decline_align, CharPtr matrix_name)
3013
Boolean found_matrix=FALSE, found_values=FALSE;
2774
3014
array_of_8 *values;
2775
3015
Char buffer[256];
2776
3017
Int4 index, max_number_values=0;
2777
3018
MatrixInfoPtr matrix_info;
2778
3019
ValNodePtr vnp, head;
2780
3021
if (matrix_name == NULL)
2784
found_matrix = FALSE;
2785
found_values = FALSE;
2787
3026
vnp = head = BlastLoadMatrixValues();
2822
3061
if (found_values == TRUE)
2824
BlastMatrixValuesDestruct(head);
2830
sprintf(buffer, "%s is not a supported matrix", matrix_name);
2831
BlastConstructErrorMessage("BlastKarlinBlkGappedCalc", buffer, 2, error_return);
2835
matrix_info = vnp->data.ptrvalue;
2836
sprintf(buffer, "%s is a supported matrix", matrix_info->name);
3075
BlastMatrixValuesDestruct(head);
3081
PrintMatrixMessage(const Char *matrix_name)
3083
CharPtr buffer=MemNew(1024*sizeof(Char));
3085
MatrixInfoPtr matrix_info;
3086
ValNodePtr vnp, head;
3089
sprintf(ptr, "%s is not a supported matrix, supported matrices are:\n", matrix_name);
3091
ptr += StringLen(ptr);
3093
vnp = head = BlastLoadMatrixValues();
3097
matrix_info = vnp->data.ptrvalue;
3098
sprintf(ptr, "%s \n", matrix_info->name);
3099
ptr += StringLen(ptr);
3102
BlastMatrixValuesDestruct(head);
3108
PrintAllowedValuesMessage(const Char *matrix_name, Int4 gap_open, Int4 gap_extend, Int4 decline_align)
3111
Boolean found_matrix=FALSE;
3112
CharPtr buffer, ptr;
3114
Int4 index, max_number_values=0;
3115
MatrixInfoPtr matrix_info;
3116
ValNodePtr vnp, head;
3118
ptr = buffer = MemNew(2048*sizeof(Char));
3120
if (decline_align == INT2_MAX)
3121
sprintf(ptr, "Gap existence and extension values of %ld and %ld not supported for %s\nsupported values are:\n",
3122
(long) gap_open, (long) gap_extend, matrix_name);
3124
sprintf(ptr, "Gap existence, extension and decline-to-align values of %ld, %ld and %ld not supported for %s\n supported values are:\n",
3125
(long) gap_open, (long) gap_extend, (long) decline_align, matrix_name);
3127
ptr += StringLen(ptr);
3129
vnp = head = BlastLoadMatrixValues();
3132
matrix_info = vnp->data.ptrvalue;
3133
if (StringICmp(matrix_info->name, matrix_name) == 0)
3135
values = matrix_info->values;
3136
max_number_values = matrix_info->max_number_values;
3137
found_matrix = TRUE;
3145
for (index=0; index<max_number_values; index++)
3147
if (Nint(values[index][2]) == INT2_MAX)
3148
sprintf(ptr, "%ld, %ld\n", (long) Nint(values[index][0]), (long) Nint(values[index][1]));
3150
sprintf(ptr, "%ld, %ld, %ld\n", (long) Nint(values[index][0]), (long) Nint(values[index][1]), (long) Nint(values[index][2]));
3151
ptr += StringLen(ptr);
3155
BlastMatrixValuesDestruct(head);
3162
BlastKarlinReportAllowedValues(const Char *matrix_name, ValNodePtr PNTR error_return)
3165
Boolean found_matrix=FALSE;
3168
Int4 index, max_number_values=0;
3169
MatrixInfoPtr matrix_info;
3170
ValNodePtr vnp, head;
3172
vnp = head = BlastLoadMatrixValues();
3175
matrix_info = vnp->data.ptrvalue;
3176
if (StringICmp(matrix_info->name, matrix_name) == 0)
3178
values = matrix_info->values;
3179
max_number_values = matrix_info->max_number_values;
3180
found_matrix = TRUE;
3188
for (index=0; index<max_number_values; index++)
3190
if (Nint(values[index][2]) == INT2_MAX)
3191
sprintf(buffer, "Gap existence and extension values of %ld and %ld are supported", (long) Nint(values[index][0]), (long) Nint(values[index][1]));
3193
sprintf(buffer, "Gap existence, extension and decline-to-align values of %ld, %ld and %ld are supported", (long) Nint(values[index][0]), (long) Nint(values[index][1]), (long) Nint(values[index][2]));
2837
3194
BlastConstructErrorMessage("BlastKarlinBlkGappedCalc", buffer, 2, error_return);
2840
BlastMatrixValuesDestruct(head);
2844
if (decline_align == INT2_MAX)
2845
sprintf(buffer, "Gap existence and extension values of %ld and %ld not supported for %s", (long) gap_open, (long) gap_extend, matrix_name);
2847
sprintf(buffer, "Gap existence, extension and decline-to-align values of %ld, %ld and %ld not supported for %s", (long) gap_open, (long) gap_extend, (long) decline_align, matrix_name);
2848
BlastConstructErrorMessage("BlastKarlinBlkGappedCalc", buffer, 2, error_return);
2849
for (index=0; index<max_number_values; index++)
2851
if (Nint(values[index][2]) == INT2_MAX)
2852
sprintf(buffer, "Gap existence and extension values of %ld and %ld are supported", (long) Nint(values[index][0]), (long) Nint(values[index][1]));
2854
sprintf(buffer, "Gap existence, extension and decline-to-align values of %ld, %ld and %ld are supported", (long) Nint(values[index][0]), (long) Nint(values[index][1]), (long) Nint(values[index][2]));
2855
BlastConstructErrorMessage("BlastKarlinBlkGappedCalc", buffer, 2, error_return);
2858
3198
BlastMatrixValuesDestruct(head);
3206
Calculate H, the relative entropy of the p's and q's
3208
static Nlm_FloatHi LIBCALL
3209
BlastKarlinLtoH(BLAST_ScoreFreqPtr sfp, Nlm_FloatHi lambda)
3212
Nlm_FloatHi H, etonlam, sum, scale;
3214
Nlm_FloatHi PNTR probs = sfp->sprob;
3215
BLAST_Score low = sfp->obs_min, high = sfp->obs_max;
3220
if (BlastScoreChk(low, high) != 0) return -1.;
3222
etonlam = exp( - lambda );
3223
sum = low * probs[low];
3224
for( score = low + 1; score <= high; score++ ) {
3225
sum = score * probs[score] + etonlam * sum;
3228
scale = Nlm_Powi( etonlam, high );
3230
H = lambda * sum/scale;
3231
} else { /* Underflow of exp( -lambda * high ) */
3232
H = lambda * exp( lambda * high + log(sum) );
2940
3314
/* Calculate the parameter Lambda */
2942
kbp->Lambda_real = kbp->Lambda = BlastKarlinLambdaNR(sfp);
3316
kbp->Lambda = BlastKarlinLambdaNR(sfp);
2943
3317
if (kbp->Lambda < 0.)
2947
3321
/* Calculate H */
2949
kbp->H_real = kbp->H = BlastKarlinLtoH(sfp, kbp->Lambda);
3323
kbp->H = BlastKarlinLtoH(sfp, kbp->Lambda);
2950
3324
if (kbp->H < 0.)
2954
3328
/* Calculate K and log(K) */
2956
kbp->K_real = kbp->K = BlastKarlinLHtoK(sfp, kbp->Lambda, kbp->H);
3330
kbp->K = BlastKarlinLHtoK(sfp, kbp->Lambda, kbp->H);
2957
3331
if (kbp->K < 0.)
2959
kbp->logK_real = kbp->logK = log(kbp->K);
3333
kbp->logK = log(kbp->K);
2961
3335
/* Normal return */
2965
kbp->Lambda = kbp->H = kbp->K
2966
= kbp->Lambda_real = kbp->H_real = kbp->K_real = -1.;
3339
kbp->Lambda = kbp->H = kbp->K = -1.;
2967
3340
#ifdef BLASTKAR_HUGE_VAL
2968
kbp->logK_real = kbp->logK = BLASTKAR_HUGE_VAL;
3341
kbp->logK = BLASTKAR_HUGE_VAL;
2970
kbp->logK_real = kbp->logK = 1.e30;
2974
#define DIMOFP0 (iter*range + 1)
3347
#define DIMOFP0 (iterlimit*range + 1)
2975
3348
#define DIMOFP0_MAX (BLAST_KARLIN_K_ITER_MAX*BLAST_SCORE_RANGE_MAX+1)
2978
BlastKarlinLHtoK(BLAST_ScoreFreqPtr sfp, Nlm_FloatHi lambda, Nlm_FloatHi H)
2980
#ifndef BLAST_KARLIN_STACKP
2981
Nlm_FloatHi PNTR P0 = NULL;
2983
Nlm_FloatHi P0 [DIMOFP0_MAX];
2985
BLAST_Score low; /* Lowest score (must be negative) */
2986
BLAST_Score high; /* Highest score (must be positive) */
2987
Nlm_FloatHi K; /* local copy of K */
2990
BLAST_Score range, lo, hi, first, last, d;
2991
register Nlm_FloatHi sum;
2992
Nlm_FloatHi Sum, av, oldsum, oldsum2, score_avg;
2994
Nlm_FloatHi sumlimit;
2995
Nlm_FloatHi PNTR p, PNTR ptrP, PNTR ptr1, PNTR ptr2, PNTR ptr1e;
2996
Nlm_FloatHi etolami, etolam;
2997
Boolean bi_modal_score = FALSE;
2999
if (lambda <= 0. || H <= 0.) {
3003
if (sfp->score_avg >= 0.0) {
3008
high = sfp->obs_max;
3010
p = &sfp->sprob[low];
3012
/* Look for the greatest common divisor ("delta" in Appendix of PNAS 87 of
3013
Karlin&Altschul (1990) */
3014
for (i = 1, d = -low; i <= range && d > 1; ++i)
3025
etolam = exp((Nlm_FloatHi)lambda);
3027
if (low == -1 || high == 1) {
3031
score_avg = sfp->score_avg / d;
3032
K = (score_avg * score_avg) / av;
3034
return K * (1.0 - 1./etolam);
3037
sumlimit = BLAST_KARLIN_K_SUMLIMIT_DEFAULT;
3039
iter = BLAST_KARLIN_K_ITER_MAX;
3041
if (DIMOFP0 > DIMOFP0_MAX) {
3044
#ifndef BLAST_KARLIN_STACKP
3045
P0 = (Nlm_FloatHi PNTR)MemNew(DIMOFP0 * sizeof(*P0));
3049
Nlm_MemSet((CharPtr)P0, 0, DIMOFP0*sizeof(P0[0]));
3054
P0[0] = sum = oldsum = oldsum2 = 1.;
3056
if (p[0] + p[range*d] == 1.) {
3057
/* There are only two scores (e.g. DNA comparison matrix */
3058
bi_modal_score = TRUE;
3062
for (j = 0; j < iter && sum > sumlimit; Sum += sum /= ++j) {
3063
first = last = range;
3066
for (ptrP = P0+(hi-lo); ptrP >= P0; *ptrP-- =sum) {
3067
ptr1 = ptrP - first;
3068
ptr1e = ptrP - last;
3070
for (sum = 0.; ptr1 >= ptr1e; )
3071
sum += *ptr1-- * *ptr2++;
3074
if (ptrP - P0 <= range)
3077
etolami = Nlm_Powi((Nlm_FloatHi)etolam, lo - 1);
3078
for (sum = 0., i = lo; i != 0; ++i) {
3080
sum += *++ptrP * etolami;
3082
for (; i <= hi; ++i)
3088
if (!bi_modal_score) {
3089
/* Terms of geometric progression added for correction */
3090
ratio = oldsum / oldsum2;
3091
if (ratio >= (1.0 - sumlimit*0.001)) {
3096
while (sum > sumlimit) {
3098
Sum += sum = oldsum / ++j;
3104
etolami = 1 / etolam;
3105
K = exp((Nlm_FloatHi)-2.0*Sum) / (av*(1.0 - etolami));
3108
K = -exp((Nlm_FloatHi)-2.0*Sum) / (av*Nlm_Expm1(-(Nlm_FloatHi)lambda));
3111
#ifndef BLAST_KARLIN_K_STACKP
3119
BlastKarlinLambdaBis
3121
Calculate Lambda using the bisection method (slow).
3124
BlastKarlinLambdaBis(BLAST_ScoreFreqPtr sfp)
3126
register Nlm_FloatHi PNTR sprob;
3127
Nlm_FloatHi lambda, up, newval;
3128
BLAST_Score i, low, high, d;
3130
register Nlm_FloatHi sum, x0, x1;
3132
if (sfp->score_avg >= 0.) {
3136
high = sfp->obs_max;
3137
if (BlastScoreChk(low, high) != 0)
3142
/* Find greatest common divisor of all scores */
3143
for (i = 1, d = -low; i <= high-low && d > 1; ++i) {
3144
if (sprob[i+low] != 0)
3151
up = BLAST_KARLIN_LAMBDA0_DEFAULT;
3152
for (lambda=0.; ; ) {
3154
x0 = exp((Nlm_FloatHi)up);
3155
x1 = Nlm_Powi((Nlm_FloatHi)x0, low - 1);
3157
for (sum=0., i=low; i<=high; ++i)
3158
sum += sprob[i*d] * (x1 *= x0);
3161
for (sum=0., i=low; i<=high; ++i)
3162
sum += sprob[i*d] * exp(up * i);
3169
for (j=0; j<BLAST_KARLIN_LAMBDA_ITER_DEFAULT; ++j) {
3170
newval = (lambda + up) / 2.;
3171
x0 = exp((Nlm_FloatHi)newval);
3172
x1 = Nlm_Powi((Nlm_FloatHi)x0, low - 1);
3174
for (sum=0., i=low; i<=high; ++i)
3175
sum += sprob[i*d] * (x1 *= x0);
3178
for (sum=0., i=low; i<=high; ++i)
3179
sum += sprob[i*d] * exp(newval * i);
3186
return (lambda + up) / (2. * d);
3351
#define smallLambdaThreshold 20 /*defines special case in K computation*/
3352
/*threshold is on exp(-Lambda)*/
3354
/*The following procedure computes K. The input includes Lambda, H,
3355
* and an array of probabilities for each score.
3356
* There are distinct closed form for three cases:
3357
* 1. high score is 1 low score is -1
3358
* 2. high score is 1 low score is not -1
3359
* 3. low score is -1, high score is not 1
3361
* Otherwise, in most cases the value is computed as:
3362
* -exp(-2.0*outerSum) / ((H/lambda)*(exp(-lambda) - 1)
3363
* The last term (exp(-lambda) - 1) can be computed in two different
3364
* ways depending on whether lambda is small or not.
3365
* outerSum is a sum of the terms
3366
* innerSum/j, where j is denoted by iterCounter in the code.
3367
* The sum is truncated when the new term innersum/j i sufficiently small.
3368
* innerSum is a weighted sum of the probabilities of
3369
* of achieving a total score i in a gapless alignment,
3370
* which we denote by P(i,j).
3371
* of exactly j characters. innerSum(j) has two parts
3372
* Sum over i < 0 P(i,j)exp(-i * lambda) +
3373
* Sum over i >=0 P(i,j)
3374
* The terms P(i,j) are computed by dynamic programming.
3375
* An earlier version was flawed in that ignored the special case 1
3376
* and tried to replace the tail of the computation of outerSum
3377
* by a geometric series, but the base of the geometric series
3378
* was not accurately estimated in some cases.
3382
BlastKarlinLHtoK(BLAST_ScoreFreqPtr sfp, Nlm_FloatHi lambda, Nlm_FloatHi H)
3384
/*The next array stores the probabilities of getting each possible
3385
score in an alignment of fixed length; the array is shifted
3386
during part of the computation, so that
3387
entry 0 is for score 0. */
3388
Nlm_FloatHi PNTR alignmentScoreProbabilities = NULL;
3389
BLAST_Score low; /* Lowest score (must be negative) */
3390
BLAST_Score high; /* Highest score (must be positive) */
3391
BLAST_Score range; /* range of scores, computed as high - low*/
3392
Nlm_FloatHi K; /* local copy of K to return*/
3393
Int4 i; /*loop index*/
3394
Int4 iterCounter; /*counter on iterations*/
3395
BLAST_Score divisor; /*candidate divisor of all scores with
3396
non-zero probabilities*/
3397
/*highest and lowest possible alignment scores for current length*/
3398
BLAST_Score lowAlignmentScore, highAlignmentScore;
3399
BLAST_Score first, last; /*loop indices for dynamic program*/
3400
register Nlm_FloatHi innerSum;
3401
Nlm_FloatHi oldsum, oldsum2; /* values of innerSum on previous
3403
Nlm_FloatHi outerSum; /* holds sum over j of (innerSum
3404
for iteration j/j)*/
3406
Nlm_FloatHi score_avg; /*average score*/
3407
/*first term to use in the closed form for the case where
3408
high == 1 or low == -1, but not both*/
3409
Nlm_FloatHi firstTermClosedForm; /*usually store H/lambda*/
3410
Int4 iterlimit; /*upper limit on iterations*/
3411
Nlm_FloatHi sumlimit; /*lower limit on contributions
3412
to sum over scores*/
3414
/*array of score probabilities reindexed so that low is at index 0*/
3415
Nlm_FloatHi PNTR probArrayStartLow;
3417
/*pointers used in dynamic program*/
3418
Nlm_FloatHi PNTR ptrP, PNTR ptr1, PNTR ptr2, PNTR ptr1e;
3419
Nlm_FloatHi expMinusLambda; /*e^^(-Lambda) */
3421
if (lambda <= 0. || H <= 0.) {
3422
/* Theory dictates that H and lambda must be positive, so
3423
* return -1 to indicate an error */
3427
/*Karlin-Altschul theory works only if the expected score
3429
if (sfp->score_avg >= 0.0) {
3434
high = sfp->obs_max;
3437
probArrayStartLow = &sfp->sprob[low];
3438
/* Look for the greatest common divisor ("delta" in Appendix of PNAS 87 of
3439
Karlin&Altschul (1990) */
3440
for (i = 1, divisor = -low; i <= range && divisor > 1; ++i) {
3441
if (probArrayStartLow[i] != 0.0)
3442
divisor = Nlm_Gcd(divisor, i);
3451
firstTermClosedForm = H/lambda;
3452
expMinusLambda = exp((Nlm_FloatHi) -lambda);
3454
if (low == -1 && high == 1) {
3455
K = (sfp->sprob[low] - sfp->sprob[high]) *
3456
(sfp->sprob[low] - sfp->sprob[high]) / sfp->sprob[low];
3460
if (low == -1 || high == 1) {
3464
score_avg = sfp->score_avg / divisor;
3466
= (score_avg * score_avg) / firstTermClosedForm;
3468
return firstTermClosedForm * (1.0 - expMinusLambda);
3471
sumlimit = BLAST_KARLIN_K_SUMLIMIT_DEFAULT;
3472
iterlimit = BLAST_KARLIN_K_ITER_MAX;
3474
if (DIMOFP0 > DIMOFP0_MAX) {
3477
alignmentScoreProbabilities =
3478
(Nlm_FloatHi PNTR)MemNew(DIMOFP0 *
3479
sizeof(*alignmentScoreProbabilities));
3480
if (alignmentScoreProbabilities == NULL)
3484
lowAlignmentScore = highAlignmentScore = 0;
3485
alignmentScoreProbabilities[0] = innerSum = oldsum = oldsum2 = 1.;
3487
for (iterCounter = 0;
3488
((iterCounter < iterlimit) && (innerSum > sumlimit));
3489
outerSum += innerSum /= ++iterCounter) {
3490
first = last = range;
3491
lowAlignmentScore += low;
3492
highAlignmentScore += high;
3493
/*dynamic program to compute P(i,j)*/
3494
for (ptrP = alignmentScoreProbabilities +
3495
(highAlignmentScore-lowAlignmentScore);
3496
ptrP >= alignmentScoreProbabilities;
3497
*ptrP-- =innerSum) {
3498
ptr1 = ptrP - first;
3499
ptr1e = ptrP - last;
3500
ptr2 = probArrayStartLow + first;
3501
for (innerSum = 0.; ptr1 >= ptr1e; )
3502
innerSum += *ptr1-- * *ptr2++;
3505
if (ptrP - alignmentScoreProbabilities <= range)
3510
for( i = lowAlignmentScore + 1; i < 0; i++ ) {
3511
innerSum = *++ptrP + innerSum * expMinusLambda;
3513
innerSum *= expMinusLambda;
3515
for (; i <= highAlignmentScore; ++i)
3516
innerSum += *++ptrP;
3521
#ifdef ADD_GEOMETRIC_TERMS_TO_K
3522
/*old code assumed that the later terms in sum were
3523
asymptotically comparable to those of a geometric
3524
progression, and tried to speed up convergence by
3525
guessing the estimated ratio between sucessive terms
3526
and using the explicit terms of a geometric progression
3527
to speed up convergence. However, the assumption does not
3528
always hold, and convergenece of the above code is fast
3529
enough in practice*/
3530
/* Terms of geometric progression added for correction */
3532
Nlm_FloatHi ratio; /* fraction used to generate the
3533
geometric progression */
3535
ratio = oldsum / oldsum2;
3536
if (ratio >= (1.0 - sumlimit*0.001)) {
3538
if (alignmentScoreProbabilities != NULL)
3539
MemFree(alignmentScoreProbabilities);
3543
while (innerSum > sumlimit) {
3545
outerSum += innerSum = oldsum / ++iterCounter;
3550
if (expMinusLambda < smallLambdaThreshold ) {
3551
K = -exp((double)-2.0*outerSum) /
3552
(firstTermClosedForm*(expMinusLambda - 1.0));
3554
K = -exp((double)-2.0*outerSum) /
3555
(firstTermClosedForm*Expm1(-(double)lambda));
3558
if (alignmentScoreProbabilities != NULL)
3559
MemFree(alignmentScoreProbabilities);
3189
3566
/******************* Fast Lambda Calculation Subroutine ************************
3190
3567
Version 1.0 May 16, 1991
3194
3571
guess (lambda0) obtained perhaps by the bisection method.
3195
3572
*******************************************************************************/
3575
* Find positive solution to sum_{i=low}^{high} exp(i lambda) = 1.
3577
* @param probs probabilities of a score occurring
3578
* @param d the gcd of the possible scores. This equals 1 if the scores
3580
* @param low the lowest possible score
3581
* @param high the highest possible score
3582
* @param lambda0 an initial value for lambda
3583
* @param tolx the tolerance to which lambda must be computed
3584
* @param itmax the maximum number of times the function may be
3586
* @param maxNewton the maximum permissible number of Newton
3587
* iteration. After that the computation will proceed by bisection.
3588
* @param itn a pointer to an integer that will receive the actually
3589
* number of iterations performed.
3591
* Let phi(lambda) = sum_{i=low}^{high} exp(i lambda) - 1. Then phi(lambda)
3594
* phi(lamdba) = exp(u lambda) p( exp(-lambda) )
3596
* where p(x) is a polynomial that has exactly two zeros, one at x = 1
3597
* and one at y = exp(-lamdba). It is simpler, more numerically
3598
* efficient and stable to apply Newton's method to p(x) than to
3601
* We define a safeguarded Newton iteration as follows. Let the
3602
* initial interval of uncertainty be [0,1]. If p'(x) >= 0, we bisect
3603
* the interval. Otherwise we try a Newton step. If the Newton iterate
3604
* lies in the current interval of uncertainty and it reduces the
3605
* value of | p(x) | by at least 10%, we accept the new
3606
* point. Otherwise, we bisect the current interval of uncertainty.
3607
* It is clear that this method converges to a zero of p(x). Since
3608
* p'(x) > 0 in an interval containing x = 1, the method cannot
3609
* converge to x = 1 and therefore converges to the only other zero,
3614
NlmKarlinLambdaNR( Nlm_FloatHi PNTR probs, BLAST_Score d,
3615
BLAST_Score low, BLAST_Score high,
3616
Nlm_FloatHi lambda0, Nlm_FloatHi tolx,
3617
int itmax, int maxNewton, int * itn )
3620
Nlm_FloatHi x0, x, a = 0, b = 1;
3621
Nlm_FloatHi f = 4; /* Larger than any possible value of the poly in [0,1] */
3622
int isNewton = 0; /* we haven't yet taken a Newton step. */
3626
x0 = exp( -lambda0 );
3627
x = ( 0 < x0 && x0 < 1 ) ? x0 : .5;
3629
for( k = 0; k < itmax; k++ ) { /* all iteration indices k */
3631
Nlm_FloatHi g, fold = f;
3632
int wasNewton = isNewton; /* If true, then the previous step was a */
3634
isNewton = 0; /* Assume that this step is not */
3636
/* Horner's rule for evaluating a polynomial and its derivative */
3639
for( i = low + d; i < 0; i += d ) {
3641
f = f * x + probs[i];
3644
f = f * x + probs[0] - 1;
3645
for( i = d; i <= high; i += d ) {
3647
f = f * x + probs[i];
3649
/* End Horner's rule */
3652
a = x; /* move the left endpoint */
3653
} else if( f < 0 ) {
3654
b = x; /* move the right endpoint */
3655
} else { /* f == 0 */
3656
break; /* x is an exact solution */
3658
if( b - a < 2 * a * ( 1 - b ) * tolx ) {
3659
/* The midpoint of the interval converged */
3660
x = (a + b) / 2; break;
3663
if( k >= maxNewton ||
3664
/* If convergence of Newton's method appears to be failing; or */
3665
( wasNewton && fabs( f ) > .9 * fabs(fold) ) ||
3666
/* if the previous iteration was a Newton step but didn't decrease
3667
* f sufficiently; or */
3669
/* if a Newton step will move us away from the desired solution */
3674
/* try a Newton step */
3677
if( y <= a || y >= b ) { /* The proposed iterate is not in (a,b) */
3679
} else { /* The proposed iterate is in (a,b). Accept it. */
3682
if( fabs( p ) < tolx * x * (1-x) ) break; /* Converged */
3683
} /* else the proposed iterate is in (a,b) */
3684
} /* else try a Newton step. */
3685
} /* end for all iteration indices k */
3198
3692
BlastKarlinLambdaNR(BLAST_ScoreFreqPtr sfp)
3200
3694
BLAST_Score low; /* Lowest score (must be negative) */
3201
3695
BLAST_Score high; /* Highest score (must be positive) */
3203
3697
BLAST_Score i, d;
3204
3698
Nlm_FloatHi PNTR sprob;
3205
Nlm_FloatHi lambda0, sum, slope, temp, x0, x1, amt;
3699
Nlm_FloatHi returnValue;
3207
3701
low = sfp->obs_min;
3208
3702
high = sfp->obs_max;
3209
3703
if (sfp->score_avg >= 0.) { /* Expected score must be negative */
3212
if (BlastScoreChk(low, high) != 0)
3215
lambda0 = BLAST_KARLIN_LAMBDA0_DEFAULT;
3706
if (BlastScoreChk(low, high) != 0) return -1.;
3217
3708
sprob = sfp->sprob;
3218
/* Find greatest common divisor of all scores */
3219
for (i = 1, d = -low; i <= high-low && d > 1; ++i) {
3220
if (sprob[i+low] != 0)
3226
/* Calculate lambda */
3228
for (j=0; j<20; ++j) { /* limit of 20 should never be close-approached */
3233
x0 = exp((Nlm_FloatHi)lambda0);
3234
x1 = Nlm_Powi((Nlm_FloatHi)x0, low - 1);
3237
for (i=low; i<=high; i++) {
3238
sum += (temp = sprob[i*d] * (x1 *= x0));
3241
lambda0 -= (amt = sum/slope);
3242
if (ABS(amt/lambda0) < BLAST_KARLIN_LAMBDA_ACCURACY_DEFAULT) {
3244
Does it appear that we may be on the verge of converging
3245
to the ever-present, zero-valued solution?
3247
if (lambda0 > BLAST_KARLIN_LAMBDA_ACCURACY_DEFAULT)
3252
return BlastKarlinLambdaBis(sfp);
3258
Calculate H, the relative entropy of the p's and q's
3261
BlastKarlinLtoH(BLAST_ScoreFreqPtr sfp, Nlm_FloatHi lambda)
3264
Nlm_FloatHi av, etolam, etolami;
3269
if (BlastScoreChk(sfp->obs_min, sfp->obs_max) != 0)
3272
etolam = exp((Nlm_FloatHi)lambda);
3273
etolami = Nlm_Powi((Nlm_FloatHi)etolam, sfp->obs_min - 1);
3277
for (score=sfp->obs_min; score<=sfp->obs_max; score++)
3278
av += sfp->sprob[score] * score * (etolami *= etolam);
3283
for (score=sfp->obs_min; score<=sfp->obs_max; score++)
3284
av += sfp->sprob[score] * score * exp(lambda * score);
3292
BlastGapDecayInverse(Nlm_FloatHi pvalue, unsigned nsegs, Nlm_FloatHi decayrate)
3294
if (decayrate <= 0. || decayrate >= 1. || nsegs == 0)
3297
return pvalue * (1. - decayrate) * Nlm_Powi(decayrate, nsegs - 1);
3301
BlastGapDecay(Nlm_FloatHi pvalue, unsigned nsegs, Nlm_FloatHi decayrate)
3303
if (decayrate <= 0. || decayrate >= 1. || nsegs == 0)
3306
return pvalue / ((1. - decayrate) * Nlm_Powi(decayrate, nsegs - 1));
3709
/* Find greatest common divisor of all scores */
3710
for (i = 1, d = -low; i <= high-low && d > 1; ++i) {
3711
if (sprob[i+low] != 0.0) {
3716
NlmKarlinLambdaNR( sprob, d, low, high,
3717
BLAST_KARLIN_LAMBDA0_DEFAULT,
3718
BLAST_KARLIN_LAMBDA_ACCURACY_DEFAULT,
3719
20, 20 + BLAST_KARLIN_LAMBDA_ITER_DEFAULT, &itn );
3726
impalaKarlinLambdaNR(BLAST_ScoreFreqPtr sfp, Nlm_FloatHi initialLambda)
3728
Nlm_FloatHi returnValue;
3730
Nlm_FloatHi PNTR sprob = sfp->sprob;
3732
if (sfp->score_avg >= 0.) { /* Expected score must be negative */
3736
Boolean foundPositive = FALSE;
3738
for(j = 1; j <=sfp->obs_max; j++) {
3739
if (sprob[j] > 0.0) {
3740
foundPositive = TRUE;
3744
if (!foundPositive) return(-1);
3748
NlmKarlinLambdaNR( sprob, 1, sfp->obs_min, sfp->obs_max,
3749
initialLambda, BLAST_KARLIN_LAMBDA_ACCURACY_DEFAULT,
3750
20, 20 + BLAST_KARLIN_LAMBDA_ITER_DEFAULT, &itn );
3757
/* Compute a divisor used to weight the evalue of a collection of
3758
* "nsegs" distinct alignments. These divisors are used to compensate
3759
* for the effect of choosing the best among multiple collections of
3762
* Stephen F. Altschul. Evaluating the statitical significance of
3763
* multiple distinct local alignments. In Suhai, editior, Theoretical
3764
* and Computational Methods in Genome Research, pages 1-14. Plenum
3765
* Press, New York, 1997.
3767
* The "decayrate" parameter of this routine is a value in the
3768
* interval (0,1). Typical values of decayrate are .1 and .5. */
3771
BlastGapDecayDivisor(Nlm_FloatHi decayrate, unsigned nsegs )
3773
return (1. - decayrate) * Nlm_Powi(decayrate, nsegs - 1);
3722
Calculates the p-value for alignments with "small" gaps (typically
3723
under fifty residues/basepairs) following ideas of Stephen Altschul's.
3724
"gap" gives the size of this gap, "gap_prob" is the probability
3725
of this model of gapping being correct (it's thought that gap_prob
3726
will generally be 0.5). "num" is the number of HSP's involved, "sum"
3727
is the "raw" sum-score of these HSP's. "subject_len" is the (effective)
3728
length of the database sequence, "query_length" is the (effective)
3729
length of the query sequence. min_length_one specifies whether one
3730
or 1/K will be used as the minimum expected length.
3735
BlastSmallGapSumE(BLAST_KarlinBlkPtr kbp, Int4 gap, Nlm_FloatHi gap_prob, Nlm_FloatHi gap_decay_rate, Int2 num, Int4 sum, Nlm_FloatHi score_prime, Int4 query_length, Int4 subject_length, Boolean min_length_one)
3739
Nlm_FloatHi sum_p, sum_e;
3741
score_prime -= kbp->logK + log((Nlm_FloatHi)subject_length*(Nlm_FloatHi)query_length) + (num-1)*(kbp->logK + 2*log((Nlm_FloatHi)gap));
3742
score_prime -= LnFactorial((Nlm_FloatHi) num);
3744
sum_p = BlastSumP(num, score_prime);
3746
sum_e = BlastKarlinPtoE(sum_p);
3748
sum_e = sum_e/((1.0-gap_decay_rate)*Nlm_Powi(gap_decay_rate, (num-1)));
3752
if (gap_prob == 0.0)
3755
sum_e = sum_e/gap_prob;
3762
Calculates the p-value for alignments with asymmetric gaps, typically
3763
a small (like in BlastSmallGapSumE) gap for one (protein) sequence and
3764
a possibly large (up to 4000 bp) gap in the other (translated DNA)
3765
sequence. Used for linking HSPs representing exons in the DNA sequence
3766
that are separated by introns.
3770
BlastUnevenGapSumE(BLAST_KarlinBlkPtr kbp, Int4 p_gap, Int4 n_gap, Nlm_FloatHi gap_prob, Nlm_FloatHi gap_decay_rate, Int2 num, Int4 sum, Nlm_FloatHi score_prime, Int4 query_length, Int4 subject_length, Boolean min_length_one)
3774
Nlm_FloatHi sum_p, sum_e;
3776
score_prime -= kbp->logK + log((Nlm_FloatHi)subject_length*(Nlm_FloatHi)query_length) + (num-1)*(kbp->logK + log((Nlm_FloatHi)p_gap) + log((Nlm_FloatHi)n_gap));
3777
score_prime -= LnFactorial((Nlm_FloatHi) num);
3779
sum_p = BlastSumP(num, score_prime);
3781
sum_e = BlastKarlinPtoE(sum_p);
3783
sum_e = sum_e/((1.0-gap_decay_rate)*Nlm_Powi(gap_decay_rate, (num-1)));
3787
if (gap_prob == 0.0)
3790
sum_e = sum_e/gap_prob;
3797
Calculates the p-values for alignments with "large" gaps (i.e.,
3798
infinite) followings an idea of Stephen Altschul's.
3802
BlastLargeGapSumE(BLAST_KarlinBlkPtr kbp, Nlm_FloatHi gap_prob, Nlm_FloatHi gap_decay_rate, Int2 num, Int4 sum, Nlm_FloatHi score_prime, Int4 query_length, Int4 subject_length, Boolean old_stats)
3806
Nlm_FloatHi sum_p, sum_e;
4191
Calculates the e-value for alignments with "small" gaps (typically
4192
under fifty residues/basepairs) following ideas of Stephen Altschul's.
4197
Int4 starting_points, /* the number of starting points
4198
* permitted between adjacent
4200
* max_overlap + max_gap + 1 */
4201
Int2 num, /* the number of distinct alignments in this
4203
Nlm_FloatHi xsum, /* the sum of the scores of these alignments
4204
* each weighted by an appropriate value of
4205
* Lambda and logK */
4206
Int4 query_length, /* the effective len of the query seq */
4207
Int4 subject_length, /* the effective len of the database seq */
4208
Int8 dblen_eff, /* the effective len of the database */
4209
Nlm_FloatHi weight_divisor) /* a divisor used to weight the e-value
4210
* when multiple collections of alignments
4211
* are being considered by the calling
4214
Nlm_FloatHi search_space; /* The effective size of the search space */
4215
Nlm_FloatHi sum_e; /* The e-value of this set of alignments */
4218
search_space = (Nlm_FloatHi) query_length * (Nlm_FloatHi)dblen_eff;
4220
sum_e = search_space * exp(-xsum);
4222
Nlm_FloatHi sum_p; /* The p-value of this set of alignments */
4224
search_space = (Nlm_FloatHi)subject_length * (Nlm_FloatHi)query_length;
4227
log(search_space) + 2 * (num-1)*log((Nlm_FloatHi)starting_points);
4228
xsum -= LnFactorial((Nlm_FloatHi) num);
4230
sum_p = BlastSumP(num, xsum);
4231
sum_e = BlastKarlinPtoE(sum_p) *
4232
((Nlm_FloatHi) dblen_eff / (Nlm_FloatHi) subject_length);
4234
if( weight_divisor == 0 || (sum_e /= weight_divisor) > INT4_MAX ) {
4243
Calculates the e-value of a collection multiple distinct
4244
alignments with asymmetric gaps between the alignments. The gaps
4245
in one (protein) sequence are typically small (like in
4246
BlastSmallGapSumE) gap an the gaps in the other (translated DNA)
4247
sequence are possibly large (up to 4000 bp.) This routine is used
4248
for linking HSPs representing exons in the DNA sequence that are
4249
separated by introns.
4254
Int4 query_start_points, /* the number of starting points in
4255
* the query sequence permitted
4256
* between adjacent alignments;
4257
* max_overlap + max_gap + 1 */
4258
Int4 subject_start_points, /* the number of starting points in
4259
* the subject sequence permitted
4260
* between adjacent alignments */
4261
Int2 num, /* the number of distinct alignments in this
4263
Nlm_FloatHi xsum, /* the sum of the scores of these alignments
4264
* each weighted by an appropriate value of
4265
* Lambda and logK */
4266
Int4 query_length, /* the effective len of the query seq */
4267
Int4 subject_length, /* the effective len of the database seq */
4268
Int8 dblen_eff, /* the effective len of the database */
4269
Nlm_FloatHi weight_divisor) /* a divisor used to weight the e-value
4270
* when multiple collections of alignments
4271
* are being considered by the calling
4274
Nlm_FloatHi search_space; /* The effective size of the search space */
4275
Nlm_FloatHi sum_e; /* The e-value of this set of alignments */
4278
search_space = (Nlm_FloatHi)query_length * (Nlm_FloatHi)dblen_eff;
4280
sum_e = search_space * exp(-xsum);
4282
Nlm_FloatHi sum_p; /* The p-value of this set of alignments */
4284
search_space = (Nlm_FloatHi)subject_length * (Nlm_FloatHi)query_length;
4286
xsum -= log(search_space) +
4287
(num-1)*(log((Nlm_FloatHi) query_start_points) +
4288
log((Nlm_FloatHi) subject_start_points));
4289
xsum -= LnFactorial((Nlm_FloatHi) num);
4291
sum_p = BlastSumP(num, xsum);
4293
sum_e = BlastKarlinPtoE(sum_p) *
4294
((Nlm_FloatHi) dblen_eff / (Nlm_FloatHi) subject_length);
4296
if( weight_divisor == 0 || (sum_e /= weight_divisor) > INT4_MAX ) {
4304
Calculates the e-value if a collection of distinct alignments with
4305
arbitrarily large gaps between the alignments
4310
Int2 num, /* the number of distinct alignments in this
4312
Nlm_FloatHi xsum, /* the sum of the scores of these alignments
4313
* each weighted by an appropriate value of
4314
* Lambda and logK */
4315
Int4 query_length, /* the effective len of the query seq */
4316
Int4 subject_length, /* the effective len of the database seq */
4317
Int8 dblen_eff, /* the effective len of the database */
4318
Nlm_FloatHi weight_divisor) /* a divisor used to weight the e-value
4319
* when multiple collections of alignments
4320
* are being considered by the calling
4323
Nlm_FloatHi sum_p; /* The p-value of this set of alignments */
4324
Nlm_FloatHi sum_e; /* The e-value of this set of alignments */
3807
4326
/* The next two variables are for compatability with Warren's code. */
3808
Nlm_FloatHi lcl_subject_length, lcl_query_length;
3810
lcl_query_length = (Nlm_FloatHi) query_length;
3811
lcl_subject_length = (Nlm_FloatHi) subject_length;
3813
score_prime -= num*(kbp->logK + log(lcl_subject_length*lcl_query_length))
3814
- LnFactorial((Nlm_FloatHi) num);
3816
sum_p = BlastSumP(num, score_prime);
3818
sum_e = BlastKarlinPtoE(sum_p);
3820
sum_e = sum_e/((1.0-gap_decay_rate)*Nlm_Powi(gap_decay_rate, (num-1)));
3824
if (gap_prob == 1.0)
3827
sum_e = sum_e/(1.0 - gap_prob);
4327
Nlm_FloatHi lcl_subject_length; /* query_length as a float */
4328
Nlm_FloatHi lcl_query_length; /* subject_length as a float */
4330
lcl_query_length = (Nlm_FloatHi) query_length;
4331
lcl_subject_length = (Nlm_FloatHi) subject_length;
4334
sum_e = lcl_query_length * (Nlm_FloatHi) dblen_eff * exp(-xsum);
4336
xsum -= num*log(lcl_subject_length*lcl_query_length)
4337
- LnFactorial((Nlm_FloatHi) num);
4339
sum_p = BlastSumP(num, xsum);
4341
sum_e = BlastKarlinPtoE(sum_p) *
4342
((Nlm_FloatHi) dblen_eff / (Nlm_FloatHi) subject_length);
4344
if( weight_divisor == 0 || (sum_e /= weight_divisor) > INT4_MAX ) {
3833
4352
/********************************************************************
3835
4354
* The following function, from Stephen Altschul, calculates a
4001
4520
MemFree(txmatrix);
4526
* Computes the adjustment to the lengths of the query and database sequences
4527
* that is used to compensate for edge effects when computing evalues.
4529
* The length adjustment is an integer-valued approximation to the fixed
4530
* point of the function
4533
* (alpha/lambda) * (log K + log((m - ell)*(n - N ell)))
4535
* where m is the query length n is the length of the database and N is the
4536
* number of sequences in the database. The values beta, alpha, lambda and
4537
* K are statistical, Karlin-Altschul parameters.
4539
* The value of the length adjustment computed by this routine, A,
4540
* will always be an integer smaller than the fixed point of
4541
* f(ell). Usually, it will be the largest such integer. However, the
4542
* computed length adjustment, A, will also be so small that
4544
* K * (m - A) * (n - N * A) > min(m,n).
4546
* Moreover, an iterative method is used to compute A, and under
4547
* unusual circumstances the iterative method may not converge.
4549
* @param K the statistical parameter K
4550
* @param logK the natural logarithm of K
4551
* @param alpha_d_lambda the ratio of the statistical parameters
4552
* alpha and lambda (for ungapped alignments, the
4553
* value 1/H should be used)
4554
* @param beta the statistical parameter beta (for ungapped
4555
* alignments, beta == 0)
4556
* @param query_length the length of the query sequence
4557
* @param db_length the length of the database
4558
* @param db_num_seq the number of sequences in the database
4559
* @param length_adjustment the computed value of the length adjustment [out]
4561
* @return 0 if length_adjustment is known to be the largest integer less
4562
* than the fixed point of f(ell); 1 otherwise.
4565
BlastComputeLengthAdjustment(Nlm_FloatHi K,
4567
Nlm_FloatHi alpha_d_lambda,
4572
Int4 * length_adjustment)
4574
Int4 i; /* iteration index */
4575
const Int4 maxits = 20; /* maximum allowed iterations */
4576
Nlm_FloatHi m = query_length, n = db_length, N = db_num_seqs;
4578
Nlm_FloatHi ell; /* A float value of the length adjustment */
4579
Nlm_FloatHi ss; /* effective size of the search space */
4580
Nlm_FloatHi ell_min = 0, ell_max; /* At each iteration i,
4581
* ell_min <= ell <= ell_max. */
4582
Boolean converged = FALSE; /* True if the iteration converged */
4583
Nlm_FloatHi ell_next = 0; /* Value the variable ell takes at iteration
4585
/* Choose ell_max to be the largest nonnegative value that satisfies
4587
* K * (m - ell) * (n - N * ell) > max(m,n)
4589
* Use quadratic formula: 2 c /( - b + sqrt( b*b - 4 * a * c )) */
4590
{ /* scope of a, mb, and c, the coefficients in the quadratic formula
4591
* (the variable mb is -b) */
4593
Nlm_FloatHi mb = m * N + n;
4594
Nlm_FloatHi c = n * m - MAX(m, n) / K;
4597
*length_adjustment = 0;
4600
ell_max = 2 * c / (mb + sqrt(mb * mb - 4 * a * c));
4602
} /* end scope of a, mb and c */
4604
for(i = 1; i <= maxits; i++) { /* for all iteration indices */
4605
Nlm_FloatHi ell_bar; /* proposed next value of ell */
4607
ss = (m - ell) * (n - N * ell);
4608
ell_bar = alpha_d_lambda * (logK + log(ss)) + beta;
4609
if(ell_bar >= ell) { /* ell is no bigger than the true fixed point */
4611
if(ell_bar - ell_min <= 1.0) {
4615
if(ell_min == ell_max) { /* There are no more points to check */
4618
} else { /* else ell is greater than the true fixed point */
4621
if(ell_min <= ell_bar && ell_bar <= ell_max) {
4622
/* ell_bar is in range. Accept it */
4624
} else { /* else ell_bar is not in range. Reject it */
4625
ell_next = (i == 1) ? ell_max : (ell_min + ell_max) / 2;
4627
} /* end for all iteration indices */
4628
if(converged) { /* the iteration converged */
4629
/* If ell_fixed is the (unknown) true fixed point, then we
4630
* wish to set (*length_adjustment) to floor(ell_fixed). We
4631
* assume that floor(ell_min) = floor(ell_fixed) */
4632
*length_adjustment = (Int4) ell_min;
4633
/* But verify that ceil(ell_min) != floor(ell_fixed) */
4634
ell = ceil(ell_min);
4635
if( ell <= ell_max ) {
4636
ss = (m - ell) * (n - N * ell);
4637
if(alpha_d_lambda * (logK + log(ss)) + beta >= ell) {
4638
/* ceil(ell_min) == floor(ell_fixed) */
4639
*length_adjustment = (Int4) ell;
4642
} else { /* else the iteration did not converge. */
4643
/* Use the best value seen so far */
4644
*length_adjustment = (Int4) ell_min;
4647
return converged ? 0 : 1;