1
static char const rcsid[] =
2
"$Id: blast_psi_priv.c,v 1.30 2004/10/19 14:28:38 camacho Exp $";
3
/* ===========================================================================
6
* National Center for Biotechnology Information
8
* This software/database is a "United States Government Work" under the
9
* terms of the United States Copyright Act. It was written as part of
10
* the author's official duties as a United States Government employee and
11
* thus cannot be copyrighted. This software/database is freely available
12
* to the public for use. The National Library of Medicine and the U.S.
13
* Government have not placed any restriction on its use or reproduction.
15
* Although all reasonable efforts have been taken to ensure the accuracy
16
* and reliability of the software and data, the NLM and the U.S.
17
* Government do not and cannot warrant the performance or results that
18
* may be obtained by using this software or data. The NLM and the U.S.
19
* Government disclaim all warranties, express or implied, including
20
* warranties of performance, merchantability or fitness for any particular
23
* Please cite the author in any work or product based on this material.
25
* ===========================================================================
27
* Author: Alejandro Schaffer, ported by Christiam Camacho
31
/** @file blast_psi_priv.c
32
* Defintions for functions in private interface for Position Iterated BLAST
36
#include "blast_psi_priv.h"
37
#include "matrix_freq_ratios.h"
38
#include <algo/blast/core/blast_util.h> /* needed for IS_residue */
40
/****************************************************************************/
41
/* Use the following #define's to enable/disable functionality */
43
/* Taking gaps into account when constructing a PSSM was introduced in the
44
* 2001 paper "Improving the accuracy of PSI-BLAST protein database searches
45
* with composition based-statistics and other refinements". This feature
46
* can be disabled by defining the PSI_IGNORE_GAPS_IN_COLUMNS symbol below */
47
/* #define PSI_IGNORE_GAPS_IN_COLUMNS */
48
/****************************************************************************/
50
/****************************************************************************/
52
const double kPSINearIdentical = 0.94;
53
const double kPSIIdentical = 1.0;
54
const unsigned int kQueryIndex = 0;
55
const double kEpsilon = 0.0001;
56
const int kPSIScaleFactor = 200;
58
/****************************************************************************/
61
_PSIAllocateMatrix(unsigned int ncols, unsigned int nrows,
62
unsigned int data_type_sz)
67
retval = (void**) malloc(sizeof(void*) * ncols);
72
for (i = 0; i < ncols; i++) {
73
retval[i] = (void*) calloc(nrows, data_type_sz);
75
retval = _PSIDeallocateMatrix(retval, i);
83
_PSIDeallocateMatrix(void** matrix, unsigned int ncols)
90
for (i = 0; i < ncols; i++) {
98
_PSICopyIntMatrix(int** dest, int** src,
99
unsigned int ncols, unsigned int nrows)
107
for (i = 0; i < ncols; i++) {
108
for (j = 0; j < nrows; j++) {
109
dest[i][j] = src[i][j];
114
/****************************************************************************/
117
_PSIMsaNew(const PSIMsa* msa, Uint4 alphabet_size)
119
_PSIMsa* retval = NULL; /* the return value */
120
Uint4 s = 0; /* index on sequences */
121
Uint4 p = 0; /* index on positions */
123
if ( !msa || !msa->dimensions || !msa->data ) {
127
retval = (_PSIMsa*) calloc(1, sizeof(_PSIMsa));
129
return _PSIMsaFree(retval);
132
retval->alphabet_size = alphabet_size;
133
retval->dimensions = (PSIMsaDimensions*) malloc(sizeof(PSIMsaDimensions));
134
if ( !retval->dimensions ) {
135
return _PSIMsaFree(retval);
137
memcpy((void*) retval->dimensions,
138
(void*) msa->dimensions,
139
sizeof(PSIMsaDimensions));
141
retval->cell = (_PSIMsaCell**)
142
_PSIAllocateMatrix(msa->dimensions->num_seqs + 1,
143
msa->dimensions->query_length,
144
sizeof(_PSIMsaCell));
145
if ( !retval->cell ) {
146
return _PSIMsaFree(retval);
148
/* Copy the multiple sequence alignment data */
149
for (s = 0; s < msa->dimensions->num_seqs + 1; s++) {
150
for (p = 0; p < msa->dimensions->query_length; p++) {
151
retval->cell[s][p].letter = msa->data[s][p].letter;
152
retval->cell[s][p].is_aligned = msa->data[s][p].is_aligned;
153
retval->cell[s][p].extents.left = -1;
154
retval->cell[s][p].extents.right = msa->dimensions->query_length;
158
retval->use_sequence = (Boolean*) malloc((msa->dimensions->num_seqs + 1) *
160
if ( !retval->use_sequence ) {
161
return _PSIMsaFree(retval);
163
/* All sequences are valid candidates for taking part in PSSM construction
165
for (s = 0; s < msa->dimensions->num_seqs + 1; s++) {
166
retval->use_sequence[s] = TRUE;
169
retval->query = (Uint1*) malloc(msa->dimensions->query_length *
171
if ( !retval->query ) {
172
return _PSIMsaFree(retval);
174
/* Initialize the query according to convention that first sequence in msa
175
* data structure corresponds to the query */
176
for (p = 0; p < msa->dimensions->query_length; p++) {
177
ASSERT(IS_residue(msa->data[kQueryIndex][p].letter));
178
retval->query[p] = msa->data[kQueryIndex][p].letter;
181
retval->residue_counts = (Uint4**)
182
_PSIAllocateMatrix(msa->dimensions->query_length,
185
if ( !retval->residue_counts ) {
186
return _PSIMsaFree(retval);
189
retval->num_matching_seqs = (Uint4*) calloc(msa->dimensions->query_length,
191
if ( !retval->num_matching_seqs ) {
192
return _PSIMsaFree(retval);
199
_PSIMsaFree(_PSIMsa* msa)
205
if ( msa->cell && msa->dimensions ) {
206
_PSIDeallocateMatrix((void**) msa->cell,
207
msa->dimensions->num_seqs + 1);
211
if ( msa->use_sequence ) {
212
sfree(msa->use_sequence);
219
if ( msa->residue_counts && msa->dimensions ) {
220
_PSIDeallocateMatrix((void**) msa->residue_counts,
221
msa->dimensions->query_length);
222
msa->residue_counts = NULL;
225
if ( msa->num_matching_seqs ) {
226
sfree(msa->num_matching_seqs);
229
if ( msa->dimensions ) {
230
sfree(msa->dimensions);
238
_PSIInternalPssmData*
239
_PSIInternalPssmDataNew(Uint4 query_length, Uint4 alphabet_size)
241
_PSIInternalPssmData* retval = NULL;
243
retval = (_PSIInternalPssmData*) calloc(1, sizeof(_PSIInternalPssmData));
248
retval->ncols = query_length;
249
retval->nrows = alphabet_size;
251
retval->pssm = (int**) _PSIAllocateMatrix(retval->ncols,
254
if ( !retval->pssm ) {
255
return _PSIInternalPssmDataFree(retval);
258
retval->scaled_pssm = (int**) _PSIAllocateMatrix(retval->ncols,
261
if ( !retval->scaled_pssm ) {
262
return _PSIInternalPssmDataFree(retval);
265
retval->res_freqs = (double**) _PSIAllocateMatrix(retval->ncols,
268
if ( !retval->res_freqs ) {
269
return _PSIInternalPssmDataFree(retval);
275
_PSIInternalPssmData*
276
_PSIInternalPssmDataFree(_PSIInternalPssmData* pssm_data)
282
if (pssm_data->pssm) {
283
pssm_data->pssm = (int**)
284
_PSIDeallocateMatrix((void**) pssm_data->pssm, pssm_data->ncols);
287
if (pssm_data->scaled_pssm) {
288
pssm_data->scaled_pssm = (int**)
289
_PSIDeallocateMatrix((void**) pssm_data->scaled_pssm,
293
if (pssm_data->res_freqs) {
294
pssm_data->res_freqs = (double**)
295
_PSIDeallocateMatrix((void**) pssm_data->res_freqs,
305
_PSIAlignedBlockNew(Uint4 num_positions)
307
_PSIAlignedBlock* retval = NULL;
310
retval = (_PSIAlignedBlock*) calloc(1, sizeof(_PSIAlignedBlock));
315
retval->size = (Uint4*) calloc(num_positions, sizeof(Uint4));
316
if ( !retval->size ) {
317
return _PSIAlignedBlockFree(retval);
320
retval->pos_extnt = (SSeqRange*) malloc(num_positions * sizeof(SSeqRange));
321
if ( !retval->pos_extnt ) {
322
return _PSIAlignedBlockFree(retval);
325
/* N.B.: these initial values are deliberate so that the retval->size[i]
326
* field is initialized with a value that exceeds num_positions in
327
* _PSIComputeAlignedRegionLengths and can be used as a sanity check */
328
for (i = 0; i < num_positions; i++) {
329
retval->pos_extnt[i].left = -1;
330
retval->pos_extnt[i].right = num_positions;
336
_PSIAlignedBlockFree(_PSIAlignedBlock* aligned_blocks)
338
if ( !aligned_blocks ) {
342
if (aligned_blocks->size) {
343
sfree(aligned_blocks->size);
346
if (aligned_blocks->pos_extnt) {
347
sfree(aligned_blocks->pos_extnt);
350
sfree(aligned_blocks);
355
_PSISequenceWeightsNew(const PSIMsaDimensions* dimensions,
356
const BlastScoreBlk* sbp)
358
_PSISequenceWeights* retval = NULL;
363
retval = (_PSISequenceWeights*) calloc(1, sizeof(_PSISequenceWeights));
368
retval->row_sigma = (double*) calloc(dimensions->num_seqs + 1,
370
if ( !retval->row_sigma ) {
371
return _PSISequenceWeightsFree(retval);
374
retval->norm_seq_weights = (double*) calloc(dimensions->num_seqs + 1,
376
if ( !retval->norm_seq_weights ) {
377
return _PSISequenceWeightsFree(retval);
380
retval->sigma = (double*) calloc(dimensions->query_length, sizeof(double));
381
if ( !retval->sigma ) {
382
return _PSISequenceWeightsFree(retval);
385
retval->match_weights = (double**)
386
_PSIAllocateMatrix(dimensions->query_length,
389
retval->match_weights_size = dimensions->query_length;
390
if ( !retval->match_weights ) {
391
return _PSISequenceWeightsFree(retval);
394
retval->std_prob = _PSIGetStandardProbabilities(sbp);
395
if ( !retval->std_prob ) {
396
return _PSISequenceWeightsFree(retval);
399
retval->gapless_column_weights = (double*)
400
calloc(dimensions->query_length, sizeof(double));
401
if ( !retval->gapless_column_weights ) {
402
return _PSISequenceWeightsFree(retval);
409
_PSISequenceWeightsFree(_PSISequenceWeights* seq_weights)
411
if ( !seq_weights ) {
415
if (seq_weights->row_sigma) {
416
sfree(seq_weights->row_sigma);
419
if (seq_weights->norm_seq_weights) {
420
sfree(seq_weights->norm_seq_weights);
423
if (seq_weights->sigma) {
424
sfree(seq_weights->sigma);
427
if (seq_weights->match_weights) {
428
_PSIDeallocateMatrix((void**) seq_weights->match_weights,
429
seq_weights->match_weights_size);
432
if (seq_weights->std_prob) {
433
sfree(seq_weights->std_prob);
436
if (seq_weights->gapless_column_weights) {
437
sfree(seq_weights->gapless_column_weights);
446
static char getRes(char input)
449
case 0: return ('-');
450
case 1: return ('A');
451
case 2: return ('B');
452
case 3: return ('C');
453
case 4: return ('D');
454
case 5: return ('E');
455
case 6: return ('F');
456
case 7: return ('G');
457
case 8: return ('H');
458
case 9: return ('I');
459
case 10: return ('K');
460
case 11: return ('L');
461
case 12: return ('M');
462
case 13: return ('N');
463
case 14: return ('P');
464
case 15: return ('Q');
465
case 16: return ('R');
466
case 17: return ('S');
467
case 18: return ('T');
468
case 19: return ('V');
469
case 20: return ('W');
470
case 21: return ('X');
471
case 22: return ('Y');
472
case 23: return ('Z');
473
case 24: return ('U');
474
case 25: return ('*');
475
default: return ('?');
480
_DEBUG_printMsa(const char* filename, const _PSIMsa* msa)
488
fp = fopen(filename, "w");
490
for (i = 0; i < msa->dimensions->num_seqs + 1; i++) {
491
/*fprintf(fp, "%3d\t", i);*/
492
for (j = 0; j < msa->dimensions->query_length; j++) {
493
if (msa->cell[i][j].is_aligned) {
494
fprintf(fp, "%c", getRes(msa->cell[i][j].letter));
505
/************** Validation routines *****************************************/
507
/** Validate that there are no gaps in the query sequence
508
* @param msa multiple sequence alignment data structure [in]
509
* @param ignore_consensus TRUE if query sequence should be ignored [in]
510
* @return PSIERR_GAPINQUERY if validation fails, else PSI_SUCCESS
513
_PSIValidateNoGapsInQuery(const _PSIMsa* msa, Boolean ignore_consensus)
515
const Uint1 GAP = AMINOACID_TO_NCBISTDAA['-'];
516
Uint4 p = 0; /* index on positions sequences */
519
if (ignore_consensus) {
523
for (p = 0; p < msa->dimensions->query_length; p++) {
524
if (msa->cell[kQueryIndex][p].letter == GAP || msa->query[p] == GAP) {
525
return PSIERR_GAPINQUERY;
531
/** Validate that there are no flanking gaps in the multiple sequence alignment
532
* @param msa multiple sequence alignment data structure [in]
533
* @return PSIERR_STARTINGGAP or PSIERR_ENDINGGAP if validation fails,
537
_PSIValidateNoFlankingGaps(const _PSIMsa* msa)
539
const Uint1 GAP = AMINOACID_TO_NCBISTDAA['-'];
540
Uint4 s = 0; /* index on sequences */
541
Uint4 p = 0; /* index on positions sequences */
544
/* Look for starting gaps in alignments */
545
for (s = 0; s < msa->dimensions->num_seqs + 1; s++) {
547
if ( !msa->use_sequence[s] ) {
551
/* find the first aligned residue */
552
for (p = 0; p < msa->dimensions->query_length; p++) {
553
if (msa->cell[s][p].is_aligned) {
554
if (msa->cell[s][p].letter == GAP) {
555
return PSIERR_STARTINGGAP;
563
/* Look for ending gaps in alignments */
564
for (s = 0; s < msa->dimensions->num_seqs + 1; s++) {
566
if ( !msa->use_sequence[s] ) {
570
/* find the first aligned residue */
571
for (p = msa->dimensions->query_length - 1; p >= 0; p--) {
572
if (msa->cell[s][p].is_aligned) {
573
if (msa->cell[s][p].letter == GAP) {
574
return PSIERR_ENDINGGAP;
585
/** Validate that there are no unaligned columns or columns which only contain
586
* gaps in the multiple sequence alignment.
587
* @param msa multiple sequence alignment data structure [in]
588
* @param ignore_consensus TRUE if query sequence should be ignored [in]
589
* @return PSIERR_UNALIGNEDCOLUMN or PSIERR_COLUMNOFGAPS if validation fails,
593
_PSIValidateAlignedColumns(const _PSIMsa* msa, Boolean ignore_consensus)
595
const Uint1 GAP = AMINOACID_TO_NCBISTDAA['-'];
596
Uint4 s = 0; /* index on sequences */
597
Uint4 p = 0; /* index on positions sequences */
600
for (p = 0; p < msa->dimensions->query_length; p++) {
602
Boolean found_aligned_sequence = FALSE;
603
Boolean found_non_gap_residue = FALSE;
604
s = ignore_consensus ? 1 : 0;
606
for ( ; s < msa->dimensions->num_seqs + 1; s++) {
608
if (msa->cell[s][p].is_aligned) {
609
found_aligned_sequence = TRUE;
610
if (msa->cell[s][p].letter != GAP) {
611
found_non_gap_residue = TRUE;
616
if ( !found_aligned_sequence ) {
617
return PSIERR_UNALIGNEDCOLUMN;
619
if ( !found_non_gap_residue ) {
620
return PSIERR_COLUMNOFGAPS;
627
/** Verify that after purging biased sequences in multiple sequence alignment
628
* there are still sequences participating in the multiple sequences alignment
629
* @todo merge with validation performed at the PSSM engine level. Will need to
630
* pass Blast_Message to provide informative error messages.
631
* @param msa multiple sequence alignment structure [in]
632
* @return PSIERR_NOALIGNEDSEQS if validation fails, else PSI_SUCCESS
635
_PSIValidateParticipatingSequences(const _PSIMsa* msa)
637
Uint4 s = 0; /* index on aligned sequences */
640
for (s = 0; s < msa->dimensions->num_seqs + 1; s++) {
641
if (msa->use_sequence[s]) {
646
if (s == msa->dimensions->num_seqs + 1) { /* break was never executed */
647
return PSIERR_NOALIGNEDSEQS;
654
_PSIValidateMSA(const _PSIMsa* msa, Boolean ignore_consensus)
656
int retval = PSI_SUCCESS;
659
return PSIERR_BADPARAM;
662
retval = _PSIValidateNoFlankingGaps(msa);
663
if (retval != PSI_SUCCESS) {
667
retval = _PSIValidateAlignedColumns(msa, ignore_consensus);
668
if (retval != PSI_SUCCESS) {
672
retval = _PSIValidateNoGapsInQuery(msa, ignore_consensus);
673
if (retval != PSI_SUCCESS) {
677
retval = _PSIValidateParticipatingSequences(msa);
678
if (retval != PSI_SUCCESS) {
685
/****************************************************************************/
686
/* Function prototypes */
688
/* Purges any aligned segments which are identical to the query sequence */
690
_PSIPurgeSelfHits(_PSIMsa* msa);
692
/* Keeps only one copy of any aligned sequences which are >kPSINearIdentical%
693
* identical to one another */
695
_PSIPurgeNearIdenticalAlignments(_PSIMsa* msa);
697
/* FIXME: needs more descriptive name */
699
_PSIPurgeSimilarAlignments(_PSIMsa* msa,
702
double max_percent_identity);
703
/****************************************************************************/
705
/**************** PurgeMatches stage of PSSM creation ***********************/
707
_PSIPurgeBiasedSegments(_PSIMsa* msa)
710
return PSIERR_BADPARAM;
713
_PSIPurgeSelfHits(msa);
714
_PSIPurgeNearIdenticalAlignments(msa);
715
_PSIUpdatePositionCounts(msa);
720
/** Remove those sequences which are identical to the query sequence
723
_PSIPurgeSelfHits(_PSIMsa* msa)
725
Uint4 s = 0; /* index on sequences */
729
for (s = kQueryIndex + 1; s < msa->dimensions->num_seqs + 1; s++) {
730
_PSIPurgeSimilarAlignments(msa, kQueryIndex, s, kPSIIdentical);
735
_PSIPurgeNearIdenticalAlignments(_PSIMsa* msa)
742
for (i = 1; i < msa->dimensions->num_seqs + 1; i++) {
743
for (j = 1; (i + j) < msa->dimensions->num_seqs + 1; j++) {
744
/* N.B.: The order of comparison of sequence pairs is deliberate,
745
* tests on real data indicated that this approach allowed more
746
* sequences to be purged */
747
_PSIPurgeSimilarAlignments(msa, j, (i + j),
753
/** Counts the number of sequences matching the query per query position
754
* (columns of the multiple alignment) as well as the number of residues
755
* present in each position of the query.
756
* Should be called after multiple alignment data has been purged from biased
758
* @param msa multiple sequence alignment structure [in]
761
_PSIUpdatePositionCounts(_PSIMsa* msa)
763
Uint4 s = 0; /* index on aligned sequences */
764
Uint4 p = 0; /* index on positions */
768
for (s = 0; s < msa->dimensions->num_seqs + 1; s++) {
770
if ( !msa->use_sequence[s] ) {
774
for (p = 0; p < msa->dimensions->query_length; p++) {
775
if (msa->cell[s][p].is_aligned) {
776
const Uint1 res = msa->cell[s][p].letter;
777
if (res >= msa->alphabet_size) {
780
msa->residue_counts[p][res]++;
781
msa->num_matching_seqs[p]++;
787
/** Defines the states of the finite state machine used in
788
* _PSIPurgeSimilarAlignments. Successor to posit.c's POS_COUNTING and
790
typedef enum _EPSIPurgeFsmState {
793
} _EPSIPurgeFsmState;
795
/** Auxiliary structure to maintain information about two aligned regions
796
* between the query and a subject sequence. It is used to store the data
797
* manipulated by the finite state machine used in _PSIPurgeSimilarAlignments.
799
typedef struct _PSIAlignmentTraits {
800
Uint4 start; /**< starting offset of alignment w.r.t. query */
801
Uint4 effective_length; /**< length of alignment not including Xs */
802
Uint4 n_x_residues; /**< number of X residues in alignment */
803
Uint4 n_identical; /**< number of identical residues in alignment */
804
} _PSIAlignmentTraits;
808
void DEBUG_printTraits(_PSIAlignmentTraits* traits,
809
_EPSIPurgeFsmState state, Uint4 position)
811
fprintf(stderr, "Position: %d - State: %s\n", position,
812
state == eCounting ? "eCounting" : "eResting");
813
fprintf(stderr, "\tstart: %d\n", traits->start);
814
fprintf(stderr, "\teffective_length: %d\n", traits->effective_length);
815
fprintf(stderr, "\tn_x_residues: %d\n", traits->n_x_residues);
816
fprintf(stderr, "\tn_identical: %d\n", traits->n_identical);
821
_PSIResetAlignmentTraits(_PSIAlignmentTraits* traits, Uint4 position)
824
memset((void*) traits, 0, sizeof(_PSIAlignmentTraits));
825
traits->start = position;
828
/** Handles neither is aligned event */
830
_handleNeitherAligned(_PSIAlignmentTraits* traits, _EPSIPurgeFsmState* state,
831
_PSIMsa* msa, Uint4 seq_index,
832
double max_percent_identity)
839
/* Purge aligned region if max_percent_identity is exceeded */
841
if (traits->effective_length > 0) {
842
double percent_identity =
843
((double)traits->n_identical) / traits->effective_length;
844
if (percent_identity >= max_percent_identity) {
845
const unsigned int align_stop =
846
traits->start + traits->effective_length +
847
traits->n_x_residues;
848
int rv = _PSIPurgeAlignedRegion(msa, seq_index,
849
traits->start, align_stop);
850
ASSERT(rv == PSI_SUCCESS);
867
/** Handle event when both positions are aligned, using the same residue, but
868
* this residue is not X */
870
_handleBothAlignedSameResidueNoX(_PSIAlignmentTraits* traits,
871
_EPSIPurgeFsmState* state)
878
traits->n_identical++;
890
/** Handle the event when either position is aligned and either is X */
892
_handleEitherAlignedEitherX(_PSIAlignmentTraits* traits,
893
_EPSIPurgeFsmState* state)
900
traits->n_x_residues++;
912
/** Handle the event when either position is aligned and neither is X */
914
_handleEitherAlignedNeitherX(_PSIAlignmentTraits* traits,
915
_EPSIPurgeFsmState* state,
923
traits->effective_length++;
927
_PSIResetAlignmentTraits(traits, position);
928
traits->effective_length = 1; /* count this residue */
937
/** This function compares the sequences in the msa->cell
938
* structure indexed by sequence_index1 and seq_index2. If it finds aligned
939
* regions that have a greater percent identity than max_percent_identity,
940
* it removes the sequence identified by seq_index2.
943
_PSIPurgeSimilarAlignments(_PSIMsa* msa,
946
double max_percent_identity)
949
const Uint1 X = AMINOACID_TO_NCBISTDAA['X'];
950
_EPSIPurgeFsmState state = eCounting; /* initial state of the fsm */
951
_PSIAlignmentTraits traits;
952
Uint4 p = 0; /* position on alignment */
954
/* Nothing to do if sequences are the same or not selected for further
956
if ( seq_index1 == seq_index2 ||
957
!msa->use_sequence[seq_index1] ||
958
!msa->use_sequence[seq_index2] ) {
962
_PSIResetAlignmentTraits(&traits, p);
964
/* Examine each position of the aligned sequences and use the fsm to
965
* determine if a region of the alignment should be purged */
966
for (p = 0; p < msa->dimensions->query_length; p++) {
968
const _PSIMsaCell* seq1 = msa->cell[seq_index1];
969
const _PSIMsaCell* seq2 = msa->cell[seq_index2];
971
/* Indicates if the position in seq_index1 currently being examined is
972
* aligned. In the special case for seq_index1 == kQueryIndex, this
973
* variable is set to FALSE to force the other sequence's position to
974
* be used to proceed with processing. */
975
const Boolean pos1_aligned = (seq_index1 == kQueryIndex ?
976
FALSE : seq1[p].is_aligned);
977
/* Indicates if the position in seq_index2 currently being examined is
979
const Boolean pos2_aligned = seq2[p].is_aligned;
981
Boolean neither_is_aligned = !pos1_aligned && !pos2_aligned;
982
/* FIXME: kludgy solution, need to document fsm */
983
Boolean both_are_aligned = seq1[p].is_aligned && pos2_aligned;
984
Boolean either_is_aligned = pos1_aligned || pos2_aligned;
986
Boolean neither_is_X = seq1[p].letter != X && seq2[p].letter != X;
987
Boolean either_is_X = seq1[p].letter == X || seq2[p].letter == X;
988
Boolean same_residue = seq1[p].letter == seq2[p].letter;
990
/* Look for events interesting to the finite state machine */
991
if (neither_is_aligned) {
992
_handleNeitherAligned(&traits, &state, msa, seq_index2,
993
max_percent_identity);
996
if (either_is_aligned && either_is_X) {
997
_handleEitherAlignedEitherX(&traits, &state);
999
if (either_is_aligned && neither_is_X) {
1000
_handleEitherAlignedNeitherX(&traits, &state, p);
1002
if (both_are_aligned && same_residue && neither_is_X) {
1003
_handleBothAlignedSameResidueNoX(&traits, &state);
1007
_handleNeitherAligned(&traits, &state, msa, seq_index2,
1008
max_percent_identity);
1011
/****************************************************************************/
1012
/* Function prototypes */
1014
_PSIGetLeftExtents(const _PSIMsa* msa, Uint4 seq_index);
1016
_PSIGetRightExtents(const _PSIMsa* msa, Uint4 seq_index);
1019
_PSIComputePositionExtents(const _PSIMsa* msa,
1021
_PSIAlignedBlock* aligned_blocks);
1023
_PSIComputeAlignedRegionLengths(const _PSIMsa* msa,
1024
_PSIAlignedBlock* aligned_blocks);
1026
/****************************************************************************/
1027
/******* Compute alignment extents stage of PSSM creation *******************/
1028
/* posComputeExtents in posit.c */
1030
_PSIComputeAlignmentBlocks(const _PSIMsa* msa, /* [in] */
1031
_PSIAlignedBlock* aligned_blocks) /* [out] */
1033
Uint4 s = 0; /* index on aligned sequences */
1035
if ( !msa || !aligned_blocks ) {
1036
return PSIERR_BADPARAM;
1039
/* no need to compute extents for query sequence */
1040
for (s = kQueryIndex + 1; s < msa->dimensions->num_seqs + 1; s++) {
1041
if ( !msa->use_sequence[s] )
1044
_PSIGetLeftExtents(msa, s);
1045
_PSIGetRightExtents(msa, s);
1046
_PSIComputePositionExtents(msa, s, aligned_blocks);
1049
_PSIComputeAlignedRegionLengths(msa, aligned_blocks);
1055
_PSIGetLeftExtents(const _PSIMsa* msa, Uint4 seq_index)
1057
const Uint1 GAP = AMINOACID_TO_NCBISTDAA['-'];
1058
_PSIMsaCell* sequence_position = NULL;
1059
Uint4 prev = 0; /* index for the first and previous position */
1060
Uint4 curr = 0; /* index for the current position */
1063
ASSERT(seq_index < msa->dimensions->num_seqs + 1);
1064
ASSERT(msa->use_sequence[seq_index]);
1066
sequence_position = msa->cell[seq_index];
1068
if (sequence_position[prev].is_aligned &&
1069
sequence_position[prev].letter != GAP) {
1070
sequence_position[prev].extents.left = prev;
1073
for (curr = prev + 1; curr < msa->dimensions->query_length;
1076
if ( !sequence_position[curr].is_aligned ) {
1080
if (sequence_position[prev].is_aligned) {
1081
sequence_position[curr].extents.left =
1082
sequence_position[prev].extents.left;
1084
sequence_position[curr].extents.left = curr;
1090
_PSIGetRightExtents(const _PSIMsa* msa, Uint4 seq_index)
1092
const Uint1 GAP = AMINOACID_TO_NCBISTDAA['-'];
1093
_PSIMsaCell* sequence_position = NULL;
1094
Uint4 last = 0; /* index for the last position */
1095
Int4 curr = 0; /* index for the current position */
1098
ASSERT(seq_index < msa->dimensions->num_seqs + 1);
1099
ASSERT(msa->use_sequence[seq_index]);
1101
sequence_position = msa->cell[seq_index];
1102
last = msa->dimensions->query_length - 1;
1104
if (sequence_position[last].is_aligned &&
1105
sequence_position[last].letter != GAP) {
1106
sequence_position[last].extents.right = last;
1109
for (curr = last - 1; curr >= 0; curr--, last--) {
1111
if ( !sequence_position[curr].is_aligned ) {
1115
if (sequence_position[last].is_aligned) {
1116
sequence_position[curr].extents.right =
1117
sequence_position[last].extents.right;
1119
sequence_position[curr].extents.right = curr;
1125
_PSIComputePositionExtents(const _PSIMsa* msa,
1127
_PSIAlignedBlock* aligned_blocks)
1129
#ifdef PSI_IGNORE_GAPS_IN_COLUMNS
1130
const Uint1 GAP = AMINOACID_TO_NCBISTDAA['-'];
1132
_PSIMsaCell* sequence_position = NULL;
1135
ASSERT(aligned_blocks);
1137
ASSERT(seq_index < msa->dimensions->num_seqs + 1);
1138
ASSERT(msa->use_sequence[seq_index]);
1140
sequence_position = msa->cell[seq_index];
1142
for (i = 0; i < msa->dimensions->query_length; i++) {
1143
#ifdef PSI_IGNORE_GAPS_IN_COLUMNS
1144
if (sequence_position[i].is_aligned &&
1145
sequence_position[i].letter != GAP) {
1147
if (sequence_position[i].is_aligned) {
1149
aligned_blocks->pos_extnt[i].left =
1150
MAX(aligned_blocks->pos_extnt[i].left,
1151
sequence_position[i].extents.left);
1152
aligned_blocks->pos_extnt[i].right =
1153
MIN(aligned_blocks->pos_extnt[i].right,
1154
sequence_position[i].extents.right);
1160
_PSIComputeAlignedRegionLengths(const _PSIMsa* msa,
1161
_PSIAlignedBlock* aligned_blocks)
1163
const Uint1 X = AMINOACID_TO_NCBISTDAA['X'];
1167
ASSERT(aligned_blocks);
1169
for (i = 0; i < msa->dimensions->query_length; i++) {
1170
aligned_blocks->size[i] = aligned_blocks->pos_extnt[i].right -
1171
aligned_blocks->pos_extnt[i].left + 1;
1173
/* Sanity check: if aligned_blocks->pos_extnt[i].{right,left} was not
1174
* modified after initialization, this assertion will fail */
1175
ASSERT(aligned_blocks->size[i] <= msa->dimensions->query_length);
1178
/* Do not include X's in aligned region lengths */
1179
for (i = 0; i < msa->dimensions->query_length; i++) {
1181
if (msa->query[i] == X) {
1184
for (idx = 0; idx < i; idx++) {
1185
if ((Uint4)aligned_blocks->pos_extnt[idx].right >= i &&
1186
msa->query[idx] != X) {
1187
aligned_blocks->size[idx]--;
1190
for (idx = msa->dimensions->query_length - 1; idx > i; idx--) {
1191
if ((Uint4)aligned_blocks->pos_extnt[idx].left <= i &&
1192
msa->query[idx] != X) {
1193
aligned_blocks->size[idx]--;
1201
/****************************************************************************/
1203
_PSIGetAlignedSequencesForPosition(
1206
Uint4* aligned_sequences);
1209
_PSICalculateNormalizedSequenceWeights(
1210
const _PSIMsa* msa, /* [in] */
1211
const _PSIAlignedBlock* aligned_blocks, /* [in] */
1212
Uint4 position, /* [in] */
1213
const Uint4* aligned_seqs, /* [in] */
1214
Uint4 num_aligned_seqs, /* [in] */
1215
_PSISequenceWeights* seq_weights); /* [out] */
1218
_PSICalculateMatchWeights(
1219
const _PSIMsa* msa, /* [in] */
1220
Uint4 position, /* [in] */
1221
const Uint4* aligned_seqs, /* [in] */
1222
Uint4 num_aligned_seqs, /* [in] */
1223
_PSISequenceWeights* seq_weights); /* [out] */
1226
_PSISpreadGapWeights(const _PSIMsa* msa,
1227
_PSISequenceWeights* seq_weights,
1228
Boolean ignore_consensus);
1231
_PSICheckSequenceWeights(
1232
const _PSIMsa* msa, /* [in] */
1233
const _PSISequenceWeights* seq_weights,
1234
Boolean ignore_consensus); /* [in] */
1236
/****************************************************************************/
1237
/******* Calculate sequence weights stage of PSSM creation ******************/
1238
/* Needs the _PSIAlignedBlock structure calculated in previous stage as well
1239
* as PSIAlignmentData structure */
1242
_PSIComputeSequenceWeights(const _PSIMsa* msa, /* [in] */
1243
const _PSIAlignedBlock* aligned_blocks, /* [in] */
1244
Boolean ignore_consensus, /* [in] */
1245
_PSISequenceWeights* seq_weights) /* [out] */
1247
Uint4* aligned_seqs = NULL; /* list of indices of sequences
1248
which participate in an
1250
Uint4 pos = 0; /* position index */
1251
int retval = PSI_SUCCESS; /* return value */
1252
const Uint4 kExpectedNumMatchingSeqs = ignore_consensus ? 0 : 1;
1255
ASSERT(aligned_blocks);
1256
ASSERT(seq_weights);
1258
aligned_seqs = (Uint4*) malloc((msa->dimensions->num_seqs + 1) *
1260
if ( !aligned_seqs ) {
1261
return PSIERR_OUTOFMEM;
1264
for (pos = 0; pos < msa->dimensions->query_length; pos++) {
1266
Uint4 num_aligned_seqs = 0;
1268
/* ignore positions of no interest */
1269
if (aligned_blocks->size[pos] == 0 ||
1270
msa->num_matching_seqs[pos] <= kExpectedNumMatchingSeqs) {
1274
memset((void*)aligned_seqs, 0,
1275
sizeof(Uint4) * (msa->dimensions->num_seqs + 1));
1277
num_aligned_seqs = _PSIGetAlignedSequencesForPosition(msa, pos,
1279
ASSERT(msa->num_matching_seqs[pos] == num_aligned_seqs);
1280
if (num_aligned_seqs <= kExpectedNumMatchingSeqs) {
1284
memset((void*)seq_weights->norm_seq_weights, 0,
1285
sizeof(double)*(msa->dimensions->num_seqs+1));
1286
memset((void*)seq_weights->row_sigma, 0,
1287
sizeof(double)*(msa->dimensions->num_seqs+1));
1289
_PSICalculateNormalizedSequenceWeights(msa, aligned_blocks, pos,
1290
aligned_seqs, num_aligned_seqs,
1294
/* Uses seq_weights->norm_seq_weights to populate match_weights */
1295
_PSICalculateMatchWeights(msa, pos, aligned_seqs,
1296
num_aligned_seqs, seq_weights);
1299
sfree(aligned_seqs);
1301
/* Check that the sequence weights add up to 1 in each column */
1302
retval = _PSICheckSequenceWeights(msa, seq_weights, ignore_consensus);
1303
if (retval != PSI_SUCCESS) {
1307
#ifndef PSI_IGNORE_GAPS_IN_COLUMNS
1308
_PSISpreadGapWeights(msa, seq_weights, ignore_consensus);
1309
retval = _PSICheckSequenceWeights(msa, seq_weights, ignore_consensus);
1312
/* Return seq_weights->match_weigths, should free others? FIXME: need to
1313
* keep sequence weights for diagnostics for structure group */
1317
/** Calculates the position based weights using a modified version of the
1318
* Henikoff's algorithm presented in "Position-based sequence weights".
1319
* Skipped optimization about identical previous sets.
1322
_PSICalculateNormalizedSequenceWeights(
1323
const _PSIMsa* msa, /* [in] */
1324
const _PSIAlignedBlock* aligned_blocks, /* [in] */
1325
Uint4 position, /* [in] */
1326
const Uint4* aligned_seqs, /* [in] */
1327
Uint4 num_aligned_seqs, /* [in] */
1328
_PSISequenceWeights* seq_weights) /* [out] norm_seq_weights, sigma */
1330
/* Index into aligned block for requested position */
1333
/* This flag will be true if more than one different type of residue is
1334
* found in a column in the extent of the alignment that corresponds to
1335
* the position being examined. (replaces Sigma in old code) */
1336
Boolean distinct_residues_found = FALSE;
1338
/* Number of different characters occurring in matches within a
1339
* multi-alignment block including identical columns (replaces
1340
* intervalSigma in old code)
1341
* FIXME: alternate description
1342
* number of distinct residues in all columns in the extent of the
1343
* alignment corresponding to a position
1347
/* Index into array of aligned sequences */
1351
ASSERT(aligned_blocks);
1352
ASSERT(seq_weights);
1353
ASSERT(aligned_seqs);
1354
ASSERT(position >= 0 && position < msa->dimensions->query_length);
1356
for (i = (Uint4) aligned_blocks->pos_extnt[position].left;
1357
i <= (Uint4) aligned_blocks->pos_extnt[position].right; i++) {
1359
/* Keeps track of how many occurrences of each residue are found in a
1360
* column of the alignment extent corresponding to a query position */
1361
Uint4 residue_counts_for_column[BLASTAA_SIZE];
1363
/* number of distinct residues found in a column of the alignment
1364
* extent correspoding to a query position */
1365
Uint4 num_distinct_residues_for_column = 0;
1367
memset((void*) residue_counts_for_column, 0,
1368
sizeof(Uint4)*BLASTAA_SIZE);
1370
/* Assert that the alignment extents have sane values */
1371
ASSERT(i >= 0 && i < msa->dimensions->query_length);
1373
/* Count number of residues in column i of the alignment extent
1374
* corresponding to position */
1375
for (asi = 0; asi < num_aligned_seqs; asi++) {
1376
const Uint4 seq_idx = aligned_seqs[asi];
1377
const Uint1 residue =
1378
msa->cell[seq_idx][i].letter;
1380
if (residue_counts_for_column[residue] == 0) {
1381
num_distinct_residues_for_column++;
1383
residue_counts_for_column[residue]++;
1386
sigma += num_distinct_residues_for_column;
1387
if (num_distinct_residues_for_column > 1) {
1388
/* num_distinct_residues_for_column == 1 means that all residues in
1389
* that column of the alignment extent are the same residue */
1390
distinct_residues_found = TRUE;
1393
/* Calculate row_sigma, an intermediate value to calculate the
1394
* normalized sequence weights */
1395
for (asi = 0; asi < num_aligned_seqs; asi++) {
1396
const Uint4 seq_idx = aligned_seqs[asi];
1397
const Uint1 residue = msa->cell[seq_idx][i].letter;
1399
/* This is a modified version of the Henikoff's idea in
1400
* "Position-based sequence weights" paper. The modification
1401
* consists in using the alignment extents. */
1402
seq_weights->row_sigma[seq_idx] +=
1404
(residue_counts_for_column[residue] *
1405
num_distinct_residues_for_column) );
1409
/* Save sigma for this position */
1410
seq_weights->sigma[position] = sigma;
1412
if (distinct_residues_found) {
1413
double weight_sum = 0.0;
1415
for (asi = 0; asi < num_aligned_seqs; asi++) {
1416
const Uint4 seq_idx = aligned_seqs[asi];
1417
seq_weights->norm_seq_weights[seq_idx] =
1418
seq_weights->row_sigma[seq_idx] /
1419
(aligned_blocks->pos_extnt[position].right -
1420
aligned_blocks->pos_extnt[position].left + 1);
1421
weight_sum += seq_weights->norm_seq_weights[seq_idx];
1425
for (asi = 0; asi < num_aligned_seqs; asi++) {
1426
const Uint4 seq_idx = aligned_seqs[asi];
1427
seq_weights->norm_seq_weights[seq_idx] /= weight_sum;
1431
/* All residues in the extent of this position's alignment are the same
1432
* residue, therefore we distribute the sequence weight equally among
1433
* all participating sequences */
1434
for (asi = 0; asi < num_aligned_seqs; asi++) {
1435
const Uint4 seq_idx = aligned_seqs[asi];
1436
seq_weights->norm_seq_weights[seq_idx] =
1437
(1.0/(double) num_aligned_seqs);
1445
_PSICalculateMatchWeights(
1446
const _PSIMsa* msa, /* [in] */
1447
Uint4 position, /* [in] */
1448
const Uint4* aligned_seqs, /* [in] */
1449
Uint4 num_aligned_seqs, /* [in] */
1450
_PSISequenceWeights* seq_weights) /* [out] */
1452
const Uint1 GAP = AMINOACID_TO_NCBISTDAA['-'];
1453
Uint4 asi = 0; /* index into array of aligned sequences */
1456
ASSERT(aligned_seqs);
1457
ASSERT(seq_weights);
1459
for (asi = 0; asi < num_aligned_seqs; asi++) {
1460
const Uint4 seq_idx = aligned_seqs[asi];
1461
const Uint1 residue = msa->cell[seq_idx][position].letter;
1463
seq_weights->match_weights[position][residue] +=
1464
seq_weights->norm_seq_weights[seq_idx];
1466
/* Collected for diagnostics information, not used elsewhere */
1467
if (residue != GAP) {
1468
seq_weights->gapless_column_weights[position] +=
1469
seq_weights->norm_seq_weights[seq_idx];
1474
/** Finds the sequences aligned in a given position.
1475
* @param alignment Multiple-alignment data [in]
1476
* @param position position of interest [in]
1477
* @param aligned_sequences array which will contain the indices of the
1478
* sequences aligned at the requested position. This array must have size
1479
* greater than or equal to the number of sequences + 1 in multiple alignment
1480
* data structure (alignment->dimensions->num_seqs + 1) [out]
1481
* @return number of sequences aligned at the requested position
1484
_PSIGetAlignedSequencesForPosition(const _PSIMsa* msa,
1486
Uint4* aligned_sequences)
1488
#ifdef PSI_IGNORE_GAPS_IN_COLUMNS
1489
const Uint1 GAP = AMINOACID_TO_NCBISTDAA['-'];
1495
ASSERT(position < msa->dimensions->query_length);
1496
ASSERT(aligned_sequences);
1498
for (i = 0; i < msa->dimensions->num_seqs + 1; i++) {
1500
if ( !msa->use_sequence[i] ) {
1504
#ifdef PSI_IGNORE_GAPS_IN_COLUMNS
1505
if (msa->cell[i][position].is_aligned &&
1506
msa->cell[i][position].letter != GAP) {
1508
if (msa->cell[i][position].is_aligned) {
1510
aligned_sequences[retval++] = i;
1517
/* The second parameter is not really const, it's updated! */
1519
_PSISpreadGapWeights(const _PSIMsa* msa,
1520
_PSISequenceWeights* seq_weights,
1521
Boolean ignore_consensus)
1523
const Uint1 GAP = AMINOACID_TO_NCBISTDAA['-'];
1524
const Uint1 X = AMINOACID_TO_NCBISTDAA['X'];
1525
Uint4 pos = 0; /* residue position (ie: column number) */
1526
Uint4 res = 0; /* residue */
1527
const Uint4 kExpectedNumMatchingSeqs = ignore_consensus ? 0 : 1;
1530
ASSERT(seq_weights);
1532
for (pos = 0; pos < msa->dimensions->query_length; pos++) {
1534
if (msa->num_matching_seqs[pos] <= kExpectedNumMatchingSeqs ||
1535
msa->cell[kQueryIndex][pos].letter == X) {
1539
/* Disperse method of spreading the gap weights */
1540
for (res = 0; res < msa->alphabet_size; res++) {
1541
if (seq_weights->std_prob[res] > kEpsilon) {
1542
seq_weights->match_weights[pos][res] +=
1543
(seq_weights->match_weights[pos][GAP] *
1544
seq_weights->std_prob[res]);
1547
seq_weights->match_weights[pos][GAP] = 0.0;
1552
/* Verifies that each column of the match_weights field in the seq_weights
1553
* structure adds up to 1. */
1555
_PSICheckSequenceWeights(const _PSIMsa* msa,
1556
const _PSISequenceWeights* seq_weights,
1557
Boolean ignore_consensus)
1559
const Uint1 X = AMINOACID_TO_NCBISTDAA['X'];
1560
Uint4 pos = 0; /* residue position (ie: column number) */
1561
Boolean check_performed = FALSE; /* were there any sequences checked? */
1562
const Uint4 kExpectedNumMatchingSeqs = ignore_consensus ? 0 : 1;
1565
ASSERT(seq_weights);
1567
for (pos = 0; pos < msa->dimensions->query_length; pos++) {
1569
double running_total = 0.0;
1572
if (msa->num_matching_seqs[pos] <= kExpectedNumMatchingSeqs ||
1573
msa->cell[kQueryIndex][pos].letter == X) {
1577
for (residue = 0; residue < msa->alphabet_size; residue++) {
1578
running_total += seq_weights->match_weights[pos][residue];
1581
if (running_total < 0.99 || running_total > 1.01) {
1582
return PSIERR_BADSEQWEIGHTS;
1584
check_performed = TRUE;
1587
/* This condition should never happen because it means that no sequences
1588
* were selected to calculate the sequence weights! */
1589
if ( !check_performed ) {
1590
assert(!"Did not perform sequence weights check");
1596
/****************************************************************************/
1597
/******* Compute residue frequencies stage of PSSM creation *****************/
1599
/* Implements formula 2 in Nucleic Acids Research, 2001, Vol 29, No 14.
1600
Subscripts are indicated as follows: N_i, where i is a subscript of N.
1603
_PSIComputeResidueFrequencies(const _PSIMsa* msa, /* [in] */
1604
const _PSISequenceWeights* seq_weights, /* [in] */
1605
const BlastScoreBlk* sbp, /* [in] */
1606
const _PSIAlignedBlock* aligned_blocks, /* [in] */
1607
Int4 pseudo_count, /* [in] */
1608
_PSIInternalPssmData* internal_pssm) /* [out] */
1610
const Uint1 X = AMINOACID_TO_NCBISTDAA['X'];
1611
SFreqRatios* freq_ratios = NULL;/* matrix-specific frequency ratios */
1612
Uint4 p = 0; /* index on positions */
1613
Uint4 r = 0; /* index on residues */
1615
if ( !msa || !seq_weights || !sbp || !aligned_blocks || !internal_pssm ) {
1616
return PSIERR_BADPARAM;
1619
freq_ratios = _PSIMatrixFrequencyRatiosNew(sbp->name);
1621
for (p = 0; p < msa->dimensions->query_length; p++) {
1623
for (r = 0; r < (Uint4) sbp->alphabet_size; r++) {
1625
/* If there is an 'X' in the query sequence at position p
1626
or the standard probability of residue r is close to 0 */
1627
if (msa->cell[kQueryIndex][p].letter == X ||
1628
seq_weights->std_prob[r] <= kEpsilon) {
1629
internal_pssm->res_freqs[p][r] = 0.0;
1632
Uint4 i = 0; /* loop index */
1634
/* beta( Sum_j(f_j r_ij) ) in formula 2 */
1635
double pseudo = 0.0;
1636
/* Effective number of independent observations for column p */
1638
/* Renamed to match the formula in the paper */
1639
const double beta = pseudo_count;
1640
double numerator = 0.0; /* intermediate term */
1641
double denominator = 0.0; /* intermediate term */
1642
double qOverPEstimate = 0.0; /* intermediate term */
1644
/* As specified in 2001 paper, underlying matrix frequency
1645
ratios are used here */
1646
for (i = 0; i < (Uint4) sbp->alphabet_size; i++) {
1647
if (sbp->matrix[r][i] != BLAST_SCORE_MIN) {
1648
pseudo += (seq_weights->match_weights[p][i] *
1649
freq_ratios->data[r][i]);
1654
alpha = seq_weights->sigma[p]/aligned_blocks->size[p]-1;
1657
(alpha * seq_weights->match_weights[p][r] /
1658
seq_weights->std_prob[r])
1661
denominator = alpha + beta;
1663
ASSERT(denominator != 0.0);
1664
qOverPEstimate = numerator/denominator;
1666
/* Note artificial multiplication by standard probability
1668
internal_pssm->res_freqs[p][r] = qOverPEstimate *
1669
seq_weights->std_prob[r];
1675
freq_ratios = _PSIMatrixFrequencyRatiosFree(freq_ratios);
1680
/* port of posInitializeInformation */
1682
_PSICalculateInformationContentFromScoreMatrix(
1684
const double* std_prob,
1690
double* retval = NULL; /* the return value */
1691
Uint4 p = 0; /* index on positions */
1692
Uint4 r = 0; /* index on residues */
1694
if ( !std_prob || !score_mat ) {
1698
retval = (double*) calloc(query_length, sizeof(double));
1703
for (p = 0; p < query_length; p++) {
1705
double info_sum = 0.0;
1707
for (r = 0; r < alphabet_sz; r++) {
1709
if (std_prob[r] > kEpsilon) {
1710
Int4 score = score_mat[query[p]][r];
1711
double exponent = exp(score * lambda);
1712
double tmp = std_prob[r] * exponent;
1713
info_sum += tmp * log(tmp/std_prob[r])/ NCBIMATH_LN2;
1717
retval[p] = info_sum;
1724
_PSICalculateInformationContentFromResidueFreqs(
1726
const double* std_prob,
1730
double* retval = NULL; /* the return value */
1731
Uint4 p = 0; /* index on positions */
1732
Uint4 r = 0; /* index on residues */
1734
if ( !std_prob || !res_freqs ) {
1738
retval = (double*) calloc(query_length, sizeof(double));
1743
for (p = 0; p < query_length; p++) {
1745
double info_sum = 0.0;
1747
for (r = 0; r < alphabet_sz; r++) {
1749
if (std_prob[r] > kEpsilon) {
1750
double qOverPEstimate = res_freqs[p][r] / std_prob[r];
1751
if (qOverPEstimate > kEpsilon) {
1753
res_freqs[p][r] * log(qOverPEstimate) / NCBIMATH_LN2;
1758
retval[p] = info_sum;
1765
/****************************************************************************/
1766
/**************** Convert residue frequencies to PSSM stage *****************/
1768
/* FIXME: Answer questions
1769
FIXME: need ideal_labmda, regular scoring matrix, length of query
1770
port of posFreqsToMatrix
1773
_PSIConvertResidueFreqsToPSSM(_PSIInternalPssmData* internal_pssm, /* [in|out] */
1774
const Uint1* query, /* [in] */
1775
const BlastScoreBlk* sbp, /* [in] */
1776
const double* std_probs) /* [in] */
1778
const Uint4 X = AMINOACID_TO_NCBISTDAA['X'];
1779
const Uint4 Star = AMINOACID_TO_NCBISTDAA['*'];
1782
SFreqRatios* freq_ratios = NULL; /* only needed when there are not
1783
residue frequencies for a given
1785
Blast_KarlinBlk* kbp_ideal = Blast_KarlinBlkIdealCalc(sbp);
1786
double ideal_lambda = kbp_ideal->Lambda;
1787
kbp_ideal = Blast_KarlinBlkDestruct(kbp_ideal);
1789
if ( !internal_pssm || !sbp || !std_probs )
1790
return PSIERR_BADPARAM;
1792
freq_ratios = _PSIMatrixFrequencyRatiosNew(sbp->name);
1794
/* Each column is a position in the query */
1795
for (i = 0; i < internal_pssm->ncols; i++) {
1797
/* True if all frequencies in column i are zero */
1798
Boolean is_unaligned_column = TRUE;
1799
const Uint4 query_res = query[i];
1801
for (j = 0; j < (Uint4) sbp->alphabet_size; j++) {
1803
double qOverPEstimate = 0.0;
1805
/* Division compensates for multiplication in previous function */
1806
if (std_probs[j] > kEpsilon) {
1808
internal_pssm->res_freqs[i][j] / std_probs[j];
1811
if (is_unaligned_column && qOverPEstimate != 0.0) {
1812
is_unaligned_column = FALSE;
1815
/* Populate scaled matrix */
1816
if (qOverPEstimate == 0.0 || std_probs[j] < kEpsilon) {
1817
internal_pssm->scaled_pssm[i][j] = BLAST_SCORE_MIN;
1819
double tmp = log(qOverPEstimate)/ideal_lambda;
1820
internal_pssm->scaled_pssm[i][j] = (int)
1821
BLAST_Nint(kPSIScaleFactor * tmp);
1824
if ( (j == X || j == Star) &&
1825
(sbp->matrix[query_res][X] != BLAST_SCORE_MIN) ) {
1826
internal_pssm->scaled_pssm[i][j] =
1827
sbp->matrix[query_res][j] * kPSIScaleFactor;
1831
if (is_unaligned_column) {
1832
for (j = 0; j < (Uint4) sbp->alphabet_size; j++) {
1834
internal_pssm->pssm[i][j] = sbp->matrix[query_res][j];
1836
if (sbp->matrix[query_res][j] != BLAST_SCORE_MIN) {
1838
kPSIScaleFactor * freq_ratios->bit_scale_factor *
1839
log(freq_ratios->data[query_res][j])/NCBIMATH_LN2;
1841
internal_pssm->scaled_pssm[i][j] = BLAST_Nint(tmp);
1843
internal_pssm->scaled_pssm[i][j] = BLAST_SCORE_MIN;
1849
freq_ratios = _PSIMatrixFrequencyRatiosFree(freq_ratios);
1851
/* Set the last column of the matrix to BLAST_SCORE_MIN (why?) *
1852
for (j = 0; j < (Uint4) sbp->alphabet_size; j++) {
1853
internal_pssm->scaled_pssm[internal_pssm->ncols-1][j] = BLAST_SCORE_MIN;
1860
/****************************************************************************/
1861
/************************* Scaling of PSSM stage ****************************/
1864
* @param initial_lambda_guess value to be used when calculating lambda if this
1866
* @param sbp Score block structure where the calculated lambda and K will be
1870
_PSIUpdateLambdaK(const int** pssm, /* [in] */
1871
const Uint1* query, /* [in] */
1872
Uint4 query_length, /* [in] */
1873
const double* std_probs, /* [in] */
1874
double* initial_lambda_guess, /* [in] */
1875
BlastScoreBlk* sbp); /* [in|out] */
1877
/* FIXME: change so that only lambda is calculated inside the loop that scales
1878
the matrix and kappa is calculated before returning from this function.
1879
Scaling factor should be optional argument to accomodate kappa.c's needs?
1882
_PSIScaleMatrix(const Uint1* query, /* [in] */
1883
Uint4 query_length, /* [in] */
1884
const double* std_probs, /* [in] */
1885
double* scaling_factor, /* [in] */
1886
_PSIInternalPssmData* internal_pssm, /* [in|out] */
1887
BlastScoreBlk* sbp) /* [in|out] */
1889
Boolean first_time = TRUE;
1890
Uint4 index = 0; /* loop index */
1891
int** scaled_pssm = NULL;
1894
double factor_low = 0.0;
1895
double factor_high = 0.0;
1896
double ideal_lambda = 0.0; /* ideal value of ungapped lambda for
1897
underlying scoring matrix */
1898
double new_lambda = 0.0; /* Karlin-Altschul parameter calculated
1899
from scaled matrix*/
1901
const double kPositPercent = 0.05;
1902
const Uint4 kPositNumIterations = 10;
1903
Boolean too_high = TRUE;
1905
if ( !internal_pssm || !sbp || !query || !std_probs )
1906
return PSIERR_BADPARAM;
1908
ASSERT(sbp->kbp_psi[0]);
1909
ASSERT(sbp->kbp_ideal);
1911
scaled_pssm = internal_pssm->scaled_pssm;
1912
pssm = internal_pssm->pssm;
1913
ideal_lambda = sbp->kbp_ideal->Lambda;
1915
/* FIXME: need to take scaling_factor into account */
1922
for (i = 0; i < internal_pssm->ncols; i++) {
1923
for (j = 0; j < internal_pssm->nrows; j++) {
1924
if (scaled_pssm[i][j] != BLAST_SCORE_MIN) {
1926
BLAST_Nint(factor*scaled_pssm[i][j]/kPSIScaleFactor);
1928
pssm[i][j] = BLAST_SCORE_MIN;
1933
if (scaling_factor) {
1934
double init_lambda_guess =
1935
sbp->kbp_psi[0]->Lambda / *scaling_factor;
1936
_PSIUpdateLambdaK((const int**)pssm, query, query_length,
1937
std_probs, &init_lambda_guess, sbp);
1939
_PSIUpdateLambdaK((const int**)pssm, query, query_length,
1940
std_probs, NULL, sbp);
1943
new_lambda = sbp->kbp_psi[0]->Lambda;
1945
if (new_lambda > ideal_lambda) {
1947
factor_high = 1.0 + kPositPercent;
1948
factor = factor_high;
1952
if (too_high == FALSE) {
1955
factor_high += (factor_high - 1.0);
1956
factor = factor_high;
1958
} else if (new_lambda > 0) {
1961
factor_low = 1.0 - kPositPercent;
1962
factor = factor_low;
1966
if (too_high == TRUE) {
1969
factor_low += (factor_low - 1.0);
1970
factor = factor_low;
1973
return PSIERR_POSITIVEAVGSCORE;
1977
/* Binary search for kPositNumIterations times */
1978
for (index = 0; index < kPositNumIterations; index++) {
1982
factor = (factor_high + factor_low)/2;
1984
for (i = 0; i < internal_pssm->ncols; i++) {
1985
for (j = 0; j < internal_pssm->nrows; j++) {
1986
if (scaled_pssm[i][j] != BLAST_SCORE_MIN) {
1988
BLAST_Nint(factor*scaled_pssm[i][j]/kPSIScaleFactor);
1990
pssm[i][j] = BLAST_SCORE_MIN;
1995
if (scaling_factor) {
1996
double init_lambda_guess =
1997
sbp->kbp_psi[0]->Lambda / *scaling_factor;
1998
_PSIUpdateLambdaK((const int**)pssm, query, query_length,
1999
std_probs, &init_lambda_guess, sbp);
2001
_PSIUpdateLambdaK((const int**)pssm, query, query_length,
2002
std_probs, NULL, sbp);
2005
new_lambda = sbp->kbp_psi[0]->Lambda;
2007
if (new_lambda > ideal_lambda) {
2008
factor_low = factor;
2010
factor_high = factor;
2018
_PSISequenceLengthWithoutX(const Uint1* seq, Uint4 length)
2020
const Uint1 X = AMINOACID_TO_NCBISTDAA['X'];
2021
Uint4 retval = 0; /* the return value */
2022
Uint4 i = 0; /* loop index */
2026
for(i = 0; i < length; i++) {
2036
_PSIComputeScoreProbabilities(const int** pssm, /* [in] */
2037
const Uint1* query, /* [in] */
2038
Uint4 query_length, /* [in] */
2039
const double* std_probs, /* [in] */
2040
const BlastScoreBlk* sbp) /* [in] */
2042
const Uint1 X = AMINOACID_TO_NCBISTDAA['X'];
2043
Uint1 aa_alphabet[BLASTAA_SIZE]; /* ncbistdaa alphabet */
2044
Uint4 alphabet_size = 0; /* number of elements populated
2046
Uint4 effective_length = 0; /* length of query w/o Xs */
2047
Uint4 p = 0; /* index on positions */
2048
Uint4 r = 0; /* index on residues */
2049
int s = 0; /* index on scores */
2050
int min_score = 0; /* minimum score in pssm */
2051
int max_score = 0; /* maximum score in pssm */
2052
Blast_ScoreFreq* score_freqs = NULL; /* score frequencies */
2058
ASSERT(sbp->alphabet_code == BLASTAA_SEQ_CODE);
2060
alphabet_size = (Uint4) Blast_GetStdAlphabet(sbp->alphabet_code,
2061
aa_alphabet, BLASTAA_SIZE);
2062
if (alphabet_size <= 0) {
2065
ASSERT(alphabet_size < BLASTAA_SIZE);
2067
effective_length = _PSISequenceLengthWithoutX(query, query_length);
2069
/* Get the minimum and maximum scores */
2070
for (p = 0; p < query_length; p++) {
2071
for (r = 0; r < alphabet_size; r++) {
2072
const int kScore = pssm[p][aa_alphabet[r]];
2074
if (kScore <= BLAST_SCORE_MIN || kScore >= BLAST_SCORE_MAX) {
2077
max_score = MAX(kScore, max_score);
2078
min_score = MIN(kScore, min_score);
2082
score_freqs = Blast_ScoreFreqNew(min_score, max_score);
2083
if ( !score_freqs ) {
2087
score_freqs->obs_min = min_score;
2088
score_freqs->obs_max = max_score;
2089
for (p = 0; p < query_length; p++) {
2090
if (query[p] == X) {
2094
for (r = 0; r < alphabet_size; r++) {
2095
const int kScore = pssm[p][aa_alphabet[r]];
2097
if (kScore <= BLAST_SCORE_MIN || kScore >= BLAST_SCORE_MAX) {
2101
/* Increment the weight for the score in position p, residue r */
2102
score_freqs->sprob[kScore] +=
2103
(std_probs[aa_alphabet[r]]/effective_length);
2107
ASSERT(score_freqs->score_avg == 0.0);
2108
for (s = min_score; s < max_score; s++) {
2109
score_freqs->score_avg += (s * score_freqs->sprob[s]);
2115
/* Port of blastool.c's updateLambdaK */
2117
_PSIUpdateLambdaK(const int** pssm, /* [in] */
2118
const Uint1* query, /* [in] */
2119
Uint4 query_length, /* [in] */
2120
const double* std_probs, /* [in] */
2121
double* initial_lambda_guess, /* [in] */
2122
BlastScoreBlk* sbp) /* [in|out] */
2124
Blast_ScoreFreq* score_freqs =
2125
_PSIComputeScoreProbabilities(pssm, query, query_length,
2128
if (initial_lambda_guess) {
2129
sbp->kbp_psi[0]->Lambda = Blast_KarlinLambdaNR(score_freqs,
2130
*initial_lambda_guess);
2134
/* Calculate lambda and K */
2135
Blast_KarlinBlkCalc(sbp->kbp_psi[0], score_freqs);
2137
/* Shouldn't this be in a function? */
2138
ASSERT(sbp->kbp_ideal);
2139
ASSERT(sbp->kbp_psi[0]);
2140
ASSERT(sbp->kbp_gap_std[0]);
2141
ASSERT(sbp->kbp_gap_psi[0]);
2143
sbp->kbp_gap_psi[0]->K =
2144
sbp->kbp_psi[0]->K * sbp->kbp_gap_std[0]->K / sbp->kbp_ideal->K;
2145
sbp->kbp_gap_psi[0]->logK = log(sbp->kbp_gap_psi[0]->K);
2147
/* what about Lambda? */
2150
score_freqs = Blast_ScoreFreqDestruct(score_freqs);
2154
/****************************************************************************/
2155
/* Function definitions for auxiliary functions for the stages above */
2157
_PSIPurgeAlignedRegion(_PSIMsa* msa,
2158
unsigned int seq_index,
2162
_PSIMsaCell* sequence_position = NULL;
2166
return PSIERR_BADPARAM;
2168
if (seq_index > msa->dimensions->num_seqs + 1 ||
2169
stop > msa->dimensions->query_length)
2170
return PSIERR_BADPARAM;
2173
sequence_position = msa->cell[seq_index];
2174
for (i = start; i < stop; i++) {
2176
@todo This function is the successor to posit.c's posCancel and it
2177
has been implemented to be consistent with it. However, its choice
2178
of sentinel values to flag positions as unused is inconsistent with
2179
the state of newly allocated positions (which would be preferred).
2180
This behavior should be fixed once the algo/blast implementation the
2181
PSSM engine replaces posit.c
2182
sequence_position[i].letter = (unsigned char) -1;
2183
sequence_position[i].e_value = kDefaultEvalueForPosition;
2184
sequence_position[i].extents.left = (unsigned int) -1;
2186
/* posCancel initializes positions differently than when they are
2188
sequence_position[i].letter = 0;
2189
sequence_position[i].is_aligned = FALSE;
2190
/*sequence_position[i].e_value = PSI_INCLUSION_ETHRESH;*/
2191
sequence_position[i].extents.left = 0;
2192
sequence_position[i].extents.right =
2193
msa->dimensions->query_length;
2196
_PSIDiscardIfUnused(msa, seq_index);
2201
/* Check if we still need this sequence */
2203
_PSIDiscardIfUnused(_PSIMsa* msa, unsigned int seq_index)
2205
Boolean contains_aligned_regions = FALSE;
2208
for (i = 0; i < msa->dimensions->query_length; i++) {
2209
if (msa->cell[seq_index][i].is_aligned) {
2210
contains_aligned_regions = TRUE;
2215
if ( !contains_aligned_regions ) {
2216
msa->use_sequence[seq_index] = FALSE;
2220
/****************************************************************************/
2222
_PSIGetStandardProbabilities(const BlastScoreBlk* sbp)
2224
Blast_ResFreq* standard_probabilities = NULL;
2226
double* retval = NULL;
2228
retval = (double*) malloc(sbp->alphabet_size * sizeof(double));
2233
standard_probabilities = Blast_ResFreqNew(sbp);
2234
Blast_ResFreqStdComp(sbp, standard_probabilities);
2236
for (i = 0; i < (Uint4) sbp->alphabet_size; i++) {
2237
retval[i] = standard_probabilities->prob[i];
2240
Blast_ResFreqDestruct(standard_probabilities);
2245
_PSISaveDiagnostics(const _PSIMsa* msa,
2246
const _PSIAlignedBlock* aligned_block,
2247
const _PSISequenceWeights* seq_weights,
2248
const _PSIInternalPssmData* internal_pssm,
2249
PSIDiagnosticsResponse* diagnostics)
2251
Uint4 p = 0; /* index on positions */
2252
Uint4 r = 0; /* index on residues */
2254
if ( !diagnostics || !msa || !aligned_block || !seq_weights ||
2256
return PSIERR_BADPARAM;
2259
ASSERT(msa->dimensions->query_length == diagnostics->query_length);
2261
if (diagnostics->information_content) {
2262
double* info = _PSICalculateInformationContentFromResidueFreqs(
2263
internal_pssm->res_freqs, seq_weights->std_prob,
2264
diagnostics->query_length,
2265
diagnostics->alphabet_size);
2267
return PSIERR_OUTOFMEM;
2269
for (p = 0; p < diagnostics->query_length; p++) {
2270
diagnostics->information_content[p] = info[p];
2275
if (diagnostics->residue_freqs) {
2276
for (p = 0; p < diagnostics->query_length; p++) {
2277
for (r = 0; r < diagnostics->alphabet_size; r++) {
2278
diagnostics->residue_freqs[p][r] = msa->residue_counts[p][r];
2283
if (diagnostics->weighted_residue_freqs) {
2284
for (p = 0; p < diagnostics->query_length; p++) {
2285
for (r = 0; r < diagnostics->alphabet_size; r++) {
2286
diagnostics->weighted_residue_freqs[p][r] =
2287
seq_weights->match_weights[p][r];
2292
if (diagnostics->frequency_ratios) {
2293
for (p = 0; p < diagnostics->query_length; p++) {
2294
for (r = 0; r < diagnostics->alphabet_size; r++) {
2295
diagnostics->frequency_ratios[p][r] =
2296
internal_pssm->res_freqs[p][r];
2301
if (diagnostics->gapless_column_weights) {
2302
for (p = 0; p < diagnostics->query_length; p++) {
2303
diagnostics->gapless_column_weights[p] =
2304
seq_weights->gapless_column_weights[p];
2312
* ===========================================================================
2313
* $Log: blast_psi_priv.c,v $
2314
* Revision 1.30 2004/10/19 14:28:38 camacho
2315
* Fix memory access error
2317
* Revision 1.29 2004/10/18 14:33:14 camacho
2318
* 1. Added validation routines
2319
* 2. Fixed bug in calculating sequence weights
2320
* 3. Expanded error codes
2322
* Revision 1.28 2004/10/13 16:03:00 camacho
2323
* Add assertions as sanity checks
2325
* Revision 1.27 2004/10/01 13:59:22 camacho
2326
* Use calloc where needed
2328
* Revision 1.26 2004/09/23 19:51:44 camacho
2329
* Added sanity checks
2331
* Revision 1.25 2004/09/17 02:06:34 camacho
2332
* Renaming of diagnostics structure fields
2334
* Revision 1.24 2004/09/14 21:17:02 camacho
2335
* Add structure group customization to ignore query
2337
* Revision 1.23 2004/08/31 16:13:28 camacho
2338
* Documentation changes
2340
* Revision 1.22 2004/08/13 22:32:16 camacho
2341
* Refactoring of _PSIPurgeSimilarAlignments to use finite state machine
2343
* Revision 1.21 2004/08/04 20:18:26 camacho
2344
* 1. Renaming of structures and functions that pertain to the internals of PSSM
2346
* 2. Updated documentation (in progress)
2348
* Revision 1.20 2004/08/02 13:25:49 camacho
2349
* 1. Various renaming of structures, in progress
2350
* 2. Addition of PSIDiagnostics structures, in progress
2352
* Revision 1.19 2004/07/29 19:16:02 camacho
2353
* Moved PSIExtractQuerySequenceInfo
2355
* Revision 1.18 2004/07/22 19:06:18 camacho
2356
* 1. Fix in _PSICheckSequenceWeights.
2357
* 2. Added functions to calculate information content.
2358
* 3. Cleaned up PSIComputeResidueFrequencies.
2359
* 4. Removed unneeded code to set populate extra column in PSSMs.
2360
* 5. Added collection of information content and gapless column weights to
2361
* diagnostics structure
2363
* Revision 1.17 2004/07/06 15:23:54 camacho
2364
* Fix memory acccess error
2366
* Revision 1.16 2004/07/02 19:40:48 camacho
2367
* Fixes for handling out-of-memory conditions
2369
* Revision 1.15 2004/07/02 18:00:25 camacho
2370
* 1. Document rationale for order in which sequences are compared in
2371
* _PSIPurgeNearIdenticalAlignments.
2372
* 2. Fix in _PSIPurgeSimilarAlignments to take into account the X residues
2373
* 3. Refactorings in sequence weight calculation functions:
2374
* - Simplification of _PSICheckSequenceWeights
2375
* - Addition of _PSISpreadGapWeights
2376
* - Reorganization of _PSICalculateNormalizedSequenceWeights
2378
* Revision 1.14 2004/06/25 20:31:11 camacho
2379
* Remove C++ comments
2381
* Revision 1.13 2004/06/25 20:16:50 camacho
2382
* 1. Minor fixes to sequence weights calculation
2385
* Revision 1.12 2004/06/21 12:52:44 camacho
2386
* Replace PSI_ALPHABET_SIZE for BLASTAA_SIZE
2388
* Revision 1.11 2004/06/17 20:47:21 camacho
2389
* Minor fix to extent sizes
2391
* Revision 1.10 2004/06/16 15:22:47 camacho
2392
* Fixes to add new unit tests
2394
* Revision 1.9 2004/06/09 14:21:03 camacho
2395
* Removed msvc compiler warnings
2397
* Revision 1.8 2004/06/08 17:30:06 dondosha
2398
* Compiler warnings fixes
2400
* Revision 1.7 2004/06/07 14:18:24 dondosha
2401
* Added some variables initialization, to remove compiler warnings
2403
* Revision 1.6 2004/05/28 17:35:03 camacho
2404
* Fix msvc6 warnings
2406
* Revision 1.5 2004/05/28 16:00:09 camacho
2407
* + first port of PSSM generation engine
2409
* Revision 1.4 2004/05/19 14:52:02 camacho
2410
* 1. Added doxygen tags to enable doxygen processing of algo/blast/core
2411
* 2. Standardized copyright, CVS $Id string, $Log and rcsid formatting and i
2413
* 3. Added use of @todo doxygen keyword
2415
* Revision 1.3 2004/05/06 14:01:40 camacho
2416
* + _PSICopyDoubleMatrix
2418
* Revision 1.2 2004/04/07 22:08:37 kans
2419
* needed to include blast_def.h for sfree prototype
2421
* Revision 1.1 2004/04/07 19:11:17 camacho
2424
* ===========================================================================