1
/* $Id: blast_hits.h,v 1.56 2004/10/04 17:10:45 dondosha Exp $
2
* ===========================================================================
5
* National Center for Biotechnology Information
7
* This software/database is a "United States Government Work" under the
8
* terms of the United States Copyright Act. It was written as part of
9
* the author's official duties as a United States Government employee and
10
* thus cannot be copyrighted. This software/database is freely available
11
* to the public for use. The National Library of Medicine and the U.S.
12
* Government have not placed any restriction on its use or reproduction.
14
* Although all reasonable efforts have been taken to ensure the accuracy
15
* and reliability of the software and data, the NLM and the U.S.
16
* Government do not and cannot warrant the performance or results that
17
* may be obtained by using this software or data. The NLM and the U.S.
18
* Government disclaim all warranties, express or implied, including
19
* warranties of performance, merchantability or fitness for any particular
22
* Please cite the author in any work or product based on this material.
24
* ===========================================================================
26
* Author: Ilya Dondoshansky
30
/** @file blast_hits.h
31
* Structures and API used for saving BLAST hits
34
#ifndef __BLAST_HITS__
35
#define __BLAST_HITS__
37
#include <algo/blast/core/blast_options.h>
38
#include <algo/blast/core/gapinfo.h>
39
#include <algo/blast/core/blast_seqsrc.h>
45
/** One sequence segment within an HSP */
46
typedef struct BlastSeg {
47
Int2 frame; /**< Translation frame */
48
Int4 offset; /**< Start of hsp */
49
Int4 length; /**< Length of hsp */
50
Int4 end; /**< End of HSP */
51
Int4 gapped_start;/**< Where the gapped extension started. */
54
/** Structure holding all information about an HSP */
55
typedef struct BlastHSP {
56
Int4 score; /**< This HSP's raw score */
57
Int4 num_ident; /**< Number of identical base pairs in this HSP */
58
double bit_score; /**< Bit score, calculated from score */
59
double evalue; /**< This HSP's e-value */
60
BlastSeg query; /**< Query sequence info. */
61
BlastSeg subject; /**< Subject sequence info. */
62
Int4 context; /**< Context number of query */
63
GapEditBlock* gap_info;/**< ALL gapped alignment is here */
64
Int4 num; /**< How many HSP's make up this (sum) segment? */
65
Uint4 pattern_length; /**< Length of pattern occurrence in this HSP, in
69
/** The structure to hold all HSPs for a given sequence after the gapped
72
typedef struct BlastHSPList {
73
Int4 oid;/**< The ordinal id of the subject sequence this HSP list is for */
74
Int4 query_index; /**< Index of the query which this HSPList corresponds to.
75
Set to 0 if not applicable */
76
BlastHSP** hsp_array; /**< Array of pointers to individual HSPs */
77
Int4 hspcnt; /**< Number of HSPs saved */
78
Int4 allocated; /**< The allocated size of the hsp_array */
79
Int4 hsp_max; /**< The maximal number of HSPs allowed to be saved */
80
Boolean do_not_reallocate; /**< Is reallocation of the hsp_array allowed? */
83
/** The structure to contain all BLAST results for one query sequence */
84
typedef struct BlastHitList {
85
Int4 hsplist_count; /**< Filled size of the HSP lists array */
86
Int4 hsplist_max; /**< Maximal allowed size of the HSP lists array */
87
double worst_evalue; /**< Highest of the best e-values among the HSP
89
Int4 low_score; /**< The lowest of the best scores among the HSP lists */
90
Boolean heapified; /**< Is this hit list already heapified? */
91
BlastHSPList** hsplist_array; /**< Array of HSP lists for individual
95
/** The structure to contain all BLAST results, for multiple queries */
96
typedef struct BlastHSPResults {
97
Int4 num_queries; /**< Number of query sequences */
98
BlastHitList** hitlist_array; /**< Array of results for individual
103
/** By how much should the chunks of a subject sequence overlap if it is
104
too long and has to be split */
105
#define DBSEQ_CHUNK_OVERLAP 100
107
/********************************************************************************
109
The following section has four sets of functions (or "APIs"), manipulating with
110
the following structures:
111
1. BlastHSP, which is the basic unit to record one alignment.
112
2. BlastHSPList, which is a list of BlastHSP's for one database sequence.
113
3. BlastHitList, which contains all HSPList's for a given query.
114
4. BlastHSPResults, which is a list of BlastHitList's for multiple queries.
116
The naming conventions for the functions are the following:
118
1.) All routines start with "Blast_"
120
2.) After "Blast_" comes the structure being manipulated, that should be either
121
HSP (all capitals all the time!), HSPList (exactly this capitalization),
122
HitList (capital H and L, all others lower-case), or HSPResults.
124
3.) finally the task being done, e.g., "Free", "New", "Init".
126
********************************************************************************/
127
/********************************************************************************
129
********************************************************************************/
131
/** Deallocate memory for an HSP structure */
133
BlastHSP* Blast_HSPFree(BlastHSP* hsp);
135
/** Allocate and zeros out memory for an HSP structure */
137
BlastHSP* Blast_HSPNew(void);
139
/** Allocates BlastHSP and inits with information from input.
141
* @param query_start Start of query alignment [in]
142
* @param query_end End of query alignment [in]
143
* @param subject_start Start of subject alignment [in]
144
* @param subject_end End of subject alignment [in]
145
* @param query_gapped_start Where gapped alignment started on query [in]
146
* @param subject_gapped_start Where gapped alignment started on subject [in]
147
* @param query_context The index of the query containing this HSP [in]
148
* @param subject_frame Subject frame: -3..3 for translated sequence,
149
* 1 for blastn, 0 for blastp [in]
150
* @param score score of alignment [in]
151
* @param gap_edit Will be transferred to HSP and nulled out
152
* if a traceback was not calculated may be NULL [in] [out]
153
* @param ret_hsp allocated and filled in BlastHSP [out]
157
Blast_HSPInit(Int4 query_start, Int4 query_end, Int4 subject_start,
159
Int4 query_gapped_start, Int4 subject_gapped_start,
160
Int4 query_context, Int2 subject_frame, Int4 score,
161
GapEditBlock* *gap_edit, BlastHSP** ret_hsp);
164
/** Modifies the HSP data after the final gapped alignment.
165
* Input includes only data that likely needs modification.
166
* @param query_start start of alignment on query [in]
167
* @param query_end end of alignment on query [in]
168
* @param subject_start start of alignment on subject [in]
169
* @param subject_end end of alignment on subject [in]
170
* @param score New score for this HSP [in]
171
* @param gap_edit traceback from final gapped alignment [in] [out]
172
* @param hsp Original HSP from the preliminary stage [in] [out]
176
Blast_HSPReset(Int4 query_start, Int4 query_end, Int4 subject_start,
177
Int4 subject_end, Int4 score, GapEditBlock* *gap_edit,
180
/** Calculate e-value for an HSP found by PHI BLAST.
181
* @param hsp An HSP found by PHI BLAST [in]
182
* @param sbp Scoring block with statistical parameters [in]
185
void Blast_HSPPHIGetEvalue(BlastHSP* hsp, BlastScoreBlk* sbp);
187
/** Reevaluate the HSP's score, e-value and percent identity after taking
188
* into account the ambiguity information. Needed for blastn only, either
189
* after a greedy gapped extension, or for ungapped search.
190
* @param hsp The HSP structure [in] [out]
191
* @param query_start Pointer to the start of the query sequence [in]
192
* @param subject_start Pointer to the start of the subject sequence [in]
193
* @param hit_options Hit saving options with e-value cut-off [in]
194
* @param score_params Scoring parameters [in]
195
* @param query_info Query information structure, containing effective search
197
* @param sbp Score block with Karlin-Altschul parameters [in]
198
* @return Should this HSP be deleted after the score reevaluation?
201
Boolean Blast_HSPReevaluateWithAmbiguities(BlastHSP* hsp,
202
Uint1* query_start, Uint1* subject_start,
203
const BlastHitSavingOptions* hit_options,
204
const BlastScoringParameters* score_params,
205
BlastQueryInfo* query_info, BlastScoreBlk* sbp);
207
/** Calculate number of identities in an HSP.
208
* @param query The query sequence [in]
209
* @param subject The uncompressed subject sequence [in]
210
* @param hsp All information about the HSP [in]
211
* @param is_gapped Is this a gapped search? [in]
212
* @param num_ident_ptr Number of identities [out]
213
* @param align_length_ptr The alignment length, including gaps [out]
217
Blast_HSPGetNumIdentities(Uint1* query, Uint1* subject, BlastHSP* hsp,
218
Boolean is_gapped, Int4* num_ident_ptr,
219
Int4* align_length_ptr);
221
/** Calculate number of identities in an HSP for an out-of-frame alignment.
222
* @param query The query sequence [in]
223
* @param subject The uncompressed subject sequence [in]
224
* @param hsp All information about the HSP [in]
225
* @param program BLAST program (blastx or tblastn) [in]
226
* @param num_ident_ptr Number of identities [out]
227
* @param align_length_ptr The alignment length, including gaps [out]
231
Blast_HSPGetOOFNumIdentities(Uint1* query, Uint1* subject, BlastHSP* hsp,
232
EBlastProgramType program, Int4* num_ident_ptr,
233
Int4* align_length_ptr);
235
/** Calculate length of an HSP as length in query plus length of gaps in
236
* query. If gap information is unavailable, return maximum between length in
237
* query and in subject.
238
* @param hsp An HSP structure [in]
239
* @param length Length of this HSP [out]
240
* @param gaps Total number of gaps in this HSP [out]
241
* @param gap_opens Number of gap openings in this HSP [out]
244
void Blast_HSPCalcLengthAndGaps(BlastHSP* hsp, Int4* length,
245
Int4* gaps, Int4* gap_opens);
247
/** Adjust HSP endpoint offsets according to strand/frame; return values in
248
* 1-offset coordinates instead of internal 0-offset.
249
* @param hsp An HSP structure [in]
250
* @param q_start Start of alignment in query [out]
251
* @param q_end End of alignment in query [out]
252
* @param s_start Start of alignment in subject [out]
253
* @param s_end End of alignment in subject [out]
257
Blast_HSPGetAdjustedOffsets(BlastHSP* hsp, Int4* q_start, Int4* q_end,
258
Int4* s_start, Int4* s_end);
260
/********************************************************************************
262
********************************************************************************/
264
/** Deallocate memory for an HSP list structure
265
* as well as all it's components.
266
* @param hsp_list the BlastHSPList to be freed [in].
269
BlastHSPList* Blast_HSPListFree(BlastHSPList* hsp_list);
271
/** Creates HSP list structure with a default size HSP array
272
* @param hsp_max the maximum number of HSP's that can ever be
273
* saved at once [in].
276
BlastHSPList* Blast_HSPListNew(Int4 hsp_max);
278
/** Saves HSP information into a BlastHSPList structure
279
* @param hsp_list Structure holding all HSPs with full gapped alignment
280
* information [in] [out]
281
* @param hsp The new HSP to be inserted into the HSPList [in]
285
Blast_HSPListSaveHSP(BlastHSPList* hsp_list, BlastHSP* hsp);
287
/** Assign frames in all HSPs in the HSP list.
288
* @param program_number Type of BLAST program [in]
289
* @param hsp_list List of HSPs for one subject sequence [in] [out]
290
* @param is_ooframe Is out-of-frame gapping used? [in]
294
Blast_HSPListSetFrames(EBlastProgramType program_number, BlastHSPList* hsp_list,
297
/** Calculate the expected values for all HSPs in a hit list, without using
298
* the sum statistics. In case of multiple queries, the offsets are assumed
299
* to be already adjusted to individual query coordinates, and the contexts
300
* are set for each HSP.
301
* @param query_info Auxiliary query information - needed only for effective
302
* search space calculation if it is not provided [in]
303
* @param hsp_list List of HSPs for one subject sequence [in] [out]
304
* @param gapped_calculation Is this for a gapped or ungapped search? [in]
305
* @param sbp Structure containing statistical information [in]
306
* @param gap_decay_rate Adjustment parameter to compensate for the effects of
307
* performing multiple tests when linking HSPs. No adjustment is made if 0. [in]
310
Int2 Blast_HSPListGetEvalues(BlastQueryInfo* query_info,
311
BlastHSPList* hsp_list, Boolean gapped_calculation,
312
BlastScoreBlk* sbp, double gap_decay_rate);
314
/** Calculate e-values for a PHI BLAST HSP list.
315
* @param hsp_list HSP list found by PHI BLAST [in] [out]
316
* @param sbp Scoring block with statistical parameters [in]
319
void Blast_HSPListPHIGetEvalues(BlastHSPList* hsp_list, BlastScoreBlk* sbp);
321
/** Calculate bit scores from raw scores in an HSP list.
322
* @param hsp_list List of HSPs [in] [out]
323
* @param gapped_calculation Is this a gapped search? [in]
324
* @param sbp Scoring block with statistical parameters [in]
327
Int2 Blast_HSPListGetBitScores(BlastHSPList* hsp_list,
328
Boolean gapped_calculation, BlastScoreBlk* sbp);
330
/** Discard the HSPs above the e-value threshold from the HSP list
331
* @param hsp_list List of HSPs for one subject sequence [in] [out]
332
* @param hit_options Options block containing the e-value cut-off [in]
335
Int2 Blast_HSPListReapByEvalue(BlastHSPList* hsp_list,
336
BlastHitSavingOptions* hit_options);
338
/** Cleans out the NULLed out HSP's from the HSP array that
339
* is part of the BlastHSPList.
340
* @param hsp_list Contains array of pointers to HSP structures [in]
341
* @return status of function call.
345
Blast_HSPListPurgeNullHSPs(BlastHSPList* hsp_list);
347
/** Reevaluate all HSPs in an HSP list, using ambiguity information.
348
* This is/can only done either for an ungapped search, or if traceback is
350
* Subject sequence is uncompressed and saved here. Number of identities is
351
* calculated for each HSP along the way.
352
* @param hsp_list The list of HSPs for one subject sequence [in] [out]
353
* @param query_blk The query sequence [in]
354
* @param subject_blk The subject sequence [in] [out]
355
* @param hit_options The options related to saving hits [in]
356
* @param query_info Auxiliary query information [in]
357
* @param sbp The statistical information [in]
358
* @param score_params Parameters related to scoring [in]
359
* @param seq_src The BLAST database structure (for retrieving uncompressed
364
Blast_HSPListReevaluateWithAmbiguities(BlastHSPList* hsp_list,
365
BLAST_SequenceBlk* query_blk, BLAST_SequenceBlk* subject_blk,
366
const BlastHitSavingOptions* hit_options, BlastQueryInfo* query_info,
367
BlastScoreBlk* sbp, const BlastScoringParameters* score_params,
368
const BlastSeqSrc* seq_src);
370
/** Append one HSP list to the other. Discard lower scoring HSPs if there is
371
* not enough space to keep all.
372
* @param hsp_list New list of HSPs [in]
373
* @param combined_hsp_list_ptr Pointer to the combined list of HSPs, possibly
374
* containing previously saved HSPs [in] [out]
375
* @param hsp_num_max Maximal allowed number of HSPs to save (unlimited if INT4_MAX) [in]
376
* @return Status: 0 on success, -1 on failure.
379
Int2 Blast_HSPListAppend(BlastHSPList* hsp_list,
380
BlastHSPList** combined_hsp_list_ptr,
383
/** Merge an HSP list from a chunk of the subject sequence into a previously
385
* @param hsp_list Contains HSPs from the new chunk [in]
386
* @param combined_hsp_list_ptr Contains HSPs from previous chunks [in] [out]
387
* @param hsp_num_max Maximal allowed number of HSPs to save (unlimited if INT4_MAX) [in]
388
* @param start Offset where the current subject chunk starts [in]
389
* @param merge_hsps Should the overlapping HSPs be merged into one? [in]
390
* @return 0 if HSP lists have been merged successfully, -1 otherwise.
393
Int2 Blast_HSPListsMerge(BlastHSPList* hsp_list,
394
BlastHSPList** combined_hsp_list_ptr,
395
Int4 hsp_num_max, Int4 start, Boolean merge_hsps);
397
/** Adjust subject offsets in an HSP list if only part of the subject sequence
398
* was searched. Used when long subject sequence is split into more manageable
400
* @param hsp_list List of HSPs from a chunk of a subject sequence [in]
401
* @param offset Offset where the chunk starts [in]
404
void Blast_HSPListAdjustOffsets(BlastHSPList* hsp_list, Int4 offset);
406
/** Sort the HSPs in an HSP list by diagonal and remove redundant HSPs. */
409
Blast_HSPListUniqSort(BlastHSPList* hsp_list);
411
/** Sort the HSPs in an HSP list by score. This type of sorting is done
412
* at the beginning of the traceback stage, and is needed to eliminate the effects
413
* of wrong score order because of application of sum statistics.
414
* Checks if the HSP array is already sorted before proceeding with quicksort.
415
* @param hsp_list Structure containing array of HSPs to be sorted. [in] [out]
418
void Blast_HSPListSortByScore(BlastHSPList* hsp_list);
420
/** Sort the HSPs in an HSP list by e-value, with scores and other criteria
421
* used to resolve ties. Checks if the HSP array is already sorted before
422
* proceeding with quicksort.
423
* @param hsp_list Structure containing array of HSPs to be sorted. [in] [out]
426
void Blast_HSPListSortByEvalue(BlastHSPList* hsp_list);
429
/********************************************************************************
431
********************************************************************************/
433
/** Allocate memory for a hit list of a given size.
434
* @param hitlist_size Size of the hit list (number of HSP lists) [in]
437
BlastHitList* Blast_HitListNew(Int4 hitlist_size);
439
/** Deallocate memory for the hit list */
441
BlastHitList* Blast_HitListFree(BlastHitList* hitlist);
443
/** Deallocate memory for every HSP list on BlastHitList,
444
* as well as all their components.
445
* @param hitlist contains the BlastHSPList array to be freed [in/out].
448
Int2 Blast_HitListHSPListsFree(BlastHitList* hitlist);
450
/** Insert a new HSP list into the hit list.
451
* Before capacity of the hit list is reached, just add to the end;
452
* After that, store in a heap, to ensure efficient insertion and deletion.
453
* The heap order is reverse, with worst e-value on top, for convenience
455
* @param hit_list Contains all HSP lists saved so far [in] [out]
456
* @param hsp_list A new HSP list to be inserted into the hit list [in]
459
Int2 Blast_HitListUpdate(BlastHitList* hit_list, BlastHSPList* hsp_list);
461
/********************************************************************************
463
********************************************************************************/
465
/** Initialize the results structure.
466
* @param num_queries Number of query sequences to allocate results structure
468
* @param results_ptr The allocated structure [out]
471
Int2 Blast_HSPResultsInit(Int4 num_queries, BlastHSPResults** results_ptr);
473
/** Deallocate memory for BLAST results */
475
BlastHSPResults* Blast_HSPResultsFree(BlastHSPResults* results);
477
/** Sort each hit list in the BLAST results by best e-value */
479
Int2 Blast_HSPResultsSortByEvalue(BlastHSPResults* results);
480
/** Sort each hit list in the BLAST results by best e-value, in reverse
483
Int2 Blast_HSPResultsReverseSort(BlastHSPResults* results);
485
/** Reverse order of HSP lists in each hit list in the BLAST results.
486
* This allows to return HSP lists from the end of the arrays when reading
487
* from a collector HSP stream.
490
Int2 Blast_HSPResultsReverseOrder(BlastHSPResults* results);
492
/** Blast_HSPResultsSaveRPSHSPList
493
* Save the HSPs from an HSPList obtained on the preliminary stage of
494
* RPS BLAST to appropriate places in the results structure. Input HSPList
495
* contains HSPs from a single query, but from all RPS BLAST database
496
* sequences. The HSPs in the list are not assumed to be sorted, because
497
* e-values are not yet calculated.
498
* @param program The type of BLAST search [in]
499
* @param results The structure holding results for all queries [in] [out]
500
* @param hsp_list The results for the current subject sequence; in case of
501
* multiple queries, offsets are still in the concatenated
502
* sequence coordinates [in]
503
* @param hit_options The options related to saving hits [in]
506
Int2 Blast_HSPResultsSaveRPSHSPList(EBlastProgramType program, BlastHSPResults* results,
507
BlastHSPList* hsp_list, const BlastHitSavingOptions* hit_options);
509
/** Blast_HSPResultsSaveHSPList
510
* Save the current HSP list to appropriate places in the results structure.
511
* The input HSPList contains HSPs from a single BLAST database sequence, but
512
* possibly from multiple queries. The HSPs in the list are assumed to be sorted
513
* by e-value and score.
514
* @param program The type of BLAST search [in]
515
* @param results The structure holding results for all queries [in] [out]
516
* @param hsp_list The results for the current subject sequence; in case of
517
* multiple queries, offsets are still in the concatenated
518
* sequence coordinates [in]
519
* @param hit_options The options related to saving hits [in]
522
Int2 Blast_HSPResultsSaveHSPList(EBlastProgramType program, BlastHSPResults* results,
523
BlastHSPList* hsp_list, const BlastHitSavingOptions* hit_options);
525
/** Blast_HSPResultsSaveHSPList
526
* Insert an HSP list to the appropriate place in the results structure.
527
* All HSPs in this list must be from the same query, and the query index
528
* must be set in the BlastHSPList input structure.
529
* @param results The structure holding results for all queries [in] [out]
530
* @param hsp_list The results for one query-subject sequence pair. [in]
531
* @param hitlist_size Maximal allowed hit list size. [in]
534
Int2 Blast_HSPResultsInsertHSPList(BlastHSPResults* results,
535
BlastHSPList* hsp_list, Int4 hitlist_size);
540
#endif /* !__BLAST_HITS__ */