1
/* $Id: blast_aalookup.h,v 1.5 2007/04/11 15:55:30 kazimird Exp $
2
* ===========================================================================
5
* National Center for Biotechnology Information
7
* This software/database is a "United States Government Work" under the
8
* terms of the United States Copyright Act. It was written as part of
9
* the author's official duties as a United States Government employee and
10
* thus cannot be copyrighted. This software/database is freely available
11
* to the public for use. The National Library of Medicine and the U.S.
12
* Government have not placed any restriction on its use or reproduction.
14
* Although all reasonable efforts have been taken to ensure the accuracy
15
* and reliability of the software and data, the NLM and the U.S.
16
* Government do not and cannot warrant the performance or results that
17
* may be obtained by using this software or data. The NLM and the U.S.
18
* Government disclaim all warranties, express or implied, including
19
* warranties of performance, merchantability or fitness for any particular
22
* Please cite the author in any work or product based on this material.
24
* ===========================================================================
27
/** @file blast_aalookup.h
28
* Routines for creating protein BLAST lookup tables.
29
* Contains definitions and prototypes for the lookup
30
* table construction phase of blastp and RPS blast.
33
#ifndef ALGO_BLAST_CORE__BLAST_AALOOKUP__H
34
#define ALGO_BLAST_CORE__BLAST_AALOOKUP__H
36
#include <algo/blast/core/ncbi_std.h>
37
#include <algo/blast/core/blast_def.h>
38
#include <algo/blast/core/blast_lookup.h>
39
#include <algo/blast/core/blast_options.h>
40
#include <algo/blast/core/blast_rps.h>
41
#include <algo/blast/core/blast_stat.h>
47
/* --------------- protein blast defines -----------------------------*/
49
#define AA_HITS_PER_CELL 3 /**< maximum number of hits in one lookup
52
/** structure defining one cell of the compacted lookup table */
53
typedef struct AaLookupBackboneCell {
54
Int4 num_used; /**< number of hits stored for this cell */
57
Int4 overflow_cursor; /**< integer offset into the overflow array
58
where the list of hits for this cell begins */
59
Int4 entries[AA_HITS_PER_CELL]; /**< if the number of hits for this
60
cell is AA_HITS_PER_CELL or less,
61
the hits are all stored directly in
63
} payload; /**< union that specifies either entries stored right on
64
the backbone if fewer than AA_HITS_PER_CELL are
65
present or a pointer to where the hits are stored
67
} AaLookupBackboneCell;
69
/** The basic lookup table structure for blastp searches
71
typedef struct BlastAaLookupTable {
72
Int4 threshold; /**< the score threshold for neighboring words */
73
Int4 mask; /**< part of index to mask off, that is, top
74
(wordsize*charsize) bits should be discarded. */
75
Int4 charsize; /**< number of bits for a base/residue */
76
Int4 word_length; /**< Length in letters of the full word match
77
required to trigger extension */
78
Int4 lut_word_length; /**< Length in bases of a word indexed by the
80
Int4 alphabet_size; /**< number of letters in the alphabet */
81
Int4 backbone_size; /**< number of cells in the backbone */
82
Int4 longest_chain; /**< length of the longest chain on the backbone */
83
Int4 ** thin_backbone; /**< the "thin" backbone. for each index cell,
84
maintain a pointer to a dynamically-allocated
86
AaLookupBackboneCell * thick_backbone; /**< the "thick" backbone. after
87
queries are indexed, compact the
88
backbone to put at most
89
AA_HITS_PER_CELL hits on the
90
backbone, otherwise point to
91
some overflow storage */
92
Int4 * overflow; /**< the overflow array for the compacted
94
Int4 overflow_size; /**< Number of elements in the overflow array */
95
PV_ARRAY_TYPE *pv; /**< Presence vector bitfield; bit positions that
96
are set indicate that the corresponding thick
97
backbone cell contains hits */
98
Boolean use_pssm; /**< if TRUE, lookup table construction will assume
99
that the underlying score matrix is position-
101
void *scansub_callback;/**< function for scanning subject sequences */
103
Int4 neighbor_matches; /**< the number of neighboring words found while
104
indexing the queries, used for informational/
105
debugging purposes */
106
Int4 exact_matches; /**< the number of exact matches found while
107
indexing the queries, used for informational/
108
debugging purposes */
109
} BlastAaLookupTable;
111
/** Pack the data structures comprising a protein lookup table
112
* into their final form
114
* @param lookup the lookup table [in]
117
Int4 BlastAaLookupFinalize(BlastAaLookupTable* lookup);
119
/** Create a new protein lookup table.
120
* @param opt pointer to lookup table options structure [in]
121
* @param lut handle to lookup table structure [in/modified]
122
* @return 0 if successful, nonzero on failure
125
Int4 BlastAaLookupTableNew(const LookupTableOptions* opt,
126
BlastAaLookupTable* * lut);
129
/** Free the lookup table.
130
* @param lookup The lookup table structure to be freed
133
BlastAaLookupTable* BlastAaLookupTableDestruct(BlastAaLookupTable* lookup);
135
/** Index a protein query.
137
* @param lookup the lookup table [in/modified]
138
* @param matrix the substitution matrix [in]
139
* @param query the array of queries to index
140
* @param unmasked_regions a BlastSeqLoc* which points to a (list of)
141
* integer pair(s) which specify the unmasked region(s)
143
* @param query_bias number added to each offset put into lookup table
144
* (only used for RPS blast database creation, otherwise 0) [in]
146
void BlastAaLookupIndexQuery(BlastAaLookupTable* lookup,
148
BLAST_SequenceBlk* query,
149
BlastSeqLoc* unmasked_regions,
152
/* ------------ compressed alphabet protein blast defines ---------------*/
154
/** number of query offsets to store in a backbone cell */
155
#define COMPRESSED_HITS_PER_BACKBONE_CELL 3
157
/** number of query offsets to store in an overflow cell */
158
#define COMPRESSED_HITS_PER_OVERFLOW_CELL 4
160
/** number of cells in one bank of cells */
161
#define COMPRESSED_OVERFLOW_CELLS_IN_BANK 209710
163
/** The maximum number of banks (usually less than 10 are
164
needed; memory will run out before this is insufficient) */
165
#define COMPRESSED_OVERFLOW_MAX_BANKS 1024
167
/** cell in list for holding query offsets */
168
typedef struct CompressedOverflowCell {
169
struct CompressedOverflowCell* next; /**< pointer to next cell */
171
/** the query offsets stored in the cell */
172
Int4 query_offsets[COMPRESSED_HITS_PER_OVERFLOW_CELL];
173
} CompressedOverflowCell;
175
/** "alternative" structure of CompressedLookupBackboneCell storage */
176
typedef struct CompressedMixedOffsets{
177
/** the query offsets stored locally */
178
Int4 query_offsets[COMPRESSED_HITS_PER_BACKBONE_CELL-1];
180
/** head of linked list of cells of query offsets
181
stored off the backbone */
182
CompressedOverflowCell* head;
183
} CompressedMixedOffsets;
185
/** structure for hashtable of indexed query offsets */
186
typedef struct CompressedLookupBackboneCell {
187
Int4 num_used; /**< number of hits stored for this cell */
189
/** structure for holding the list of query offsets */
191
/** storage for query offsets local to the backbone cell */
192
Int4 query_offsets[COMPRESSED_HITS_PER_BACKBONE_CELL];
193
/** storage for remote query offsets */
194
CompressedMixedOffsets overflow_list;
196
} CompressedLookupBackboneCell;
198
/** The lookup table structure for protein searches
199
* using a compressed alphabet
201
typedef struct BlastCompressedAaLookupTable {
202
Int4 threshold; /**< the score threshold for neighboring words */
203
Int4 word_length; /**< Length in letters of the full word match
204
required to trigger extension */
205
Int4 alphabet_size; /**< number of letters in the alphabet */
206
Int4 compressed_alphabet_size; /**< letters in the compressed alphabet */
207
Int4 reciprocal_alphabet_size; /**< 2^32 / compressed_alphabet_size */
208
Int4 longest_chain; /**< length of the longest chain on the backbone */
209
Int4 backbone_size; /**< number of cells in the backbone */
210
CompressedLookupBackboneCell * backbone; /**< hashtable for storing
211
indexed query offsets */
212
CompressedOverflowCell ** overflow_banks; /**< array of batches of query
213
offsets that are too numerous to
214
fit in backbone cells */
215
Int4 curr_overflow_cell; /**< occupied cells in the current bank */
216
Int4 curr_overflow_bank; /**< current bank to fill-up */
217
PV_ARRAY_TYPE *pv; /**< Presence vector bitfield; bit positions that
218
are set indicate that the corresponding thick
219
backbone cell contains hits */
220
Int4 pv_array_bts; /* bit-to-shift value for PV array indicies */
221
Uint1* compress_table; /**< translation table (protein->compressed) */
222
Int4* scaled_compress_table; /**< scaled version of compress_table */
223
void *scansub_callback;/**< function for scanning subject sequences */
226
Int4 neighbor_matches; /**< the number of neighboring words found while
227
indexing the queries, used for informational/
228
debugging purposes */
229
Int4 exact_matches; /**< the number of exact matches found while
230
indexing the queries, used for informational/
231
debugging purposes */
232
} BlastCompressedAaLookupTable;
234
/** Create a new compressed protein lookup table.
235
* @param query The query sequence block (if concatenated sequence, the
236
* individual strands/sequences must be separated by a sentinel byte)[in]
237
* @param locations The locations to be included in the lookup table,
238
* e.g. [0,length-1] for full sequence. NULL means no sequence. [in]
239
* @param lut Pointer to the lookup table to be created [out]
240
* @param opt Options for lookup table creation [in]
241
* @param sbp pointer to score matrix information [in]
242
* @return 0 if successful, nonzero on failure
244
Int4 BlastCompressedAaLookupTableNew(BLAST_SequenceBlk* query,
245
BlastSeqLoc* locations,
246
BlastCompressedAaLookupTable * *lut,
247
const LookupTableOptions * opt,
250
/** Free the compressed lookup table.
251
* @param lookup The lookup table structure to be freed
254
BlastCompressedAaLookupTable* BlastCompressedAaLookupTableDestruct(
255
BlastCompressedAaLookupTable* lookup);
257
/** Compute "high" index for a word
258
* @param wordsize Number of consecutive letters in a word [in]
259
* @param word Sequence in "regular" AA alphabet [in]
260
* @param skip If a letter is encountered that cannot be
261
* compressed, the offset from word[] where
262
* index computation can begin again [out]
263
* @param lookup Translation tables etc [in]
264
* @return Index calculated from scratch [out]
266
static NCBI_INLINE Int4 s_ComputeCompressedIndex(Int4 wordsize,
268
Int4 compressed_alphabet_size,
270
BlastCompressedAaLookupTable* lookup)
274
Int4 *scaled_compress_table = lookup->scaled_compress_table;
277
for(i = 0; i < wordsize; i++) {
278
Int4 ch = scaled_compress_table[word[i]];
284
index = index / compressed_alphabet_size + ch;
290
/* ----------------------- rpsblast defines -----------------------------*/
292
#define RPS_HITS_PER_CELL 3 /**< maximum number of hits in an RPS backbone
293
cell; this may be redundant (have the same
294
value as AA_HITS_PER_CELL) but must be
295
separate to guarantee binary compatibility
296
with existing RPS blast databases */
298
/** structure defining one cell of the RPS lookup table */
299
typedef struct RPSBackboneCell {
300
Int4 num_used; /**< number of hits in this cell */
301
Int4 entries[RPS_HITS_PER_CELL]; /**< if the number of hits in this cell
302
is RPS_HITS_PER_CELL or less, all
303
hits go into this array. Otherwise,
304
the first hit in the list goes into
305
element 0 of the array, and element 1
306
contains the byte offset into the
307
overflow array where the list of the
308
remaining hits begins */
311
/** The number of regions into which the concatenated RPS blast
312
database is split via bucket sorting */
313
#define RPS_BUCKET_SIZE 2048
316
/** structure used for bucket sorting offsets retrieved
317
from the RPS blast lookup table. */
318
typedef struct RPSBucket {
319
Int4 num_filled; /**< number of offset pairs currently in bucket */
320
Int4 num_alloc; /**< max number of offset pairs bucket can hold */
321
BlastOffsetPair *offset_pairs; /**< list of offset pairs */
325
* The basic lookup table structure for RPS blast searches
327
typedef struct BlastRPSLookupTable {
328
Int4 wordsize; /**< number of full bytes in a full word */
329
Int4 mask; /**< part of index to mask off, that is,
330
top (wordsize*charsize) bits should be
332
Int4 alphabet_size; /**< number of letters in the alphabet */
333
Int4 charsize; /**< number of bits for a base/residue */
334
Int4 backbone_size; /**< number of cells in the backbone */
335
RPSBackboneCell * rps_backbone; /**< the lookup table used for RPS blast */
336
Int4 ** rps_pssm; /**< Pointer to memory-mapped RPS Blast profile file */
337
Int4 * rps_seq_offsets; /**< array of start offsets for each RPS DB seq. */
338
Int4 num_profiles; /**< Number of profiles in RPS database. */
339
Int4 * overflow; /**< the overflow array for the compacted
341
Int4 overflow_size;/**< Number of elements in the overflow array */
342
PV_ARRAY_TYPE *pv; /**< Presence vector bitfield; bit positions that
343
are set indicate that the corresponding thick
344
backbone cell contains hits */
346
Int4 num_buckets; /**< number of buckets used to sort offsets
347
retrieved from the lookup table */
348
RPSBucket *bucket_array; /**< list of buckets */
349
} BlastRPSLookupTable;
351
/** Create a new RPS blast lookup table.
352
* @param rps_info pointer to structure with RPS setup information [in]
353
* @param lut handle to lookup table [in/modified]
354
* @return 0 if successful, nonzero on failure
357
Int2 RPSLookupTableNew(const BlastRPSInfo *rps_info,
358
BlastRPSLookupTable* * lut);
360
/** Free the lookup table.
361
* @param lookup The lookup table structure to free; note that
362
* the rps_backbone and rps_seq_offsets fields are not freed
363
* by this call, since they may refer to memory-mapped arrays
366
BlastRPSLookupTable* RPSLookupTableDestruct(BlastRPSLookupTable* lookup);
372
#endif /* !ALGO_BLAST_CORE__BLAST_AALOOKUP__H */