63
46
/** Set the bit at position 'index' in the PV
64
47
* array bitfield within 'lookup'
66
#define PV_SET(lookup, index) ( (lookup)->pv[(index)>>PV_ARRAY_BTS] |= (PV_ARRAY_TYPE)1 << ((index) & PV_ARRAY_MASK) )
49
#define PV_SET(lookup, index, shift) \
50
lookup[(index) >> (shift)] |= (PV_ARRAY_TYPE)1 << ((index) & PV_ARRAY_MASK)
67
52
/** Test the bit at position 'index' in the PV
68
53
* array bitfield within 'lookup'
70
#define PV_TEST(lookup, index) ( (lookup)->pv[(index)>>PV_ARRAY_BTS] & (PV_ARRAY_TYPE)1 << ((index) & PV_ARRAY_MASK) )
72
#define FULL_BYTE_SHIFT 8 /**< Number of bits to shift in lookup
73
index calculation when scanning
74
compressed nucleotide sequence */
76
#define HITS_ON_BACKBONE 3 /**< maximum number of hits in one lookup
79
/** structure defining one cell of the compacted lookup table */
80
typedef struct LookupBackboneCell {
81
Int4 num_used; /**< number of hits stored for this cell */
84
Int4 overflow_cursor; /**< integer offset into the overflow array
85
where the list of hits for this cell begins */
86
Int4 entries[HITS_ON_BACKBONE]; /**< if the number of hits for this
87
cell is HITS_ON_BACKBONE or less,
88
the hits are all stored directly in
90
} payload; /**< UNION that specifies either entries stored right on the backbone
91
if fewer than HITS_ON_BACKBONE are present or a pointer to
92
where the hits are stored (off-backbone). */
95
/** The basic lookup table structure for blastn
98
typedef struct BlastLookupTable {
99
Int4 threshold; /**< the score threshold for neighboring words */
100
Int4 neighbor_matches; /**< the number of neighboring words found while
101
indexing the queries, used for informational/
102
debugging purposes */
103
Int4 exact_matches; /**< the number of exact matches found while
104
indexing the queries, used for informational/
105
debugging purposes */
106
Int4 mask; /**< part of index to mask off, that is, top
107
(wordsize*charsize) bits should be discarded. */
108
Int4 word_length; /**< Length in bases of the full word match
109
required to trigger extension */
110
Int4 lut_word_length; /**< Length in bases of a word indexed by the
112
Int4 charsize; /**< number of bits for a base/residue */
113
Int4 scan_step; /**< number of bases between successive words */
114
Int4 alphabet_size; /**< number of letters in the alphabet */
115
Int4 backbone_size; /**< number of cells in the backbone */
116
Int4 longest_chain; /**< length of the longest chain on the backbone */
117
Int4 ** thin_backbone; /**< the "thin" backbone. for each index cell,
118
maintain a pointer to a dynamically-allocated
120
LookupBackboneCell * thick_backbone; /**< the "thick" backbone. after
121
queries are indexed, compact the
122
backbone to put at most
123
HITS_ON_BACKBONE hits on the
124
backbone, otherwise point to
125
some overflow storage */
126
Int4 * overflow; /**< the overflow array for the compacted
128
Int4 overflow_size; /**< Number of elements in the overflow array */
129
PV_ARRAY_TYPE *pv; /**< Presence vector bitfield; bit positions that
130
are set indicate that the corresponding thick
131
backbone cell contains hits */
132
Boolean use_pssm; /**< if TRUE, lookup table construction will assume
133
that the underlying score matrix is position-
135
Boolean ag_scanning_mode; /**< Using AG scanning mode (or stride) if TRUE, so that
136
not every base is checked. */
139
/** Create a mapping from word w to the supplied query offset
55
#define PV_TEST(lookup, index, shift) \
56
( lookup[(index) >> (shift)] & \
57
((PV_ARRAY_TYPE)1 << ((index) & PV_ARRAY_MASK)) )
59
/** Add a single query offset to a generic lookup table
141
* @param lookup the lookup table [in]
142
* @param w pointer to the beginning of the word [in]
61
* @param backbone The current list of hashtable cells [in][out]
62
* @param wordsize Number of letters in a word [in]
63
* @param charsize Number of bits in one letter [in]
64
* @param seq pointer to the beginning of the word [in]
143
65
* @param query_offset the offset in the query where the word occurs [in]
147
Int4 BlastAaLookupAddWordHit(BlastLookupTable* lookup,
151
/** Convert the chained lookup table to the pv_array and thick_backbone.
153
* @param lookup the lookup table [in]
156
Int4 _BlastAaLookupFinalize(BlastLookupTable* lookup);
159
* Scans the subject sequence from "offset" to the end of the sequence.
160
* Copies at most array_size hits.
161
* Returns the number of hits found.
162
* If there isn't enough room to copy all the hits, return early, and update
165
* @param lookup_wrap the lookup table [in]
166
* @param subject the subject sequence [in]
167
* @param offset the offset in the subject at which to begin scanning [in/out]
168
* @param offset_pairs Array to which hits will be copied [out]
169
* @param array_size length of the offset arrays [in]
170
* @return The number of hits found.
172
Int4 BlastAaScanSubject(const LookupTableWrap* lookup_wrap,
173
const BLAST_SequenceBlk *subject,
175
BlastOffsetPair* NCBI_RESTRICT offset_pairs,
179
* Scans the RPS query sequence from "offset" to the end of the sequence.
180
* Copies at most array_size hits.
181
* Returns the number of hits found.
182
* If there isn't enough room to copy all the hits, return early, and update
185
* @param lookup_wrap the lookup table [in]
186
* @param sequence the subject sequence [in]
187
* @param offset the offset in the subject at which to begin scanning [in/out]
188
* @return The number of hits found.
190
Int4 BlastRPSScanSubject(const LookupTableWrap* lookup_wrap,
191
const BLAST_SequenceBlk *sequence,
194
/** Create a new protein lookup table.
195
* @param opt pointer to lookup table options structure [in]
196
* @param lut handle to lookup table structure [in/modified]
197
* @return 0 if successful, nonzero on failure
200
Int4 BlastAaLookupNew(const LookupTableOptions* opt, BlastLookupTable* * lut);
203
/** Create a new lookup table.
204
* @param opt pointer to lookup table options structure [in]
205
* @param lut handle to lookup table [in/modified]
206
* @param approx_num_entries an estimate of the number of words
207
* to be added to the lookup table. Only used for nucleotide
209
* @param is_protein boolean indicating protein or nucleotide [in]
210
* @return 0 if successful, nonzero on failure
213
Int4 LookupTableNew(const LookupTableOptions* opt,
214
BlastLookupTable* * lut,
215
Int4 approx_num_entries,
218
/** Free the lookup table.
219
* @param lookup The lookup table structure to be frees
222
BlastLookupTable* LookupTableDestruct(BlastLookupTable* lookup);
224
/** Index a protein query.
226
* @param lookup the lookup table [in/modified]
227
* @param matrix the substitution matrix [in]
228
* @param query the array of queries to index
229
* @param unmasked_regions an array of BlastSeqLoc*s, each of which points to a (list of) integer pair(s) which specify the unmasked region(s) of the query [in]
232
Int4 BlastAaLookupIndexQuery(BlastLookupTable* lookup,
234
BLAST_SequenceBlk* query,
235
BlastSeqLoc* unmasked_regions);
237
/** Index a single query.
239
* @param lookup the lookup table [in/modified]
240
* @param matrix the substitution matrix [in]
241
* @param query the array of queries to index
242
* @param unmasked_regions a BlastSeqLoc* which points to a (list of) integer pair(s) which specify the unmasked region(s) of the query [in]
243
* @param query_bias number added to each offset put into lookup table (only used for RPS blast database creation, otherwise 0) [in]
247
Int4 _BlastAaLookupIndexQuery(BlastLookupTable* lookup,
249
BLAST_SequenceBlk* query,
250
BlastSeqLoc* unmasked_regions,
254
* Index a query sequence; i.e. fill a lookup table with the offsets
255
* of query words score exceeds a threshold.
257
* @param lookup the lookup table [in/modified]
258
* @param matrix the substitution matrix [in]
259
* @param query the query sequence [in]
260
* @param query_bias number added to each offset put into lookup table
261
* (ordinarily 0; a nonzero value allows a succession of
262
* query sequences to update the same lookup table)
263
* @param locations the list of ranges of query offsets to examine for indexing [in]
266
Int4 AddNeighboringWords(BlastLookupTable* lookup,
268
BLAST_SequenceBlk* query,
270
BlastSeqLoc* locations);
273
* A position-specific version of AddNeighboringWords. Note that
274
* only the score matrix matters for indexing purposes, so an
275
* actual query sequence is unneccessary
277
* @param lookup the lookup table [in/modified]
278
* @param matrix the substitution matrix [in]
279
* @param query_bias number added to each offset put into lookup table
280
* (ordinarily 0; a nonzero value allows a succession of
281
* query sequences to update the same lookup table)
282
* @param locations the list of ranges of query offsets to examine for indexing
285
Int4 AddPSSMNeighboringWords(BlastLookupTable* lookup,
288
BlastSeqLoc* locations);
290
/* RPS blast structures and functions */
292
#define RPS_HITS_PER_CELL 3 /**< maximum number of hits in an RPS backbone
293
cell; this may be redundant (have the same
294
value as HITS_ON_BACKBONE) but must be
295
separate to guarantee binary compatibility
296
with existing RPS blast databases */
298
/** structure defining one cell of the RPS lookup table */
299
typedef struct RPSBackboneCell {
300
Int4 num_used; /**< number of hits in this cell */
301
Int4 entries[RPS_HITS_PER_CELL]; /**< if the number of hits in this cell
302
is RPS_HITS_PER_CELL or less, all
303
hits go into this array. Otherwise,
304
the first hit in the list goes into
305
element 0 of the array, and element 1
306
contains the byte offset into the
307
overflow array where the list of the
308
remaining hits begins */
311
/** structure used for bucket sorting offsets retrieved
312
from the RPS blast lookup table. */
313
typedef struct RPSBucket {
314
Int4 num_filled; /**< number of offset pairs currently in bucket */
315
Int4 num_alloc; /**< max number of offset pairs bucket can hold */
316
BlastOffsetPair *offset_pairs; /**< list of offset pairs */
320
* The basic lookup table structure for RPS blast searches
322
typedef struct BlastRPSLookupTable {
323
Int4 wordsize; /**< number of full bytes in a full word */
324
Int4 mask; /**< part of index to mask off, that is,
325
top (wordsize*charsize) bits should be
327
Int4 alphabet_size; /**< number of letters in the alphabet */
328
Int4 charsize; /**< number of bits for a base/residue */
329
Int4 backbone_size; /**< number of cells in the backbone */
330
RPSBackboneCell * rps_backbone; /**< the lookup table used for RPS blast */
331
Int4 ** rps_pssm; /**< Pointer to memory-mapped RPS Blast profile file */
332
Int4 * rps_seq_offsets; /**< array of start offsets for each RPS DB seq. */
333
Int4 num_profiles; /**< Number of profiles in RPS database. */
334
Int4 * overflow; /**< the overflow array for the compacted
336
Int4 overflow_size;/**< Number of elements in the overflow array */
337
PV_ARRAY_TYPE *pv; /**< Presence vector bitfield; bit positions that
338
are set indicate that the corresponding thick
339
backbone cell contains hits */
341
Int4 num_buckets; /**< number of buckets used to sort offsets
342
retrieved from the lookup table */
343
RPSBucket *bucket_array; /**< list of buckets */
344
} BlastRPSLookupTable;
346
/** Create a new RPS blast lookup table.
347
* @param rps_info pointer to structure with RPS setup information [in]
348
* @param lut handle to lookup table [in/modified]
349
* @return 0 if successful, nonzero on failure
352
Int2 RPSLookupTableNew(const BlastRPSInfo *rps_info,
353
BlastRPSLookupTable* * lut);
355
/** Free the lookup table.
356
* @param lookup The lookup table structure to free; note that
357
* the rps_backbone and rps_seq_offsets fields are not freed
358
* by this call, since they may refer to memory-mapped arrays
361
BlastRPSLookupTable* RPSLookupTableDestruct(BlastRPSLookupTable* lookup);
363
/********************* Nucleotide functions *******************/
365
/** Macro to test the presence vector array value for a lookup table index */
366
#define NA_PV_TEST(pv_array, index, pv_array_bts) (pv_array[(index)>>pv_array_bts]&(((PV_ARRAY_TYPE) 1)<<((index)&PV_ARRAY_MASK)))
368
/** Scan the compressed subject sequence, returning all word hits, using the
369
* old BLASTn approach - looking up words at every byte (4 bases) of the
370
* sequence. Lookup table is presumed to have a traditional BLASTn structure.
371
* @param lookup_wrap Pointer to the (wrapper to) lookup table [in]
372
* @param subject The (compressed) sequence to be scanned for words [in]
373
* @param start_offset The offset into the sequence in actual coordinates [in]
374
* @param offset_pairs Array of query and subject positions where words are
376
* @param max_hits The allocated size of the above array - how many offsets
377
* can be returned [in]
378
* @param end_offset Where the scanning should stop [in], has stopped [out]
380
Int4 BlastNaScanSubject(const LookupTableWrap* lookup_wrap,
381
const BLAST_SequenceBlk* subject,
383
BlastOffsetPair* NCBI_RESTRICT offset_pairs,
387
/** Scan the compressed subject sequence, returning all word hits, using the
388
* arbitrary stride. Lookup table is presumed to have a traditional BLASTn
390
* @param lookup_wrap Pointer to the (wrapper to) lookup table [in]
391
* @param subject The (compressed) sequence to be scanned for words [in]
392
* @param start_offset The offset into the sequence in actual coordinates [in]
393
* @param offset_pairs Array of query and subject positions where words are
395
* @param max_hits The allocated size of the above array - how many offsets
396
* can be returned [in]
397
* @param end_offset Where the scanning should stop [in], has stopped [out]
398
* @return The number of hits found from the lookup table
400
Int4 BlastNaScanSubject_AG(const LookupTableWrap* lookup_wrap,
401
const BLAST_SequenceBlk* subject,
403
BlastOffsetPair* NCBI_RESTRICT offset_pairs,
407
/** Fill the lookup table for a given query sequence or partial sequence.
408
* @param lookup Pointer to the lookup table structure [in] [out]
67
void BlastLookupAddWordHit(Int4 **backbone, Int4 wordsize,
68
Int4 charsize, Uint1* seq,
71
/** Add all applicable query offsets to a generic lookup table
73
* @param backbone The current list of hashtable cells [in][out]
74
* @param word_length Number of letters in a word [in]
75
* @param charsize Number of bits in one letter [in]
76
* @param lut_word_length Width of the lookup table in letters
77
* (must be <= word_length) [in]
409
78
* @param query The query sequence [in]
410
* @param location What locations on the query sequence to index? [in]
413
Int4 BlastNaLookupIndexQuery(BlastLookupTable* lookup,
414
BLAST_SequenceBlk* query,
415
BlastSeqLoc* location);
79
* @param locations What locations on the query sequence to index? [in]
81
void BlastLookupIndexQueryExactMatches(Int4 **backbone,
85
BLAST_SequenceBlk * query,
86
BlastSeqLoc * locations);
88
/** Given a word, compute its index value from scratch.
90
* @param wordsize length of the word, in residues [in]
91
* @param charsize length of one residue, in bits [in]
92
* @param word pointer to the beginning of the word [in]
93
* @return the computed index value
96
static NCBI_INLINE Int4 ComputeTableIndex(Int4 wordsize,
103
for(i = 0; i < wordsize; i++) {
104
index = (index << charsize) | word[i];
110
/** Given a word, compute its index value, reusing a previously
111
* computed index value.
113
* @param wordsize length of the word - 1, in residues [in]
114
* @param charsize length of one residue, in bits [in]
115
* @param mask value used to mask the index so that only the bottom wordsize * charsize bits remain [in]
116
* @param word pointer to the beginning of the word [in]
117
* @param index the current value of the index [in]
118
* @return the computed index value
121
static NCBI_INLINE Int4 ComputeTableIndexIncremental(Int4 wordsize,
127
return ((index << charsize) | word[wordsize - 1]) & mask;
417
130
#ifdef __cplusplus
421
#endif /* BLAST_LOOKUP__H */
134
#endif /* !ALGO_BLAST_CORE__BLAST_LOOKUP__H */