2
Copyright (c) 2007,2008 Thomas Jahns <Thomas.Jahns@gmx.net>
4
Permission to use, copy, modify, and distribute this software for any
5
purpose with or without fee is hereby granted, provided that the above
6
copyright notice and this permission notice appear in all copies.
8
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
9
WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
10
MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
11
ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
12
WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
13
ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
14
OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
22
* Interface definitions for querying an indexed representation of the
23
* BWT of a sequence as presented by Manzini and Ferragina (Compressed
24
* Representations of Sequences and Full-Text Indexes, 2006)
27
#include "core/error.h"
29
#include "core/logger.h"
30
#include "match/eis-encidxseq.h"
31
#include "match/eis-mrangealphabet.h"
32
#include "match/eis-bwtseq-param.h"
35
* - implement other index types
39
* Defines mode by which to derive sort of symbols in particular ranges
43
SORTMODE_UNDEFINED = 1,
48
* Stores column indices of the (virtual) matrix of rotations of the
49
* input string used to construct the BWT, note that upper will
50
* typically contain the lower value since rows are numbered from 0 at
51
* the top to n-1 at the bottom.
55
unsigned long start, /**< index of first boundary row */
56
end; /**< index of second boundary row */
59
/** Object holding a BWT sequence index */
61
typedef struct BWTSeq BWTSeq;
63
/** Iterator of Matches produced by a search query from a BWT
65
typedef struct BWTSeqExactMatchesIterator BWTSeqExactMatchesIterator;
68
* \brief Creates or loads an encoded indexed sequence object of the
70
* @param params a struct holding parameter information for index construction
71
* @param err genometools error object reference
72
* @return reference to new BWT sequence object
75
gt_availBWTSeq(const struct bwtParam *params, GtLogger *verbosity, GtError*);
78
* \brief Creates an encoded indexed sequence object of the BWT
79
* transform by translating the information in a pre-existing suffix
81
* @param params a struct holding parameter information for index construction
82
* @param err genometools error object reference
83
* @return reference to new BWT sequence object
86
gt_trSuftab2BWTSeq(const struct bwtParam *params, GtLogger *verbosity,
90
* \brief Loads an encoded indexed sequence object of the
93
* @param BWTOptFlags Selects in-memory features of sequence index to
94
* use, see enum BWTOptionDefaultsOptimizationFlags for possible settings.
95
* @param err genometools error object reference
96
* @return reference to new BWT sequence object
99
gt_loadBWTSeq(const char *projectName, int BWTOptFlags, GtLogger *verbosity,
103
* \brief Deallocate a previously loaded/created BWT sequence object.
104
* @param bwtseq reference of object to delete
107
gt_deleteBWTSeq(BWTSeq *bwtseq);
110
* \brief Query BWT sequence object for availability of added
111
* information to locate matches.
112
* @param bwtSeq reference of object to query
113
* @return 0 if no locate information is present, non-zero otherwise
116
BWTSeqHasLocateInformation(const BWTSeq *bwtSeq);
119
* \brief Retrieve alphabet transformation from BWT sequence object
120
* @param bwtSeq reference of object to query for alphabet
121
* @return read-only reference of alphabet associated with sequence
123
static inline const MRAEnc *
124
BWTSeqGetAlphabet(const BWTSeq *bwtSeq);
127
* \brief Retrieve sequence index in which the BWT is stored
128
* @param bwtSeq reference of object to query for index
129
* @return read-only reference of index containing the sequence
131
static inline const EISeq *
132
BWTSeqGetEncIdxSeq(const BWTSeq *bwtSeq);
135
* \brief Retrieve Symbol at given position in BWT
136
* @param bwtSeq reference of object to query for index
137
* @return read-only reference of index containing the sequence
140
BWTSeqGetSym(const BWTSeq *bwtSeq, unsigned long pos);
143
* \brief Query length of stored sequence.
144
* @param bwtSeq reference of object to query
145
* @return length of sequence
147
static inline unsigned long
148
BWTSeqLength(const BWTSeq *bwtSeq);
151
* @brief Query position of rotation 0 in suffix array (aka
152
* longest) which also is the position of the terminator symbol in the
154
* @param bwtSeq reference of object to query
155
* @return position of terminator
157
static inline unsigned long
158
BWTSeqTerminatorPos(const BWTSeq *bwtSeq);
161
* \brief Query BWT sequence for the number of occurences of a symbol in a
163
* @param bwtSeq reference of object to query
164
* @param tSym transformed symbol (as obtained by
165
* MRAEncMapSymbol(BWTSeqGetAlphabet(bwtSeq), origSym)
166
* @param pos right bound of BWT prefix queried
167
* @return number of occurrences of symbol up to but not including pos
169
static inline unsigned long
170
BWTSeqTransformedOcc(const BWTSeq *bwtSeq, Symbol tSym, unsigned long pos);
173
* \brief Query BWT sequence for the number of occurences of a symbol in a
175
* @param bwtSeq reference of object to query
176
* @param sym symbol (from original alphabet)
177
* @param pos right bound of BWT prefix queried
178
* @return number of occurrences of symbol up to but not including pos
180
static inline unsigned long
181
BWTSeqOcc(const BWTSeq *bwtSeq, Symbol sym, unsigned long pos);
184
* \brief Query BWT sequence for the number of occurences of a symbol
185
* in two given prefixes.
186
* @param bwtSeq reference of object to query
187
* @param tSym transformed symbol (as obtained by
188
* MRAEncMapSymbol(BWTSeqGetAlphabet(bwtSeq), origSym)
189
* @param posA right bound of first BWT prefix queried
190
* @param posA right bound of second BWT prefix queried
191
* @return number of occurrences of symbol up to but not including
192
* posA and posB respectively in fields a and b of returned struct
194
static inline GtUlongPair
195
BWTSeqTransformedPosPairOcc(const BWTSeq *bwtSeq, Symbol tSym,
196
unsigned long posA, unsigned long posB);
199
* \brief Query BWT sequence for the number of occurences of a symbol
200
* in two given prefixes.
201
* @param bwtSeq reference of object to query
202
* @param sym symbol (from original alphabet)
203
* @param posA right bound of first BWT prefix queried
204
* @param posB right bound of second BWT prefix queried
205
* @return number of occurrences of symbol up to but not including
206
* posA and posB respectively in fields a and b of returned struct
208
static inline GtUlongPair
209
BWTSeqPosPairOcc(const BWTSeq *bwtSeq, Symbol sym,
210
unsigned long posA, unsigned long posB);
213
* \brief Query BWT sequence for the number of occurences of all symbols in a
214
* given alphabet range and BWT sequence prefix.
215
* @param bwtSeq reference of object to query
216
* @param range range of symbols in alphabet to query
217
* @param pos right bound of BWT prefix queried
218
* @param rangeOccs occurrence counts for all symbols in range are written to
219
* this array. The referenced memory must be sized appropriately to
220
* accomodate as many symbols as are in range (MRAEncGetRangeSize if
221
* in doubt) and rangeOccs[i] will hold the occurrence count of symbol
222
* MRAEncRevMapSymbol(alphabet, i + MRAEncGetRangeBase(alphabet, range))
225
BWTSeqRangeOcc(const BWTSeq *bwtSeq, AlphabetRangeID range, unsigned long pos,
226
unsigned long *rangeOccs);
228
/* XXX: range 0 for range = 0 for regular symbols;
229
unsigned long rangeOcc[4]; */
232
* \brief Query BWT sequence for the number of occurences of all symbols in a
233
* given alphabet range and two BWT sequence prefixes.
234
* @param bwtSeq reference of object to query
235
* @param range range of symbols in alphabet to query
236
* @param posA right bound of first BWT prefix queried
237
* @param posB right bound of second BWT prefix queried
238
* @param rangeOccs occurrence counts for all symbols in range are written to
239
* this array. The referenced memory must be sized appropriately to
240
* accomodate two-times as many positions symbols as are in range
241
* (use MRAEncGetRangeSize if in doubt) and rangeOccs[i] will hold the
242
* occurrence count of symbol
243
* MRAEncRevMapSymbol(alphabet, i + MRAEncGetRangeBase(alphabet, range))
244
* up to position posA while the corresponding is true for
245
* rangeOccs[rangeSize + i] concerning posB
248
BWTSeqPosPairRangeOcc(const BWTSeq *bwtSeq, AlphabetRangeID range,
249
unsigned long posA, unsigned long posB,
250
unsigned long *rangeOccs);
253
* \brief Given a position in the L-column of the matrix of rotations,
254
* find the corresponding row in the F-column.
255
* @param bwtSeq reference of object to query
256
* @param pos row index for L-column
257
* @param extBits modified in intermediate queries
258
* @return index of corresponding row F-column
260
static inline unsigned long
261
BWTSeqLFMap(const BWTSeq *bwtSeq, unsigned long pos,
262
struct extBitsRetrieval *extBits);
265
* \brief Given a symbol, query the aggregate count of symbols with
266
* lower index, this corresponds to the first row in the C-column of
267
* standard literature on the BWT on which the given symbol is found.
268
* @param bwtSeq reference of object to query
269
* @param sym symbol to query counts sum for
270
* @return aggregate count
272
static inline unsigned long
273
BWTSeqAggCount(const BWTSeq *bwtSeq, Symbol sym);
276
* \brief Given a symbol, query the aggregate count of symbols with
279
* This corresponds to the first row in the C-column of
280
* standard literature on the BWT on which the given symbol is found,
281
* this function takes a symbol already transformed into the stored
282
* alphabet as argument.
284
* @param bwtSeq reference of object to query
285
* @param tSym symbol to query counts sum for
286
* reference for core functions @return aggregate count
288
static inline unsigned long
289
BWTSeqAggTransformedCount(const BWTSeq *bwtSeq, Symbol tSym);
292
* \brief Given a query string find number of matches in original
293
* sequence (of which the sequence object is a BWT).
294
* @param bwtSeq reference of object to query
295
* @param query symbol string to search matches for
296
* @param queryLen length of query string
297
* @param forward direction of processing the query
298
* @return number of matches
301
gt_BWTSeqMatchCount(const BWTSeq *bwtSeq, const Symbol *query, size_t queryLen,
305
* \brief Given a pair of limiting positions in the suffix array and a
306
* symbol, compute the interval reached by matching one symbol further.
307
* @param bwtSeq reference of sequence index to query
308
* @param nextSym symbol by which to further restrict match
309
* @param limits current restriction of match interval
310
* @return limits, with bounds adjusted
312
static inline struct matchBound *
313
BWTSeqIncrMatch(const BWTSeq *bwtSeq, struct matchBound *limits,
317
* GtError conditions encountered upon integrity check.
319
enum verifyBWTSeqErrCode
321
VERIFY_BWTSEQ_NO_ERROR = 0, /**< every check completed okay */
322
VERIFY_BWTSEQ_REFLOAD_ERROR = -1, /**< failed to load suffix array for
323
* reference comparisons */
324
VERIFY_BWTSEQ_LENCOMPARE_ERROR = -2, /* lengths of bwt sequence
325
* index and loaded suffix arry
327
VERIFY_BWTSEQ_SUFVAL_ERROR = -3, /**< a marked suffix array value
328
* stored in the bwt sequence index
329
* does not match the value read directly
330
* from the suffix array table */
331
VERIFY_BWTSEQ_LFMAPWALK_ERROR = -4, /**< while traversing the bwt
332
* sequence in reverse original sequence
333
* order, the symbol retrieved
335
* corresponding symbol in the
336
* encoded sequence */
337
VERIFY_BWTSEQ_LFMAPWALK_IMP_ERROR = -5, /**< original sequence
340
* but is impossible */
341
VERIFY_BWTSEQ_TERMPOS_ERROR = -6, /**< the position of the
342
* 0-rotation does not match */
343
VERIFY_BWTSEQ_CONTEXT_SYMFAIL = -7, /**< context regeneration
344
* delivered an incorrect symbol */
345
VERIFY_BWTSEQ_CONTEXT_LOADFAIL = -8, /**< context regeneration
346
* is impossible because the
347
* context failed to load */
350
enum verifyBWTSeqFlags
352
VERIFY_BWTSEQ_SUFVAL = 1 << 0, /**< check stored suffix arrays */
353
VERIFY_BWTSEQ_LFMAPWALK = 1 << 1, /**< performs full backwards
354
* regeneration of original
355
* sequence (if possible)
357
VERIFY_BWTSEQ_CONTEXT = 1 << 2, /**< try some random context
363
* \brief Perform various checks on the burrows wheeler transform
365
* - inspect all sampled suffix array values for equality with
366
* corresponding value of mapped reference suffix array
367
* - check wether the last-to-first traversal of the BWT sequence
368
* index delivers the reversed encoded sequence
370
* @param bwtSeq index to check
371
* @param projectName suffix array to load as reference
372
* @param tickPrint print a dot every time tickPrint many symbols have
374
* @param fp dots printed to this file
376
enum verifyBWTSeqErrCode
377
gt_BWTSeqVerifyIntegrity(BWTSeq *bwtSeq, const char *projectName,
379
unsigned long tickPrint, FILE *fp,
380
GtLogger *verbosity, GtError *err);
383
* \brief Given a query string produce iterator for all matches in
384
* original sequence (of which the sequence object is a BWT).
386
* Warning: the iterator object will become invalid once the
387
* corresponding bwt sequence object has been deleted.
389
* Warning: user must manage storage of iter manually
391
* @param iter points to storage for iterator
392
* @param bwtSeq reference of bwt sequence object to use for matching
393
* @param query symbol string to search matches for
394
* @param queryLen length of query string
395
* @param forward direction of processing the query
396
* @return true if successfully initialized, false on error
399
gt_initEMIterator(BWTSeqExactMatchesIterator *iter, const BWTSeq *bwtSeq,
400
const Symbol *query, size_t queryLen, bool forward);
403
* \brief Only initializes empty iterator for given
406
* Warning: the iterator object will become invalid once the
407
* corresponding bwt sequence object has been deleted.
409
* Warning: user must manage storage of iter manually
410
* @param iter points to storage for iterator
411
* @param bwtSeq reference of bwt sequence object to use for matching
412
* @return true if successfully initialized, false on error
415
gt_initEmptyEMIterator(BWTSeqExactMatchesIterator *iter, const BWTSeq *bwt);
418
* \brief Set up iterator for new query, iter must have been
419
* initialized previously. Everything else is identical to
422
* Warning: the iterator object will become invalid once the
423
* corresponding bwt sequence object has been deleted.
424
* @param iter points to storage for iterator
425
* @param bwtSeq reference of bwt sequence object to use for matching
426
* @param query symbol string to search matches for
427
* @param queryLen length of query string
428
* @param forward direction of processing the query
429
* @return true if successfully initialized, false on error
432
gt_reinitEMIterator(BWTSeqExactMatchesIterator *iter, const BWTSeq *bwtSeq,
433
const Symbol *query, size_t queryLen, bool forward);
435
* \brief Destruct resources of matches iterator. Does not free the
436
* storage of iterator itself.
437
* Warning: user must manage storage of iter manually
438
* @param iter reference to matches iterator
441
gt_destructEMIterator(struct BWTSeqExactMatchesIterator *iter);
444
* \brief Given a query string produce iterator for all matches in
445
* original sequence (of which the sequence object is a BWT).
447
* Warning: the iterator object will become invalid once the
448
* corresponding bwt sequence object has been deleted.
449
* @param bwtSeq reference of bwt sequence object to use for matching
450
* @param query symbol string to search matches for
451
* @param queryLen length of query string
452
* @param forward direction of processing the query
453
* @return reference of iterator object, NULL on error
455
BWTSeqExactMatchesIterator *
456
gt_newEMIterator(const BWTSeq *bwtSeq, const Symbol *query, size_t queryLen,
460
* \brief Deallocate an iterator object.
461
* @param iter reference of iterator object
464
gt_deleteEMIterator(BWTSeqExactMatchesIterator *iter);
467
* \brief Get position of next match from an iterator.
468
* @param iter reference of iterator object
469
* @param bwtSeq reference of bwt sequence object to use for matching
470
* @param pos location of match will be stored here if a further match
471
* is available, not modified otherwise
472
* @return true if another match could be found, false otherwise
475
EMIGetNextMatch(BWTSeqExactMatchesIterator *iter, unsigned long *pos,
476
const BWTSeq *bwtSeq);
479
* \brief Query an iterator for the total number of matches.
480
* @param iter reference of iterator object
481
* @return total number of matches
484
gt_EMINumMatchesTotal(const BWTSeqExactMatchesIterator *iter);
487
* \brief Query an iterator for the number of matches not yet
488
* inspected via EMIGetNextMatch.
489
* @param iter reference of iterator object
490
* @return number of matches left
493
gt_EMINumMatchesLeft(const BWTSeqExactMatchesIterator *iter);
496
* @brief for packed index (given as void pointer), compute the longest
497
* prefix of string in range between qstart and qend that occurs exactly
499
* @param packed index (given as void pointer)
500
* @param qstart points to memory area where query is found
501
* @param qend points to memory area immediately after the query
503
* @return 0 if not unique, otherwise length of minmum unique prefix.
505
unsigned long gt_packedindexuniqueforward(const BWTSeq *bwtseq,
506
const GtUchar *qstart,
507
const GtUchar *qend);
509
unsigned long gt_packedindexmstatsforward(const BWTSeq *bwtseq,
510
unsigned long *witnessposition,
511
const GtUchar *qstart,
512
const GtUchar *qend);
514
#include "match/eis-bwtseq-siop.h"