153
155
return pos < m_occ.size() ? pos : pos - m_occ.size();
156
/** Decompress the index. */
157
template <typename It>
158
void decompress(It out)
160
// Construct the original string.
162
for (size_t i = 0;;) {
163
assert(i < m_occ.size());
168
i = m_cf[c] + m_occ.rank(c, i);
172
// Translate the character set and output the result.
173
for (std::vector<T>::const_reverse_iterator it = s.rbegin();
174
it != s.rend(); ++it) {
176
assert(c < m_alphabet.size());
177
*out++ = m_alphabet[c];
181
/** Search for an exact match. */
182
template <typename It>
183
std::pair<size_t, size_t> findExact(It first, It last) const
185
assert(first < last);
186
size_t l = 1, u = m_occ.size();
188
for (it = last - 1; it >= first && l < u; --it) {
191
return std::make_pair(0, 0);
192
l = m_cf[c] + m_occ.rank(c, l);
193
u = m_cf[c] + m_occ.rank(c, u);
195
return std::make_pair(l, u);
198
/** Search for an exact match. */
199
template <typename String>
200
std::pair<size_t, size_t> findExact(const String& q) const
203
std::transform(q.begin(), q.end(), s.begin(),
205
return findExact(s.begin(), s.end());
208
/** Search for a matching suffix of the query. */
209
template <typename It>
210
FMInterval findSuffix(It first, It last) const
212
assert(first < last);
213
size_t l = 1, u = m_occ.size();
215
for (it = last - 1; it >= first && l < u; --it) {
219
size_t l1 = m_cf[c] + m_occ.rank(c, l);
220
size_t u1 = m_cf[c] + m_occ.rank(c, u);
226
return FMInterval(l, u, it - first + 1, last - first);
229
/** Search for a matching substring of the query at least k long.
230
* @return the longest match
232
template <typename It>
233
FMInterval findSubstring(It first, It last, unsigned k) const
235
assert(first < last);
236
FMInterval best(0, 0, 0, k > 0 ? k - 1 : 0);
237
for (It it = last; it > first; --it) {
238
if (unsigned(it - first) < best.qspan())
240
FMInterval interval = findSuffix(first, it);
241
if (interval.qspan() > best.qspan())
247
/** Translate from ASCII to the indexed alphabet. */
249
Translate(const FMIndex& fmIndex) : m_fmIndex(fmIndex) { }
250
T operator()(unsigned char c) const
252
return c < m_fmIndex.m_mapping.size()
253
? m_fmIndex.m_mapping[c] : SENTINEL();
256
const FMIndex& m_fmIndex;
259
/** Search for a matching substring of the query at least k long.
260
* @return the longest match and the number of matches
262
Match find(const std::string& q, unsigned k) const
265
std::transform(s.begin(), s.end(), s.begin(),
268
FMInterval interval = findSubstring(s.begin(), s.end(), k);
269
assert(interval.l <= interval.u);
270
size_t count = interval.u - interval.l;
272
return Match(0, 0, 0, 0);
273
return Match(interval.qstart, interval.qend,
274
locate(interval.l), count);
277
/** Set the alphabet to [first, last). */
278
template <typename It>
279
void setAlphabet(It first, It last)
281
std::vector<bool> mask(std::numeric_limits<T>::max());
282
for (It it = first; it < last; ++it) {
283
assert((size_t)*it < mask.size());
289
for (unsigned c = 0; c < mask.size(); ++c) {
292
m_mapping.resize(c + 1, std::numeric_limits<T>::max());
293
m_mapping[c] = m_alphabet.size();
294
m_alphabet.push_back(c);
296
assert(!m_alphabet.empty());
299
/** Set the alphabet to the characters of s. */
300
void setAlphabet(const std::string& s)
302
setAlphabet(s.begin(), s.end());
158
/** Return the specified element of the suffix array. */
159
size_t operator[](size_t i) const
164
/** Return the symbol at the specifed position of the BWT. */
165
T bwtAt(size_t i) const
167
assert(i < m_occ.size());
169
assert(c != SENTINEL());
170
assert(c < m_alphabet.size());
171
return m_alphabet[c];
174
/** Return the first symbol of the specified suffix. */
175
T symbolAt(size_t i) const
177
assert(i < m_occ.size());
178
std::vector<size_type>::const_iterator it
179
= std::upper_bound(m_cf.begin(), m_cf.end(), i);
180
assert(it != m_cf.begin());
181
T c = it - m_cf.begin() - 1;
182
assert(c < m_alphabet.size());
183
return m_alphabet[c];
186
/** Decompress the index. */
187
template <typename It>
188
void decompress(It out)
190
// Construct the original string.
192
for (size_t i = 0;;) {
193
assert(i < m_occ.size());
198
i = m_cf[c] + m_occ.rank(c, i);
202
// Translate the character set and output the result.
203
for (std::vector<T>::reverse_iterator it = s.rbegin();
204
it != s.rend(); ++it) {
206
assert(c < m_alphabet.size());
207
*out++ = m_alphabet[c];
211
/** Extend a suffix array coordinate by one character to the left. */
212
size_type update(size_type i, T c) const
214
assert(c < m_cf.size());
215
return m_cf[c] + m_occ.rank(c, i);
218
/** Extend a suffix array interval by one character to the left. */
219
SAInterval update(SAInterval sai, T c) const
221
return SAInterval(update(sai.l, c), update(sai.u, c));
224
/** Search for an exact match. */
225
template <typename It>
226
SAInterval findExact(It first, It last, SAInterval sai) const
228
assert(first < last);
229
for (It it = last - 1; it >= first && !sai.empty(); --it) {
232
return SAInterval(0, 0);
233
sai = update(sai, c);
238
/** Search for an exact match. */
239
template <typename String>
240
SAInterval findExact(const String& q) const
243
std::transform(q.begin(), q.end(), s.begin(), Translate(*this));
244
return findExact(s.begin(), s.end());
247
/** Search for a suffix of the query that matches a prefix of the
250
template <typename It, typename OutIt>
251
OutIt findOverlapSuffix(It first, It last, OutIt out,
252
unsigned minOverlap) const
254
assert(first < last);
256
SAInterval sai(*this);
257
for (It it = last - 1; it >= first && !sai.empty(); --it) {
261
sai = update(sai, c);
265
if (unsigned(last - it) < minOverlap)
268
SAInterval sai1 = update(sai, 0);
270
*out++ = Match(sai1.l, sai1.u,
271
it - first, last - first);
276
/** Search for a suffix of the query that matches a prefix of the
279
template <typename OutIt>
280
OutIt findOverlapSuffix(const std::string& q, OutIt out,
281
unsigned minOverlap) const
284
std::transform(s.begin(), s.end(), s.begin(), Translate(*this));
285
return findOverlapSuffix(s.begin(), s.end(), out, minOverlap);
288
/** Search for a prefix of the query that matches a suffix of the
291
template <typename OutIt>
292
OutIt findOverlapPrefix(const std::string& q, OutIt out,
293
unsigned minOverlap) const
296
std::transform(s.begin(), s.end(), s.begin(), Translate(*this));
297
typedef std::string::const_iterator It;
298
It first = s.begin();
299
for (It it = first + minOverlap; it <= s.end(); ++it) {
300
SAInterval sai = findExact(first, it,
301
update(SAInterval(*this), 0));
303
*out++ = Match(sai.l, sai.u, 0, it - first);
308
/** Search for a matching suffix of the query. */
309
template <typename It, typename MemoIt>
310
Match findSuffix(It first, It last, MemoIt memoIt) const
312
assert(first < last);
314
SAInterval sai(*this);
316
for (it = last - 1; it >= first && !sai.empty(); --it) {
320
SAInterval sai1 = update(sai, c);
325
if (*memoIt == sai) {
326
// This vertex of the prefix DAWG has been visited.
331
return Match(sai.l, sai.u, it - first + 1, last - first);
334
/** Search for a matching substring of the query at least k long.
335
* @return the longest match
337
template <typename It>
338
Match findSubstring(It first, It last, unsigned k) const
340
assert(first < last);
341
Match best(0, 0, 0, k > 0 ? k - 1 : 0);
343
// Record which vertices of the prefix DAWG have been visited.
344
std::vector<SAInterval> memo(last - first, SAInterval(0, 0));
345
SAInterval* memoIt = &memo[0];
346
for (It it = last; it > first; --it) {
347
if (unsigned(it - first) < best.qspan())
349
Match interval = findSuffix(first, it, memoIt++);
350
if (interval.qspan() > best.qspan())
356
/** Translate from ASCII to the indexed alphabet. */
358
Translate(const FMIndex& fmIndex) : m_fmIndex(fmIndex) { }
359
T operator()(unsigned char c) const
361
return c < m_fmIndex.m_mapping.size()
362
? m_fmIndex.m_mapping[c] : SENTINEL();
365
const FMIndex& m_fmIndex;
368
/** Search for a matching substring of the query at least k long.
369
* @return the longest match and the number of matches
371
Match find(const std::string& q, unsigned k) const
374
std::transform(s.begin(), s.end(), s.begin(), Translate(*this));
376
Match m = findSubstring(s.begin(), s.end(), k);
380
/** Set the alphabet to [first, last). */
381
template <typename It>
382
void setAlphabet(It first, It last)
384
std::vector<bool> mask(std::numeric_limits<T>::max());
385
for (It it = first; it < last; ++it) {
386
assert((size_t)*it < mask.size());
392
for (unsigned c = 0; c < mask.size(); ++c) {
395
m_mapping.resize(c + 1, std::numeric_limits<T>::max());
396
m_mapping[c] = m_alphabet.size();
397
m_alphabet.push_back(c);
399
assert(!m_alphabet.empty());
402
/** Set the alphabet to the characters of s. */
403
void setAlphabet(const std::string& s)
405
setAlphabet(s.begin(), s.end());
305
408
#define STRINGIFY(X) #X
306
409
#define FM_VERSION_BITS(BITS) "FM " STRINGIFY(BITS) " 1"