8
#include <boost/integer.hpp>
11
#include <cstdlib> // for exit
13
#include <limits> // for numeric_limits
18
/** A match of a substring of a query sequence to an FM index. */
21
FMInterval(size_t l, size_t u, unsigned qstart, unsigned qend)
22
: l(l), u(u), qstart(qstart), qend(qend) { }
24
unsigned qstart, qend;
26
unsigned qspan() const
28
assert(qstart <= qend);
33
/** A match of a substring of a query sequence to a target sequence.
37
unsigned qstart, qend;
40
Match(unsigned qstart, unsigned qend,
41
size_t tstart, unsigned count)
42
: qstart(qstart), qend(qend), tstart(tstart), count(count) { }
44
unsigned qspan() const
46
assert(qstart <= qend);
57
/** The sentinel symbol. */
58
static T SENTINEL() { return std::numeric_limits<T>::max(); }
62
typedef boost::uint_t<FMBITS>::exact size_type;
64
/** An index for SAIS, which must be signed. */
65
typedef boost::int_t<FMBITS>::exact sais_size_type;
67
/** The type of a symbol. */
70
void read(const char *path, std::vector<T> &s);
72
FMIndex() : m_sampleSA(1) { }
74
/** Return the size of the string not counting the sentinel. */
75
size_t size() const { return m_occ.size() - 1; }
77
/** Build an FM-index of the specified file. */
79
void assign(It first, It last)
82
assert(size_t(last - first)
83
< std::numeric_limits<size_type>::max());
87
// Translate the alphabet.
88
if (m_alphabet.empty())
89
setAlphabet(first, last);
90
std::transform(first, last, first, Translate(*this));
91
std::replace(first, last, SENTINEL(), T(0));
93
std::cerr << "The alphabet has "
94
<< m_alphabet.size() << " symbols.\n";
96
// Construct the suffix array.
97
std::cerr << "Building the suffix array...\n";
98
size_t n = last - first;
102
assert(sizeof (size_type) == sizeof (sais_size_type));
103
int status = saisxx(first,
104
reinterpret_cast<sais_size_type*>(&m_sa[1]),
106
(sais_size_type)m_alphabet.size());
111
// Construct the Burrows-Wheeler transform.
113
std::cerr << "Building the Burrows-Wheeler transform...\n";
114
bwt.resize(m_sa.size());
115
for (size_t i = 0; i < m_sa.size(); i++)
116
bwt[i] = m_sa[i] == 0 ? SENTINEL() : first[m_sa[i] - 1];
118
std::cerr << "Building the character occurence table...\n";
123
/** Sample the suffix array. */
124
void sampleSA(unsigned period)
127
if (period == m_sampleSA)
129
assert(m_sampleSA == 1);
133
std::vector<size_type>::iterator out = m_sa.begin();
134
for (size_t i = 0; i < m_sa.size(); i += m_sampleSA)
136
m_sa.erase(out, m_sa.end());
137
assert(!m_sa.empty());
140
/** Return the position of the specified suffix in the original
143
size_t locate(size_t i) const
146
while (i % m_sampleSA != 0) {
148
i = c == SENTINEL() ? 0 : m_cf[c] + m_occ.rank(c, i + 1) - 1;
151
assert(i / m_sampleSA < m_sa.size());
152
size_t pos = m_sa[i / m_sampleSA] + n;
153
return pos < m_occ.size() ? pos : pos - m_occ.size();
156
/** Decompress the index. */
157
template <typename It>
158
void decompress(It out)
160
// Construct the original string.
162
for (size_t i = 0;;) {
163
assert(i < m_occ.size());
168
i = m_cf[c] + m_occ.rank(c, i);
172
// Translate the character set and output the result.
173
for (std::vector<T>::const_reverse_iterator it = s.rbegin();
174
it != s.rend(); ++it) {
176
assert(c < m_alphabet.size());
177
*out++ = m_alphabet[c];
181
/** Search for an exact match. */
182
template <typename It>
183
std::pair<size_t, size_t> findExact(It first, It last) const
185
assert(first < last);
186
size_t l = 1, u = m_occ.size();
188
for (it = last - 1; it >= first && l < u; --it) {
191
return std::make_pair(0, 0);
192
l = m_cf[c] + m_occ.rank(c, l);
193
u = m_cf[c] + m_occ.rank(c, u);
195
return std::make_pair(l, u);
198
/** Search for an exact match. */
199
template <typename String>
200
std::pair<size_t, size_t> findExact(const String& q) const
203
std::transform(q.begin(), q.end(), s.begin(),
205
return findExact(s.begin(), s.end());
208
/** Search for a matching suffix of the query. */
209
template <typename It>
210
FMInterval findSuffix(It first, It last) const
212
assert(first < last);
213
size_t l = 1, u = m_occ.size();
215
for (it = last - 1; it >= first && l < u; --it) {
219
size_t l1 = m_cf[c] + m_occ.rank(c, l);
220
size_t u1 = m_cf[c] + m_occ.rank(c, u);
226
return FMInterval(l, u, it - first + 1, last - first);
229
/** Search for a matching substring of the query at least k long.
230
* @return the longest match
232
template <typename It>
233
FMInterval findSubstring(It first, It last, unsigned k) const
235
assert(first < last);
236
FMInterval best(0, 0, 0, k > 0 ? k - 1 : 0);
237
for (It it = last; it > first; --it) {
238
if (unsigned(it - first) < best.qspan())
240
FMInterval interval = findSuffix(first, it);
241
if (interval.qspan() > best.qspan())
247
/** Translate from ASCII to the indexed alphabet. */
249
Translate(const FMIndex& fmIndex) : m_fmIndex(fmIndex) { }
250
T operator()(unsigned char c) const
252
return c < m_fmIndex.m_mapping.size()
253
? m_fmIndex.m_mapping[c] : SENTINEL();
256
const FMIndex& m_fmIndex;
259
/** Search for a matching substring of the query at least k long.
260
* @return the longest match and the number of matches
262
Match find(const std::string& q, unsigned k) const
265
std::transform(s.begin(), s.end(), s.begin(),
268
FMInterval interval = findSubstring(s.begin(), s.end(), k);
269
assert(interval.l <= interval.u);
270
size_t count = interval.u - interval.l;
272
return Match(0, 0, 0, 0);
273
return Match(interval.qstart, interval.qend,
274
locate(interval.l), count);
277
/** Set the alphabet to [first, last). */
278
template <typename It>
279
void setAlphabet(It first, It last)
281
std::vector<bool> mask(std::numeric_limits<T>::max());
282
for (It it = first; it < last; ++it) {
283
assert((size_t)*it < mask.size());
289
for (unsigned c = 0; c < mask.size(); ++c) {
292
m_mapping.resize(c + 1, std::numeric_limits<T>::max());
293
m_mapping[c] = m_alphabet.size();
294
m_alphabet.push_back(c);
296
assert(!m_alphabet.empty());
299
/** Set the alphabet to the characters of s. */
300
void setAlphabet(const std::string& s)
302
setAlphabet(s.begin(), s.end());
305
#define STRINGIFY(X) #X
306
#define FM_VERSION_BITS(BITS) "FM " STRINGIFY(BITS) " 1"
307
#define FM_VERSION FM_VERSION_BITS(FMBITS)
309
/** Store an index. */
310
friend std::ostream& operator<<(std::ostream& out, const FMIndex& o)
312
out << FM_VERSION << '\n'
313
<< o.m_sampleSA << '\n';
315
out << o.m_alphabet.size() << '\n';
316
out.write((char*)o.m_alphabet.data(),
317
o.m_alphabet.size() * sizeof o.m_alphabet[0]);
319
out << o.m_sa.size() << '\n';
320
out.write((char*)o.m_sa.data(),
321
o.m_sa.size() * sizeof o.m_sa[0]);
323
return out << o.m_occ;
326
/** Load an index. */
327
friend std::istream& operator>>(std::istream& in, FMIndex& o)
330
std::getline(in, version);
332
if (version != FM_VERSION) {
333
std::cerr << "error: the version of this FM-index, `"
334
<< version << "', does not match the version required "
335
"by this program, `" FM_VERSION "'.\n";
349
assert(n < std::numeric_limits<size_type>::max());
350
o.m_alphabet.resize(n);
351
in.read((char*)o.m_alphabet.data(), n * sizeof o.m_alphabet[0]);
352
o.setAlphabet(o.m_alphabet.begin(), o.m_alphabet.end());
358
assert(n < std::numeric_limits<size_type>::max());
360
in.read((char*)o.m_sa.data(), n * sizeof o.m_sa[0]);
371
/** Build the cumulative frequency table m_cf from m_occ. */
372
void countOccurences()
374
assert(!m_alphabet.empty());
375
m_cf.resize(m_alphabet.size());
376
// The sentinel character occurs once.
378
for (unsigned i = 0; i < m_cf.size() - 1; ++i)
379
m_cf[i + 1] = m_cf[i] + m_occ.count(i);
383
std::vector<T> m_alphabet;
384
std::vector<T> m_mapping;
385
std::vector<size_type> m_cf;
386
std::vector<size_type> m_sa;