2
Copyright (c) 2007 Thomas Jahns <Thomas.Jahns@gmx.net>
4
Permission to use, copy, modify, and distribute this software for any
5
purpose with or without fee is hereby granted, provided that the above
6
copyright notice and this permission notice appear in all copies.
8
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
9
WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
10
MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
11
ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
12
WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
13
ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
14
OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
19
#include "core/error.h"
20
#include "core/logger.h"
21
#include "core/minmax.h"
22
#include "core/option_api.h"
24
#include "core/versionfunc.h"
25
#include "match/eis-bwtseq.h"
26
#include "match/eis-bwtseq-param.h"
27
#include "core/encseq.h"
28
#include "match/enum-patt-def.h"
29
#include "match/esa-mmsearch.h"
30
#include "match/sarr-def.h"
31
#include "match/esa-map.h"
32
#include "tools/gt_packedindex_chk_search.h"
33
#include "match/sfx-apfxlen.h"
35
#define DEFAULT_PROGRESS_INTERVAL 100000UL
37
struct chkSearchOptions
39
struct bwtOptions idx;
40
long minPatLen, maxPatLen;
41
unsigned long numOfSamples, progressInterval;
47
parseChkBWTOptions(int *parsed_args, int argc, const char **argv,
48
struct chkSearchOptions *params, const GtStr *projectName,
52
gt_packedindex_chk_search(int argc, const char *argv[], GtError *err)
54
struct chkSearchOptions params;
55
Suffixarray suffixarray;
56
Enumpatterniterator *epi = NULL;
57
bool saIsLoaded = false;
58
BWTSeq *bwtSeq = NULL;
59
GtStr *inputProject = NULL;
62
BWTSeqExactMatchesIterator EMIter;
63
bool EMIterInitialized = false;
64
GtLogger *logger = NULL;
65
inputProject = gt_str_new();
71
switch (parseChkBWTOptions(&parsedArgs, argc, argv, ¶ms,
74
case GT_OPTION_PARSER_OK:
76
case GT_OPTION_PARSER_ERROR:
80
case GT_OPTION_PARSER_REQUESTS_EXIT:
87
gt_str_set(inputProject, argv[parsedArgs]);
89
logger = gt_logger_new(params.verboseOutput,
90
GT_LOGGER_DEFLT_PREFIX, stdout);
92
bwtSeq = gt_availBWTSeq(¶ms.idx.final, logger, err);
93
if ((had_err = bwtSeq == NULL))
97
enum verifyBWTSeqErrCode retval =
98
gt_BWTSeqVerifyIntegrity(bwtSeq, gt_str_get(inputProject), params.flags,
99
params.progressInterval, stderr, logger, err);
100
if ((had_err = (retval != VERIFY_BWTSEQ_NO_ERROR)))
102
fprintf(stderr, "index integrity check failed: %s\n",
104
gt_error_set(err, "aborted because of index integrity check fail");
108
if (BWTSeqHasLocateInformation(bwtSeq))
110
if ((had_err = !gt_initEmptyEMIterator(&EMIter, bwtSeq)))
112
gt_error_set(err, "Cannot create matches iterator for sequence index.");
115
EMIterInitialized = true;
118
unsigned long totalLen, dbstart;
119
unsigned long trial, patternLen;
122
gt_mapsuffixarray(&suffixarray, SARR_SUFTAB | SARR_ESQTAB,
123
gt_str_get(inputProject), NULL, err) != 0))
125
gt_error_set(err, "Can't load suffix array project with"
126
" demand for encoded sequence and suffix table files\n");
129
totalLen = gt_encseq_total_length(suffixarray.encseq);
131
if ((had_err = (params.minPatLen >= 0L && params.maxPatLen >= 0L
132
&& params.minPatLen > params.maxPatLen)))
134
gt_error_set(err, "Invalid pattern lengths selected: min=%ld, max=%ld;"
135
" min <= max is required.", params.minPatLen,
139
if (params.minPatLen < 0 || params.maxPatLen < 0)
141
unsigned int numofchars
142
= gt_alphabet_num_of_chars(
143
gt_encseq_alphabet(suffixarray.encseq));
144
if (params.minPatLen < 0)
146
= gt_recommendedprefixlength(numofchars,
148
GT_RECOMMENDED_MULTIPLIER_DEFAULT,
150
if (params.maxPatLen < 0)
152
MAX(params.minPatLen,
153
125 * gt_recommendedprefixlength(numofchars,totalLen,
154
GT_RECOMMENDED_MULTIPLIER_DEFAULT,
157
params.maxPatLen = MAX(params.maxPatLen, params.minPatLen);
159
fprintf(stderr, "Using patterns of lengths %lu to %lu\n",
160
params.minPatLen, params.maxPatLen);
161
if ((had_err = totalLen + 1 != BWTSeqLength(bwtSeq)))
163
gt_error_set(err, "base suffix array and index have diferrent lengths!"
164
"%lu vs. %lu", totalLen + 1,
165
BWTSeqLength(bwtSeq));
169
(epi = gt_newenumpatterniterator(params.minPatLen, params.maxPatLen,
173
fputs("Creation of pattern iterator failed!\n", stderr);
176
for (trial = 0; !had_err && trial < params.numOfSamples; ++trial)
178
const GtUchar *pptr = gt_nextEnumpatterniterator(&patternLen, epi);
179
MMsearchiterator *mmsi =
180
gt_newmmsearchiteratorcomplete_plain(suffixarray.encseq,
183
totalLen, /* rightbound */
185
suffixarray.readmode,
188
if (BWTSeqHasLocateInformation(bwtSeq))
190
if ((had_err = !gt_reinitEMIterator(&EMIter, bwtSeq, pptr, patternLen,
193
fputs("Internal error: failed to reinitialize pattern match"
194
" iterator", stderr);
197
gt_assert(gt_EMINumMatchesTotal(&EMIter) ==
198
gt_BWTSeqMatchCount(bwtSeq, pptr, patternLen,
200
gt_assert(gt_EMINumMatchesTotal(&EMIter)
201
== gt_countmmsearchiterator(mmsi));
202
while (gt_nextmmsearchiterator(&dbstart,mmsi))
204
unsigned long matchPos = 0;
205
bool match = EMIGetNextMatch(&EMIter, &matchPos, bwtSeq);
206
if ((had_err = !match))
209
"matches of packedindex expired before mmsearch!");
212
if ((had_err = matchPos != dbstart))
214
gt_error_set(err, "packedindex match doesn't equal mmsearch "
215
"match result!\n%lu vs. %lu\n",
221
unsigned long matchPos;
222
bool trailingMatch = EMIGetNextMatch(&EMIter, &matchPos, bwtSeq);
223
if ((had_err = trailingMatch))
225
gt_error_set(err, "matches of mmsearch expired before fmindex!");
232
unsigned long numFMIMatches = gt_BWTSeqMatchCount(bwtSeq, pptr,
235
numMMSearchMatches = gt_countmmsearchiterator(mmsi);
236
if ((had_err = numFMIMatches != numMMSearchMatches))
238
gt_error_set(err, "Number of matches not equal for suffix array ("
239
"%lu) and fmindex (%lu).\n",
240
numFMIMatches, numMMSearchMatches);
243
gt_freemmsearchiterator(&mmsi);
244
if (params.progressInterval && !((trial + 1) % params.progressInterval))
247
if (params.progressInterval)
249
fprintf(stderr, "Finished %lu of %lu matchings successfully.\n",
250
trial, params.numOfSamples);
253
if (EMIterInitialized) gt_destructEMIterator(&EMIter);
254
if (saIsLoaded) gt_freesuffixarray(&suffixarray);
255
if (epi) gt_freeEnumpatterniterator(&epi);
256
if (bwtSeq) gt_deleteBWTSeq(bwtSeq);
257
if (logger) gt_logger_delete(logger);
258
if (inputProject) gt_str_delete(inputProject);
263
parseChkBWTOptions(int *parsed_args, int argc, const char **argv,
264
struct chkSearchOptions *params, const GtStr *projectName,
269
GtOption *option, *optionProgress;
270
bool checkSuffixArrayValues, tryContextRetrieve, tryFullRegen;
273
op = gt_option_parser_new("indexname",
274
"Load (or build if necessary) BWT index for project"
275
" <indexname> and perform verification of search"
278
gt_registerPackedIndexOptions(op, ¶ms->idx, BWTDEFOPT_MULTI_QUERY,
281
option = gt_option_new_long("minpatlen",
282
"minimum length of patterns searched for, -1 "
283
"implies automatic choice based on index "
284
"properties", ¶ms->minPatLen, -1);
285
gt_option_parser_add_option(op, option);
287
option = gt_option_new_long("maxpatlen",
288
"maximum length of patterns searched for, -1 "
289
"implies automatic choice based on index "
290
"properties", ¶ms->maxPatLen, -1);
291
gt_option_parser_add_option(op, option);
293
option = gt_option_new_ulong("nsamples",
294
"number of sequences to search for",
295
¶ms->numOfSamples, 1000);
296
gt_option_parser_add_option(op, option);
298
option = gt_option_new_bool("chksfxarray",
299
"verify integrity of stored suffix array positions",
300
&checkSuffixArrayValues, false);
301
gt_option_parser_add_option(op, option);
303
option = gt_option_new_bool("full-lfmap",
304
"verify complete backwards regeneration of "
305
"original sequence", &tryFullRegen, false);
306
gt_option_parser_add_option(op, option);
308
option = gt_option_new_bool("chkcontext",
309
"verify integrity of regenerated sequence context",
310
&tryContextRetrieve, false);
311
gt_option_parser_add_option(op, option);
313
optionProgress = gt_option_new_ulong("ticks",
314
"print dot after this many symbols "
315
"tested okay", ¶ms->progressInterval,
316
DEFAULT_PROGRESS_INTERVAL);
317
gt_option_parser_add_option(op, optionProgress);
319
option = gt_option_new_bool("v",
320
"print verbose progress information",
321
¶ms->verboseOutput,
323
gt_option_parser_add_option(op, option);
325
gt_option_parser_set_min_max_args(op, 1, 1);
326
oprval = gt_option_parser_parse(op, parsed_args, argc, argv, gt_versionfunc,
329
/* condense boolean options to flags field */
330
params->flags = (checkSuffixArrayValues?VERIFY_BWTSEQ_SUFVAL:0)
331
| (tryFullRegen?VERIFY_BWTSEQ_LFMAPWALK:0)
332
| (tryContextRetrieve?VERIFY_BWTSEQ_CONTEXT:0);
333
/* compute parameters currently not set from command-line or
334
* determined indirectly */
335
gt_computePackedIndexDefaults(¶ms->idx, BWTBaseFeatures);
337
gt_option_parser_delete(op);