2
Copyright (c) 2011 Stefan Kurtz <kurtz@zbh.uni-hamburg.de>
3
Copyright (c) 2011 Giorgio Gonnella <gonnella@zbh.uni-hamburg.de>
4
Copyright (c) 2011 Center for Bioinformatics, University of Hamburg
6
Permission to use, copy, modify, and distribute this software for any
7
purpose with or without fee is hereby granted, provided that the above
8
copyright notice and this permission notice appear in all copies.
10
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11
WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12
MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13
ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14
WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15
ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16
OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
21
#include "core/unused_api.h"
22
#include "core/option_api.h"
23
#include "core/encseq.h"
24
#include "core/intbits.h"
25
#include "core/logger.h"
26
#include "core/minmax.h"
27
#ifdef GT_THREADS_ENABLED
28
#include "core/thread.h"
30
#include "tools/gt_readjoiner_overlap.h"
31
#include "match/rdj-spmfind.h"
32
#include "match/firstcodes.h"
40
unsigned int minmatchlength,
43
unsigned long maximumspace,
48
GtOption *refoptionmemlimit,
49
*refoptionphase2extra;
51
unsigned int radixparts;
52
} GtReadjoinerOverlapArguments;
54
static void* gt_readjoiner_overlap_arguments_new(void)
56
GtReadjoinerOverlapArguments *arguments = gt_calloc((size_t) 1,
58
arguments->encseqinput = gt_str_new();
59
arguments->numofparts = 0;
60
arguments->memlimitarg = gt_str_new();
61
arguments->phase2extraarg = gt_str_new();
62
arguments->phase2extra = 0UL; /* in bytes */
63
arguments->maximumspace = 0UL; /* in bytes */
67
static void gt_readjoiner_overlap_arguments_delete(void *tool_arguments)
69
GtReadjoinerOverlapArguments *arguments = tool_arguments;
70
if (!arguments) return;
71
gt_str_delete(arguments->encseqinput);
72
gt_option_delete(arguments->refoptionmemlimit);
73
gt_option_delete(arguments->refoptionphase2extra);
74
gt_str_delete(arguments->memlimitarg);
75
gt_str_delete(arguments->phase2extraarg);
79
static GtOptionParser* gt_readjoiner_overlap_option_parser_new(
82
GtReadjoinerOverlapArguments *arguments = tool_arguments;
84
GtOption *option, *optionparts, *optionmemlimit, *q_option, *v_option;
89
op = gt_option_parser_new("[option ...] [file]",
90
"Compute suffix prefix matches "
91
"from encoded sequence.");
94
option = gt_option_new_string("readset", "specify the readset name",
95
arguments->encseqinput, NULL);
96
gt_option_parser_add_option(op, option);
97
gt_option_is_mandatory(option);
100
option = gt_option_new_uint_min("l", "specify the minimum SPM length",
101
&arguments->minmatchlength, 0, 2U);
102
gt_option_parser_add_option(op, option);
103
gt_option_is_mandatory(option);
106
optionparts = gt_option_new_uint("parts", "specify the number of parts",
107
&arguments->numofparts, 0U);
108
gt_option_parser_add_option(op, optionparts);
111
optionmemlimit = gt_option_new_string("memlimit",
112
"specify maximal amount of memory to be used during "
113
"index construction (in bytes, the keywords 'MB' "
114
"and 'GB' are allowed)",
115
arguments->memlimitarg, NULL);
116
gt_option_parser_add_option(op, optionmemlimit);
117
gt_option_exclude(optionmemlimit, optionparts);
118
arguments->refoptionmemlimit = gt_option_ref(optionmemlimit);
121
option = gt_option_new_bool("singlestrand", "do not use reverse complements "
122
"of the reads", &arguments->singlestrand, false);
123
gt_option_is_development_option(option);
124
gt_option_parser_add_option(op, option);
127
option = gt_option_new_bool("elimtrans", "output only irreducible SPMs",
128
&arguments->elimtrans, true);
129
gt_option_is_development_option(option);
130
gt_option_parser_add_option(op, option);
133
option = gt_option_new_uint("wmax", "specify the maximum width of w set;\n"
134
"use 0 to disable w set partitioning",
135
&arguments->w_maxsize, 32U);
136
gt_option_is_development_option(option);
137
gt_option_parser_add_option(op, option);
140
option = gt_option_new_bool("showspm", "show textual SPMs list on stdout",
141
&arguments->showspm, false);
142
gt_option_is_development_option(option);
143
gt_option_parser_add_option(op, option);
146
option = gt_option_new_string("phase2extra",
147
"specify amount of additional space required for "
148
"the second phase of the computation involving the "
149
"processing of the intervals (in bytes, "
150
"the keywords 'MB' and 'GB' are allowed)",
151
arguments->phase2extraarg, NULL);
152
gt_option_parser_add_option(op, option);
153
arguments->refoptionphase2extra = gt_option_ref(option);
154
gt_option_is_development_option(option);
157
v_option = gt_option_new_verbose(&arguments->verbose);
158
gt_option_parser_add_option(op, v_option);
161
q_option = gt_option_new_bool("q", "suppress standard output messages",
162
&arguments->quiet, false);
163
gt_option_parser_add_option(op, q_option);
164
gt_option_exclude(q_option, v_option);
167
option = gt_option_new_uint("radixparts", "specify the radixpart parameter",
168
&arguments->radixparts, 2U);
169
gt_option_is_development_option(option);
170
gt_option_parser_add_option(op, option);
173
option = gt_option_new_bool("radixsmall", "specify the radixsmall parameter",
174
&arguments->radixsmall, false);
175
gt_option_is_development_option(option);
176
gt_option_parser_add_option(op, option);
181
static int gt_readjoiner_overlap_arguments_check(int rest_argc,
182
void *tool_arguments,
185
GtReadjoinerOverlapArguments *arguments = tool_arguments;
191
gt_error_set(err,"unnecessary arguments");
194
if (!haserr && gt_option_is_set(arguments->refoptionmemlimit))
196
if (gt_option_parse_spacespec(&arguments->maximumspace,
198
arguments->memlimitarg,
203
if (!haserr && !gt_ma_bookkeeping_enabled()) {
204
gt_error_set(err, "option '-memlimit' requires "
205
"GT_MEM_BOOKKEEPING=on");
209
if (!haserr && gt_option_is_set(arguments->refoptionphase2extra))
211
if (gt_option_parse_spacespec(&arguments->phase2extra,
213
arguments->phase2extraarg,
219
#ifdef GT_THREADS_ENABLED
221
if (gt_jobs > 1 && gt_ma_bookkeeping_enabled()) {
222
gt_error_set(err, "gt option '-j' and GT_MEM_BOOKKEEPING=on "
228
return haserr ? -1 : 0;
231
static int gt_readjoiner_overlap_runner(GT_UNUSED int argc,
232
GT_UNUSED const char **argv,
233
GT_UNUSED int parsed_args,
234
void *tool_arguments,
237
GtReadjoinerOverlapArguments *arguments = tool_arguments;
238
GtEncseqLoader *el = NULL;
239
GtEncseq *encseq = NULL;
240
GtLogger *default_logger, *verbose_logger;
241
unsigned int kmersize;
244
unsigned long total_nof_irr_spm = 0, total_nof_trans_spm = 0;
247
gt_assert(arguments);
249
default_logger = gt_logger_new(!arguments->quiet, GT_LOGGER_DEFLT_PREFIX,
251
gt_logger_log(default_logger, "gt readjoiner overlap");
252
verbose_logger = gt_logger_new(arguments->verbose, GT_LOGGER_DEFLT_PREFIX,
254
gt_logger_log(verbose_logger, "verbose output activated");
255
kmersize = MIN((unsigned int) GT_UNITSIN2BITENC, arguments->minmatchlength);
257
el = gt_encseq_loader_new();
258
gt_encseq_loader_drop_description_support(el);
259
gt_encseq_loader_disable_autosupport(el);
260
encseq = gt_encseq_loader_load(el, gt_str_get(arguments->encseqinput),
262
eqlen = gt_encseq_accesstype_get(encseq) == GT_ACCESS_TYPE_EQUALLENGTH;
266
if (!haserr && !arguments->singlestrand)
268
if (gt_encseq_mirror(encseq, err) != 0)
273
unsigned int threadcount;
274
#ifdef GT_THREADS_ENABLED
275
const unsigned int threads = gt_jobs;
277
const unsigned int threads = 1U;
281
GtBUstate_spmeq **state_table
282
= gt_malloc(sizeof (*state_table) * threads);
284
for (threadcount = 0; threadcount < threads; threadcount++)
286
state_table[threadcount]
287
= gt_spmfind_eqlen_state_new(encseq,
288
(unsigned long)arguments->minmatchlength,
289
(unsigned long)arguments->w_maxsize, arguments->elimtrans,
290
arguments->showspm, gt_str_get(arguments->encseqinput),
291
threadcount, default_logger, verbose_logger, err);
293
if (storefirstcodes_getencseqkmers_twobitencoding(encseq, kmersize,
294
arguments->numofparts, arguments->maximumspace,
295
arguments->minmatchlength, false, false, false, 5U,
296
arguments->phase2extra, arguments->radixsmall,
297
arguments->radixparts, gt_spmfind_eqlen_process,
298
gt_spmfind_eqlen_process_end, state_table, verbose_logger, err)
303
for (threadcount = 0; threadcount < threads; threadcount++)
306
gt_spmfind_eqlen_nof_irr_spm(state_table[threadcount]);
307
total_nof_trans_spm +=
308
gt_spmfind_eqlen_nof_irr_spm(state_table[threadcount]);
309
gt_spmfind_eqlen_state_delete(state_table[threadcount]);
311
gt_free(state_table);
315
GtBUstate_spmvar **state_table
316
= gt_malloc(sizeof (*state_table) * threads);
318
for (threadcount = 0; threadcount < threads; threadcount++)
320
state_table[threadcount]
321
= gt_spmfind_varlen_state_new(encseq,
322
(unsigned long)arguments->minmatchlength,
323
(unsigned long)arguments->w_maxsize, arguments->elimtrans,
324
arguments->showspm, gt_str_get(arguments->encseqinput),
325
threadcount, default_logger, verbose_logger, err);
327
if (storefirstcodes_getencseqkmers_twobitencoding(encseq, kmersize,
328
arguments->numofparts, arguments->maximumspace,
329
arguments->minmatchlength, false, false, false, 5U,
330
arguments->phase2extra, arguments->radixsmall,
331
arguments->radixparts, gt_spmfind_varlen_process,
332
gt_spmfind_varlen_process_end, state_table, verbose_logger, err)
337
for (threadcount = 0; threadcount < threads; threadcount++)
339
gt_spmfind_varlen_state_delete(state_table[threadcount]);
341
gt_spmfind_eqlen_nof_irr_spm(state_table[threadcount]);
342
total_nof_trans_spm +=
343
gt_spmfind_eqlen_nof_irr_spm(state_table[threadcount]);
345
gt_free(state_table);
350
gt_logger_log(default_logger, "number of %ssuffix-prefix matches = %lu",
351
arguments->elimtrans ? "irreducible " : "", total_nof_irr_spm);
352
if (arguments->elimtrans)
353
gt_logger_log(default_logger, "number of transitive suffix-prefix "
354
"matches = %lu", total_nof_trans_spm);
356
gt_logger_delete(default_logger);
357
gt_logger_delete(verbose_logger);
358
gt_encseq_delete(encseq);
359
gt_encseq_loader_delete(el);
360
return haserr ? -1 : 0;
363
GtTool* gt_readjoiner_overlap(void)
365
return gt_tool_new(gt_readjoiner_overlap_arguments_new,
366
gt_readjoiner_overlap_arguments_delete,
367
gt_readjoiner_overlap_option_parser_new,
368
gt_readjoiner_overlap_arguments_check,
369
gt_readjoiner_overlap_runner);