~ubuntu-branches/ubuntu/quantal/genometools/quantal-backports

« back to all changes in this revision

Viewing changes to src/tools/gt_readjoiner_overlap.c

  • Committer: Package Import Robot
  • Author(s): Sascha Steinbiss
  • Date: 2012-07-09 14:10:23 UTC
  • Revision ID: package-import@ubuntu.com-20120709141023-juuu4spm6chqsf9o
Tags: upstream-1.4.1
ImportĀ upstreamĀ versionĀ 1.4.1

Show diffs side-by-side

added added

removed removed

Lines of Context:
 
1
/*
 
2
  Copyright (c) 2011 Stefan Kurtz <kurtz@zbh.uni-hamburg.de>
 
3
  Copyright (c) 2011 Giorgio Gonnella <gonnella@zbh.uni-hamburg.de>
 
4
  Copyright (c) 2011 Center for Bioinformatics, University of Hamburg
 
5
 
 
6
  Permission to use, copy, modify, and distribute this software for any
 
7
  purpose with or without fee is hereby granted, provided that the above
 
8
  copyright notice and this permission notice appear in all copies.
 
9
 
 
10
  THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
 
11
  WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
 
12
  MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
 
13
  ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
 
14
  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
 
15
  ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
 
16
  OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 
17
*/
 
18
 
 
19
#include <string.h>
 
20
#include "core/ma.h"
 
21
#include "core/unused_api.h"
 
22
#include "core/option_api.h"
 
23
#include "core/encseq.h"
 
24
#include "core/intbits.h"
 
25
#include "core/logger.h"
 
26
#include "core/minmax.h"
 
27
#ifdef GT_THREADS_ENABLED
 
28
#include "core/thread.h"
 
29
#endif
 
30
#include "tools/gt_readjoiner_overlap.h"
 
31
#include "match/rdj-spmfind.h"
 
32
#include "match/firstcodes.h"
 
33
 
 
34
typedef struct {
 
35
  bool singlestrand,
 
36
       elimtrans,
 
37
       verbose,
 
38
       quiet,
 
39
       showspm;
 
40
  unsigned int minmatchlength,
 
41
               numofparts,
 
42
               w_maxsize;
 
43
  unsigned long maximumspace,
 
44
                phase2extra;
 
45
  GtStr *encseqinput,
 
46
        *memlimitarg,
 
47
        *phase2extraarg;
 
48
  GtOption *refoptionmemlimit,
 
49
           *refoptionphase2extra;
 
50
  bool radixsmall;
 
51
  unsigned int radixparts;
 
52
} GtReadjoinerOverlapArguments;
 
53
 
 
54
static void* gt_readjoiner_overlap_arguments_new(void)
 
55
{
 
56
  GtReadjoinerOverlapArguments *arguments = gt_calloc((size_t) 1,
 
57
      sizeof *arguments);
 
58
  arguments->encseqinput = gt_str_new();
 
59
  arguments->numofparts = 0;
 
60
  arguments->memlimitarg = gt_str_new();
 
61
  arguments->phase2extraarg = gt_str_new();
 
62
  arguments->phase2extra = 0UL; /* in bytes */
 
63
  arguments->maximumspace = 0UL; /* in bytes */
 
64
  return arguments;
 
65
}
 
66
 
 
67
static void gt_readjoiner_overlap_arguments_delete(void *tool_arguments)
 
68
{
 
69
  GtReadjoinerOverlapArguments *arguments = tool_arguments;
 
70
  if (!arguments) return;
 
71
  gt_str_delete(arguments->encseqinput);
 
72
  gt_option_delete(arguments->refoptionmemlimit);
 
73
  gt_option_delete(arguments->refoptionphase2extra);
 
74
  gt_str_delete(arguments->memlimitarg);
 
75
  gt_str_delete(arguments->phase2extraarg);
 
76
  gt_free(arguments);
 
77
}
 
78
 
 
79
static GtOptionParser* gt_readjoiner_overlap_option_parser_new(
 
80
    void *tool_arguments)
 
81
{
 
82
  GtReadjoinerOverlapArguments *arguments = tool_arguments;
 
83
  GtOptionParser *op;
 
84
  GtOption *option, *optionparts, *optionmemlimit, *q_option, *v_option;
 
85
 
 
86
  gt_assert(arguments);
 
87
 
 
88
  /* init */
 
89
  op = gt_option_parser_new("[option ...] [file]",
 
90
                            "Compute suffix prefix matches "
 
91
                            "from encoded sequence.");
 
92
 
 
93
  /* -readset */
 
94
  option = gt_option_new_string("readset", "specify the readset name",
 
95
                                arguments->encseqinput, NULL);
 
96
  gt_option_parser_add_option(op, option);
 
97
  gt_option_is_mandatory(option);
 
98
 
 
99
  /* -l */
 
100
  option = gt_option_new_uint_min("l", "specify the minimum SPM length",
 
101
                                  &arguments->minmatchlength, 0, 2U);
 
102
  gt_option_parser_add_option(op, option);
 
103
  gt_option_is_mandatory(option);
 
104
 
 
105
  /* -parts */
 
106
  optionparts = gt_option_new_uint("parts", "specify the number of parts",
 
107
                                  &arguments->numofparts, 0U);
 
108
  gt_option_parser_add_option(op, optionparts);
 
109
 
 
110
  /* -memlimit */
 
111
  optionmemlimit = gt_option_new_string("memlimit",
 
112
                       "specify maximal amount of memory to be used during "
 
113
                       "index construction (in bytes, the keywords 'MB' "
 
114
                       "and 'GB' are allowed)",
 
115
                       arguments->memlimitarg, NULL);
 
116
  gt_option_parser_add_option(op, optionmemlimit);
 
117
  gt_option_exclude(optionmemlimit, optionparts);
 
118
  arguments->refoptionmemlimit = gt_option_ref(optionmemlimit);
 
119
 
 
120
  /* -singlestrand */
 
121
  option = gt_option_new_bool("singlestrand", "do not use reverse complements "
 
122
      "of the reads", &arguments->singlestrand, false);
 
123
  gt_option_is_development_option(option);
 
124
  gt_option_parser_add_option(op, option);
 
125
 
 
126
  /* -elimtrans */
 
127
  option = gt_option_new_bool("elimtrans", "output only irreducible SPMs",
 
128
      &arguments->elimtrans, true);
 
129
  gt_option_is_development_option(option);
 
130
  gt_option_parser_add_option(op, option);
 
131
 
 
132
  /* -wmax */
 
133
  option = gt_option_new_uint("wmax", "specify the maximum width of w set;\n"
 
134
                              "use 0 to disable w set partitioning",
 
135
                              &arguments->w_maxsize, 32U);
 
136
  gt_option_is_development_option(option);
 
137
  gt_option_parser_add_option(op, option);
 
138
 
 
139
  /* -showspm */
 
140
  option = gt_option_new_bool("showspm", "show textual SPMs list on stdout",
 
141
      &arguments->showspm, false);
 
142
  gt_option_is_development_option(option);
 
143
  gt_option_parser_add_option(op, option);
 
144
 
 
145
  /* -phase2extra */
 
146
  option = gt_option_new_string("phase2extra",
 
147
                       "specify amount of additional space required for "
 
148
                       "the second phase of the computation involving the "
 
149
                       "processing of the intervals (in bytes, "
 
150
                       "the keywords 'MB' and 'GB' are allowed)",
 
151
                       arguments->phase2extraarg, NULL);
 
152
  gt_option_parser_add_option(op, option);
 
153
  arguments->refoptionphase2extra = gt_option_ref(option);
 
154
  gt_option_is_development_option(option);
 
155
 
 
156
  /* -v */
 
157
  v_option = gt_option_new_verbose(&arguments->verbose);
 
158
  gt_option_parser_add_option(op, v_option);
 
159
 
 
160
  /* -q */
 
161
  q_option = gt_option_new_bool("q", "suppress standard output messages",
 
162
                             &arguments->quiet, false);
 
163
  gt_option_parser_add_option(op, q_option);
 
164
  gt_option_exclude(q_option, v_option);
 
165
 
 
166
  /* -radixparts */
 
167
  option = gt_option_new_uint("radixparts", "specify the radixpart parameter",
 
168
      &arguments->radixparts, 2U);
 
169
  gt_option_is_development_option(option);
 
170
  gt_option_parser_add_option(op, option);
 
171
 
 
172
  /* -radixsmall */
 
173
  option = gt_option_new_bool("radixsmall", "specify the radixsmall parameter",
 
174
      &arguments->radixsmall, false);
 
175
  gt_option_is_development_option(option);
 
176
  gt_option_parser_add_option(op, option);
 
177
 
 
178
  return op;
 
179
}
 
180
 
 
181
static int gt_readjoiner_overlap_arguments_check(int rest_argc,
 
182
                                         void *tool_arguments,
 
183
                                         GtError *err)
 
184
{
 
185
  GtReadjoinerOverlapArguments *arguments = tool_arguments;
 
186
  bool haserr = false;
 
187
 
 
188
  gt_error_check(err);
 
189
  if (rest_argc != 0)
 
190
  {
 
191
    gt_error_set(err,"unnecessary arguments");
 
192
    haserr = true;
 
193
  }
 
194
  if (!haserr && gt_option_is_set(arguments->refoptionmemlimit))
 
195
  {
 
196
    if (gt_option_parse_spacespec(&arguments->maximumspace,
 
197
          "memlimit",
 
198
          arguments->memlimitarg,
 
199
          err) != 0)
 
200
    {
 
201
      haserr = true;
 
202
    }
 
203
    if (!haserr && !gt_ma_bookkeeping_enabled()) {
 
204
      gt_error_set(err, "option '-memlimit' requires "
 
205
                        "GT_MEM_BOOKKEEPING=on");
 
206
      haserr = true;
 
207
    }
 
208
  }
 
209
  if (!haserr && gt_option_is_set(arguments->refoptionphase2extra))
 
210
  {
 
211
    if (gt_option_parse_spacespec(&arguments->phase2extra,
 
212
          "phase2extra",
 
213
          arguments->phase2extraarg,
 
214
          err) != 0)
 
215
    {
 
216
      haserr = true;
 
217
    }
 
218
  }
 
219
#ifdef GT_THREADS_ENABLED
 
220
  if (!haserr) {
 
221
    if (gt_jobs > 1 && gt_ma_bookkeeping_enabled()) {
 
222
      gt_error_set(err, "gt option '-j' and GT_MEM_BOOKKEEPING=on "
 
223
                        "are incompatible");
 
224
      haserr = true;
 
225
    }
 
226
  }
 
227
#endif
 
228
  return haserr ? -1 : 0;
 
229
}
 
230
 
 
231
static int gt_readjoiner_overlap_runner(GT_UNUSED int argc,
 
232
                                GT_UNUSED const char **argv,
 
233
                                GT_UNUSED int parsed_args,
 
234
                                void *tool_arguments,
 
235
                                GtError *err)
 
236
{
 
237
  GtReadjoinerOverlapArguments *arguments = tool_arguments;
 
238
  GtEncseqLoader *el = NULL;
 
239
  GtEncseq *encseq = NULL;
 
240
  GtLogger *default_logger, *verbose_logger;
 
241
  unsigned int kmersize;
 
242
  bool haserr = false;
 
243
  bool eqlen;
 
244
  unsigned long total_nof_irr_spm = 0, total_nof_trans_spm = 0;
 
245
 
 
246
  gt_error_check(err);
 
247
  gt_assert(arguments);
 
248
 
 
249
  default_logger = gt_logger_new(!arguments->quiet, GT_LOGGER_DEFLT_PREFIX,
 
250
      stdout);
 
251
  gt_logger_log(default_logger, "gt readjoiner overlap");
 
252
  verbose_logger = gt_logger_new(arguments->verbose, GT_LOGGER_DEFLT_PREFIX,
 
253
      stdout);
 
254
  gt_logger_log(verbose_logger, "verbose output activated");
 
255
  kmersize = MIN((unsigned int) GT_UNITSIN2BITENC, arguments->minmatchlength);
 
256
 
 
257
  el = gt_encseq_loader_new();
 
258
  gt_encseq_loader_drop_description_support(el);
 
259
  gt_encseq_loader_disable_autosupport(el);
 
260
  encseq = gt_encseq_loader_load(el, gt_str_get(arguments->encseqinput),
 
261
                                 err);
 
262
  eqlen = gt_encseq_accesstype_get(encseq) == GT_ACCESS_TYPE_EQUALLENGTH;
 
263
 
 
264
  if (encseq == NULL)
 
265
    haserr = true;
 
266
  if (!haserr && !arguments->singlestrand)
 
267
  {
 
268
    if (gt_encseq_mirror(encseq, err) != 0)
 
269
      haserr = true;
 
270
  }
 
271
  if (!haserr)
 
272
  {
 
273
    unsigned int threadcount;
 
274
#ifdef GT_THREADS_ENABLED
 
275
    const unsigned int threads = gt_jobs;
 
276
#else
 
277
    const unsigned int threads = 1U;
 
278
#endif
 
279
    if (eqlen)
 
280
    {
 
281
      GtBUstate_spmeq **state_table
 
282
        = gt_malloc(sizeof (*state_table) * threads);
 
283
 
 
284
      for (threadcount = 0; threadcount < threads; threadcount++)
 
285
      {
 
286
        state_table[threadcount]
 
287
          = gt_spmfind_eqlen_state_new(encseq,
 
288
                (unsigned long)arguments->minmatchlength,
 
289
                (unsigned long)arguments->w_maxsize, arguments->elimtrans,
 
290
                arguments->showspm, gt_str_get(arguments->encseqinput),
 
291
                threadcount, default_logger, verbose_logger, err);
 
292
      }
 
293
      if (storefirstcodes_getencseqkmers_twobitencoding(encseq, kmersize,
 
294
            arguments->numofparts, arguments->maximumspace,
 
295
            arguments->minmatchlength, false, false, false, 5U,
 
296
            arguments->phase2extra, arguments->radixsmall,
 
297
            arguments->radixparts, gt_spmfind_eqlen_process,
 
298
            gt_spmfind_eqlen_process_end, state_table, verbose_logger, err)
 
299
            != 0)
 
300
      {
 
301
        haserr = true;
 
302
      }
 
303
      for (threadcount = 0; threadcount < threads; threadcount++)
 
304
      {
 
305
        total_nof_irr_spm +=
 
306
          gt_spmfind_eqlen_nof_irr_spm(state_table[threadcount]);
 
307
        total_nof_trans_spm +=
 
308
          gt_spmfind_eqlen_nof_irr_spm(state_table[threadcount]);
 
309
        gt_spmfind_eqlen_state_delete(state_table[threadcount]);
 
310
      }
 
311
      gt_free(state_table);
 
312
    }
 
313
    else
 
314
    {
 
315
      GtBUstate_spmvar **state_table
 
316
        = gt_malloc(sizeof (*state_table) * threads);
 
317
 
 
318
      for (threadcount = 0; threadcount < threads; threadcount++)
 
319
      {
 
320
        state_table[threadcount]
 
321
           = gt_spmfind_varlen_state_new(encseq,
 
322
                  (unsigned long)arguments->minmatchlength,
 
323
                  (unsigned long)arguments->w_maxsize, arguments->elimtrans,
 
324
                  arguments->showspm, gt_str_get(arguments->encseqinput),
 
325
                  threadcount, default_logger, verbose_logger, err);
 
326
      }
 
327
      if (storefirstcodes_getencseqkmers_twobitencoding(encseq, kmersize,
 
328
            arguments->numofparts, arguments->maximumspace,
 
329
            arguments->minmatchlength, false, false, false, 5U,
 
330
            arguments->phase2extra, arguments->radixsmall,
 
331
            arguments->radixparts, gt_spmfind_varlen_process,
 
332
            gt_spmfind_varlen_process_end, state_table, verbose_logger, err)
 
333
          != 0)
 
334
      {
 
335
        haserr = true;
 
336
      }
 
337
      for (threadcount = 0; threadcount < threads; threadcount++)
 
338
      {
 
339
        gt_spmfind_varlen_state_delete(state_table[threadcount]);
 
340
        total_nof_irr_spm +=
 
341
          gt_spmfind_eqlen_nof_irr_spm(state_table[threadcount]);
 
342
        total_nof_trans_spm +=
 
343
          gt_spmfind_eqlen_nof_irr_spm(state_table[threadcount]);
 
344
      }
 
345
      gt_free(state_table);
 
346
    }
 
347
  }
 
348
  if (!haserr)
 
349
  {
 
350
    gt_logger_log(default_logger, "number of %ssuffix-prefix matches = %lu",
 
351
        arguments->elimtrans ? "irreducible " : "", total_nof_irr_spm);
 
352
    if (arguments->elimtrans)
 
353
      gt_logger_log(default_logger, "number of transitive suffix-prefix "
 
354
          "matches = %lu", total_nof_trans_spm);
 
355
  }
 
356
  gt_logger_delete(default_logger);
 
357
  gt_logger_delete(verbose_logger);
 
358
  gt_encseq_delete(encseq);
 
359
  gt_encseq_loader_delete(el);
 
360
  return haserr ? -1 : 0;
 
361
}
 
362
 
 
363
GtTool* gt_readjoiner_overlap(void)
 
364
{
 
365
  return gt_tool_new(gt_readjoiner_overlap_arguments_new,
 
366
                     gt_readjoiner_overlap_arguments_delete,
 
367
                     gt_readjoiner_overlap_option_parser_new,
 
368
                     gt_readjoiner_overlap_arguments_check,
 
369
                     gt_readjoiner_overlap_runner);
 
370
}