~ubuntu-branches/ubuntu/quantal/genometools/quantal-backports

« back to all changes in this revision

Viewing changes to src/tools/gt_patternmatch.c

  • Committer: Package Import Robot
  • Author(s): Sascha Steinbiss
  • Date: 2012-07-09 14:10:23 UTC
  • Revision ID: package-import@ubuntu.com-20120709141023-juuu4spm6chqsf9o
Tags: upstream-1.4.1
ImportĀ upstreamĀ versionĀ 1.4.1

Show diffs side-by-side

added added

removed removed

Lines of Context:
 
1
/*
 
2
  Copyright (c) 2007 Stefan Kurtz <kurtz@zbh.uni-hamburg.de>
 
3
  Copyright (c) 2007 Center for Bioinformatics, University of Hamburg
 
4
 
 
5
  Permission to use, copy, modify, and distribute this software for any
 
6
  purpose with or without fee is hereby granted, provided that the above
 
7
  copyright notice and this permission notice appear in all copies.
 
8
 
 
9
  THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
 
10
  WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
 
11
  MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
 
12
  ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
 
13
  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
 
14
  ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
 
15
  OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 
16
*/
 
17
 
 
18
#include <inttypes.h>
 
19
#include "core/encseq.h"
 
20
#include "core/error.h"
 
21
#include "core/option_api.h"
 
22
#include "core/str.h"
 
23
#include "core/unused_api.h"
 
24
#include "core/versionfunc.h"
 
25
#include "match/cutendpfx.h"
 
26
#include "match/enum-patt-def.h"
 
27
#include "match/esa-map.h"
 
28
#include "match/esa-mmsearch.h"
 
29
#include "match/qgram2code.h"
 
30
#include "match/sarr-def.h"
 
31
#include "match/spacedef.h"
 
32
#include "match/stamp.h"
 
33
 
 
34
#include "tools/gt_patternmatch.h"
 
35
 
 
36
typedef struct
 
37
{
 
38
  unsigned long minpatternlen, maxpatternlen, numofsamples;
 
39
  bool showpatt, usebcktab, immediate;
 
40
  GtStr *indexname;
 
41
} Pmatchoptions;
 
42
 
 
43
static void comparemmsis(const MMsearchiterator *mmsi1,
 
44
                         const MMsearchiterator *mmsi2)
 
45
{
 
46
  if (gt_isemptymmsearchiterator(mmsi1))
 
47
  {
 
48
    if (!gt_isemptymmsearchiterator(mmsi2))
 
49
    {
 
50
      fprintf(stderr,"mmsi1 is empty but mmsi2 not\n");
 
51
      exit(GT_EXIT_PROGRAMMING_ERROR);
 
52
    }
 
53
  } else
 
54
  {
 
55
    if (gt_isemptymmsearchiterator(mmsi2))
 
56
    {
 
57
      fprintf(stderr,"mmsi2 is empty but mmsi1 not\n");
 
58
      exit(GT_EXIT_PROGRAMMING_ERROR);
 
59
    }
 
60
    if (!gt_identicalmmsearchiterators(mmsi1,mmsi2))
 
61
    {
 
62
      fprintf(stderr,"mmsi1 and mmsi2 are different\n");
 
63
      exit(GT_EXIT_PROGRAMMING_ERROR);
 
64
    }
 
65
  }
 
66
}
 
67
 
 
68
#define UNDEFREFSTART totallength
 
69
 
 
70
static int callpatternmatcher(const Pmatchoptions *pmopt, GtError *err)
 
71
{
 
72
  Suffixarray suffixarray;
 
73
  unsigned long totallength = 0;
 
74
  bool haserr = false;
 
75
  const GtUchar *pptr;
 
76
  unsigned long patternlen;
 
77
  unsigned int demand = SARR_SUFTAB | SARR_ESQTAB;
 
78
 
 
79
  if (pmopt->usebcktab)
 
80
  {
 
81
    demand |= SARR_BCKTAB;
 
82
  }
 
83
  if (gt_mapsuffixarray(&suffixarray,
 
84
                     demand,
 
85
                     gt_str_get(pmopt->indexname),
 
86
                     NULL,
 
87
                     err) != 0)
 
88
  {
 
89
    haserr = true;
 
90
  } else
 
91
  {
 
92
    totallength = gt_encseq_total_length(suffixarray.encseq);
 
93
  }
 
94
  if (!haserr)
 
95
  {
 
96
    unsigned long trial;
 
97
    unsigned long dbstart;
 
98
    Enumpatterniterator *epi;
 
99
    GT_UNUSED unsigned int firstspecial;
 
100
    MMsearchiterator *mmsibck, *mmsiimm;
 
101
    GtBucketspecification bucketspec;
 
102
    Bucketenumerator *bucketenumerator;
 
103
    Lcpinterval itv;
 
104
    unsigned long refstart;
 
105
    GtEncseqReader *esr1, *esr2;
 
106
    GT_UNUSED int retval;
 
107
    unsigned long idx, maxlcp;
 
108
    GtCodetype code = 0;
 
109
    const GtCodetype **multimappower;
 
110
    const GtAlphabet *alpha;
 
111
 
 
112
    if (pmopt->usebcktab)
 
113
    {
 
114
      multimappower = gt_bcktab_multimappower(suffixarray.bcktab);
 
115
    } else
 
116
    {
 
117
      multimappower = NULL;
 
118
    }
 
119
    epi = gt_newenumpatterniterator(pmopt->minpatternlen,
 
120
                                    pmopt->maxpatternlen,
 
121
                                    suffixarray.encseq,
 
122
                                    err);
 
123
    esr1 = gt_encseq_create_reader_with_readmode(suffixarray.encseq,
 
124
                                                 suffixarray.readmode, 0);
 
125
    esr2 = gt_encseq_create_reader_with_readmode(suffixarray.encseq,
 
126
                                                 suffixarray.readmode, 0);
 
127
    alpha = gt_encseq_alphabet(suffixarray.encseq);
 
128
    for (trial = 0; trial < pmopt->numofsamples; trial++)
 
129
    {
 
130
      pptr = gt_nextEnumpatterniterator(&patternlen,epi);
 
131
      if (pmopt->showpatt)
 
132
      {
 
133
        gt_alphabet_decode_seq_to_fp(alpha,stdout,pptr,patternlen);
 
134
        printf("\n");
 
135
      }
 
136
      if (pmopt->usebcktab)
 
137
      {
 
138
        if (patternlen < (unsigned long) suffixarray.prefixlength)
 
139
        {
 
140
          mmsibck = NULL;
 
141
          bucketenumerator
 
142
            = gt_newbucketenumerator(suffixarray.bcktab,
 
143
                                  suffixarray.prefixlength,
 
144
                                  pptr,
 
145
                                  (unsigned int) patternlen);
 
146
          refstart = UNDEFREFSTART;
 
147
          while (gt_nextbucketenumerator(&itv,bucketenumerator))
 
148
          {
 
149
            if (refstart == UNDEFREFSTART)
 
150
            {
 
151
              refstart = ESASUFFIXPTRGET(suffixarray.suftab,itv.left);
 
152
            } else
 
153
            {
 
154
              for (idx=itv.left; idx<=itv.right; idx++)
 
155
              {
 
156
                retval = gt_encseq_check_comparetwosuffixes(
 
157
                                        suffixarray.encseq,
 
158
                                        suffixarray.readmode,
 
159
                                        &maxlcp,
 
160
                                        false,
 
161
                                        false,
 
162
                                        patternlen,
 
163
                                        refstart,
 
164
                                        ESASUFFIXPTRGET(suffixarray.suftab,idx),
 
165
                                        esr1,
 
166
                                        esr2);
 
167
                gt_assert(retval == 0 && maxlcp == patternlen);
 
168
              }
 
169
            }
 
170
          }
 
171
          gt_freebucketenumerator(&bucketenumerator);
 
172
        } else
 
173
        {
 
174
          firstspecial = qgram2code(&code,
 
175
                                    multimappower,
 
176
                                    suffixarray.prefixlength,
 
177
                                    pptr);
 
178
          gt_assert(firstspecial == suffixarray.prefixlength);
 
179
          gt_bcktab_calcboundaries(&bucketspec,
 
180
                                   suffixarray.bcktab,
 
181
                                   code);
 
182
          if (bucketspec.nonspecialsinbucket == 0)
 
183
          {
 
184
            mmsibck = NULL;
 
185
          } else
 
186
          {
 
187
            mmsibck
 
188
              = gt_newmmsearchiteratorcomplete_plain(
 
189
                                       suffixarray.encseq,
 
190
                                       suffixarray.suftab,
 
191
                                       bucketspec.left,
 
192
                                       bucketspec.left +
 
193
                                         bucketspec.nonspecialsinbucket-1,
 
194
                                       (unsigned long) suffixarray.prefixlength,
 
195
                                       suffixarray.readmode,
 
196
                                       pptr,
 
197
                                       patternlen);
 
198
          }
 
199
        }
 
200
      }
 
201
      if (pmopt->immediate)
 
202
      {
 
203
        mmsiimm = gt_newmmsearchiteratorcomplete_plain(
 
204
                                            suffixarray.encseq,
 
205
                                            suffixarray.suftab,
 
206
                                            0,  /* leftbound */
 
207
                                            totallength, /* rightbound */
 
208
                                            0, /* offset */
 
209
                                            suffixarray.readmode,
 
210
                                            pptr,
 
211
                                            patternlen);
 
212
      }
 
213
      if (pmopt->immediate && pmopt->usebcktab)
 
214
      {
 
215
        comparemmsis(mmsibck,mmsiimm);
 
216
      }
 
217
      if (pmopt->usebcktab && mmsibck != NULL)
 
218
      {
 
219
        while (gt_nextmmsearchiterator(&dbstart,mmsibck))
 
220
        {
 
221
          /* Nothing */;
 
222
        }
 
223
        gt_freemmsearchiterator(&mmsibck);
 
224
      }
 
225
      if (pmopt->immediate)
 
226
      {
 
227
        while (gt_nextmmsearchiterator(&dbstart,mmsiimm))
 
228
        {
 
229
          /* Nothing */;
 
230
        }
 
231
        gt_freemmsearchiterator(&mmsiimm);
 
232
      }
 
233
    }
 
234
    gt_encseq_reader_delete(esr1);
 
235
    gt_encseq_reader_delete(esr2);
 
236
    if (pmopt->showpatt)
 
237
    {
 
238
      gt_showPatterndistribution(epi);
 
239
    }
 
240
    gt_freeEnumpatterniterator(&epi);
 
241
  }
 
242
  gt_freesuffixarray(&suffixarray);
 
243
  return haserr ? -1 : 0;
 
244
}
 
245
 
 
246
static GtOPrval parse_options(Pmatchoptions *pmopt,
 
247
                              int *parsed_args,
 
248
                              int argc, const char **argv, GtError *err)
 
249
{
 
250
  GtOptionParser *op;
 
251
  GtOption *option, *optionimm, *optionbck;
 
252
  GtOPrval oprval;
 
253
 
 
254
  gt_error_check(err);
 
255
  op = gt_option_parser_new("[options] -ii indexname",
 
256
                         "Perform pattern matches.");
 
257
  gt_option_parser_set_mail_address(op, "<kurtz@zbh.uni-hamburg.de>");
 
258
 
 
259
  option = gt_option_new_ulong("minpl","Specify minimum length of pattern",
 
260
                           &pmopt->minpatternlen,
 
261
                           (unsigned long) 20);
 
262
  gt_option_parser_add_option(op, option);
 
263
  option = gt_option_new_ulong("maxpl","Specify maximum length of pattern",
 
264
                            &pmopt->maxpatternlen,
 
265
                            (unsigned long) 30);
 
266
  gt_option_parser_add_option(op, option);
 
267
 
 
268
  option = gt_option_new_ulong("samples","Specify number of samples",
 
269
                            &pmopt->numofsamples,
 
270
                           (unsigned long) 100000);
 
271
  gt_option_parser_add_option(op, option);
 
272
 
 
273
  option = gt_option_new_bool("s","Show generated pattern",
 
274
                            &pmopt->showpatt,
 
275
                            false);
 
276
  gt_option_parser_add_option(op, option);
 
277
 
 
278
  optionbck = gt_option_new_bool("bck","Use the bucket boundaries",
 
279
                              &pmopt->usebcktab,
 
280
                              false);
 
281
  gt_option_parser_add_option(op, optionbck);
 
282
 
 
283
  optionimm = gt_option_new_bool("imm","Start with offset 0",
 
284
                              &pmopt->immediate,
 
285
                              false);
 
286
  gt_option_parser_add_option(op, optionimm);
 
287
 
 
288
  option = gt_option_new_string("ii",
 
289
                             "Specify input index",
 
290
                             pmopt->indexname, NULL);
 
291
  gt_option_parser_add_option(op, option);
 
292
  gt_option_is_mandatory(option);
 
293
 
 
294
  oprval = gt_option_parser_parse(op, parsed_args, argc, argv,
 
295
                               gt_versionfunc, err);
 
296
  gt_option_parser_delete(op);
 
297
 
 
298
  return oprval;
 
299
}
 
300
 
 
301
int gt_patternmatch(int argc, const char **argv, GtError *err)
 
302
{
 
303
  bool haserr = false;
 
304
  int parsed_args;
 
305
  Pmatchoptions pmopt;
 
306
  GtOPrval oprval;
 
307
 
 
308
  gt_error_check(err);
 
309
 
 
310
  pmopt.indexname = gt_str_new();
 
311
  oprval = parse_options(&pmopt,&parsed_args, argc, argv, err);
 
312
  if (oprval == GT_OPTION_PARSER_OK)
 
313
  {
 
314
    gt_assert(parsed_args == argc);
 
315
    if (callpatternmatcher(&pmopt,err) != 0)
 
316
    {
 
317
      haserr = true;
 
318
    }
 
319
  }
 
320
  gt_str_delete(pmopt.indexname);
 
321
  if (oprval == GT_OPTION_PARSER_REQUESTS_EXIT)
 
322
  {
 
323
    return 0;
 
324
  }
 
325
  if (oprval == GT_OPTION_PARSER_ERROR)
 
326
  {
 
327
    return -1;
 
328
  }
 
329
  return haserr ? -1 : 0;
 
330
}