2
Copyright (c) 2008 Stefan Kurtz <kurtz@zbh.uni-hamburg.de>
3
Copyright (c) 2008 Center for Bioinformatics, University of Hamburg
5
Permission to use, copy, modify, and distribute this software for any
6
purpose with or without fee is hereby granted, provided that the above
7
copyright notice and this permission notice appear in all copies.
9
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10
WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11
MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12
ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13
WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14
ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15
OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
20
#include "core/defined-types.h"
21
#include "core/divmodmul.h"
23
#include "core/intbits.h"
24
#include "core/mathsupport.h"
25
#include "core/minmax.h"
26
#include "core/str_api.h"
27
#include "core/types_api.h"
28
#include "core/unused_api.h"
29
#include "core/xansi_api.h"
32
#include "tyr-mersplit.h"
34
#define BUCKETSUFFIX ".mbd"
35
#define MAXUCHARVALUEWITHBITS(BITNUM) ((1 << (BITNUM)) - 1)
36
#define ISBOUNDDEFINED(UDB,IDX) GT_ISIBITSET(UDB,IDX)
37
#define SETDEFINEDBOUND(UDB,IDX) GT_SETIBIT(UDB,IDX)
41
void *mappedmbdfileptr;
42
unsigned int prefixlength;
43
unsigned long numofcodes,
51
const GtUchar *leftmer,
55
static unsigned long extractprefixbytecode(unsigned long merbytes,
56
unsigned int prefixlength,
57
const GtUchar *bytecode)
59
unsigned long idx, code = 0;
61
for (idx=0; idx < MIN((unsigned long) sizeof (unsigned long),merbytes); idx++)
63
code = (code << 8) | bytecode[idx];
64
if (GT_MULT4(idx+1) == (unsigned long) prefixlength)
68
if (GT_MULT4(idx+1) > (unsigned long) prefixlength)
70
code >>= GT_MULT2(GT_MULT4(idx+1) - prefixlength);
77
static GtUchar extractremainingbytes(const GtUchar remainmask,
78
unsigned long byteoffset,
79
const GtUchar *bytecode)
81
return bytecode[byteoffset] & remainmask;
84
static const GtUchar *remainingleftmost(unsigned long merbytes,
85
unsigned long byteoffset,
88
const GtUchar *leftptr,
89
const GtUchar *rightptr)
93
const GtUchar *midptr;
95
while (leftptr + merbytes < rightptr)
97
len = (unsigned long) (rightptr-leftptr)/GT_MULT2(merbytes);
98
midptr = leftptr + merbytes * len;
99
midcode = extractremainingbytes(remainmask,byteoffset,midptr);
111
static const GtUchar *remainingrightmost(unsigned long merbytes,
112
unsigned long byteoffset,
115
const GtUchar *leftptr,
116
const GtUchar *rightptr)
120
const GtUchar *midptr;
122
while (leftptr + merbytes < rightptr)
124
len = (unsigned long) (rightptr-leftptr)/GT_MULT2(merbytes);
125
midptr = leftptr + merbytes * len;
126
midcode = extractremainingbytes(remainmask,byteoffset,midptr);
138
static bool remainadvance(Merbounds *merbounds,
139
unsigned long merbytes,
140
unsigned long byteoffset,
142
const GtUchar *searchbytecode,
143
const GtUchar *leftptr,
144
const GtUchar *rightptr)
146
GtUchar scode, scodeleft, scoderight;
148
scode = extractremainingbytes(remainmask,byteoffset,searchbytecode);
149
scodeleft = extractremainingbytes(remainmask,byteoffset,leftptr);
150
if (scode > scodeleft)
152
scoderight = extractremainingbytes(remainmask,byteoffset,rightptr);
153
if (scode > scoderight)
157
merbounds->leftmer = remainingleftmost(merbytes,
164
if (scode < scodeleft)
168
if (scode == scodeleft)
170
merbounds->leftmer = leftptr;
172
scoderight = extractremainingbytes(remainmask,byteoffset,rightptr);
173
if (scode >= scoderight)
175
merbounds->rightmer = rightptr;
178
merbounds->rightmer = remainingrightmost(merbytes,
188
const GtUchar *gt_searchinbuckets(const Tyrindex *tyrindex,
189
const Tyrbckinfo *tyrbckinfo,
190
const GtUchar *bytecode)
192
const GtUchar *result;
193
unsigned long prefixcode, leftbound, merbytes;
195
gt_assert(tyrbckinfo != NULL);
196
merbytes = gt_tyrindex_merbytes(tyrindex);
197
prefixcode = extractprefixbytecode(merbytes,
198
tyrbckinfo->prefixlength,
200
leftbound = tyrbckinfo->bounds[prefixcode];
201
if (ISBOUNDDEFINED(tyrbckinfo->boundisdefined,prefixcode))
203
const GtUchar *mertable;
204
unsigned long rightbound;
206
mertable = gt_tyrindex_mertable(tyrindex);
207
rightbound = tyrbckinfo->bounds[prefixcode+1] - merbytes;
208
if (GT_MOD4(tyrbckinfo->prefixlength) == 0)
210
result = gt_tyrindex_binmersearch(tyrindex,
212
GT_DIV4(tyrbckinfo->prefixlength),
214
mertable + leftbound,
215
mertable + rightbound);
220
merbounds.leftmer = merbounds.rightmer = NULL;
221
if (!remainadvance(&merbounds,
223
(unsigned long) GT_DIV4(tyrbckinfo->prefixlength),
224
tyrbckinfo->remainmask,
226
mertable + leftbound,
227
mertable + rightbound) ||
228
merbounds.leftmer == NULL ||
229
merbounds.leftmer > merbounds.rightmer)
234
result = gt_tyrindex_binmersearch(tyrindex,
235
1UL + (unsigned long)
236
GT_DIV4(tyrbckinfo->prefixlength),
249
static const GtUchar *findrightmostmer(unsigned long merbytes,
250
unsigned int prefixlength,
252
const GtUchar *leftptr,
253
const GtUchar *rightptr)
255
unsigned long len, midcode;
256
const GtUchar *midptr;
258
while (leftptr + merbytes < rightptr)
260
len = (unsigned long) (rightptr-leftptr)/GT_MULT2(merbytes);
261
midptr = leftptr + merbytes * len;
262
midcode = extractprefixbytecode(merbytes,prefixlength,midptr);
274
static void splitmerinterval(Tyrbckinfo *tyrbckinfo,
275
const Tyrindex *tyrindex)
278
const GtUchar *rightbound, *leftptr, *rightptr, *mertable, *lastmer;
279
unsigned long code, leftcode, rightcode, merbytes;
281
mertable = gt_tyrindex_mertable(tyrindex);
282
lastmer = gt_tyrindex_lastmer(tyrindex);
283
merbytes = gt_tyrindex_merbytes(tyrindex);
286
rightcode = extractprefixbytecode(merbytes,
287
tyrbckinfo->prefixlength,
291
leftcode = extractprefixbytecode(merbytes,
292
tyrbckinfo->prefixlength,
294
tyrbckinfo->bounds[leftcode] = (unsigned long) (leftptr - mertable);
295
SETDEFINEDBOUND(tyrbckinfo->boundisdefined,leftcode);
296
if (leftcode == rightcode)
300
rightbound = findrightmostmer(merbytes,
301
tyrbckinfo->prefixlength,
302
leftcode,leftptr,rightptr);
303
leftptr = rightbound + merbytes;
305
tyrbckinfo->bounds[tyrbckinfo->numofcodes]
306
= (unsigned long) (lastmer + merbytes - mertable);
307
SETDEFINEDBOUND(tyrbckinfo->boundisdefined,tyrbckinfo->numofcodes);
308
for (code = tyrbckinfo->numofcodes - 1; /* Nothing */ ; code--)
310
if (!ISBOUNDDEFINED(tyrbckinfo->boundisdefined,code))
312
tyrbckinfo->bounds[code] = tyrbckinfo->bounds[code+1];
321
int gt_constructmerbuckets(const char *inputindex,
322
const Definedunsignedint *callprefixlength,
326
Tyrbckinfo tyrbckinfo;
327
FILE *bucketfp = NULL;
331
tyrbckinfo.bounds = NULL;
332
tyrbckinfo.boundisdefined = NULL;
333
tyrindex = gt_tyrindex_new(inputindex,err);
334
if (tyrindex == NULL)
338
if (!haserr && tyrindex != NULL && !gt_tyrindex_isempty(tyrindex))
340
if (gt_determinetyrbckpfxlen(&tyrbckinfo.prefixlength,
348
if (!haserr && tyrindex != NULL && !gt_tyrindex_isempty(tyrindex))
350
printf("# construct mer buckets for prefixlength %u\n",
351
tyrbckinfo.prefixlength);
352
tyrbckinfo.numofcodes
353
= gt_power_for_small_exponents(gt_tyrindex_alphasize(tyrindex),
354
tyrbckinfo.prefixlength);
355
tyrbckinfo.mappedmbdfileptr = NULL;
356
printf("# numofcodes = %lu\n",tyrbckinfo.numofcodes);
357
gt_tyrindex_show(tyrindex);
358
ALLOCASSIGNSPACE(tyrbckinfo.bounds,NULL,unsigned long,
359
tyrbckinfo.numofcodes+1);
360
GT_INITBITTAB(tyrbckinfo.boundisdefined,tyrbckinfo.numofcodes+1);
361
splitmerinterval(&tyrbckinfo,tyrindex);
362
bucketfp = gt_fa_fopen_with_suffix(inputindex,BUCKETSUFFIX,"wb",err);
363
if (bucketfp == NULL)
368
if (!haserr && tyrindex != NULL && !gt_tyrindex_isempty(tyrindex))
370
unsigned long pl_long = (unsigned long) tyrbckinfo.prefixlength;
371
gt_assert(bucketfp != NULL);
372
gt_xfwrite(&pl_long, sizeof (pl_long), (size_t) 1, bucketfp);
374
if (!haserr && tyrindex != NULL && !gt_tyrindex_isempty(tyrindex))
376
gt_assert(bucketfp != NULL);
377
gt_xfwrite(tyrbckinfo.bounds, sizeof (*tyrbckinfo.bounds),
378
(size_t) (tyrbckinfo.numofcodes+1), bucketfp);
380
if (!haserr && tyrindex != NULL && !gt_tyrindex_isempty(tyrindex))
382
gt_assert(bucketfp != NULL);
383
gt_xfwrite(tyrbckinfo.boundisdefined, sizeof (*tyrbckinfo.boundisdefined),
384
GT_NUMOFINTSFORBITS(tyrbckinfo.numofcodes+1), bucketfp);
386
gt_fa_xfclose(bucketfp);
387
if (tyrindex != NULL)
389
gt_tyrindex_delete(&tyrindex);
391
FREESPACE(tyrbckinfo.bounds);
392
gt_free(tyrbckinfo.boundisdefined);
393
return haserr ? -1 : 0;
396
Tyrbckinfo *gt_tyrbckinfo_new(const char *tyrindexname,unsigned int alphasize,
400
Tyrbckinfo *tyrbckinfo;
403
ALLOCASSIGNSPACE(tyrbckinfo,NULL,Tyrbckinfo,1);
404
tyrbckinfo->mappedmbdfileptr = gt_fa_mmap_read_with_suffix(tyrindexname,
407
if (tyrbckinfo->mappedmbdfileptr == NULL)
413
unsigned long pl_long;
415
gt_assert(tyrbckinfo->mappedmbdfileptr != NULL);
416
pl_long = *((unsigned long *) tyrbckinfo->mappedmbdfileptr);
417
tyrbckinfo->prefixlength = (unsigned int) pl_long;
418
tyrbckinfo->numofcodes
419
= gt_power_for_small_exponents(alphasize,tyrbckinfo->prefixlength);
420
/*check if numofbytes == expected size*/
421
gt_assert(numofbytes == sizeof (unsigned long) *
423
(tyrbckinfo->numofcodes+1) +
424
GT_NUMOFINTSFORBITS(tyrbckinfo->numofcodes + 1)));
425
tyrbckinfo->bounds = ((unsigned long *) tyrbckinfo->mappedmbdfileptr) + 1;
426
tyrbckinfo->boundisdefined
427
= tyrbckinfo->bounds + tyrbckinfo->numofcodes + 1;
428
if (tyrbckinfo->prefixlength > 0 && GT_MOD4(tyrbckinfo->prefixlength) > 0)
430
tyrbckinfo->remainmask
431
= (GtUchar) MAXUCHARVALUEWITHBITS(GT_MULT2(
432
4U - GT_MOD4(tyrbckinfo->prefixlength)));
437
FREESPACE(tyrbckinfo);
443
void gt_tyrbckinfo_delete(Tyrbckinfo **tyrbckinfoptr)
445
Tyrbckinfo *tyrbckinfo = *tyrbckinfoptr;
447
gt_fa_xmunmap(tyrbckinfo->mappedmbdfileptr);
448
tyrbckinfo->mappedmbdfileptr = NULL;
449
FREESPACE(tyrbckinfo);
450
*tyrbckinfoptr = NULL;