1
/* $Id: blast_dust.c,v 1.37 2005/11/16 14:27:03 madden Exp $
2
* ===========================================================================
5
* National Center for Biotechnology Information
7
* This software/database is a "United States Government Work" under the
8
* terms of the United States Copyright Act. It was written as part of
9
* the author's official duties as a United States Government employee and
10
* thus cannot be copyrighted. This software/database is freely available
11
* to the public for use. The National Library of Medicine and the U.S.
12
* Government have not placed any restriction on its use or reproduction.
14
* Although all reasonable efforts have been taken to ensure the accuracy
15
* and reliability of the software and data, the NLM and the U.S.
16
* Government do not and cannot warrant the performance or results that
17
* may be obtained by using this software or data. The NLM and the U.S.
18
* Government disclaim all warranties, express or implied, including
19
* warranties of performance, merchantability or fitness for any particular
22
* Please cite the author in any work or product based on this material.
24
* ==========================================================================
26
* Authors: Richa Agarwala (based upon versions variously worked upon by Roma
27
* Tatusov, John Kuzio, and Ilya Dondoshansky).
29
* ==========================================================================
32
/** @file blast_dust.c
33
* A utility to find low complexity NA regions. This parallels functionality
34
* of dust.c from the C toolkit, but without using the structures generated
38
#ifndef SKIP_DOXYGEN_PROCESSING
39
static char const rcsid[] =
40
"$Id: blast_dust.c,v 1.37 2005/11/16 14:27:03 madden Exp $";
41
#endif /* SKIP_DOXYGEN_PROCESSING */
43
#include <algo/blast/core/blast_dust.h>
44
#include <algo/blast/core/blast_util.h>
45
#include <algo/blast/core/blast_encoding.h>
46
#include <algo/blast/core/blast_filter.h>
49
/** Declared in blast_def.h as extern const. */
50
const int kDustLevel = 20;
51
const int kDustWindow = 64;
52
const int kDustLinker = 1;
55
/* local, file scope, structures and variables */
58
* @todo expand documentation
60
typedef struct DREGION {
66
* @todo expand documentation
68
typedef struct DCURLOC {
69
Int4 cursum, curstart, curend;
77
static void wo (Int4, Uint1*, Int4, DCURLOC*, Uint1*, Uint1, Int4);
78
static Boolean wo1 (Int4, Uint1*, Int4, DCURLOC*);
79
static Int4 dust_triplet_find (Uint1*, Int4, Int4, Uint1*);
81
/* entry point for dusting */
83
static Int4 dust_segs (Uint1* sequence, Int4 length, Int4 start,
85
Int4 level, Int4 windowsize, Int4 linker)
90
DREGION* regold = NULL;
94
/* defaults are more-or-less in keeping with original dust */
95
if (level < 2 || level > 64) level = kDustLevel;
96
if (windowsize < 8 || windowsize > 64) windowsize = kDustWindow;
97
if (linker < 1 || linker > 32) linker = kDustLinker;
100
seq = (Uint1*) calloc(1, windowsize); /* triplets */
104
SUM_THRESHOLD = (Uint1 *) calloc(windowsize, sizeof(Uint1));
105
if (!SUM_THRESHOLD) {
108
SUM_THRESHOLD[0] = 1;
109
for (i=1; i < windowsize; i++)
110
SUM_THRESHOLD[i] = (level * i)/10;
112
if (length < windowsize) windowsize = length;
114
/* Consider smaller windows in beginning of the sequence */
115
for (i = 2; i <= windowsize-1; i++) {
117
wo (len, sequence, 0, &cloc, seq, 1, level);
119
if (cloc.cursum*10 > level*cloc.curlength) {
121
regold->to + linker >= cloc.curstart+start &&
122
regold->from <= cloc.curend + start + linker) {
123
/* overlap windows nicely if needed */
124
if (regold->to < cloc.curend + start)
125
regold->to = cloc.curend + start;
126
if (regold->from > cloc.curstart + start)
127
regold->from = cloc.curstart + start;
129
/* new window or dusted regions do not overlap */
130
reg->from = cloc.curstart + start;
131
reg->to = cloc.curend + start;
133
reg = (DREGION*) calloc(1, sizeof(DREGION));
136
sfree(SUM_THRESHOLD);
143
} /* end 'if' high score */
146
for (i = 1; i < length-2; i++) {
147
len = (Int4) ((length > i+windowsize) ? windowsize : length-i);
149
if (length >= i+windowsize)
150
wo (len, sequence, i, &cloc, seq, 2, level);
151
else /* remaining portion of sequence is less than windowsize */
152
wo (len, sequence, i, &cloc, seq, 3, level);
154
if (cloc.cursum*10 > level*cloc.curlength) {
156
regold->to + linker >= cloc.curstart+i+start &&
157
regold->from <= cloc.curend + i + start + linker) {
158
/* overlap windows nicely if needed */
159
if (regold->to < cloc.curend + i + start)
160
regold->to = cloc.curend + i + start;
161
if (regold->from > cloc.curstart + i + start)
162
regold->from = cloc.curstart + i + start;
164
/* new window or dusted regions do not overlap */
165
reg->from = cloc.curstart + i + start;
166
reg->to = cloc.curend + i + start;
168
reg = (DREGION*) calloc(1, sizeof(DREGION));
171
sfree(SUM_THRESHOLD);
178
} /* end 'if' high score */
181
sfree(SUM_THRESHOLD);
185
static void wo (Int4 len, Uint1* seq_start, Int4 iseg, DCURLOC* cloc,
186
Uint1* seq, Uint1 FIND_TRIPLET, Int4 level)
188
Int4 smaller_window_start, mask_window_end;
189
Boolean SINGLE_TRIPLET;
199
/* get the chunk of sequence in triplets */
200
if (FIND_TRIPLET==1) /* Append one */
202
seq[len-1] = seq[len] = seq[len+1] = 0;
203
dust_triplet_find (seq_start, iseg+len-1, 1, seq+len-1);
205
if (FIND_TRIPLET==2) /* Copy suffix as prefix and find one */
207
memmove(seq,seq+1,(len-1)*sizeof(Uint1));
208
seq[len-1] = seq[len] = seq[len+1] = 0;
209
dust_triplet_find (seq_start, iseg+len-1, 1, seq+len-1);
211
if (FIND_TRIPLET==3) /* Copy suffix */
212
memmove(seq,seq+1,len*sizeof(Uint1));
215
SINGLE_TRIPLET = wo1 (len, seq, 0, cloc); /* dust at start of window */
217
/* consider smaller windows only if anything interesting
218
found for starting position and smaller windows have potential of
219
being at higher level */
220
if ((cloc->cursum*10 > level*cloc->curlength) && (!SINGLE_TRIPLET)) {
221
mask_window_end = cloc->curend-1;
222
smaller_window_start = 1;
223
while ((smaller_window_start < mask_window_end) &&
225
SINGLE_TRIPLET = wo1(mask_window_end-smaller_window_start,
226
seq+smaller_window_start, smaller_window_start, cloc);
227
smaller_window_start++;
231
cloc->curend += cloc->curstart;
234
/** returns TRUE if there is single triplet in the sequence considered */
235
static Boolean wo1 (Int4 len, Uint1* seq, Int4 iwo, DCURLOC* cloc)
242
Uint1 triplet_count = 0;
244
memset (counts, 0, sizeof (counts));
245
/* zero everything */
248
/* dust loop -- specific for triplets */
249
for (loop = 0; loop < len; loop++)
251
countsptr = &counts[*seq++];
254
sum += (Uint4)(*countsptr);
256
if (sum >= SUM_THRESHOLD[loop])
258
if ((Uint4)cloc->cursum*loop < sum*cloc->curlength)
261
cloc->curlength = loop;
262
cloc->curstart = iwo;
263
cloc->curend = loop + 2; /* triplets */
272
if (triplet_count > 1)
277
/** Fill an array with 2-bit encoded triplets.
278
* @param seq_start Pointer to the start of the sequence in blastna
280
* @param icur Offset at which to start extracting triplets [in]
281
* @param max Maximal length of the sequence segment to be processed [in]
282
* @param s1 Array of triplets [out]
283
* @return How far was the sequence processed?
286
dust_triplet_find (Uint1* seq_start, Int4 icur, Int4 max, Uint1* s1)
291
Uint1* seq = &seq_start[icur];
292
Uint1 end_byte = NCBI4NA_TO_BLASTNA[NULLB];
300
if ((c = *seq++) == end_byte)
307
if ((c = *seq++) == end_byte)
313
/* triplet fill loop */
314
while (n < max && (c = *seq++) != end_byte) {
330
/** Look for dustable locations */
332
GetDustLocations (BlastSeqLoc** loc, DREGION* reg, Int4 nreg)
334
BlastSeqLoc* tail = NULL; /* pointer to tail of loc linked list */
341
/* point to dusted locations */
344
for (i = 0; reg && i < nreg; i++) {
345
/* Cache the tail of the list to avoid the overhead of traversing the
346
* list when appending to it */
347
tail = BlastSeqLocNew(tail ? &tail : loc, reg->from, reg->to);
353
* additional error checking added below results in unit test failures!
356
/* point to dusted locations */
359
for (i = 0; reg && i < nreg; i++) {
360
if (BlastSeqLocNew(loc, reg->from, reg->to) == NULL) {
365
/* either BlastSeqLocNew failed or nreg didn't match the number of
366
* elements in reg, so return error */
368
*loc = BlastSeqLocFree(*loc);
376
Int2 SeqBufferDust (Uint1* sequence, Int4 length, Int4 offset,
377
Int2 level, Int2 window, Int2 linker,
378
BlastSeqLoc** dust_loc)
380
DREGION* reg,* regold;
384
/* place for dusted regions */
385
regold = reg = (DREGION*) calloc(1, sizeof(DREGION));
389
nreg = dust_segs (sequence, length, offset, reg, (Int4)level,
390
(Int4)window, (Int4)linker);
392
status = GetDustLocations(dust_loc, reg, nreg);
394
/* clean up memory */