1
/* $Id: pattern.c,v 1.9 2004/05/19 14:52:03 camacho Exp $
2
* ===========================================================================
5
* National Center for Biotechnology Information
7
* This software/database is a "United States Government Work" under the
8
* terms of the United States Copyright Act. It was written as part of
9
* the author's offical duties as a United States Government employee and
10
* thus cannot be copyrighted. This software/database is freely available
11
* to the public for use. The National Library of Medicine and the U.S.
12
* Government have not placed any restriction on its use or reproduction.
14
* Although all reasonable efforts have been taken to ensure the accuracy
15
* and reliability of the software and data, the NLM and the U.S.
16
* Government do not and cannot warrant the performance or results that
17
* may be obtained by using this software or data. The NLM and the U.S.
18
* Government disclaim all warranties, express or implied, including
19
* warranties of performance, merchantability or fitness for any particular
22
* Please cite the author in any work or product based on this material.
24
* ===========================================================================
26
* Author: Ilya Dondoshansky
31
* Functions for finding pattern matches in sequence.
32
* @todo FIXME needs doxygen comments and lines shorter than 80 characters
35
static char const rcsid[] =
36
"$Id: pattern.c,v 1.9 2004/05/19 14:52:03 camacho Exp $";
38
#include <algo/blast/core/blast_def.h>
39
#include <algo/blast/core/pattern.h>
42
/*Looks for 1 bits in the same position of s and mask
43
Let rightOne be the rightmost position where s and mask both have
45
Let rightMaskOnly < rightOne be the rightmost position where mask has a 1, if any
47
returns (rightOne - rightMaskOnly) */
49
static Int4 lenof(Int4 s, Int4 mask)
51
Int4 checkingMatches = s & mask; /*look for 1 bits in same position*/
52
Int4 rightOne; /*loop index looking for 1 in checkingMatches*/
53
Int4 rightMaskOnly; /*rightnost bit that is 1 in the mask only*/
55
/*AAS Changed upper bound on loop here*/
56
for (rightOne = 0; rightOne < BITS_PACKED_PER_WORD; rightOne++) {
57
if ((checkingMatches >> rightOne) % 2 == 1)
58
return rightOne - rightMaskOnly;
59
if ((mask >> rightOne) %2 == 1)
60
rightMaskOnly = rightOne;
62
/*ErrPostEx(SEV_FATAL, 1, 0, "wrong\n");*/
66
/* routine to find hits of pattern to sequence when sequence is proteins
67
hitArray is an array of matches to pass back
68
seq is the input sequence
69
len1 is the length of the input sequence
70
patternSearch carries variables that keep track of
72
returns the number of matches*/
73
static Int4 find_hitsS(Int4 *hitArray, const Uint1* seq, Int4 len1,
74
patternSearchItems *patternSearch)
76
Int4 i; /*loop index on sequence*/
77
Int4 prefixMatchedBitPattern = 0; /*indicates where pattern aligns
78
with seq; e.g., if value is 9 = 0101 then
79
last 3 chars of seq match first 3 positions in pattern
80
and last 1 char of seq matches 1 position of pattern*/
81
Int4 numMatches = 0; /*number of matches found*/
82
Int4 mask; /*mask of input pattern positions after which
83
a match can be declared*/
84
Int4 maskShiftPlus1; /*mask shifted left 1 plus 1 */
86
mask = patternSearch->match_mask;
87
maskShiftPlus1 = (mask << 1) + 1;
88
for (i = 0; i < len1; i++) {
89
/*shift the positions matched by 1 and try to match up against
90
the next character, also allow next character to match the
92
prefixMatchedBitPattern =
93
((prefixMatchedBitPattern << 1) | maskShiftPlus1) &
94
patternSearch->whichPositionPtr[seq[i]];
95
if (prefixMatchedBitPattern & mask) {
96
/*first part of pair is index of place in seq where match
97
ends; second part is where match starts*/
98
hitArray[numMatches++] = i;
99
hitArray[numMatches++] = i - lenof(prefixMatchedBitPattern, mask)+1;
100
if (numMatches == MAX_HIT)
102
/*ErrPostEx(SEV_WARNING, 0, 0,
103
"%ld matches saved, discarding others", numMatches);*/
111
/*find hits when sequence is DNA and pattern is short
112
returns twice the number of hits
113
pos indicates the starting position
114
len is length of sequence seq
115
hitArray stores the results*/
116
static Int4 find_hitsS_DNA(Int4* hitArray, const Uint1* seq, Int4 pos, Int4 len,
117
patternSearchItems *patternSearch)
119
/*Some variables and the algorithm are similar to what is
120
used in find_hits() above; see more detailed comments there*/
121
Uint4 prefixMatchedBitPattern; /*indicates where pattern aligns
123
Uint4 mask2; /*mask to match agaist*/
124
Int4 maskShiftPlus1; /*mask2 shifted plus 1*/
125
Uint4 tmp; /*intermediate result of masked comparisons*/
126
Int4 i; /*index on seq*/
127
Int4 end; /*count of number of 4-mer iterations needed*/
128
Int4 remain; /*0,1,2,3 DNA letters left over*/
129
Int4 j; /*index on suffixRemnant*/
130
Int4 twiceNumHits = 0; /*twice the number of hits*/
132
mask2 = patternSearch->match_mask*BITS_PACKED_PER_WORD+15;
133
maskShiftPlus1 = (patternSearch->match_mask << 1)+1;
137
prefixMatchedBitPattern = ((patternSearch->match_mask * ((1 << (pos+1))-1)*2) +
138
(1 << (pos+1))-1)& patternSearch->DNAwhichSuffixPosPtr[seq[0]];
141
remain = (len-pos) % 4;
144
prefixMatchedBitPattern = maskShiftPlus1;
148
for (i = 0; i < end; i++) {
149
if ( (tmp = (prefixMatchedBitPattern &
150
patternSearch->DNAwhichPrefixPosPtr[seq[i]]))) {
151
for (j = 0; j < 4; j++) {
152
if (tmp & patternSearch->match_mask) {
153
hitArray[twiceNumHits++] = i*4+j + pos;
154
hitArray[twiceNumHits++] = i*4+j + pos -lenof(tmp & patternSearch->match_mask,
155
patternSearch->match_mask)+1;
160
prefixMatchedBitPattern = (((prefixMatchedBitPattern << 4) | mask2) & patternSearch->DNAwhichSuffixPosPtr[seq[i]]);
162
/* In the last byte check bits only up to 'remain' */
163
if ( (tmp = (prefixMatchedBitPattern &
164
patternSearch->DNAwhichPrefixPosPtr[seq[i]]))) {
165
for (j = 0; j < remain; j++) {
166
if (tmp & patternSearch->match_mask) {
167
hitArray[twiceNumHits++] = i*4+j + pos;
168
hitArray[twiceNumHits++] = i*4+j + pos - lenof(tmp & patternSearch->match_mask, patternSearch->match_mask)+1;
176
/*Top level routine when pattern has a short description
177
hitArray is to return the hits
178
seq is the input sequence
179
start is what position to start at in seq
180
len is the length of seq
181
is_dna is 1 if and only if seq is a DNA sequence
182
the return value from the appropriate lower level routine is passed
184
static Int4 find_hitsS_head(Int4* hitArray, const Uint1* seq, Int4 start, Int4 len,
185
Uint1 is_dna, patternSearchItems *patternSearch)
188
return find_hitsS_DNA(hitArray, &seq[start/4], start % 4, len, patternSearch);
189
return find_hitsS(hitArray, &seq[start], len, patternSearch);
192
/*Shift each word in the array left by 1 bit and add bit b,
193
if the new values is bigger that OVERFLOW1, then subtract OVERFLOW1 */
194
static void lmove(Int4 *a, Uint1 b, patternSearchItems *patternSearch)
197
Int4 i; /*index on words*/
198
for (i = 0; i < patternSearch->numWords; i++) {
200
if (x >= OVERFLOW1) {
201
a[i] = x - OVERFLOW1;
211
/*Do a word-by-word bit-wise or of a and b and put the result back in a*/
212
static void or(Int4 *a, Int4 *b, patternSearchItems *patternSearch)
214
Int4 i; /*index over words*/
215
for (i = 0; i < patternSearch->numWords; i++)
216
a[i] = (a[i] | b[i]);
219
/*Do a word-by-word bit-wise or of a and b and put the result in
220
result; return 1 if there are any non-zero words*/
221
static Int4 and(Int4 *result, Int4 *a, Int4 *b, patternSearchItems *patternSearch)
223
Int4 i; /*index over words*/
224
Int4 returnValue = 0;
226
for (i = 0; i < patternSearch->numWords; i++)
227
if ( (result[i] = (a[i] & b[i]) ) )
232
/*Let F be the number of the first bit in s that is 1
233
Let G be the first bit in mask that is one such that G < F;
236
static Int4 lenofL(Int4 *s, Int4 *mask, patternSearchItems *patternSearch)
238
Int4 bitIndex; /*loop index over bits in a word*/
239
Int4 wordIndex; /*loop index over words*/
243
for (wordIndex = 0; wordIndex < patternSearch->numWords; wordIndex++) {
244
for (bitIndex = 0; bitIndex < BITS_PACKED_PER_WORD; bitIndex++) {
245
if ((s[wordIndex] >> bitIndex) % 2 == 1)
246
return wordIndex*BITS_PACKED_PER_WORD+bitIndex-firstOneInMask;
247
if ((mask[wordIndex] >> bitIndex) %2 == 1)
248
firstOneInMask = wordIndex*BITS_PACKED_PER_WORD+bitIndex;
251
/*ErrPostEx(SEV_FATAL, 1, 0, "wrong\n");*/
255
/* Finds places where pattern matches seq and returns them as
256
pairs of positions in consecutive entries of hitArray;
257
similar to find_hitsS
258
hitArray is array of hits to return
259
seq is the input sequence and len1 is its length
260
patternSearch carries all the pattern variables
261
return twice the number of hits*/
262
static Int4 find_hitsL(Int4 *hitArray, const Uint1* seq, Int4 len1,
263
patternSearchItems *patternSearch)
265
Int4 wordIndex; /*index on words in mask*/
266
Int4 i; /*loop index on seq */
267
Int4 *prefixMatchedBitPattern; /*see similar variable in
269
Int4 twiceNumHits = 0; /*counter for hitArray*/
270
Int4 *mask; /*local copy of match_maskL version of pattern
271
indicates after which positions a match can be declared*/
272
Int4 *matchResult; /*Array of words to hold the result of the
273
final test for a match*/
275
matchResult = (Int4 *) calloc(patternSearch->numWords, sizeof(Int4));
276
mask = (Int4 *) calloc(patternSearch->numWords, sizeof(Int4));
277
prefixMatchedBitPattern = (Int4 *) calloc(patternSearch->numWords, sizeof(Int4));
278
for (wordIndex = 0; wordIndex < patternSearch->numWords; wordIndex++) {
279
mask[wordIndex] = patternSearch->match_maskL[wordIndex];
280
prefixMatchedBitPattern[wordIndex] = 0;
282
/*This is a multiword version of the algorithm in find_hitsS*/
283
lmove(mask, 1, patternSearch);
284
for (i = 0; i < len1; i++) {
285
lmove(prefixMatchedBitPattern, 0, patternSearch);
286
or(prefixMatchedBitPattern, mask, patternSearch);
287
and(prefixMatchedBitPattern, prefixMatchedBitPattern, patternSearch->bitPatternByLetter[seq[i]], patternSearch);
288
if (and(matchResult, prefixMatchedBitPattern, patternSearch->match_maskL, patternSearch)) {
289
hitArray[twiceNumHits++] = i;
290
hitArray[twiceNumHits++] = i-lenofL(matchResult, patternSearch->match_maskL, patternSearch)+1;
293
sfree(prefixMatchedBitPattern);
299
/*find matches when pattern is very long,
300
hitArray is used to pass back pairs of end position. start position for hits
301
seq is the sequence; len is its length
302
is_dna indicates if the sequence is DNA or protein*/
303
static Int4 find_hitsLL(Int4 *hitArray, const Uint1* seq, Int4 len, Boolean is_dna,
304
patternSearchItems *patternSearch)
306
Int4 twiceNumHits; /*twice the number of matches*/
307
Int4 twiceHitsOneCall; /*twice the number of hits in one call to
309
Int4 wordIndex; /*index over words in pattern*/
310
Int4 start; /*start position in sequence for calls to find_hitsS */
311
Int4 hitArray1[MAX_HIT]; /*used to get hits against different words*/
312
Int4 nextPosInHitArray; /*next available position in hitArray1 */
313
Int4 hitIndex1, hitIndex2; /*indices over hitArray1*/
315
patternSearch->whichPositionPtr =
316
patternSearch->SLL[patternSearch->whichMostSpecific];
317
patternSearch->match_mask =
318
patternSearch->match_maskL[patternSearch->whichMostSpecific];
320
patternSearch->DNAwhichPrefixPosPtr = patternSearch->DNAprefixSLL[patternSearch->whichMostSpecific];
321
patternSearch->DNAwhichSuffixPosPtr = patternSearch->DNAsuffixSLL[patternSearch->whichMostSpecific];
323
/*find matches to most specific word of pattern*/
324
twiceNumHits = find_hitsS_head(hitArray, seq, 0, len, is_dna, patternSearch);
325
if (twiceNumHits < 2)
327
/*extend matches word by word*/
328
for (wordIndex = patternSearch->whichMostSpecific+1;
329
wordIndex < patternSearch->numWords; wordIndex++) {
330
patternSearch->whichPositionPtr =
331
patternSearch->SLL[wordIndex];
332
patternSearch->match_mask = patternSearch->match_maskL[wordIndex];
334
patternSearch->DNAwhichPrefixPosPtr = patternSearch->DNAprefixSLL[wordIndex];
335
patternSearch->DNAwhichSuffixPosPtr = patternSearch->DNAsuffixSLL[wordIndex];
337
nextPosInHitArray = 0;
338
for (hitIndex2 = 0; hitIndex2 < twiceNumHits; hitIndex2 += 2) {
339
twiceHitsOneCall = find_hitsS_head(&hitArray1[nextPosInHitArray], seq,
340
hitArray[hitIndex2]+1, MIN(len-hitArray[hitIndex2]-1,
341
patternSearch->spacing[wordIndex-1] + patternSearch->numPlacesInWord[wordIndex]), is_dna, patternSearch);
342
for (hitIndex1 = 0; hitIndex1 < twiceHitsOneCall; hitIndex1+= 2) {
343
hitArray1[nextPosInHitArray+hitIndex1] =
344
hitArray[hitIndex2]+hitArray1[nextPosInHitArray+hitIndex1]+1;
345
hitArray1[nextPosInHitArray+hitIndex1+1] = hitArray[hitIndex2+1];
347
nextPosInHitArray += twiceHitsOneCall;
349
twiceNumHits = nextPosInHitArray;
350
if (twiceNumHits < 2)
352
/*copy back matches that extend */
353
for (hitIndex2 = 0; hitIndex2 < nextPosInHitArray; hitIndex2++)
354
hitArray[hitIndex2] = hitArray1[hitIndex2];
356
/*extend each match back one word at a time*/
357
for (wordIndex = patternSearch->whichMostSpecific-1; wordIndex >=0;
359
patternSearch->whichPositionPtr =
360
patternSearch->SLL[wordIndex];
361
patternSearch->match_mask = patternSearch->match_maskL[wordIndex];
363
patternSearch->DNAwhichPrefixPosPtr = patternSearch->DNAprefixSLL[wordIndex];
364
patternSearch->DNAwhichSuffixPosPtr = patternSearch->DNAsuffixSLL[wordIndex];
366
nextPosInHitArray = 0;
367
for (hitIndex2 = 0; hitIndex2 < twiceNumHits; hitIndex2 += 2) {
368
start = hitArray[hitIndex2+1]-patternSearch->spacing[wordIndex]-patternSearch->numPlacesInWord[wordIndex];
371
twiceHitsOneCall = find_hitsS_head(&hitArray1[nextPosInHitArray], seq, start,
372
hitArray[hitIndex2+1]-start, is_dna, patternSearch);
373
for (hitIndex1 = 0; hitIndex1 < twiceHitsOneCall; hitIndex1+= 2) {
374
hitArray1[nextPosInHitArray+hitIndex1] = hitArray[hitIndex2];
375
hitArray1[nextPosInHitArray+hitIndex1+1] = start +
376
hitArray1[nextPosInHitArray+hitIndex1+1];
378
nextPosInHitArray += twiceHitsOneCall;
380
twiceNumHits = nextPosInHitArray;
381
if (twiceNumHits < 2)
383
/*copy back matches that extend*/
384
for (hitIndex2 = 0; hitIndex2 < nextPosInHitArray; hitIndex2++)
385
hitArray[hitIndex2] = hitArray1[hitIndex2];
390
Int4 FindPatternHits(Int4 *hitArray, const Uint1* seq, Int4 len,
391
Boolean is_dna, patternSearchItems * patternSearch)
393
if (patternSearch->flagPatternLength == ONE_WORD_PATTERN)
394
return find_hitsS_head(hitArray, seq, 0, len, is_dna, patternSearch);
395
if (patternSearch->flagPatternLength == MULTI_WORD_PATTERN)
396
return find_hitsL(hitArray, seq, len, patternSearch);
397
return find_hitsLL(hitArray, seq, len, is_dna, patternSearch);