1
/* Copyright (c) 1991 Sun Wu and Udi Manber. All Rights Reserved. */
6
#define CHARTYPE unsigned char
11
#define BLOCKSIZE 8192
12
#define MAX_SHIFT_2 4096
16
#define MAXMEMBER_1 65536
21
extern COUNT, FNAME, SILENT, FILENAMEONLY, num_of_matched;
22
extern DNA ; /* DNA flag is set in checksg when pattern is DNA pattern and
24
extern WORDBOUND, WHOLELINE, NOUPPER;
25
extern unsigned char CurrentFileName[], Progname[];
26
extern unsigned Mask[];
27
extern unsigned endposition;
29
unsigned char BSize; /* log_c m */
30
unsigned char char_map[MAXSYM];
35
CHARTYPE SHIFT[MAXSYM];
36
CHARTYPE MEMBER[MAXMEMBER];
37
CHARTYPE pat[MAXPATT];
39
char MEMBER_1[MAXMEMBER_1];
47
unsigned char temp[MAXPATT];
48
for(i=0; i<MAXSYM; i++) TR[i] = i;
50
for(i='A'; i<= 'Z'; i++) TR[i] = i + 'a' - 'A';
52
if(WORDBOUND) { /* SUN: To be added to be more complete */
53
for(i=0; i<128; i++) {
54
if(!isalnum(i)) TR[i] = W_DELIM;
58
memcpy(temp, pat, *m);
60
memcpy(pat+1, temp, *m);
68
CHARTYPE *pat; int fd, m, D;
70
CHARTYPE text[BLOCKSIZE+2*MAXLINE+MAXPATT]; /* input text stream */
71
int offset = 2*MAXLINE;
72
int buf_end, num_read, i, start, end, residue = 0;
73
if(pat[0] == '^' || pat[0] == '$') pat[0] = '\n';
74
if(pat[m-1] == '^' || pat[m-1] == '$') pat[m-1] = '\n';
75
char_tr(pat, &m); /* will change pat, and m if WHOLELINE is ON */
76
text[offset-1] = '\n'; /* initial case */
77
for(i=0; i < MAXLINE; i++) text[i] = 0; /* security zone */
79
if(WHOLELINE) start--;
81
fprintf(stderr, "%s: pattern too long\n", Progname);
85
if(m > LONG_EXAC) m_preprocess(pat);
88
else if (DNA) prep4(pat, m);
89
else if(m >= LONG_APPX) am_preprocess(pat);
92
initmask(pat, Mask, m, 0, &endposition);
94
for(i=1; i<=m; i++) text[BLOCKSIZE+offset+i] = pat[m-1];
95
/* to make sure the skip loop in bm() won't go out of bound */
96
while( (num_read = read(fd, text+offset, BLOCKSIZE)) > 0)
98
buf_end = end = offset + num_read -1 ;
99
while(text[end] != '\n' && end > offset) end--;
100
residue = buf_end - end + 1 ;
101
text[start-1] = '\n';
103
if(m > LONG_EXAC) monkey(pat, m, text+start, text+end);
104
else bm(pat, m, text+start, text+end);
107
if(DNA) monkey4( pat, m, text+start, text+end, D );
109
if(m >= LONG_APPX) a_monkey(pat, m, text+start, text+end, D);
110
else agrep(pat, m, text+start, text+end, D);
113
if(FILENAMEONLY && num_of_matched) {
114
printf("%s\n", CurrentFileName);
116
start = offset - residue ;
117
if(start < MAXLINE) {
120
strncpy(text+start, text+end, residue);
122
} /* end of while(num_read = ... */
126
/* SUN: bm assumes that the content of text[n]...text[n+m-1] is
127
pat[m-1] such that the skip loop is guaranteed to terminated */
129
bm(pat, m, text, textend)
130
CHARTYPE *text, *textend, *pat; int m;
133
register int m1, j, d1;
136
printf("%d\t", textend - text);
137
printf("%c, %c", *text, *textend);
140
d1 = shift_1; /* at least 1 */
143
while (text <= textend) {
144
shift = SHIFT[*(text += shift)];
146
shift = SHIFT[*(text += shift)];
147
shift = SHIFT[*(text += shift)];
148
shift = SHIFT[*(text += shift)];
151
while(TR[pat[m1 - j]] == TR[*(text - j)]) {
152
if(++j == m) break; /* if statement can be
153
saved, but for safty ... */
156
if(text > textend) return;
158
if(TR[*(text+1)] != W_DELIM) goto CONT;
159
if(TR[*(text-m)] != W_DELIM) goto CONT;
162
if(FILENAMEONLY) return;
164
if(FNAME) printf("%s: ", CurrentFileName);
165
while(*(--text) != '\n');
166
while(*(++text) != '\n') putchar(*(text));
169
else { while(*text != '\n') text++; }
179
/* initmask() initializes the mask table for the pattern */
180
/* endposition is a mask for the endposition of the pattern */
181
/* endposition will contain k mask bits if the pattern contains k fragments */
182
initmask(pattern, Mask, m, D, endposition)
183
CHARTYPE *pattern; unsigned *Mask; register int m, D; unsigned *endposition;
185
register unsigned Bit1, c;
186
register int i, j, frag_num;
188
Bit1 = 1 << 31; /* the first bit of Bit1 is 1, others 0. */
189
frag_num = D+1; *endposition = 0;
190
for (i = 0; i < frag_num; i++) *endposition = *endposition | (Bit1 >> i);
191
*endposition = *endposition >> (m - frag_num);
192
for(i = 0; i < m; i++)
193
if (pattern[i] == '^' || pattern[i] == '$') {
196
for(i = 0; i < MAXSYM; i++) Mask[i] = ~0;
197
for(i = 0; i < m; i++) /* initialize the mask table */
199
for ( j = 0; j < m; j++)
200
if( c == pattern[j] )
201
Mask[c] = Mask[c] & ~( Bit1 >> j ) ;
205
prep(Pattern, M, D) /* preprocessing for partitioning_bm */
206
CHARTYPE *Pattern; /* can be fine-tuned to choose a better partition */
209
register int i, j, k, p, shift;
211
unsigned hash, b_size = 3;
214
for (i = 0; i < MAXSYM; i++) SHIFT[i] = m;
215
for (i = M-1; i>=p ; i--) {
218
if(SHIFT[hash] > shift) SHIFT[hash] = shift;
221
for(i=0; i<M; i++) printf(" %d,", SHIFT[Pattern[i]]);
225
for(i=0; i<D+1; i++) {
229
if(Pattern[j-k] == Pattern[M-1-m*p])
230
if(k < shift_1) shift_1 = k;
234
printf("\nshift_1 = %d", shift_1);
236
if(shift_1 == 0) shift_1 = 1;
237
for(i=0; i<MAXMEMBER; i++) MEMBER[i] = 0;
238
if (m < 3) b_size = m;
239
for(i=0; i<D+1; i++) {
242
for(k=0; k<b_size; k++) {
243
hash = (hash << 2) + Pattern[j-k];
246
printf(" hash = %d,", hash);
253
agrep( pat, M, text, textend, D )
254
int M, D ; register CHARTYPE *text, *textend, *pat;
257
register int m = M/(D+1);
258
register CHARTYPE *textstart;
259
register int shift, HASH;
262
int Candidate[MaxCan][2], round, lastend=0;
263
unsigned R1[MaxError+1], R2[MaxError+1];
264
register unsigned int r1, endpos, c;
269
Candidate[0][0] = Candidate[0][1] = 0;
276
while (text < textend) {
277
shift = SHIFT[*(text += shift)];
279
shift = SHIFT[*(text += shift)];
280
shift = SHIFT[*(text += shift)];
283
while(j < r1) { HASH = (HASH << 2) + *(text-j);
286
i = text - textstart;
287
if((i - M - D - 10) > Candidate[cdx][1]) {
288
Candidate[++cdx][0] = i-M-D-2;
289
Candidate[cdx][1] = i+M+D; }
290
else Candidate[cdx][1] = i+M+D;
298
n = textend - textstart;
300
/* for those candidate areas, find the D-error matches */
301
if(Candidate[1][0] < 0) Candidate[1][0] = 0;
302
endpos = endposition; /* the mask table and the endposition */
304
for(round = 0; round <= cdx; round++)
305
{ i = Candidate[round][0] ;
306
if(Candidate[round][1] > n) Candidate[round][1] = n;
309
printf("round: %d, start=%d, end=%d, ", round, i, Candidate[round][1]);
312
R1[1] = R2[1] = ~Bit1;
313
for(k = 1; k <= D; k++) R1[k] = R2[k] = (R1[k-1] >> 1) & R1[k-1];
314
while (i < Candidate[round][1])
318
for(k = 0 ; k <= D; k++) R1[k] = R2[k] = (~0 );
321
R1[0] = (R2[0] >> 1) | r1;
323
R1[k] = ((R2[k] >> 1) | r1) & R2[k-1] & ((R1[k-1] & R2[k-1]) >> 1);
324
if((R1[D] & endpos) == 0) {
326
if(FILENAMEONLY) { return; }
328
if(i <= lastend) i = lastend;
330
s_output(text, ¤tpos);
334
for(k=0; k<=D; k++) R1[k] = R2[k] = ~0;
338
for(k = 0 ; k <= D; k++) R1[k] = R2[k] = (~0 );
341
R2[0] = (R1[0] >> 1) | r1;
342
for(k = 1; k <= D; k++)
343
R2[k] = ((R1[k] >> 1) | r1) & R1[k-1] & ((R1[k-1] & R2[k-1]) >> 1);
344
if((R2[D] & endpos) == 0) { currentpos = i;
346
if(FILENAMEONLY) { return; }
347
if(i <= lastend) i = lastend;
349
s_output(text, ¤tpos);
353
for(k=0; k<=D; k++) R1[k] = R2[k] = ~0;
361
int *i; CHARTYPE *text;
366
while(text[*i] != '\n') *i = *i + 1;
369
if(FNAME == ON) printf("%s: ", CurrentFileName);
371
while(text[--bp] != '\n');
372
while(text[++bp] != '\n') putchar(text[bp]);
379
unsigned char *Pattern;
385
for (i = 0; i < MAXSYM; i++) SHIFT[i] = m;
386
for (i = m-1; i>=0; i--) {
387
hash = TR[Pattern[i]];
388
if(SHIFT[hash] >= m - 1) SHIFT[hash] = m-1-i;
391
lastc = TR[Pattern[m-1]];
392
for (i= m-2; i>=0; i--) {
393
if(TR[Pattern[i]] == lastc )
394
{ shift_1 = m-1 - i; i = -1; }
396
if(shift_1 == 0) shift_1 = 1;
397
if(NOUPPER) for(i='A'; i<='Z'; i++) SHIFT[i] = SHIFT[i + 'a' - 'A'];
399
for(i='a'; i<='z'; i++) printf("%c: %d", i, SHIFT[i]); printf("\n");
400
for(i='A'; i<='Z'; i++) printf("%c: %d", i, SHIFT[i]); printf("\n");
405
/* a_monkey() the approximate monkey move */
407
a_monkey( pat, m, text, textend, D )
408
register int m, D ; register CHARTYPE *text, *textend, *pat;
410
register CHARTYPE *oldtext;
411
register unsigned hash, i, hashmask, suffix_error;
412
register int m1 = m-1-D, j, pos;
416
while (text < textend) {
419
while(suffix_error <= D) {
421
while(MEMBER_1[hash]) {
422
hash = ((hash << LOG_ASCII) + *(text--)) & hashmask;
426
if(text <= oldtext) {
427
if((pos = verify(m, 2*m+D, D, pat, oldtext)) > 0) {
429
if(text > textend) return;
431
if(FILENAMEONLY) return;
433
if(FNAME) printf("%s: ", CurrentFileName);
434
while(*(--text) != '\n');
435
while(*(++text) != '\n') putchar(*text);
439
while(*text != '\n') text++;
450
/* monkey uses two characters for delta_1 shifting */
452
CHARTYPE SHIFT_2[MAX_SHIFT_2];
454
monkey( pat, m, text, textend )
455
register int m ; register CHARTYPE *text, *textend, *pat;
457
register unsigned hash, i;
458
register CHARTYPE shift;
460
register unsigned r_newline;
466
while (text < textend) {
468
hash = (hash << 3) + *(text-1);
469
shift = SHIFT_2[hash];
472
hash = (*text << 3) + *(text-1);
473
shift = SHIFT_2[hash];
476
while(TR[pat[m1 - j]] == TR[*(text - j)]) { if(++j == m) break; }
478
if(text >= textend) return;
480
if(FILENAMEONLY) return;
482
while (*text != r_newline) text++;
486
if(FNAME) printf("%s: ", CurrentFileName);
487
while(*(--text) != r_newline);
488
while(*(++text) != r_newline) putchar(*text);
497
am_preprocess(Pattern)
503
for (i = 1, Hashmask = 1 ; i<16 ; i++) Hashmask = (Hashmask << 1) + 1 ;
504
for (i = 0; i < MAXMEMBER_1; i++) MEMBER_1[i] = 0;
505
for (i = m-1; i>=0; i--) {
506
MEMBER_1[Pattern[i]] = 1;
508
for (i = m-1; i > 0; i--) {
509
MEMBER_1[(Pattern[i] << LOG_ASCII) + Pattern[i-1]] = 1;
514
verify(m, n, D, pat, text)
515
register int m, n, D;
516
CHARTYPE *pat, *text;
518
int A[MAXPATT], B[MAXPATT];
519
register int last = D;
520
register int cost = 0;
521
register int k, i, c;
522
register int m1 = m+1;
523
CHARTYPE *textend = text+n;
524
CHARTYPE *textbegin = text;
526
for (i = 0; i <= m1; i++) A[i] = B[i] = i;
527
while (text < textend)
529
for (k = 1; k <= last; k++)
532
if (pat[k-1] != *text)
533
{ if (B[k]+1 < cost) cost = B[k]+1;
534
if (A[k-1]+1 < cost) cost = A[k-1]+1; }
538
if(pat[last] == *text++) { A[last+1] = B[last]; last++; }
539
if(A[last] < D) A[last+1] = A[last++]+1;
540
while (A[last] > D) last = last - 1;
541
if(last >= m) return(text - textbegin - 1);
544
for(c = 0; c<=m1; c++) A[c] = B[c] = c;
546
for (k = 1; k <= last; k++)
549
if (pat[k-1] != *text)
550
{ if (A[k]+1 < cost) cost = A[k]+1;
551
if (B[k-1]+1 < cost) cost = B[k-1]+1; }
555
if(pat[last] == *text++) { B[last+1] = A[last]; last++; }
556
if(B[last] < D) B[last+1] = B[last++]+1;
557
while (B[last] > D) last = last -1;
558
if(last >= m) return(text - textbegin - 1);
561
for(c = 0; c<=m1; c++) A[c] = B[c] = c;
567
/* preprocessing for monkey() */
569
m_preprocess(Pattern)
575
for (i = 0; i < MAX_SHIFT_2; i++) SHIFT_2[i] = m;
576
for (i = m-1; i>=1; i--) {
579
for (j = 0; j< MAXSYM; j++) {
580
if(SHIFT_2[hash+j] == m) SHIFT_2[hash+j] = m-1;
582
hash = hash + Pattern[i-1];
583
if(SHIFT_2[hash] >= m - 1) SHIFT_2[hash] = m-1-i;
586
for (i= m-2; i>=0; i--) {
587
if(Pattern[i] == Pattern[m-1] )
588
{ shift_1 = m-1 - i; i = -1; }
590
if(shift_1 == 0) shift_1 = 1;
594
/* monkey4() the approximate monkey move */
598
monkey4( pat, m, text, textend, D )
599
register int m, D ; register unsigned char *text, *pat, *textend;
601
register unsigned char *oldtext;
602
register unsigned hash, i, hashmask, suffix_error;
603
register int m1=m-1-D, j, pos;
607
while (text < textend) {
610
while(suffix_error <= D) {
611
hash = char_map[*text--];
612
hash = ((hash << LOG_DNA) + char_map[*(text--)]) & hashmask;
613
while(MEMBER_D[hash]) {
614
hash = ((hash << LOG_DNA) + char_map[*(text--)]) & hashmask;
618
if(text <= oldtext) {
619
if((pos = verify(m, 2*m+D, D, pat, oldtext)) > 0) {
621
if(text > textend) return;
623
if(FILENAMEONLY) return;
625
if(FNAME) printf("%s:", CurrentFileName);
626
while(*(--text) != '\n');
627
while(*(++text) != '\n') putchar(*text);
632
while(*text != '\n') text++;
636
else text = oldtext + m;
643
char *Pattern; int m;
648
for(i=0; i< MAXSYM; i++) char_map[i] = 0;
649
char_map['a'] = char_map['A'] = 4;
650
char_map['g'] = char_map['g'] = 1;
651
char_map['t'] = char_map['t'] = 2;
652
char_map['c'] = char_map['c'] = 3;
653
char_map['n'] = char_map['n'] = 5;
656
for (i = 1, Hashmask = 1 ; i<BSize*LOG_DNA; i++) Hashmask = (Hashmask << 1) + 1 ;
657
MEMBER_D = (char *) malloc((Hashmask+1) * sizeof(char));
659
printf("BSize = %d", BSize);
661
for (i=0; i <= Hashmask; i++) MEMBER_D[i] = 0;
662
for (j=0; j < BSize; j++) {
663
for(i=m-1; i >= j; i--) {
665
for(k=0; k <= j; k++)
666
hash = (hash << LOG_DNA) +char_map[Pattern[i-k]];
668
printf("< %d >, ", hash);
681
for (i = 1; exp < m; i++) exp = exp * base;