1
/* Copyright (c) 1994 Sun Wu, Udi Manber, Burra Gopal. All Rights Reserved. */
3
/* multipattern matcher */
17
#define uchar unsigned char
24
#define MAXMEMBER1 32768
25
/* #define MAXMEMBER1 262144 */ /*2^18 */
26
#define MAXPATFILE 600000
27
#define BLOCKSIZE 16384
29
/* #define MAXHASH 262144 */
31
#define max_num MAX_DASHF_FILES
38
#define Hbits 5 /* how much to shift to perform the hash */
40
extern char aduplicates[MAXNUM_PAT][MAXNUM_PAT]; /* tells what other patterns are exactly equal to the i-th one */
41
extern char tc_aduplicates[MAXNUM_PAT][MAXNUM_PAT]; /* tells what other patterns are exactly equal to the i-th one */
42
extern ParseTree aterminals[MAXNUM_PAT];
43
extern int AComplexBoolean;
44
extern int LIMITOUTPUT, LIMITPERFILE;
45
extern int BYTECOUNT, PRINTOFFSET, PRINTRECORD, CurrentByteOffset;
46
extern int MULTI_OUTPUT; /* used by glimpse only if OR, never for AND */
48
extern CHAR D_pattern[MaxDelimit*2];
50
extern CHAR tc_D_pattern[MaxDelimit*2];
51
extern int tc_D_length;
52
extern COUNT, FNAME, SILENT, FILENAMEONLY, prev_num_of_matched, num_of_matched, PRINTFILETIME;
53
extern INVERSE, OUTTAIL;
54
extern WORDBOUND, WHOLELINE, NOUPPER;
55
extern ParseTree *AParse;
56
extern int AComplexPattern;
57
extern unsigned char CurrentFileName[], Progname[];
58
extern long CurrentFileTime;
60
extern agrep_initialfd;
61
extern int EXITONERROR;
62
extern int PRINTPATTERN;
63
extern int agrep_inlen;
64
extern CHAR *agrep_inbuffer;
65
extern FILE *agrep_finalfp;
66
extern int agrep_outpointer;
67
extern int agrep_outlen;
68
extern CHAR * agrep_outbuffer;
70
extern int NEW_FILE, POST_FILTER;
72
extern int tuncompressible();
73
extern int quick_tcompress();
74
extern int quick_tuncompress();
75
extern int TCOMPRESSED;
76
extern int EASYSEARCH;
77
extern char FREQ_FILE[MAX_LINE_LEN], HASH_FILE[MAX_LINE_LEN], STRING_FILE[MAX_LINE_LEN];
78
extern char PAT_FILE_NAME[MAX_LINE_LEN];
80
uchar SHIFT1[MAXMEMBER1];
90
uchar *PatPtr[max_num];
91
uchar *pat_spool = NULL; /* [MAXPATFILE+2*max_num+MAXPAT]; */
94
int pat_indices[max_num]; /* pat_indices[p] gives the actual index in matched_teriminals: used only with AParse != 0 */
97
extern char amatched_terminals[MAXNUM_PAT]; /* which patterns have been matched in the current line? Used only with AParse != 0, so max_num is not needed */
98
extern int anum_terminals;
99
extern int AComplexBoolean;
100
static void countline();
101
void acompute_duplicates();
103
/* Equivalent variables for compression search */
104
uchar tc_SHIFT1[MAXMEMBER1];
111
uchar tc_tr1[MAXSYM];
112
int tc_HASH[MAXHASH];
113
int tc_Hash2[max_num];
114
uchar *tc_PatPtr[max_num];
115
uchar *tc_pat_spool = NULL; /* [MAXPATFILE+2*max_num+MAXPAT]; */
116
uchar *tc_patt[max_num];
117
int tc_pat_len[max_num];
118
int tc_pat_indices[max_num]; /* pat_indices[p] gives the actual index in matched_teriminals: used only with AParse != 0 */
119
int tc_num_pat; /* must be the same as num_pat */
120
#endif /*DOTCOMPRESSED*/
122
static void f_prep();
123
static void f_prep1();
124
static void accumulate();
126
static void tc_f_prep();
127
static void tc_f_prep1();
128
static void tc_accumulate();
132
int cshift=0, cshift0=0, chash=0;
136
* General idea behind output processing with delimiters, inverse, compression, etc.
137
* CAUTION: In compressed files, we can search ONLY for simple patterns or their ;,.
138
* Attempts to search for complex patterns / with errors might lead to spurious matches.
139
* 1. Once we find the match, go back and forward to get the delimiters that surround
140
* the matched region.
141
* 2. If it is a compressed file, verify that the match is "real" (compressed files
142
* can have pseudo matches hence this filtering step is required).
143
* 3. Increment num_of_matched.
144
* 4. Process some output options which print stuff before the matched region is
146
* 5. If there is compression, decomress and output the matched region. Otherwise
147
* just output it as is. Remember, from step (1) we know the matched region.
148
* 6. If inverse is set, then we must keep track of the end of the last matched region
149
* in the variable lastout. When there is a match, we must print everything from
150
* lastout to the beginning of the current matched region (curtextbegin) and then
151
* update lastout to point to the end of the current matched region (curtextend).
152
* ALSO: if we exit from the main loops, we must output everything from the end
153
* of the last matched region to the end of the input buffer.
154
* 7. Delimiter handling in complex patterns is different: there the search is done
155
* for a boolean and of the delimiter pattern and the actual pattern.
156
* 8. For convenience and speed, the multipattern matching routines to handle
157
* compressed files have been separated from their (normal) counterparts.
158
* 9. One special note on handling complicated boolean patterns: the parse
159
* tree will be the same for both compressed and uncomrpessed patterns and the
160
* same amatched_terminals array will be used in both. BUT, the pat_spool and
161
* pat_index, etc., will be different as they refer to the individual terminals.
165
prepf(mfp, mbuf, mlen)
169
int length=0, i, p=1;
175
int j, k; /* to implement \\ */
177
if ((mfp == -1) && ((mbuf == NULL) || (mlen <= 0))) return -1;
180
if (fstat(mfp, &stbuf) == -1) {
181
fprintf(stderr, "%s: cannot stat file: %s\n", Progname, PAT_FILE_NAME);
184
if (!S_ISREG(stbuf.st_mode)) {
185
fprintf(stderr, "%s: pattern file not regular file: %s\n", Progname, PAT_FILE_NAME);
188
if (stbuf.st_size*2 > MAXPATFILE + 2*max_num) {
189
fprintf(stderr, "%s: pattern file too large (> %d B): %s\n", Progname, (MAXPATFILE+2*max_num)/2, PAT_FILE_NAME);
192
if (pat_spool != NULL) free(pat_spool);
193
pat_ptr = pat_spool = (unsigned char *)malloc(stbuf.st_size*2 + MAXPAT);
194
alloc_buf(mfp, &buf, MAXPATFILE+2*BlockSize);
195
while((num_read = fill_buf(mfp, buf+length, 2*BlockSize)) > 0) {
196
length = length + num_read;
197
if(length > MAXPATFILE) {
198
fprintf(stderr, "%s: maximum pattern file size is %d\n", Progname, MAXPATFILE);
211
if (mlen*2 > MAXPATFILE + 2*max_num) {
212
fprintf(stderr, "%s: pattern buffer too large (> %d B)\n", Progname, (MAXPATFILE+2*max_num)/2);
215
if (pat_spool != NULL) free(pat_spool);
216
pat_ptr = pat_spool = (unsigned char *)malloc(mlen*2 + MAXPAT);
219
/* Now all the patterns are in buf */
222
/* removed by Udi 11/8/94 - we now do WORDBOUND "by hand"
226
*pat_ptr++ = W_DELIM;
227
while((i<length) && ((*pat_ptr = buf[i++]) != '\n')) pat_ptr++;
228
*pat_ptr++ = W_DELIM;
238
*pat_ptr++ = L_DELIM;
239
while((i<length) && ((*pat_ptr = buf[i++]) != '\n')) pat_ptr++;
240
*pat_ptr++ = L_DELIM;
248
while((i<length) && ((*pat_ptr = buf[i++]) != '\n')) pat_ptr++;
254
/* Now, the patterns have been copied into patt[] */
256
fprintf(stderr, "%s: maximum number of patterns is %d\n", Progname, max_num);
266
for(i=1; i<20; i++) *pat_ptr = i; /* boundary safety zone */
268
/* I might have to keep changing tr s.t. mgrep won't get confused with W_DELIM */
269
for(i=0; i< MAXSYM; i++) tr[i] = i;
271
for (i=0; i<MAXSYM; i++)
272
if (isupper(i)) tr[i] = tr[tolower(i)];
273
/* for(i='A'; i<= 'Z'; i++) tr[i] = i + 'a' - 'A'; */
277
for(i=1; i<MAXSYM; i++) if(!isalnum(i)) tr[i] = W_DELIM;
279
removed by Udi 11/8/94 - the trick of using W-delim was too buggy.
280
we now do it "by hand" after we find a match
283
for(i=0; i< MAXSYM; i++) tr1[i] = tr[i]&Mask;
286
for(i=1; i<=num_pat; i++) {
288
if ((patt[i][0] == '^') || (patt[i][0] == '$')) patt[i][0] = '\n';
289
if ((p > 1) && ((patt[i][p-1] == '^') || (patt[i][p-1] == '$')) && (patt[i][p-2] != '\\')) patt[i][p-1] = '\n';
291
/* Added by bg, Dec 2nd 1994 */
292
for (k=0; k<p; k++) {
293
if (patt[i][k] == '\\') {
295
patt[i][j] = patt[i][j+1]; /* including '\0' */
302
pat_len[i] = (WORDBOUND?(p-2>0?p-2:1):p); changed by Udi 11/8/94
305
printf("prepf(): patt[%d]=%s, pat_len[%d]=%d\n", i, patt[i], i, pat_len[i]);
307
if(p!=0 && p < p_size) p_size = p; /* MIN */
310
fprintf(stderr, "%s: the pattern file is empty\n", Progname);
318
if(length > 400 && p_size > 2) LONG = 1;
319
if(p_size == 1) SHORT = 1;
320
for(i=0; i<MAXMEMBER1; i++) SHIFT1[i] = p_size - 1 - LONG;
321
for(i=0; i<MAXHASH; i++) {
324
for(i=1; i<=num_pat; i++) f_prep(i, patt[i]);
326
memset(pat_indices, '\0', sizeof(int) * (num_pat + 1));
327
for(i=1; i<=num_pat; i++) f_prep1(i, patt[i]);
330
/* prepf for compression */
331
if (-1 == tc_prepf(buf, length)) {
335
#endif /*DOTCOMPRESSED*/
337
acompute_duplicates(aduplicates, aterminals, anum_terminals, tr);
343
* Compression equivalent of prepf: called right after prepf.
344
* 1. Read patt and SHIFT1
345
* 2. Call tcompress on the patterns in patt and put in tc_patt.
346
* 3. Use these patterns to compute tc_SHIFT (ignore WDELIM, LDELIM, case sensitivity, etc.)
347
* 4. Process other variables/functions (pat_spool, tr, tr1, pat_len, accumulate, SHIFT1, f_prep, f_prep1, pat_indices) appropriately.
350
tc_prepf(buf, length)
358
unsigned char tc_buf[MAXPAT * 2]; /* maximum length of the compressed pattern */
359
static struct timeval initt, finalt;
361
if (length*2 > MAXPATFILE + 2*max_num) {
362
fprintf(stderr, "%s: pattern buffer too large (> %d B)\n", Progname, (MAXPATFILE+2*max_num)/2);
365
if (tc_pat_spool != NULL) free(tc_pat_spool);
366
pat_ptr = tc_pat_spool = (unsigned char *)malloc(length*2 + MAXPAT);
369
gettimeofday(&initt, NULL);
370
#endif /*MEASURE_TIMES*/
374
tc_patt[p] = pat_ptr;
375
while((*pat_ptr = buf[i++]) != '\n') pat_ptr++;
377
if ((tc_length = quick_tcompress(FREQ_FILE, HASH_FILE, tc_patt[p], strlen(tc_patt[p]), tc_buf, MAXPAT * 2 - 8, TC_EASYSEARCH)) > 0) {
378
memcpy(tc_patt[p], tc_buf, tc_length);
379
tc_patt[p][tc_length] = '\0';
380
pat_ptr = tc_patt[p] + tc_length + 1; /* character after '\0' */
385
for(i=1; i<20; i++) *pat_ptr = i; /* boundary safety zone */
387
/* Ignore all other options: it is automatically W_DELIM */
388
for(i=0; i< MAXSYM; i++) tc_tr[i] = i;
389
for(i=0; i< MAXSYM; i++) tc_tr1[i] = tc_tr[i]&Mask;
392
for(i=1; i<=num_pat; i++) {
393
p = strlen(tc_patt[i]);
396
printf("prepf(): tc_patt[%d]=%s, tc_pat_len[%d]=%d\n", i, tc_patt[i], i, tc_pat_len[i]);
398
if(p!=0 && p < tc_p_size) tc_p_size = p; /* MIN */
400
if(tc_p_size == 0) { /* cannot happen NOW */
401
fprintf(stderr, "%s: the pattern file is empty\n", Progname);
408
if(length > 400 && tc_p_size > 2) tc_LONG = 1;
409
if(tc_p_size == 1) tc_SHORT = 1;
410
for(i=0; i<MAXMEMBER1; i++) tc_SHIFT1[i] = tc_p_size - 1 - LONG;
411
for(i=0; i<MAXHASH; i++) {
414
for(i=1; i<=tc_num_pat; i++) tc_f_prep(i, tc_patt[i]);
416
memset(tc_pat_indices, '\0', sizeof(int) * (tc_num_pat + 1));
417
for(i=1; i<=tc_num_pat; i++) tc_f_prep1(i, tc_patt[i]);
419
acompute_duplicates(tc_aduplicates, aterminals, anum_terminals, tc_tr);
421
gettimeofday(&finalt, NULL);
422
INFILTER_ms += (finalt.tv_sec*1000 + finalt.tv_usec/1000) - (initt.tv_sec*1000 + initt.tv_usec/1000);
423
#endif /*MEASURE_TIMES*/
426
#endif /*DOTCOMPRESSED*/
432
register char r_newline = '\n';
434
register int buf_end, num_read, start, end, residue = 0;
435
int oldCurrentByteOffset;
440
#endif /*AGREP_POINTER*/
441
alloc_buf(fd, &text, 2*BlockSize+Max_record);
442
text[Max_record-1] = '\n'; /* initial case */
445
while( (num_read = fill_buf(fd, text+Max_record, 2*BlockSize)) > 0)
447
buf_end = end = Max_record + num_read -1 ;
448
oldCurrentByteOffset = CurrentByteOffset;
451
if ((TCOMPRESSED == ON) && tuncompressible(text+Max_record, num_read)) {
452
EASYSEARCH = text[Max_record+SIGNATURE_LEN-1];
453
start += SIGNATURE_LEN;
454
CurrentByteOffset += SIGNATURE_LEN;
456
fprintf(stderr, "not compressed for easy-search: can miss some matches in: %s\n", CurrentFileName);
459
else TCOMPRESSED = OFF;
464
while(text[end] != r_newline && end > Max_record) end--;
465
text[start-1] = r_newline;
468
unsigned char *newbuf = text + end + 1;
469
newbuf = backward_delimiter(newbuf, text+Max_record, D_pattern, D_length, OUTTAIL); /* see agrep.c/'d' */
470
if (newbuf < text+Max_record+D_length) newbuf = text + end + 1;
471
end = newbuf - text - 1;
472
memcpy(text+start-D_length, D_pattern, D_length);
474
residue = buf_end - end + 1 ;
475
if(INVERSE && COUNT) countline(text+Max_record, num_read);
478
if (TCOMPRESSED) { /* separate functions since separate globals => too many if-statements within a single function makes it slow */
480
if(tc_SHORT) { if (-1 == tc_m_short(text, start, end)) {free_buf(fd, text); return -1;}}
481
else { if (-1 == tc_monkey1(text, start, end)) {free_buf(fd, text); return -1;}}
482
#endif /*DOTCOMPRESSED*/
485
if(SHORT) { if (-1 == m_short(text, start, end)) {free_buf(fd, text); return -1;}}
486
else { if (-1 == monkey1(text, start, end)) {free_buf(fd, text); return -1;}}
488
if(FILENAMEONLY && (num_of_matched - prev_num_of_matched) && (NEW_FILE || !POST_FILTER)) {
489
if (agrep_finalfp != NULL)
490
fprintf(agrep_finalfp, "%s", CurrentFileName);
493
for(outindex=0; (outindex+agrep_outpointer<agrep_outlen) &&
494
(CurrentFileName[outindex] != '\0'); outindex++) {
495
agrep_outbuffer[agrep_outpointer+outindex] = CurrentFileName[outindex];
497
if ((CurrentFileName[outindex] != '\0') || (outindex+agrep_outpointer>=agrep_outlen)) {
502
agrep_outpointer += outindex;
505
char *s = aprint_file_time(CurrentFileTime);
506
if (agrep_finalfp != NULL)
507
fprintf(agrep_finalfp, "%s", s);
510
for(outindex=0; (outindex+agrep_outpointer<agrep_outlen) &&
511
(s[outindex] != '\0'); outindex++) {
512
agrep_outbuffer[agrep_outpointer+outindex] = s[outindex];
514
if ((s[outindex] != '\0') || (outindex+agrep_outpointer>=agrep_outlen)) {
519
agrep_outpointer += outindex;
522
if (agrep_finalfp != NULL)
523
fprintf(agrep_finalfp, "\n");
525
if (agrep_outpointer+1>=agrep_outlen) {
530
else agrep_outbuffer[agrep_outpointer++] = '\n';
538
CurrentByteOffset = oldCurrentByteOffset + end - start + 1;
539
start = Max_record - residue;
543
strncpy(text+start, text+end, residue);
545
if (((LIMITOUTPUT > 0) && (LIMITOUTPUT <= num_of_matched)) ||
546
((LIMITPERFILE > 0) && (LIMITPERFILE <= num_of_matched - prev_num_of_matched))) {
550
} /* end of while(num_read = ... */
552
text[start-1] = '\n';
553
text[start+residue] = '\n';
556
if (start > D_length) memcpy(text+start-D_length, D_pattern, D_length);
557
memcpy(text+start+residue, D_pattern, D_length);
559
end = start + residue;
563
if(tc_SHORT) tc_m_short(text, start, end);
564
else tc_monkey1(text, start, end);
565
#endif /*DOTCOMPRESSED*/
568
if(SHORT) m_short(text, start, end);
569
else monkey1(text, start, end);
571
if(FILENAMEONLY && (num_of_matched - prev_num_of_matched) && (NEW_FILE || !POST_FILTER)) {
572
if (agrep_finalfp != NULL)
573
fprintf(agrep_finalfp, "%s", CurrentFileName);
576
for(outindex=0; (outindex+agrep_outpointer<agrep_outlen) &&
577
(CurrentFileName[outindex] != '\0'); outindex++) {
578
agrep_outbuffer[agrep_outpointer+outindex] = CurrentFileName[outindex];
580
if ((CurrentFileName[outindex] != '\0') || (outindex+agrep_outpointer>=agrep_outlen)) {
585
agrep_outpointer += outindex;
588
char *s = aprint_file_time(CurrentFileTime);
589
if (agrep_finalfp != NULL)
590
fprintf(agrep_finalfp, "%s", s);
593
for(outindex=0; (outindex+agrep_outpointer<agrep_outlen) &&
594
(s[outindex] != '\0'); outindex++) {
595
agrep_outbuffer[agrep_outpointer+outindex] = s[outindex];
597
if ((s[outindex] != '\0') || (outindex+agrep_outpointer>=agrep_outlen)) {
602
agrep_outpointer += outindex;
605
if (agrep_finalfp != NULL)
606
fprintf(agrep_finalfp, "\n");
608
if (agrep_outpointer+1>=agrep_outlen) {
613
else agrep_outbuffer[agrep_outpointer++] = '\n';
626
text = (unsigned char *)agrep_inbuffer;
627
num_read = agrep_inlen;
629
buf_end = end = num_read - 1;
631
oldCurrentByteOffset = CurrentByteOffset;
634
if ((TCOMPRESSED == ON) && tuncompressible(text+Max_record, num_read)) {
635
EASYSEARCH = text[Max_record+SIGNATURE_LEN-1];
636
start += SIGNATURE_LEN;
637
CurrentByteOffset += SIGNATURE_LEN;
639
fprintf(stderr, "not compressed for easy-search: can miss some matches in: %s\n", CurrentFileName);
642
else TCOMPRESSED = OFF;
647
while(text[end] != r_newline && end > 1) end--;
649
unsigned char *newbuf = text + end + 1;
650
newbuf = backward_delimiter(newbuf, text, D_pattern, D_length, OUTTAIL); /* see agrep.c/'d' */
651
if (newbuf < text+D_length) newbuf = text + end + 1;
652
end = newbuf - text - 1;
654
/* text[0] = text[end] = r_newline; : the user must ensure that the delimiter is there at text[0] and occurs somewhere before text[end] */
656
if (INVERSE && COUNT) countline(text, num_read);
658
/* An exact copy of the above MGREP_PROCESS */
659
if (TCOMPRESSED) { /* separate functions since separate globals => too many if-statements within a single function makes it slow */
661
if(tc_SHORT) { if (-1 == tc_m_short(text, start, end)) {free_buf(fd, text); return -1;}}
662
else { if (-1 == tc_monkey1(text, start, end)) {free_buf(fd, text); return -1;}}
663
#endif /*DOTCOMPRESSED*/
666
if(SHORT) { if (-1 == m_short(text, start, end)) {free_buf(fd, text); return -1;}}
667
else { if (-1 == monkey1(text, start, end)) {free_buf(fd, text); return -1;}}
669
if(FILENAMEONLY && (num_of_matched - prev_num_of_matched) && (NEW_FILE || !POST_FILTER)) {
670
if (agrep_finalfp != NULL)
671
fprintf(agrep_finalfp, "%s", CurrentFileName);
674
for(outindex=0; (outindex+agrep_outpointer<agrep_outlen) &&
675
(CurrentFileName[outindex] != '\0'); outindex++) {
676
agrep_outbuffer[agrep_outpointer+outindex] = CurrentFileName[outindex];
678
if ((CurrentFileName[outindex] != '\0') || (outindex+agrep_outpointer>=agrep_outlen)) {
683
agrep_outpointer += outindex;
686
char *s = aprint_file_time(CurrentFileTime);
687
if (agrep_finalfp != NULL)
688
fprintf(agrep_finalfp, "%s", s);
691
for(outindex=0; (outindex+agrep_outpointer<agrep_outlen) &&
692
(s[outindex] != '\0'); outindex++) {
693
agrep_outbuffer[agrep_outpointer+outindex] = s[outindex];
695
if ((s[outindex] != '\0') || (outindex+agrep_outpointer>=agrep_outlen)) {
700
agrep_outpointer += outindex;
703
if (agrep_finalfp != NULL)
704
fprintf(agrep_finalfp, "\n");
706
if (agrep_outpointer+1>=agrep_outlen) {
711
else agrep_outbuffer[agrep_outpointer++] = '\n';
721
#endif /*AGREP_POINTER*/
723
fprintf(stderr,"Shifted %d times; shift=0 %d times; hash was = %d times\n",cshift, cshift0, chash);
730
unsigned char *text; int len;
733
for (i=0; i<len; i++) if(text[i] == '\n') total_line++;
736
/* Stuff that always needs to be printed whenever there is a match in all functions in this file */
738
print_options(pat_index, text, curtextbegin, curtextend)
740
unsigned char *text, *curtextbegin, *curtextend;
743
if (SILENT) return 0;
744
if(FNAME && (NEW_FILE || !POST_FILTER)) {
745
char nextchar = (POST_FILTER == ON)?'\n':' ';
746
char *prevstring = (POST_FILTER == ON)?"\n":"";
748
if (agrep_finalfp != NULL)
749
fprintf(agrep_finalfp, "%s%s", prevstring, CurrentFileName);
752
if (prevstring[0] != '\0') {
753
if(agrep_outpointer + 1 >= agrep_outlen) {
757
else agrep_outbuffer[agrep_outpointer ++] = prevstring[0];
759
for(outindex=0; (outindex+agrep_outpointer<agrep_outlen) &&
760
(CurrentFileName[outindex] != '\0'); outindex++) {
761
agrep_outbuffer[agrep_outpointer+outindex] = CurrentFileName[outindex];
763
if ((CurrentFileName[outindex] != '\0') || (outindex+agrep_outpointer>=agrep_outlen)) {
767
agrep_outpointer += outindex;
770
char *s = aprint_file_time(CurrentFileTime);
771
if (agrep_finalfp != NULL)
772
fprintf(agrep_finalfp, "%s", s);
775
for(outindex=0; (outindex+agrep_outpointer<agrep_outlen) &&
776
(s[outindex] != '\0'); outindex++) {
777
agrep_outbuffer[agrep_outpointer+outindex] = s[outindex];
779
if ((s[outindex] != '\0') || (outindex+agrep_outpointer>=agrep_outlen)) {
783
agrep_outpointer += outindex;
786
if (agrep_finalfp != NULL)
787
fprintf(agrep_finalfp, ":%c", nextchar);
789
if (agrep_outpointer+2>= agrep_outlen) {
794
agrep_outbuffer[agrep_outpointer++] = ':';
795
agrep_outbuffer[agrep_outpointer++] = nextchar;
804
if (agrep_finalfp != NULL)
805
fprintf(agrep_finalfp, "%d- ", pat_index);
809
sprintf(s, "%d- ", pat_index);
810
for(outindex=0; (outindex+agrep_outpointer<agrep_outlen) &&
811
(s[outindex] != '\0'); outindex++) {
812
agrep_outbuffer[agrep_outpointer+outindex] = s[outindex];
814
if (s[outindex] != '\0') {
818
agrep_outpointer += outindex;
824
if (agrep_finalfp != NULL)
825
fprintf(agrep_finalfp, "%d= ", CurrentByteOffset);
829
sprintf(s, "%d= ", CurrentByteOffset);
830
for(outindex=0; (outindex+agrep_outpointer<agrep_outlen) &&
831
(s[outindex] != '\0'); outindex++) {
832
agrep_outbuffer[agrep_outpointer+outindex] = s[outindex];
834
if (s[outindex] != '\0') {
838
agrep_outpointer += outindex;
844
if (agrep_finalfp != NULL)
845
fprintf(agrep_finalfp, "@%d{%d} ", CurrentByteOffset - (text -curtextbegin), curtextend-curtextbegin);
849
sprintf(s, "@%d{%d} ", CurrentByteOffset - (text -curtextbegin), curtextend-curtextbegin);
850
for (outindex=0; (outindex+agrep_outpointer<agrep_outlen) &&
851
(s[outindex] != '\0'); outindex ++) {
852
agrep_outbuffer[agrep_outpointer+outindex] = s[outindex];
854
if (s[outindex] != '\0') {
858
agrep_outpointer += outindex;
866
monkey1( text, start, end )
867
int start, end; register unsigned char *text;
871
unsigned char *oldtext;
873
register uchar *textend;
874
unsigned char *textbegin;
875
unsigned char *curtextend;
876
unsigned char *curtextbegin;
877
register unsigned hash;
878
register uchar shift;
879
register int m1, Long=LONG;
883
register int p, p_end;
891
if (AParse != 0) memset(amatched_terminals, '\0', anum_terminals);
892
textbegin = text + start;
893
textend = text + end;
895
lastout = text+start;
896
text = text + start + m1 -1 ;
897
/* -1 to allow match to the first \n in case the pattern has ^ in front of it */
899
if (WORDBOUND || WHOLELINE) text = text-1;
900
if (WHOLELINE) text = text-1;
902
/* to accomodate the extra 2 W_delim */
903
while (text <= textend) {
905
hash=(hash<<Hbits)+(tr1[*(text-1)]);
906
if(Long) hash=(hash<<Hbits)+(tr1[*(text-2)]);
907
shift = SHIFT1[hash];
913
hash2 = (tr[*(text-m1)]<<8) + tr[*(text-m1+1)];
918
p_end = HASH[hash+1];
920
printf("hash=%d, p=%d, p_end=%d\n", hash, p, p_end);
923
if(hash2 != Hash2[p]) continue;
927
if (((pat_index = pat_indices[p]) <= 0) || (pat_len[pat_index] <= 0)) continue;
930
while((*px!=0)&&(tr[*px] == tr[*qx])) {
935
if(text > textend) return 0;
937
if (isalnum(*(unsigned char *)qx)) goto skip_output;
938
if (isalnum(*(unsigned char *)(text-m1-1))) goto skip_output;
941
/* Don't update CurrentByteOffset here: only before outputting properly */
943
curtextbegin = text; while((curtextbegin > textbegin) && (*(--curtextbegin) != '\n'));
944
if (*curtextbegin == '\n') curtextbegin ++;
945
curtextend = curtextbegin /*text-m1*/; while((curtextend < textend) && (*curtextend != '\n')) curtextend ++;
946
if (*curtextend == '\n') curtextend ++;
949
curtextbegin = backward_delimiter(text, textbegin, D_pattern, D_length, OUTTAIL);
950
curtextend = forward_delimiter(curtextbegin /*text-m1*/, textend, D_pattern, D_length, OUTTAIL);
952
if (!OUTTAIL || INVERSE) textbegin = curtextend;
953
else if (DELIMITER) textbegin = curtextend - D_length;
954
else textbegin = curtextend - 1;
958
if (pat_index <= anum_terminals) {
960
amatched_terminals[pat_index - 1] = 1;
961
for (iii=0; iii<anum_terminals; iii++)
962
if (aduplicates[pat_index - 1][iii])
963
amatched_terminals[iii] = 1;
965
if (AComplexBoolean) {
966
/* Can output only after all the matches in the current record have been identified: just like filter_output */
968
CurrentByteOffset += (oldtext + pat_len[pat_index] - 1 - text);
969
text = oldtext + pat_len[pat_index] - 1;
973
else if ((long)AParse & AND_EXP) {
974
for (j=0; j<anum_terminals; j++) if (!amatched_terminals[j]) break;
975
if (j<anum_terminals) goto skip_output;
978
oldtext = text; /* only for MULTI_OUTPUT */
981
#define DO_OUTPUT(change_text)\
983
if(FILENAMEONLY || SILENT) return 0;\
986
if ((PRINTED = print_options(pat_index, text, curtextbegin, curtextend)) == -1) return -1;\
989
if (agrep_finalfp != NULL) {\
990
fwrite(curtextbegin, 1, curtextend - curtextbegin, agrep_finalfp);\
993
if (agrep_outpointer + curtextend - curtextbegin>= agrep_outlen) {\
998
memcpy(agrep_outbuffer + agrep_outpointer, curtextbegin, curtextend-curtextbegin);\
999
agrep_outpointer += curtextend - curtextbegin;\
1003
else if (PRINTED) {\
1004
if (agrep_finalfp != NULL) fputc('\n', agrep_finalfp);\
1005
else agrep_outbuffer[agrep_outpointer ++] = '\n';\
1008
if ((change_text) && MULTI_OUTPUT) { /* next match starting from end of current */\
1009
CurrentByteOffset += (oldtext + pat_len[pat_index] - 1 - text);\
1010
text = oldtext + pat_len[pat_index] - 1;\
1013
else if (change_text) {\
1014
CurrentByteOffset += textbegin - text;\
1018
else { /* INVERSE */\
1019
/* if(lastout < curtextbegin) OUT=1; */\
1021
if (agrep_finalfp != NULL)\
1022
fwrite(lastout, 1, curtextbegin-lastout, agrep_finalfp);\
1024
if (curtextbegin - lastout + agrep_outpointer >= agrep_outlen) {\
1028
memcpy(agrep_outbuffer+agrep_outpointer, lastout, curtextbegin-lastout);\
1029
agrep_outpointer += (curtextbegin-lastout);\
1034
CurrentByteOffset += textbegin - text;\
1039
else if (change_text) { /* COUNT */\
1040
CurrentByteOffset += textbegin - text;\
1043
if (((LIMITOUTPUT > 0) && (LIMITOUTPUT <= num_of_matched)) ||\
1044
((LIMITPERFILE > 0) && (LIMITPERFILE <= num_of_matched - prev_num_of_matched))) return 0; /* done */\
1050
if (MATCHED && !MULTI_OUTPUT && !AComplexBoolean) break; /* else look for more possible matches since we never know how many will match */
1051
if (DOWITHMASK && (text >= curtextend - 1)) {
1053
if (AComplexBoolean && dd(curtextbegin, curtextend) && eval_tree(AParse, amatched_terminals)) {
1056
if (AParse != 0) memset(amatched_terminals, '\0', anum_terminals);
1059
/* If I found some match and I am about to cross over a delimiter, then set DOWITHMASK to 0 and zero out the amatched_terminals */
1060
if (DOWITHMASK && (text >= curtextend - 1)) {
1062
if (AComplexBoolean && dd(curtextbegin, curtextend) && eval_tree(AParse, amatched_terminals)) {
1065
if (AParse != 0) memset(amatched_terminals, '\0', anum_terminals);
1067
if(!MATCHED) shift = 1; /* || MULTI_OUTPUT is implicit */
1070
shift = m1 - 1 > 0 ? m1 - 1 : 1;
1074
/* If I found some match and I am about to cross over a delimiter, then set DOWITHMASK to 0 and zero out the amatched_terminals */
1075
if (DOWITHMASK && (text >= curtextend - 1)) {
1077
if (AComplexBoolean && dd(curtextbegin, curtextend) && eval_tree(AParse, amatched_terminals)) {
1080
if (AParse != 0) memset(amatched_terminals, '\0', anum_terminals);
1084
CurrentByteOffset += shift;
1087
/* Do residual stuff: check if there was a match at the end of the line | check if rest of the buffer needs to be output due to inverse */
1089
if (DOWITHMASK && (text >= curtextend - 1)) {
1091
if (AComplexBoolean && dd(curtextbegin, curtextend) && eval_tree(AParse, amatched_terminals)) {
1094
if (AParse != 0) memset(amatched_terminals, '\0', anum_terminals);
1097
if(INVERSE && !COUNT && (lastout <= textend)) {
1099
if (agrep_finalfp != NULL) {
1100
while(lastout <= textend) fputc(*lastout++, agrep_finalfp);
1103
if (textend - lastout + 1 + agrep_outpointer >= agrep_outlen) {
1107
memcpy(agrep_outbuffer+agrep_outpointer, lastout, textend-lastout+1);
1108
agrep_outpointer += (textend-lastout+1);
1119
tc_monkey1( text, start, end )
1121
register unsigned char *text;
1124
unsigned char *oldtext;
1126
register uchar *textend;
1127
unsigned char *textbegin;
1128
unsigned char *curtextend;
1129
unsigned char *curtextbegin;
1130
register unsigned hash;
1131
register uchar shift;
1132
register int m1, Long=LONG;
1136
register int p, p_end;
1142
struct timeval initt, finalt;
1146
if (AParse != 0) memset(amatched_terminals, '\0', anum_terminals);
1147
textbegin = text + start;
1148
textend = text + end;
1150
lastout = text+start;
1151
text = text + start + m1 -1;
1152
/* -1 to allow match to the first \n in case the pattern has ^ in front of it */
1153
/* WORDBOUND adjustment not required */
1154
while (text <= textend) {
1156
hash=(hash<<Hbits)+(tc_tr1[*(text-1)]);
1157
if(Long) hash=(hash<<Hbits)+(tc_tr1[*(text-2)]);
1158
shift = tc_SHIFT1[hash];
1164
hash2 = (tc_tr[*(text-m1)]<<8) + tc_tr[*(text-m1+1)];
1169
p_end = tc_HASH[hash+1];
1171
printf("hash=%d, p=%d, p_end=%d\n", hash, p, p_end);
1173
while(p++ < p_end) {
1174
if(hash2 != tc_Hash2[p]) continue;
1178
if (((pat_index = tc_pat_indices[p]) <= 0) || (tc_pat_len[pat_index] <= 0)) continue;
1182
while((*px!=0)&&(tc_tr[*px] == tc_tr[*qx])) {
1187
if(text > textend) return 0;
1189
/* Don't update CurrentByteOffset here: only before outputting properly */
1191
curtextbegin = text; while((curtextbegin > textbegin) && (*(--curtextbegin) != '\n'));
1192
if (*curtextbegin == '\n') curtextbegin ++;
1193
curtextend = curtextbegin /*text-m1*/; while((curtextend < textend) && (*curtextend != '\n')) curtextend ++;
1194
if (*curtextend == '\n') curtextend ++;
1197
curtextbegin = backward_delimiter(text, textbegin, tc_D_pattern, tc_D_length, OUTTAIL);
1198
curtextend = forward_delimiter(curtextbegin /*text-m1*/, textend, tc_D_pattern, tc_D_length, OUTTAIL);
1201
/* else prev curtextbegin is OK: if full AND isn't found, DOWITHMASK is 0-ed so that we search at most 1 line below */
1203
gettimeofday(&initt, NULL);
1204
#endif /*MEASURE_TIMES*/
1205
/* Was it really a match in the compressed line from prev line in text to text + strlen(tc_pat_len[pat_index]? */
1206
if (-1==exists_tcompressed_word(tc_PatPtr[p], tc_pat_len[pat_index], curtextbegin, text - curtextbegin + tc_pat_len[pat_index], EASYSEARCH))
1209
gettimeofday(&finalt, NULL);
1210
FILTERALGO_ms += (finalt.tv_sec *1000 + finalt.tv_usec/1000) - (initt.tv_sec*1000 + initt.tv_usec/1000);
1211
#endif /*MEASURE_TIMES*/
1213
if (!OUTTAIL || INVERSE) textbegin = curtextend;
1214
else if (DELIMITER) textbegin = curtextend - D_length;
1215
else textbegin = curtextend - 1;
1218
if (pat_index <= anum_terminals) {
1220
amatched_terminals[pat_index - 1] = 1;
1221
for (iii=0; iii<anum_terminals; iii++)
1222
if (aduplicates[pat_index - 1][iii])
1223
amatched_terminals[iii] = 1;
1225
if (AComplexBoolean) {
1226
/* Can output only after all the matches in the current record have been identified: just like filter_output */
1228
CurrentByteOffset += (oldtext + pat_len[pat_index] - 1 - text);
1229
text = oldtext + pat_len[pat_index] - 1;
1233
else if ((long)AParse & AND_EXP) {
1234
for (j=0; j<anum_terminals; j++) if (!amatched_terminals[j]) break;
1235
if (j<anum_terminals) goto skip_output;
1239
oldtext = text; /* only for MULTI_OUTPUT */
1242
#define DO_OUTPUT(change_text)\
1244
if(FILENAMEONLY || SILENT) return 0;\
1246
if ((PRINTED = print_options(pat_index, text, curtextbegin, curtextend)) == -1) return -1;\
1249
/* #if MEASURE_TIMES\
1250
gettimeofday(&initt, NULL);\
1251
*/ /*#endif MEASURE_TIMES */\
1252
if (agrep_finalfp != NULL)\
1253
newlen = quick_tuncompress(FREQ_FILE, STRING_FILE, curtextbegin, curtextend-curtextbegin, agrep_finalfp, -1, EASYSEARCH);\
1255
if ((newlen = quick_tuncompress(FREQ_FILE, STRING_FILE, curtextbegin, curtextend-curtextbegin, agrep_outbuffer, agrep_outlen - agrep_outpointer, EASYSEARCH)) > 0) {\
1256
if (newlen + agrep_outpointer >= agrep_outlen) {\
1260
agrep_outpointer += newlen;\
1263
/* #if MEASURE_TIMES\
1264
gettimeofday(&finalt, NULL);\
1265
OUTFILTER_ms += (finalt.tv_sec* 1000 + finalt.tv_usec/1000) - (initt.tv_sec*1000 + initt.tv_usec/1000);\
1266
*/ /*#endif MEASURE_TIMES */\
1268
else if (PRINTED) {\
1269
if (agrep_finalfp != NULL) fputc('\n', agrep_finalfp);\
1270
else agrep_outbuffer[agrep_outpointer ++] = '\n';\
1273
if ((change_text) && MULTI_OUTPUT) { /* next match starting from end of current */\
1274
CurrentByteOffset += (oldtext + tc_pat_len[pat_index] - 1 - text);\
1275
text = oldtext + tc_pat_len[pat_index] - 1;\
1278
else if (change_text) {\
1279
CurrentByteOffset += textbegin - text;\
1283
else { /* INVERSE: Don't care about filtering time */\
1284
/* if(lastout < curtextbegin) OUT=1; */\
1286
if (agrep_finalfp != NULL)\
1287
newlen = quick_tuncompress(FREQ_FILE, STRING_FILE, lastout, curtextbegin - lastout, agrep_finalfp, -1, EASYSEARCH);\
1289
if ((newlen = quick_tuncompress(FREQ_FILE, STRING_FILE, lastout, curtextbegin - lastout, agrep_outbuffer, agrep_outlen - agrep_outpointer, EASYSEARCH)) > 0) {\
1290
if (newlen + agrep_outpointer >= agrep_outlen) {\
1294
agrep_outpointer += newlen;\
1300
CurrentByteOffset += textbegin - text;\
1305
else if (change_text) {\
1306
CurrentByteOffset += textbegin - text;\
1309
if (((LIMITOUTPUT > 0) && (LIMITOUTPUT <= num_of_matched)) ||\
1310
((LIMITPERFILE > 0) && (LIMITPERFILE <= num_of_matched - prev_num_of_matched))) return 0; /* done */\
1316
if (MATCHED && !MULTI_OUTPUT && !AComplexBoolean) break; /* else look for more possible matches since we never know how many will match */
1317
if (DOWITHMASK && (text >= curtextend - 1)) {
1319
if (AComplexBoolean && dd(curtextbegin, curtextend) && eval_tree(AParse, amatched_terminals)) {
1322
if (AParse != 0) memset(amatched_terminals, '\0', anum_terminals);
1325
/* If I found some match and I am about to cross over a delimiter, then set DOWITHMASK to 0 and zero out the amatched_terminals */
1326
if (DOWITHMASK && (text >= curtextend - 1)) {
1328
if (AComplexBoolean && dd(curtextbegin, curtextend) && eval_tree(AParse, amatched_terminals)) {
1331
if (AParse != 0) memset(amatched_terminals, '\0', anum_terminals);
1333
if(!MATCHED) shift = 1; /* || MULTI_OUTPUT is implicit */
1336
shift = m1 - 1 > 0 ? m1 - 1 : 1;
1340
/* If I found some match and I am about to cross over a delimiter, then set DOWITHMASK to 0 and zero out the amatched_terminals */
1341
if (DOWITHMASK && (text >= curtextend - 1)) {
1343
if (AComplexBoolean && dd(curtextbegin, curtextend) && eval_tree(AParse, amatched_terminals)) {
1346
if (AParse != 0) memset(amatched_terminals, '\0', anum_terminals);
1350
CurrentByteOffset += shift;
1353
/* Do residual stuff: check if there was a match at the end of the line | check if rest of the buffer needs to be output due to inverse */
1355
if (DOWITHMASK && (text >= curtextend - 1)) {
1357
if (AComplexBoolean && dd(curtextbegin, curtextend) && eval_tree(AParse, amatched_terminals)) {
1360
if (AParse != 0) memset(amatched_terminals, '\0', anum_terminals);
1363
if (INVERSE && !COUNT && (lastout <= textend)) {
1365
if (agrep_finalfp != NULL)
1366
newlen = quick_tuncompress(FREQ_FILE, STRING_FILE, lastout, textend - lastout + 1, agrep_finalfp, -1, EASYSEARCH);
1368
if ((newlen = quick_tuncompress(FREQ_FILE, STRING_FILE, lastout, textend - lastout + 1, agrep_outbuffer, agrep_outlen - agrep_outpointer, EASYSEARCH)) > 0) {
1369
if (newlen + agrep_outpointer >= agrep_outlen) {
1373
agrep_outpointer += newlen;
1381
#endif /*DOTCOMPRESSED*/
1383
/* shift is always 1: slight change in MATCHED semantics: it is set to 1 even if COUNT is set: previously, it wasn't set. Will it effect m_short? */
1385
m_short(text, start, end)
1386
int start, end; register uchar *text;
1391
unsigned char *oldtext;
1392
register uchar *textend;
1393
unsigned char *textbegin;
1394
unsigned char *curtextend;
1395
unsigned char *curtextbegin;
1396
register int p, p_end;
1406
if (AParse != 0) memset(amatched_terminals, '\0', anum_terminals);
1407
textend = text + end;
1408
lastout = text + start;
1409
textbegin = text + start;
1410
text = text + start - 1 ;
1412
if (WORDBOUND || WHOLELINE) text = text-1;
1414
if (WHOLELINE) text = text-1;
1415
/* to accomodate the extra 2 W_delim */
1416
while (++text <= textend) {
1417
CurrentByteOffset ++;
1418
p = HASH[tr[*text]];
1419
p_end = HASH[tr[*text]+1];
1420
while(p++ < p_end) {
1421
if (((pat_index = pat_indices[p]) <= 0) || (pat_len[pat_index] <= 0)) continue;
1423
printf("m_short(): p=%d pat_index=%d off=%d\n", p, pat_index, textend - text);
1427
while((*px!=0)&&(tr[*px] == tr[*qx])) {
1432
if(text >= textend) return 0;
1434
if (isalnum(*(unsigned char *)qx)) goto skip_output;
1435
if (isalnum(*(unsigned char *)(text-1))) goto skip_output;
1438
/* Don't update CurrentByteOffset here: only before outputting properly */
1440
curtextbegin = text; while((curtextbegin > textbegin) && (*(--curtextbegin) != '\n'));
1441
if (*curtextbegin == '\n') curtextbegin ++;
1442
curtextend = curtextbegin /*text-m1*/; while((curtextend < textend) && (*curtextend != '\n')) curtextend ++;
1443
if (*curtextend == '\n') curtextend ++;
1446
curtextbegin = backward_delimiter(text, textbegin, D_pattern, D_length, OUTTAIL);
1447
curtextend = forward_delimiter(curtextbegin /*text-m1*/, textend, D_pattern, D_length, OUTTAIL);
1449
if (!OUTTAIL || INVERSE) textbegin = curtextend;
1450
else if (DELIMITER) textbegin = curtextend - D_length;
1451
else textbegin = curtextend - 1;
1453
/* else prev curtextbegin is OK: if full AND isn't found, DOWITHMASK is 0-ed so that we search at most 1 line below */
1456
if (pat_index <= anum_terminals) {
1458
amatched_terminals[pat_index - 1] = 1;
1459
for (iii=0; iii<anum_terminals; iii++)
1460
if (aduplicates[pat_index - 1][iii])
1461
amatched_terminals[iii] = 1;
1463
if (AComplexBoolean) {
1464
/* Can output only after all the matches in the current record have been identified: just like filter_output */
1466
CurrentByteOffset += (oldtext + pat_len[pat_index] - 1 - text);
1467
text = oldtext + pat_len[pat_index] - 1;
1471
else if ((long)AParse & AND_EXP) {
1472
for (j=0; j<anum_terminals; j++) if (!amatched_terminals[j]) break;
1473
if (j<anum_terminals) goto skip_output;
1477
oldtext = text; /* used only if MULTI_OUTPUT */
1480
#define DO_OUTPUT(change_text)\
1482
if(FILENAMEONLY || SILENT) return 0;\
1484
if ((PRINTED = print_options(pat_index, text, curtextbegin, curtextend)) == -1) return -1;\
1487
if (agrep_finalfp != NULL) {\
1488
fwrite(curtextbegin, 1, curtextend - curtextbegin, agrep_finalfp);\
1491
if (agrep_outpointer + curtextend - curtextbegin >= agrep_outlen) {\
1496
memcpy(agrep_outbuffer + agrep_outpointer, curtextbegin, curtextend-curtextbegin);\
1497
agrep_outpointer += curtextend - curtextbegin;\
1501
else if (PRINTED) {\
1502
if (agrep_finalfp != NULL) fputc('\n', agrep_finalfp);\
1503
else agrep_outbuffer[agrep_outpointer ++] = '\n';\
1506
if ((change_text) && MULTI_OUTPUT) { /* next match starting from end of current */\
1507
CurrentByteOffset += (oldtext + pat_len[pat_index] - 1 - text);\
1508
text = oldtext + pat_len[pat_index] - 1;\
1511
else if (change_text) {\
1512
CurrentByteOffset += textbegin - text;\
1517
/* if(lastout < curtextbegin) OUT=1; */\
1519
if (agrep_finalfp != NULL)\
1520
fwrite(lastout, 1, curtextbegin-lastout, agrep_finalfp);\
1522
if (curtextbegin - lastout + agrep_outpointer >= agrep_outlen) {\
1526
memcpy(agrep_outbuffer+agrep_outpointer, lastout, curtextbegin-lastout);\
1527
agrep_outpointer += (curtextbegin-lastout);\
1532
CurrentByteOffset += textbegin - text;\
1537
else if (change_text) {\
1538
CurrentByteOffset += textbegin - text;\
1541
if (((LIMITOUTPUT > 0) && (LIMITOUTPUT <= num_of_matched)) ||\
1542
((LIMITPERFILE > 0) && (LIMITPERFILE <= num_of_matched - prev_num_of_matched))) return 0; /* done */\
1548
if(MATCHED && !MULTI_OUTPUT && !AComplexBoolean) break; /* else look for more possible matches */
1549
if (DOWITHMASK && (text >= curtextend - 1)) {
1551
if (AComplexBoolean && dd(curtextbegin, curtextend) && eval_tree(AParse, amatched_terminals)) {
1554
if (AParse != 0) memset(amatched_terminals, '\0', anum_terminals);
1557
/* If I found some match and I am about to cross over a delimiter, then set DOWITHMASK to 0 and zero out the amatched_terminals */
1558
if (DOWITHMASK && (text >= curtextend - 1)) {
1560
if (AComplexBoolean && dd(curtextbegin, curtextend) && eval_tree(AParse, amatched_terminals)) {
1563
if (AParse != 0) memset(amatched_terminals, '\0', anum_terminals);
1565
if (MATCHED) text --;
1568
CurrentByteOffset ++;
1570
/* Do residual stuff: check if there was a match at the end of the line | check if rest of the buffer needs to be output due to inverse */
1572
if (DOWITHMASK && (text >= curtextend - 1)) {
1574
if (AComplexBoolean && dd(curtextbegin, curtextend) && eval_tree(AParse, amatched_terminals)) {
1577
if (AParse != 0) memset(amatched_terminals, '\0', anum_terminals);
1580
if(INVERSE && !COUNT && (lastout <= textend)) {
1582
if (agrep_finalfp != NULL) {
1583
while(lastout <= textend) fputc(*lastout++, agrep_finalfp);
1586
if (textend - lastout + 1 + agrep_outpointer >= agrep_outlen) {
1590
memcpy(agrep_outbuffer+agrep_outpointer, lastout, text-lastout+1);
1591
agrep_outpointer += (text-lastout+1);
1601
/* shift is always 1: slight change in MATCHED semantics: it is set to 1 even if COUNT is set: previously, it wasn't set. Will it effect m_short? */
1603
tc_m_short(text, start, end)
1604
int start, end; register uchar *text;
1609
unsigned char *oldtext;
1610
register uchar *textend;
1611
unsigned char *textbegin;
1612
unsigned char *curtextend;
1613
unsigned char *curtextbegin;
1614
register int p, p_end;
1622
struct timeval initt, finalt;
1626
if (AParse != 0) memset(amatched_terminals, '\0', anum_terminals);
1627
textend = text + end;
1628
lastout = text + start;
1629
text = text + start - 1 ;
1630
textbegin = text + start;
1631
/* WORDBOUND adjustment not required */
1632
while (++text <= textend) {
1633
CurrentByteOffset ++;
1634
p = tc_HASH[tc_tr[*text]];
1635
p_end = tc_HASH[tc_tr[*text]+1];
1636
while(p++ < p_end) {
1637
if (((pat_index = tc_pat_indices[p]) <= 0) || (tc_pat_len[pat_index] <= 0)) continue;
1639
printf("m_short(): p=%d pat_index=%d off=%d\n", p, pat_index, textend - text);
1643
while((*px!=0)&&(tc_tr[*px] == tc_tr[*qx])) {
1648
if(text >= textend) return 0;
1651
/* Don't update CurrentByteOffset here: only before outputting properly */
1653
curtextbegin = text; while((curtextbegin > textbegin) && (*(--curtextbegin) != '\n'));
1654
if (*curtextbegin == '\n') curtextbegin ++;
1655
curtextend = curtextbegin /*text-m1*/; while((curtextend < textend) && (*curtextend != '\n')) curtextend ++;
1656
if (*curtextend == '\n') curtextend ++;
1659
curtextbegin = backward_delimiter(text, textbegin, tc_D_pattern, tc_D_length, OUTTAIL);
1660
curtextend = forward_delimiter(curtextbegin /*text-m1*/, textend, tc_D_pattern, tc_D_length, OUTTAIL);
1663
/* else prev curtextbegin is OK: if full AND isn't found, DOWITHMASK is 0-ed so that we search at most 1 line below */
1665
gettimeofday(&initt, NULL);
1666
#endif /*MEASURE_TIMES*/
1667
/* Was it really a match in the compressed line from prev line in text to text + strlen(tc_pat_len[pat_index]? */
1668
if (-1 == exists_tcompressed_word(tc_PatPtr[p], tc_pat_len[pat_index], curtextbegin, text - curtextbegin + tc_pat_len[pat_index], EASYSEARCH))
1671
gettimeofday(&finalt, NULL);
1672
FILTERALGO_ms += (finalt.tv_sec *1000 + finalt.tv_usec/1000) - (initt.tv_sec*1000 + initt.tv_usec/1000);
1673
#endif /*MEASURE_TIMES*/
1676
if (!OUTTAIL || INVERSE) textbegin = curtextend;
1677
else if (DELIMITER) textbegin = curtextend - D_length;
1678
else textbegin = curtextend - 1;
1681
if (pat_index <= anum_terminals) {
1683
amatched_terminals[pat_index - 1] = 1;
1684
for (iii=0; iii<anum_terminals; iii++)
1685
if (aduplicates[pat_index - 1][iii])
1686
amatched_terminals[iii] = 1;
1688
if (AComplexBoolean) {
1689
/* Can output only after all the matches in the current record have been identified: just like filter_output */
1691
CurrentByteOffset += (oldtext + pat_len[pat_index] - 1 - text);
1692
text = oldtext + pat_len[pat_index] - 1;
1696
else if ((long)AParse & AND_EXP) {
1697
for (j=0; j<anum_terminals; j++) if (!amatched_terminals[j]) break;
1698
if (j<anum_terminals) goto skip_output;
1702
oldtext = text; /* used only if MULTI_OUTPUT */
1705
#define DO_OUTPUT(change_text)\
1707
if(FILENAMEONLY || SILENT) return 0;\
1709
if ((PRINTED = print_options(pat_index, text, curtextbegin, curtextend)) == -1) return -1;\
1712
/* #if MEASURE_TIMES\
1713
gettimeofday(&initt, NULL);\
1714
*/ /*#endif MEASURE_TIMES*/\
1715
if (agrep_finalfp != NULL)\
1716
newlen = quick_tuncompress(FREQ_FILE, STRING_FILE, curtextbegin, curtextend-curtextbegin, agrep_finalfp, -1, EASYSEARCH);\
1718
if ((newlen = quick_tuncompress(FREQ_FILE, STRING_FILE, curtextbegin, curtextend-curtextbegin, agrep_outbuffer, agrep_outlen - agrep_outpointer, EASYSEARCH)) > 0) {\
1719
if (newlen + agrep_outpointer >= agrep_outlen) {\
1723
agrep_outpointer += newlen;\
1726
/*#if MEASURE_TIMES\
1727
gettimeofday(&finalt, NULL);\
1728
OUTFILTER_ms += (finalt.tv_sec* 1000 + finalt.tv_usec/1000) - (initt.tv_sec*1000 + initt.tv_usec/1000);\
1729
*/ /*#endif MEASURE_TIMES*/\
1731
else if (PRINTED) {\
1732
if (agrep_finalfp != NULL) fputc('\n', agrep_finalfp);\
1733
else agrep_outbuffer[agrep_outpointer ++] = '\n';\
1736
if ((change_text) && MULTI_OUTPUT) { /* next match starting from end of current */\
1737
CurrentByteOffset += (oldtext + tc_pat_len[pat_index] - 1 - text);\
1738
text = oldtext + tc_pat_len[pat_index] - 1;\
1741
else if (change_text) {\
1742
CurrentByteOffset += textbegin - text;\
1746
else { /* INVERSE: Don't care about filtering time */\
1747
/* if(lastout < curtextbegin) OUT=1; */\
1749
if (agrep_finalfp != NULL)\
1750
newlen = quick_tuncompress(FREQ_FILE, STRING_FILE, lastout, curtextbegin - lastout, agrep_finalfp, -1, EASYSEARCH);\
1752
if ((newlen = quick_tuncompress(FREQ_FILE, STRING_FILE, lastout, curtextbegin - lastout, agrep_outbuffer, agrep_outlen - agrep_outpointer, EASYSEARCH)) > 0) {\
1753
if (newlen + agrep_outpointer >= agrep_outlen) {\
1757
agrep_outpointer += newlen;\
1763
CurrentByteOffset += textbegin - text;\
1768
else if (change_text) {\
1769
CurrentByteOffset += textbegin - text;\
1772
if (((LIMITOUTPUT > 0) && (LIMITOUTPUT <= num_of_matched)) ||\
1773
((LIMITPERFILE > 0) && (LIMITPERFILE <= num_of_matched - prev_num_of_matched))) return 0; /* done */\
1779
if(MATCHED && !MULTI_OUTPUT && !AComplexBoolean) break; /* else look for more possible matches */
1780
if (DOWITHMASK && (text >= curtextend - 1)) {
1782
if (AComplexBoolean && dd(curtextbegin, curtextend) && eval_tree(AParse, amatched_terminals)) {
1785
if (AParse != 0) memset(amatched_terminals, '\0', anum_terminals);
1788
/* If I found some match and I am about to cross over a delimiter, then set DOWITHMASK to 0 and zero out the amatched_terminals */
1789
if (DOWITHMASK && (text >= curtextend - 1)) {
1791
if (AComplexBoolean && dd(curtextbegin, curtextend) && eval_tree(AParse, amatched_terminals)) {
1794
if (AParse != 0) memset(amatched_terminals, '\0', anum_terminals);
1796
if (MATCHED) text--;
1799
CurrentByteOffset ++;
1801
/* Do residual stuff: check if there was a match at the end of the line | check if rest of the buffer needs to be output due to inverse */
1803
if (DOWITHMASK && (text >= curtextend - 1)) {
1805
if (AComplexBoolean && dd(curtextbegin, curtextend) && eval_tree(AParse, amatched_terminals)) {
1808
if (AParse != 0) memset(amatched_terminals, '\0', anum_terminals);
1811
if (INVERSE && !COUNT && (lastout <= textend)) {
1813
if (agrep_finalfp != NULL)
1814
newlen = quick_tuncompress(FREQ_FILE, STRING_FILE, lastout, textend - lastout + 1, agrep_finalfp, -1, EASYSEARCH);
1816
if ((newlen = quick_tuncompress(FREQ_FILE, STRING_FILE, lastout, textend - lastout + 1, agrep_outbuffer, agrep_outlen - agrep_outpointer, EASYSEARCH)) > 0) {
1817
if (newlen + agrep_outpointer >= agrep_outlen) {
1821
agrep_outpointer += newlen;
1829
#endif /*DOTCOMPRESSED*/
1832
f_prep(pat_index, Pattern)
1833
uchar *Pattern; int pat_index;
1836
register unsigned hash=0;
1841
for (i=m-1; i>=(1+LONG); i--) {
1842
hash = (tr1[Pattern[i]]);
1843
hash = (hash << Hbits) + (tr1[Pattern[i-1]]);
1844
if(LONG) hash = (hash << Hbits) + (tr1[Pattern[i-2]] );
1845
if(SHIFT1[hash] >= m-1-i) SHIFT1[hash] = m-1-i;
1848
hash = (tr1[Pattern[i]]);
1849
hash = (hash << Hbits) + (tr1[Pattern[i-1]]);
1850
if(LONG) hash = (hash << Hbits) + (tr1[Pattern[i-2]] );
1851
if(SHORT) hash=tr[Pattern[0]];
1853
printf("hash = %d\n", hash);
1861
tc_f_prep(pat_index, Pattern)
1862
uchar *Pattern; int pat_index;
1865
register unsigned hash=0;
1870
for (i=m-1; i>=(1+tc_LONG); i--) {
1871
hash = (tc_tr1[Pattern[i]]);
1872
hash = (hash << Hbits) + (tc_tr1[Pattern[i-1]]);
1873
if(tc_LONG) hash = (hash << Hbits) + (tc_tr1[Pattern[i-2]] );
1874
if(tc_SHIFT1[hash] >= m-1-i) tc_SHIFT1[hash] = m-1-i;
1877
hash = (tc_tr1[Pattern[i]]);
1878
hash = (hash << Hbits) + (tc_tr1[Pattern[i-1]]);
1879
if(tc_LONG) hash = (hash << Hbits) + (tc_tr1[Pattern[i-2]] );
1880
if(tc_SHORT) hash=tc_tr[Pattern[0]];
1882
printf("hash = %d\n", hash);
1887
#endif /*DOTCOMPRESSED*/
1890
f_prep1(pat_index, Pattern)
1891
uchar *Pattern; int pat_index;
1895
register unsigned hash;
1900
for (i=m-1; i>=(1+LONG); i--) {
1901
hash = (tr1[Pattern[i]]);
1902
hash = (hash << Hbits) + (tr1[Pattern[i-1]]);
1903
if(LONG) hash = (hash << Hbits) + (tr1[Pattern[i-2]] );
1904
if(SHIFT1[hash] >= m-1-i) SHIFT1[hash] = m-1-i;
1907
hash = (tr1[Pattern[i]]);
1908
hash = (hash << Hbits) + (tr1[Pattern[i-1]]);
1909
if(LONG) hash = (hash << Hbits) + (tr1[Pattern[i-2]] );
1910
if(SHORT) hash=tr[Pattern[0]];
1911
hash2 = (tr[Pattern[0]] << 8) + tr[Pattern[1]];
1913
printf("hash = %d, HASH[hash] = %d\n", hash, HASH[hash]);
1915
PatPtr[HASH[hash]] = Pattern;
1916
pat_indices[HASH[hash]] = pat_index;
1917
Hash2[HASH[hash]] = hash2;
1924
tc_f_prep1(pat_index, Pattern)
1925
uchar *Pattern; int pat_index;
1929
register unsigned hash;
1934
for (i=m-1; i>=(1+tc_LONG); i--) {
1935
hash = (tc_tr1[Pattern[i]]);
1936
hash = (hash << Hbits) + (tc_tr1[Pattern[i-1]]);
1937
if(tc_LONG) hash = (hash << Hbits) + (tc_tr1[Pattern[i-2]] );
1938
if(tc_SHIFT1[hash] >= m-1-i) tc_SHIFT1[hash] = m-1-i;
1941
hash = (tc_tr1[Pattern[i]]);
1942
hash = (hash << Hbits) + (tc_tr1[Pattern[i-1]]);
1943
if(tc_LONG) hash = (hash << Hbits) + (tc_tr1[Pattern[i-2]] );
1944
if(tc_SHORT) hash=tc_tr[Pattern[0]];
1945
hash2 = (tc_tr[Pattern[0]] << 8) + tc_tr[Pattern[1]];
1947
printf("hash = %d, tc_HASH[hash] = %d\n", hash, tc_HASH[hash]);
1949
tc_PatPtr[tc_HASH[hash]] = Pattern;
1950
tc_pat_indices[tc_HASH[hash]] = pat_index;
1951
tc_Hash2[tc_HASH[hash]] = hash2;
1955
#endif /*DOTCOMPRESSED*/
1962
for(i=1; i<MAXHASH; i++) {
1964
printf("%d, ", HASH[i]);
1966
HASH[i] = HASH[i-1] + HASH[i];
1978
for(i=1; i<MAXHASH; i++) {
1980
printf("%d, ", HASH[i]);
1982
tc_HASH[i] = tc_HASH[i-1] + tc_HASH[i];
1987
#endif /*DOTCOMPRESSED*/
1989
/* Compute duplicate strings using tr's info, not strcmp! */
1991
acompute_duplicates(aduplicates, aterminals, anum_terminals, tr)
1992
char aduplicates[MAXNUM_PAT][MAXNUM_PAT];
1993
ParseTree aterminals[];
1997
int i, j, k, leni, lenj, initk;
1999
for (i=0; i<MAXNUM_PAT; i++) memset(aduplicates[i], '\0', MAXNUM_PAT);
2000
for (i=0; i<anum_terminals; i++) {
2001
leni = strlen(aterminals[i].data.leaf.value);
2002
for (j=i; j<anum_terminals; j++) {
2004
aduplicates[i][j] = 1;
2007
lenj = strlen(aterminals[j].data.leaf.value);
2008
if (lenj != leni) continue;
2009
for (k=0; k<lenj; k++) {
2010
if (tr[aterminals[i].data.leaf.value[k]] != tr[aterminals[j].data.leaf.value[k]]) break;
2012
if (k < lenj) continue;
2013
aduplicates[i][j] = 1;
2014
aduplicates[j][i] = 1;