1
/*************************************************
2
* PCRE testing program *
3
*************************************************/
5
/* This program was hacked up as a tester for PCRE. I really should have
6
written it more tidily in the first place. Will I ever learn? It has grown and
7
been extended and consequently is now rather untidy in places.
9
-----------------------------------------------------------------------------
10
Redistribution and use in source and binary forms, with or without
11
modification, are permitted provided that the following conditions are met:
13
* Redistributions of source code must retain the above copyright notice,
14
this list of conditions and the following disclaimer.
16
* Redistributions in binary form must reproduce the above copyright
17
notice, this list of conditions and the following disclaimer in the
18
documentation and/or other materials provided with the distribution.
20
* Neither the name of the University of Cambridge nor the names of its
21
contributors may be used to endorse or promote products derived from
22
this software without specific prior written permission.
24
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
25
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
28
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
29
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
30
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
31
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
32
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
33
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
34
POSSIBILITY OF SUCH DAMAGE.
35
-----------------------------------------------------------------------------
47
/* We need the internal info for displaying the results of pcre_study(). Also
48
for getting the opcodes for showing compiled code. */
50
#define PCRE_SPY /* For Win32 build, import data, not export */
53
/* It is possible to compile this test program without including support for
54
testing the POSIX interface, though this is not available via the standard
58
#include "pcreposix.h"
61
#ifndef CLOCKS_PER_SEC
63
#define CLOCKS_PER_SEC CLK_TCK
65
#define CLOCKS_PER_SEC 100
69
#define LOOPREPEAT 500000
71
#define BUFFER_SIZE 30000
72
#define PBUFFER_SIZE BUFFER_SIZE
73
#define DBUFFER_SIZE BUFFER_SIZE
77
static int log_store = 0;
78
static int callout_count;
79
static int callout_extra;
80
static int callout_fail_count;
81
static int callout_fail_id;
82
static int first_callout;
83
static int show_malloc;
85
static size_t gotten_store;
87
static uschar *pbuffer = NULL;
90
static const int utf8_table1[] = {
91
0x0000007f, 0x000007ff, 0x0000ffff, 0x001fffff, 0x03ffffff, 0x7fffffff};
93
static const int utf8_table2[] = {
94
0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
96
static const int utf8_table3[] = {
97
0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01};
101
/*************************************************
102
* Print compiled regex *
103
*************************************************/
105
/* The code for doing this is held in a separate file that is also included in
106
pcre.c when it is compiled with the debug switch. It defines a function called
107
print_internals(), which uses a table of opcode lengths defined by the macro
108
OP_LENGTHS, whose name must be OP_lengths. It also uses a table that translates
109
Unicode property names to numbers; this is kept in a separate file. */
111
static uschar OP_lengths[] = { OP_LENGTHS };
114
#include "ucptypetable.c"
115
#include "printint.c"
119
/*************************************************
120
* Read number from string *
121
*************************************************/
123
/* We don't use strtoul() because SunOS4 doesn't have it. Rather than mess
124
around with conditional compilation, just do the job by hand. It is only used
125
for unpicking the -o argument, so just keep it simple.
128
str string to be converted
129
endptr where to put the end pointer
131
Returns: the unsigned long
135
get_value(unsigned char *str, unsigned char **endptr)
138
while(*str != 0 && isspace(*str)) str++;
139
while (isdigit(*str)) result = result * 10 + (int)(*str++ - '0');
146
/*************************************************
147
* Convert character value to UTF-8 *
148
*************************************************/
150
/* This function takes an integer value in the range 0 - 0x7fffffff
151
and encodes it as a UTF-8 character in 0 to 6 bytes.
154
cvalue the character value
155
buffer pointer to buffer for result - at least 6 bytes long
157
Returns: number of characters placed in the buffer
158
-1 if input character is negative
159
0 if input character is positive but too big (only when
160
int is longer than 32 bits)
164
ord2utf8(int cvalue, unsigned char *buffer)
167
for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
168
if (cvalue <= utf8_table1[i]) break;
169
if (i >= sizeof(utf8_table1)/sizeof(int)) return 0;
170
if (cvalue < 0) return -1;
173
for (j = i; j > 0; j--)
175
*buffer-- = 0x80 | (cvalue & 0x3f);
178
*buffer = utf8_table2[i] | cvalue;
183
/*************************************************
184
* Convert UTF-8 string to value *
185
*************************************************/
187
/* This function takes one or more bytes that represents a UTF-8 character,
188
and returns the value of the character.
191
buffer a pointer to the byte vector
192
vptr a pointer to an int to receive the value
194
Returns: > 0 => the number of bytes consumed
195
-6 to 0 => malformed UTF-8 character at offset = (-return)
199
utf82ord(unsigned char *buffer, int *vptr)
205
for (i = -1; i < 6; i++) /* i is number of additional bytes */
207
if ((d & 0x80) == 0) break;
211
if (i == -1) { *vptr = c; return 1; } /* ascii character */
212
if (i == 0 || i == 6) return 0; /* invalid UTF-8 */
214
/* i now has a value in the range 1-5 */
217
d = (c & utf8_table3[i]) << s;
219
for (j = 0; j < i; j++)
222
if ((c & 0xc0) != 0x80) return -(j+1);
224
d |= (c & 0x3f) << s;
227
/* Check that encoding was the correct unique one */
229
for (j = 0; j < sizeof(utf8_table1)/sizeof(int); j++)
230
if (d <= utf8_table1[j]) break;
231
if (j != i) return -(i+1);
241
/*************************************************
242
* Print character string *
243
*************************************************/
245
/* Character string printing function. Must handle UTF-8 strings in utf8
246
mode. Yields number of characters printed. If handed a NULL file, just counts
247
chars without printing. */
249
static int pchars(unsigned char *p, int length, FILE *f)
258
int rc = utf82ord(p, &c);
260
if (rc > 0 && rc <= length + 1) /* Mustn't run over the end */
264
if (c < 256 && isprint(c))
266
if (f != NULL) fprintf(f, "%c", c);
272
if (f != NULL) fprintf(f, "\\x{%02x}%n", c, &n);
279
/* Not UTF-8, or malformed UTF-8 */
281
if (isprint(c = *(p++)))
283
if (f != NULL) fprintf(f, "%c", c);
288
if (f != NULL) fprintf(f, "\\x%02x", c);
298
/*************************************************
300
*************************************************/
302
/* Called from PCRE as a result of the (?C) item. We print out where we are in
303
the match. Yield zero unless more callouts than the fail count, or the callout
306
static int callout(pcre_callout_block *cb)
308
FILE *f = (first_callout | callout_extra)? outfile : NULL;
309
int i, pre_start, post_start, subject_length;
313
fprintf(f, "Callout %d: last capture = %d\n",
314
cb->callout_number, cb->capture_last);
316
for (i = 0; i < cb->capture_top * 2; i += 2)
318
if (cb->offset_vector[i] < 0)
319
fprintf(f, "%2d: <unset>\n", i/2);
322
fprintf(f, "%2d: ", i/2);
323
(void)pchars((unsigned char *)cb->subject + cb->offset_vector[i],
324
cb->offset_vector[i+1] - cb->offset_vector[i], f);
330
/* Re-print the subject in canonical form, the first time or if giving full
331
datails. On subsequent calls in the same match, we use pchars just to find the
332
printed lengths of the substrings. */
334
if (f != NULL) fprintf(f, "--->");
336
pre_start = pchars((unsigned char *)cb->subject, cb->start_match, f);
337
post_start = pchars((unsigned char *)(cb->subject + cb->start_match),
338
cb->current_position - cb->start_match, f);
340
subject_length = pchars((unsigned char *)cb->subject, cb->subject_length, NULL);
342
(void)pchars((unsigned char *)(cb->subject + cb->current_position),
343
cb->subject_length - cb->current_position, f);
345
if (f != NULL) fprintf(f, "\n");
347
/* Always print appropriate indicators, with callout number if not already
348
shown. For automatic callouts, show the pattern offset. */
350
if (cb->callout_number == 255)
352
fprintf(outfile, "%+3d ", cb->pattern_position);
353
if (cb->pattern_position > 99) fprintf(outfile, "\n ");
357
if (callout_extra) fprintf(outfile, " ");
358
else fprintf(outfile, "%3d ", cb->callout_number);
361
for (i = 0; i < pre_start; i++) fprintf(outfile, " ");
362
fprintf(outfile, "^");
366
for (i = 0; i < post_start - 1; i++) fprintf(outfile, " ");
367
fprintf(outfile, "^");
370
for (i = 0; i < subject_length - pre_start - post_start + 4; i++)
371
fprintf(outfile, " ");
373
fprintf(outfile, "%.*s", (cb->next_item_length == 0)? 1 : cb->next_item_length,
374
pbuffer + cb->pattern_position);
376
fprintf(outfile, "\n");
379
if (cb->callout_data != NULL)
381
int callout_data = *((int *)(cb->callout_data));
382
if (callout_data != 0)
384
fprintf(outfile, "Callout data = %d\n", callout_data);
389
return (cb->callout_number != callout_fail_id)? 0 :
390
(++callout_count >= callout_fail_count)? 1 : 0;
394
/*************************************************
395
* Local malloc functions *
396
*************************************************/
398
/* Alternative malloc function, to test functionality and show the size of the
401
static void *new_malloc(size_t size)
403
void *block = malloc(size);
406
fprintf(outfile, "malloc %3d %p\n", size, block);
410
static void new_free(void *block)
413
fprintf(outfile, "free %p\n", block);
418
/* For recursion malloc/free, to test stacking calls */
420
static void *stack_malloc(size_t size)
422
void *block = malloc(size);
424
fprintf(outfile, "stack_malloc %3d %p\n", size, block);
428
static void stack_free(void *block)
431
fprintf(outfile, "stack_free %p\n", block);
436
/*************************************************
437
* Call pcre_fullinfo() *
438
*************************************************/
440
/* Get one piece of information from the pcre_fullinfo() function */
442
static void new_info(pcre *re, pcre_extra *study, int option, void *ptr)
445
if ((rc = pcre_fullinfo(re, study, option, ptr)) < 0)
446
fprintf(outfile, "Error %d from pcre_fullinfo(%d)\n", rc, option);
451
/*************************************************
452
* Byte flipping function *
453
*************************************************/
456
byteflip(long int value, int n)
458
if (n == 2) return ((value & 0x00ff) << 8) | ((value & 0xff00) >> 8);
459
return ((value & 0x000000ff) << 24) |
460
((value & 0x0000ff00) << 8) |
461
((value & 0x00ff0000) >> 8) |
462
((value & 0xff000000) >> 24);
468
/*************************************************
470
*************************************************/
472
/* Read lines from named file or stdin and write to named file or stdout; lines
473
consist of a regular expression, in delimiters and optionally followed by
474
options, followed by a set of test data, terminated by an empty line. */
476
int main(int argc, char **argv)
478
FILE *infile = stdin;
480
int study_options = 0;
485
int size_offsets = 45;
486
int size_offsets_max;
494
unsigned char *buffer;
495
unsigned char *dbuffer;
497
/* Get buffers from malloc() so that Electric Fence will check their misuse
498
when I am debugging. */
500
buffer = (unsigned char *)malloc(BUFFER_SIZE);
501
dbuffer = (unsigned char *)malloc(DBUFFER_SIZE);
502
pbuffer = (unsigned char *)malloc(PBUFFER_SIZE);
504
/* The outfile variable is static so that new_malloc can use it. The _setmode()
505
stuff is some magic that I don't understand, but which apparently does good
506
things in Windows. It's related to line terminations. */
508
#if defined(_WIN32) || defined(WIN32)
509
_setmode( _fileno( stdout ), 0x8000 );
510
#endif /* defined(_WIN32) || defined(WIN32) */
516
while (argc > 1 && argv[op][0] == '-')
518
unsigned char *endptr;
520
if (strcmp(argv[op], "-s") == 0 || strcmp(argv[op], "-m") == 0)
522
else if (strcmp(argv[op], "-t") == 0) timeit = 1;
523
else if (strcmp(argv[op], "-i") == 0) showinfo = 1;
524
else if (strcmp(argv[op], "-d") == 0) showinfo = debug = 1;
525
else if (strcmp(argv[op], "-o") == 0 && argc > 2 &&
526
((size_offsets = get_value((unsigned char *)argv[op+1], &endptr)),
533
else if (strcmp(argv[op], "-p") == 0) posix = 1;
535
else if (strcmp(argv[op], "-C") == 0)
538
printf("PCRE version %s\n", pcre_version());
539
printf("Compiled with\n");
540
(void)pcre_config(PCRE_CONFIG_UTF8, &rc);
541
printf(" %sUTF-8 support\n", rc? "" : "No ");
542
(void)pcre_config(PCRE_CONFIG_UNICODE_PROPERTIES, &rc);
543
printf(" %sUnicode properties support\n", rc? "" : "No ");
544
(void)pcre_config(PCRE_CONFIG_NEWLINE, &rc);
545
printf(" Newline character is %s\n", (rc == '\r')? "CR" : "LF");
546
(void)pcre_config(PCRE_CONFIG_LINK_SIZE, &rc);
547
printf(" Internal link size = %d\n", rc);
548
(void)pcre_config(PCRE_CONFIG_POSIX_MALLOC_THRESHOLD, &rc);
549
printf(" POSIX malloc threshold = %d\n", rc);
550
(void)pcre_config(PCRE_CONFIG_MATCH_LIMIT, &rc);
551
printf(" Default match limit = %d\n", rc);
552
(void)pcre_config(PCRE_CONFIG_STACKRECURSE, &rc);
553
printf(" Match recursion uses %s\n", rc? "stack" : "heap");
558
printf("** Unknown or malformed option %s\n", argv[op]);
559
printf("Usage: pcretest [-d] [-i] [-o <n>] [-p] [-s] [-t] [<input> [<output>]]\n");
560
printf(" -C show PCRE compile-time options and exit\n");
561
printf(" -d debug: show compiled code; implies -i\n"
562
" -i show information about compiled pattern\n"
563
" -m output memory used information\n"
564
" -o <n> set size of offsets vector to <n>\n");
566
printf(" -p use POSIX interface\n");
568
printf(" -s output store (memory) used information\n"
569
" -t time compilation and execution\n");
576
/* Get the store for the offsets vector, and remember what it was */
578
size_offsets_max = size_offsets;
579
offsets = (int *)malloc(size_offsets_max * sizeof(int));
582
printf("** Failed to get %d bytes of memory for offsets vector\n",
583
size_offsets_max * sizeof(int));
587
/* Sort out the input and output files */
591
infile = fopen(argv[op], "rb");
594
printf("** Failed to open %s\n", argv[op]);
601
outfile = fopen(argv[op+1], "wb");
604
printf("** Failed to open %s\n", argv[op+1]);
609
/* Set alternative malloc function */
611
pcre_malloc = new_malloc;
612
pcre_free = new_free;
613
pcre_stack_malloc = stack_malloc;
614
pcre_stack_free = stack_free;
616
/* Heading line, then prompt for first regex if stdin */
618
fprintf(outfile, "PCRE version %s\n\n", pcre_version());
625
pcre_extra *extra = NULL;
627
#if !defined NOPOSIX /* There are still compilers that require no indent */
633
unsigned char *p, *pp, *ppp;
634
unsigned char *to_file = NULL;
635
const unsigned char *tables = NULL;
636
unsigned long int true_size, true_study_size = 0;
637
size_t size, regex_gotten_store;
639
int do_debug = debug;
642
int do_showinfo = showinfo;
645
int erroroffset, len, delimiter;
649
if (infile == stdin) printf(" re> ");
650
if (fgets((char *)buffer, BUFFER_SIZE, infile) == NULL) break;
651
if (infile != stdin) fprintf(outfile, "%s", (char *)buffer);
655
while (isspace(*p)) p++;
656
if (*p == 0) continue;
658
/* See if the pattern is to be loaded pre-compiled from a file. */
660
if (*p == '<' && strchr((char *)(p+1), '<') == NULL)
662
unsigned long int magic;
667
pp = p + (int)strlen((char *)p);
668
while (isspace(pp[-1])) pp--;
671
f = fopen((char *)p, "rb");
674
fprintf(outfile, "Failed to open %s: %s\n", p, strerror(errno));
678
if (fread(sbuf, 1, 8, f) != 8) goto FAIL_READ;
681
(sbuf[0] << 24) | (sbuf[1] << 16) | (sbuf[2] << 8) | sbuf[3];
683
(sbuf[4] << 24) | (sbuf[5] << 16) | (sbuf[6] << 8) | sbuf[7];
685
re = (real_pcre *)new_malloc(true_size);
686
regex_gotten_store = gotten_store;
688
if (fread(re, 1, true_size, f) != true_size) goto FAIL_READ;
690
magic = ((real_pcre *)re)->magic_number;
691
if (magic != MAGIC_NUMBER)
693
if (byteflip(magic, sizeof(magic)) == MAGIC_NUMBER)
699
fprintf(outfile, "Data in %s is not a compiled PCRE regex\n", p);
705
fprintf(outfile, "Compiled regex%s loaded from %s\n",
706
do_flip? " (byte-inverted)" : "", p);
708
/* Need to know if UTF-8 for printing data strings */
710
new_info(re, NULL, PCRE_INFO_OPTIONS, &options);
711
use_utf8 = (options & PCRE_UTF8) != 0;
713
/* Now see if there is any following study data */
715
if (true_study_size != 0)
717
pcre_study_data *psd;
719
extra = (pcre_extra *)new_malloc(sizeof(pcre_extra) + true_study_size);
720
extra->flags = PCRE_EXTRA_STUDY_DATA;
722
psd = (pcre_study_data *)(((char *)extra) + sizeof(pcre_extra));
723
extra->study_data = psd;
725
if (fread(psd, 1, true_study_size, f) != true_study_size)
728
fprintf(outfile, "Failed to read data from %s\n", p);
729
if (extra != NULL) new_free(extra);
730
if (re != NULL) new_free(re);
734
fprintf(outfile, "Study data loaded from %s\n", p);
735
do_study = 1; /* To get the data output if requested */
737
else fprintf(outfile, "No study data\n");
743
/* In-line pattern (the usual case). Get the delimiter and seek the end of
744
the pattern; if is isn't complete, read more. */
748
if (isalnum(delimiter) || delimiter == '\\')
750
fprintf(outfile, "** Delimiter must not be alphameric or \\\n");
760
if (*pp == '\\' && pp[1] != 0) pp++;
761
else if (*pp == delimiter) break;
766
len = BUFFER_SIZE - (pp - buffer);
769
fprintf(outfile, "** Expression too long - missing delimiter?\n");
773
if (infile == stdin) printf(" > ");
774
if (fgets((char *)pp, len, infile) == NULL)
776
fprintf(outfile, "** Unexpected EOF\n");
780
if (infile != stdin) fprintf(outfile, "%s", (char *)pp);
783
/* If the first character after the delimiter is backslash, make
784
the pattern end with backslash. This is purely to provide a way
785
of testing for the error message when a pattern ends with backslash. */
787
if (pp[1] == '\\') *pp++ = '\\';
789
/* Terminate the pattern at the delimiter, and save a copy of the pattern
793
strcpy((char *)pbuffer, (char *)p);
795
/* Look for options after final delimiter */
799
log_store = showstore; /* default from command line */
805
case 'g': do_g = 1; break;
806
case 'i': options |= PCRE_CASELESS; break;
807
case 'm': options |= PCRE_MULTILINE; break;
808
case 's': options |= PCRE_DOTALL; break;
809
case 'x': options |= PCRE_EXTENDED; break;
811
case '+': do_showrest = 1; break;
812
case 'A': options |= PCRE_ANCHORED; break;
813
case 'C': options |= PCRE_AUTO_CALLOUT; break;
814
case 'D': do_debug = do_showinfo = 1; break;
815
case 'E': options |= PCRE_DOLLAR_ENDONLY; break;
816
case 'F': do_flip = 1; break;
817
case 'G': do_G = 1; break;
818
case 'I': do_showinfo = 1; break;
819
case 'M': log_store = 1; break;
820
case 'N': options |= PCRE_NO_AUTO_CAPTURE; break;
823
case 'P': do_posix = 1; break;
826
case 'S': do_study = 1; break;
827
case 'U': options |= PCRE_UNGREEDY; break;
828
case 'X': options |= PCRE_EXTRA; break;
829
case '8': options |= PCRE_UTF8; use_utf8 = 1; break;
830
case '?': options |= PCRE_NO_UTF8_CHECK; break;
834
while (*ppp != '\n' && *ppp != ' ') ppp++;
836
if (setlocale(LC_CTYPE, (const char *)pp) == NULL)
838
fprintf(outfile, "** Failed to set locale \"%s\"\n", pp);
841
tables = pcre_maketables();
847
while (*pp != 0) pp++;
848
while (isspace(pp[-1])) pp--;
852
case '\n': case ' ': break;
855
fprintf(outfile, "** Unknown option '%c'\n", pp[-1]);
860
/* Handle compiling via the POSIX interface, which doesn't support the
861
timing, showing, or debugging options, nor the ability to pass over
862
local character tables. */
865
if (posix || do_posix)
870
if ((options & PCRE_CASELESS) != 0) cflags |= REG_ICASE;
871
if ((options & PCRE_MULTILINE) != 0) cflags |= REG_NEWLINE;
872
rc = regcomp(&preg, (char *)p, cflags);
874
/* Compilation failed; go back for another re, skipping to blank line
875
if non-interactive. */
879
(void)regerror(rc, &preg, (char *)buffer, BUFFER_SIZE);
880
fprintf(outfile, "Failed: POSIX code %d: %s\n", rc, buffer);
885
/* Handle compiling via the native interface */
888
#endif /* !defined NOPOSIX */
895
clock_t start_time = clock();
896
for (i = 0; i < LOOPREPEAT; i++)
898
re = pcre_compile((char *)p, options, &error, &erroroffset, tables);
899
if (re != NULL) free(re);
901
time_taken = clock() - start_time;
902
fprintf(outfile, "Compile time %.3f milliseconds\n",
903
(((double)time_taken * 1000.0) / (double)LOOPREPEAT) /
904
(double)CLOCKS_PER_SEC);
907
re = pcre_compile((char *)p, options, &error, &erroroffset, tables);
909
/* Compilation failed; go back for another re, skipping to blank line
910
if non-interactive. */
914
fprintf(outfile, "Failed: %s at offset %d\n", error, erroroffset);
920
if (fgets((char *)buffer, BUFFER_SIZE, infile) == NULL)
925
len = (int)strlen((char *)buffer);
926
while (len > 0 && isspace(buffer[len-1])) len--;
929
fprintf(outfile, "\n");
934
/* Compilation succeeded; print data if required. There are now two
935
info-returning functions. The old one has a limited interface and
936
returns only limited data. Check that it agrees with the newer one. */
939
fprintf(outfile, "Memory allocation (code space): %d\n",
942
((real_pcre *)re)->name_count * ((real_pcre *)re)->name_entry_size));
944
/* Extract the size for possible writing before possibly flipping it,
945
and remember the store that was got. */
947
true_size = ((real_pcre *)re)->size;
948
regex_gotten_store = gotten_store;
950
/* If /S was present, study the regexp to generate additional info to
951
help with the matching. */
959
clock_t start_time = clock();
960
for (i = 0; i < LOOPREPEAT; i++)
961
extra = pcre_study(re, study_options, &error);
962
time_taken = clock() - start_time;
963
if (extra != NULL) free(extra);
964
fprintf(outfile, " Study time %.3f milliseconds\n",
965
(((double)time_taken * 1000.0) / (double)LOOPREPEAT) /
966
(double)CLOCKS_PER_SEC);
968
extra = pcre_study(re, study_options, &error);
970
fprintf(outfile, "Failed to study: %s\n", error);
971
else if (extra != NULL)
972
true_study_size = ((pcre_study_data *)(extra->study_data))->size;
975
/* If the 'F' option was present, we flip the bytes of all the integer
976
fields in the regex data block and the study block. This is to make it
977
possible to test PCRE's handling of byte-flipped patterns, e.g. those
978
compiled on a different architecture. */
982
real_pcre *rre = (real_pcre *)re;
983
rre->magic_number = byteflip(rre->magic_number, sizeof(rre->magic_number));
984
rre->size = byteflip(rre->size, sizeof(rre->size));
985
rre->options = byteflip(rre->options, sizeof(rre->options));
986
rre->top_bracket = byteflip(rre->top_bracket, sizeof(rre->top_bracket));
987
rre->top_backref = byteflip(rre->top_backref, sizeof(rre->top_backref));
988
rre->first_byte = byteflip(rre->first_byte, sizeof(rre->first_byte));
989
rre->req_byte = byteflip(rre->req_byte, sizeof(rre->req_byte));
990
rre->name_table_offset = byteflip(rre->name_table_offset,
991
sizeof(rre->name_table_offset));
992
rre->name_entry_size = byteflip(rre->name_entry_size,
993
sizeof(rre->name_entry_size));
994
rre->name_count = byteflip(rre->name_count, sizeof(rre->name_count));
998
pcre_study_data *rsd = (pcre_study_data *)(extra->study_data);
999
rsd->size = byteflip(rsd->size, sizeof(rsd->size));
1000
rsd->options = byteflip(rsd->options, sizeof(rsd->options));
1004
/* Extract information from the compiled data if required */
1010
unsigned long int get_options, all_options;
1011
int old_first_char, old_options, old_count;
1012
int count, backrefmax, first_char, need_char;
1013
int nameentrysize, namecount;
1014
const uschar *nametable;
1018
fprintf(outfile, "------------------------------------------------------------------\n");
1019
print_internals(re, outfile);
1022
new_info(re, NULL, PCRE_INFO_OPTIONS, &get_options);
1023
new_info(re, NULL, PCRE_INFO_SIZE, &size);
1024
new_info(re, NULL, PCRE_INFO_CAPTURECOUNT, &count);
1025
new_info(re, NULL, PCRE_INFO_BACKREFMAX, &backrefmax);
1026
new_info(re, NULL, PCRE_INFO_FIRSTBYTE, &first_char);
1027
new_info(re, NULL, PCRE_INFO_LASTLITERAL, &need_char);
1028
new_info(re, NULL, PCRE_INFO_NAMEENTRYSIZE, &nameentrysize);
1029
new_info(re, NULL, PCRE_INFO_NAMECOUNT, &namecount);
1030
new_info(re, NULL, PCRE_INFO_NAMETABLE, (void *)&nametable);
1032
old_count = pcre_info(re, &old_options, &old_first_char);
1033
if (count < 0) fprintf(outfile,
1034
"Error %d from pcre_info()\n", count);
1037
if (old_count != count) fprintf(outfile,
1038
"Count disagreement: pcre_fullinfo=%d pcre_info=%d\n", count,
1041
if (old_first_char != first_char) fprintf(outfile,
1042
"First char disagreement: pcre_fullinfo=%d pcre_info=%d\n",
1043
first_char, old_first_char);
1045
if (old_options != (int)get_options) fprintf(outfile,
1046
"Options disagreement: pcre_fullinfo=%ld pcre_info=%d\n",
1047
get_options, old_options);
1050
if (size != regex_gotten_store) fprintf(outfile,
1051
"Size disagreement: pcre_fullinfo=%d call to malloc for %d\n",
1052
size, regex_gotten_store);
1054
fprintf(outfile, "Capturing subpattern count = %d\n", count);
1056
fprintf(outfile, "Max back reference = %d\n", backrefmax);
1060
fprintf(outfile, "Named capturing subpatterns:\n");
1061
while (namecount-- > 0)
1063
fprintf(outfile, " %s %*s%3d\n", nametable + 2,
1064
nameentrysize - 3 - (int)strlen((char *)nametable + 2), "",
1065
GET2(nametable, 0));
1066
nametable += nameentrysize;
1070
/* The NOPARTIAL bit is a private bit in the options, so we have
1071
to fish it out via out back door */
1073
all_options = ((real_pcre *)re)->options;
1076
all_options = byteflip(all_options, sizeof(all_options));
1079
if ((all_options & PCRE_NOPARTIAL) != 0)
1080
fprintf(outfile, "Partial matching not supported\n");
1082
if (get_options == 0) fprintf(outfile, "No options\n");
1083
else fprintf(outfile, "Options:%s%s%s%s%s%s%s%s%s%s\n",
1084
((get_options & PCRE_ANCHORED) != 0)? " anchored" : "",
1085
((get_options & PCRE_CASELESS) != 0)? " caseless" : "",
1086
((get_options & PCRE_EXTENDED) != 0)? " extended" : "",
1087
((get_options & PCRE_MULTILINE) != 0)? " multiline" : "",
1088
((get_options & PCRE_DOTALL) != 0)? " dotall" : "",
1089
((get_options & PCRE_DOLLAR_ENDONLY) != 0)? " dollar_endonly" : "",
1090
((get_options & PCRE_EXTRA) != 0)? " extra" : "",
1091
((get_options & PCRE_UNGREEDY) != 0)? " ungreedy" : "",
1092
((get_options & PCRE_UTF8) != 0)? " utf8" : "",
1093
((get_options & PCRE_NO_UTF8_CHECK) != 0)? " no_utf8_check" : "");
1095
if (((((real_pcre *)re)->options) & PCRE_ICHANGED) != 0)
1096
fprintf(outfile, "Case state changes\n");
1098
if (first_char == -1)
1100
fprintf(outfile, "First char at start or follows \\n\n");
1102
else if (first_char < 0)
1104
fprintf(outfile, "No first char\n");
1108
int ch = first_char & 255;
1109
const char *caseless = ((first_char & REQ_CASELESS) == 0)?
1112
fprintf(outfile, "First char = \'%c\'%s\n", ch, caseless);
1114
fprintf(outfile, "First char = %d%s\n", ch, caseless);
1119
fprintf(outfile, "No need char\n");
1123
int ch = need_char & 255;
1124
const char *caseless = ((need_char & REQ_CASELESS) == 0)?
1127
fprintf(outfile, "Need char = \'%c\'%s\n", ch, caseless);
1129
fprintf(outfile, "Need char = %d%s\n", ch, caseless);
1132
/* Don't output study size; at present it is in any case a fixed
1133
value, but it varies, depending on the computer architecture, and
1134
so messes up the test suite. (And with the /F option, it might be
1140
fprintf(outfile, "Study returned NULL\n");
1143
uschar *start_bits = NULL;
1144
new_info(re, extra, PCRE_INFO_FIRSTTABLE, &start_bits);
1146
if (start_bits == NULL)
1147
fprintf(outfile, "No starting byte set\n");
1152
fprintf(outfile, "Starting byte set: ");
1153
for (i = 0; i < 256; i++)
1155
if ((start_bits[i/8] & (1<<(i&7))) != 0)
1159
fprintf(outfile, "\n ");
1162
if (isprint(i) && i != ' ')
1164
fprintf(outfile, "%c ", i);
1169
fprintf(outfile, "\\x%02x ", i);
1174
fprintf(outfile, "\n");
1180
/* If the '>' option was present, we write out the regex to a file, and
1181
that is all. The first 8 bytes of the file are the regex length and then
1182
the study length, in big-endian order. */
1184
if (to_file != NULL)
1186
FILE *f = fopen((char *)to_file, "wb");
1189
fprintf(outfile, "Unable to open %s: %s\n", to_file, strerror(errno));
1194
sbuf[0] = (true_size >> 24) & 255;
1195
sbuf[1] = (true_size >> 16) & 255;
1196
sbuf[2] = (true_size >> 8) & 255;
1197
sbuf[3] = (true_size) & 255;
1199
sbuf[4] = (true_study_size >> 24) & 255;
1200
sbuf[5] = (true_study_size >> 16) & 255;
1201
sbuf[6] = (true_study_size >> 8) & 255;
1202
sbuf[7] = (true_study_size) & 255;
1204
if (fwrite(sbuf, 1, 8, f) < 8 ||
1205
fwrite(re, 1, true_size, f) < true_size)
1207
fprintf(outfile, "Write error on %s: %s\n", to_file, strerror(errno));
1211
fprintf(outfile, "Compiled regex written to %s\n", to_file);
1214
if (fwrite(extra->study_data, 1, true_study_size, f) <
1217
fprintf(outfile, "Write error on %s: %s\n", to_file,
1220
else fprintf(outfile, "Study data written to %s\n", to_file);
1225
continue; /* With next regex */
1227
} /* End of non-POSIX compile */
1229
/* Read data lines and test them */
1234
unsigned char *bptr = dbuffer;
1235
int *use_offsets = offsets;
1236
int use_size_offsets = size_offsets;
1237
int callout_data = 0;
1238
int callout_data_set = 0;
1240
int copystrings = 0;
1241
int find_match_limit = 0;
1245
int start_offset = 0;
1250
pcre_callout = callout;
1254
callout_fail_count = 999999;
1255
callout_fail_id = -1;
1258
if (infile == stdin) printf("data> ");
1259
if (fgets((char *)buffer, BUFFER_SIZE, infile) == NULL)
1264
if (infile != stdin) fprintf(outfile, "%s", (char *)buffer);
1266
len = (int)strlen((char *)buffer);
1267
while (len > 0 && isspace(buffer[len-1])) len--;
1269
if (len == 0) break;
1272
while (isspace(*p)) p++;
1275
while ((c = *p++) != 0)
1280
if (c == '\\') switch ((c = *p++))
1282
case 'a': c = 7; break;
1283
case 'b': c = '\b'; break;
1284
case 'e': c = 27; break;
1285
case 'f': c = '\f'; break;
1286
case 'n': c = '\n'; break;
1287
case 'r': c = '\r'; break;
1288
case 't': c = '\t'; break;
1289
case 'v': c = '\v'; break;
1291
case '0': case '1': case '2': case '3':
1292
case '4': case '5': case '6': case '7':
1294
while (i++ < 2 && isdigit(*p) && *p != '8' && *p != '9')
1295
c = c * 8 + *p++ - '0';
1300
/* Handle \x{..} specially - new Perl thing for utf8 */
1304
unsigned char *pt = p;
1306
while (isxdigit(*(++pt)))
1307
c = c * 16 + tolower(*pt) - ((isdigit(*pt))? '0' : 'W');
1310
unsigned char buff8[8];
1312
utn = ord2utf8(c, buff8);
1313
for (ii = 0; ii < utn - 1; ii++) *q++ = buff8[ii];
1314
c = buff8[ii]; /* Last byte */
1318
/* Not correct form; fall through */
1324
while (i++ < 2 && isxdigit(*p))
1326
c = c * 16 + tolower(*p) - ((isdigit(*p))? '0' : 'W');
1331
case 0: /* \ followed by EOF allows for an empty line */
1336
while(isdigit(*p)) start_offset = start_offset * 10 + *p++ - '0';
1339
case 'A': /* Option setting */
1340
options |= PCRE_ANCHORED;
1344
options |= PCRE_NOTBOL;
1348
if (isdigit(*p)) /* Set copy string */
1350
while(isdigit(*p)) n = n * 10 + *p++ - '0';
1351
copystrings |= 1 << n;
1353
else if (isalnum(*p))
1357
while (isalnum(*p)) *npp++ = *p++;
1359
n = pcre_get_stringnumber(re, (char *)name);
1361
fprintf(outfile, "no parentheses with name \"%s\"\n", name);
1362
else copystrings |= 1 << n;
1371
pcre_callout = NULL;
1376
callout_fail_id = 0;
1379
callout_fail_id = callout_fail_id * 10 + *p++ - '0';
1380
callout_fail_count = 0;
1385
callout_fail_count = callout_fail_count * 10 + *p++ - '0';
1392
if (*(++p) == '-') { sign = -1; p++; }
1394
callout_data = callout_data * 10 + *p++ - '0';
1395
callout_data *= sign;
1396
callout_data_set = 1;
1403
while(isdigit(*p)) n = n * 10 + *p++ - '0';
1404
getstrings |= 1 << n;
1406
else if (isalnum(*p))
1410
while (isalnum(*p)) *npp++ = *p++;
1412
n = pcre_get_stringnumber(re, (char *)name);
1414
fprintf(outfile, "no parentheses with name \"%s\"\n", name);
1415
else getstrings |= 1 << n;
1424
find_match_limit = 1;
1428
options |= PCRE_NOTEMPTY;
1432
while(isdigit(*p)) n = n * 10 + *p++ - '0';
1433
if (n > size_offsets_max)
1435
size_offsets_max = n;
1437
use_offsets = offsets = (int *)malloc(size_offsets_max * sizeof(int));
1438
if (offsets == NULL)
1440
printf("** Failed to get %d bytes of memory for offsets vector\n",
1441
size_offsets_max * sizeof(int));
1445
use_size_offsets = n;
1446
if (n == 0) use_offsets = NULL; /* Ensures it can't write to it */
1450
options |= PCRE_PARTIAL;
1458
options |= PCRE_NOTEOL;
1462
options |= PCRE_NO_UTF8_CHECK;
1470
/* Handle matching via the POSIX interface, which does not
1471
support timing or playing with the match limit or callout data. */
1473
#if !defined NOPOSIX
1474
if (posix || do_posix)
1478
regmatch_t *pmatch = NULL;
1479
if (use_size_offsets > 0)
1480
pmatch = (regmatch_t *)malloc(sizeof(regmatch_t) * use_size_offsets);
1481
if ((options & PCRE_NOTBOL) != 0) eflags |= REG_NOTBOL;
1482
if ((options & PCRE_NOTEOL) != 0) eflags |= REG_NOTEOL;
1484
rc = regexec(&preg, (const char *)bptr, use_size_offsets, pmatch, eflags);
1488
(void)regerror(rc, &preg, (char *)buffer, BUFFER_SIZE);
1489
fprintf(outfile, "No match: POSIX code %d: %s\n", rc, buffer);
1494
for (i = 0; i < (size_t)use_size_offsets; i++)
1496
if (pmatch[i].rm_so >= 0)
1498
fprintf(outfile, "%2d: ", (int)i);
1499
(void)pchars(dbuffer + pmatch[i].rm_so,
1500
pmatch[i].rm_eo - pmatch[i].rm_so, outfile);
1501
fprintf(outfile, "\n");
1502
if (i == 0 && do_showrest)
1504
fprintf(outfile, " 0+ ");
1505
(void)pchars(dbuffer + pmatch[i].rm_eo, len - pmatch[i].rm_eo,
1507
fprintf(outfile, "\n");
1515
/* Handle matching via the native interface - repeats for /g and /G */
1518
#endif /* !defined NOPOSIX */
1520
for (;; gmatched++) /* Loop for /g or /G */
1526
clock_t start_time = clock();
1527
for (i = 0; i < LOOPREPEAT; i++)
1528
count = pcre_exec(re, extra, (char *)bptr, len,
1529
start_offset, options | g_notempty, use_offsets, use_size_offsets);
1530
time_taken = clock() - start_time;
1531
fprintf(outfile, "Execute time %.3f milliseconds\n",
1532
(((double)time_taken * 1000.0) / (double)LOOPREPEAT) /
1533
(double)CLOCKS_PER_SEC);
1536
/* If find_match_limit is set, we want to do repeated matches with
1537
varying limits in order to find the minimum value. */
1539
if (find_match_limit)
1547
extra = (pcre_extra *)malloc(sizeof(pcre_extra));
1550
extra->flags |= PCRE_EXTRA_MATCH_LIMIT;
1554
extra->match_limit = mid;
1555
count = pcre_exec(re, extra, (char *)bptr, len, start_offset,
1556
options | g_notempty, use_offsets, use_size_offsets);
1557
if (count == PCRE_ERROR_MATCHLIMIT)
1559
/* fprintf(outfile, "Testing match limit = %d\n", mid); */
1561
mid = (mid == max - 1)? max : (max > 0)? (min + max)/2 : mid*2;
1563
else if (count >= 0 || count == PCRE_ERROR_NOMATCH ||
1564
count == PCRE_ERROR_PARTIAL)
1568
fprintf(outfile, "Minimum match limit = %d\n", mid);
1571
/* fprintf(outfile, "Testing match limit = %d\n", mid); */
1573
mid = (min + mid)/2;
1575
else break; /* Some other error */
1578
extra->flags &= ~PCRE_EXTRA_MATCH_LIMIT;
1581
/* If callout_data is set, use the interface with additional data */
1583
else if (callout_data_set)
1587
extra = (pcre_extra *)malloc(sizeof(pcre_extra));
1590
extra->flags |= PCRE_EXTRA_CALLOUT_DATA;
1591
extra->callout_data = &callout_data;
1592
count = pcre_exec(re, extra, (char *)bptr, len, start_offset,
1593
options | g_notempty, use_offsets, use_size_offsets);
1594
extra->flags &= ~PCRE_EXTRA_CALLOUT_DATA;
1597
/* The normal case is just to do the match once, with the default
1598
value of match_limit. */
1602
count = pcre_exec(re, extra, (char *)bptr, len,
1603
start_offset, options | g_notempty, use_offsets, use_size_offsets);
1608
fprintf(outfile, "Matched, but too many substrings\n");
1609
count = use_size_offsets/3;
1617
for (i = 0; i < count * 2; i += 2)
1619
if (use_offsets[i] < 0)
1620
fprintf(outfile, "%2d: <unset>\n", i/2);
1623
fprintf(outfile, "%2d: ", i/2);
1624
(void)pchars(bptr + use_offsets[i],
1625
use_offsets[i+1] - use_offsets[i], outfile);
1626
fprintf(outfile, "\n");
1631
fprintf(outfile, " 0+ ");
1632
(void)pchars(bptr + use_offsets[i+1], len - use_offsets[i+1],
1634
fprintf(outfile, "\n");
1640
for (i = 0; i < 32; i++)
1642
if ((copystrings & (1 << i)) != 0)
1644
char copybuffer[16];
1645
int rc = pcre_copy_substring((char *)bptr, use_offsets, count,
1646
i, copybuffer, sizeof(copybuffer));
1648
fprintf(outfile, "copy substring %d failed %d\n", i, rc);
1650
fprintf(outfile, "%2dC %s (%d)\n", i, copybuffer, rc);
1654
for (i = 0; i < 32; i++)
1656
if ((getstrings & (1 << i)) != 0)
1658
const char *substring;
1659
int rc = pcre_get_substring((char *)bptr, use_offsets, count,
1662
fprintf(outfile, "get substring %d failed %d\n", i, rc);
1665
fprintf(outfile, "%2dG %s (%d)\n", i, substring, rc);
1666
/* free((void *)substring); */
1667
pcre_free_substring(substring);
1674
const char **stringlist;
1675
int rc = pcre_get_substring_list((char *)bptr, use_offsets, count,
1678
fprintf(outfile, "get substring list failed %d\n", rc);
1681
for (i = 0; i < count; i++)
1682
fprintf(outfile, "%2dL %s\n", i, stringlist[i]);
1683
if (stringlist[i] != NULL)
1684
fprintf(outfile, "string list not terminated by NULL\n");
1685
/* free((void *)stringlist); */
1686
pcre_free_substring_list(stringlist);
1691
/* There was a partial match */
1693
else if (count == PCRE_ERROR_PARTIAL)
1695
fprintf(outfile, "Partial match\n");
1696
break; /* Out of the /g loop */
1699
/* Failed to match. If this is a /g or /G loop and we previously set
1700
g_notempty after a null match, this is not necessarily the end.
1701
We want to advance the start offset, and continue. In the case of UTF-8
1702
matching, the advance must be one character, not one byte. Fudge the
1703
offset values to achieve this. We won't be at the end of the string -
1704
that was checked before setting g_notempty. */
1708
if (g_notempty != 0)
1711
use_offsets[0] = start_offset;
1714
while (start_offset + onechar < len)
1716
int tb = bptr[start_offset+onechar];
1717
if (tb <= 127) break;
1719
if (tb != 0 && tb != 0xc0) onechar++;
1722
use_offsets[1] = start_offset + onechar;
1726
if (count == PCRE_ERROR_NOMATCH)
1728
if (gmatched == 0) fprintf(outfile, "No match\n");
1730
else fprintf(outfile, "Error %d\n", count);
1731
break; /* Out of the /g loop */
1735
/* If not /g or /G we are done */
1737
if (!do_g && !do_G) break;
1739
/* If we have matched an empty string, first check to see if we are at
1740
the end of the subject. If so, the /g loop is over. Otherwise, mimic
1741
what Perl's /g options does. This turns out to be rather cunning. First
1742
we set PCRE_NOTEMPTY and PCRE_ANCHORED and try the match again at the
1743
same point. If this fails (picked up above) we advance to the next
1747
if (use_offsets[0] == use_offsets[1])
1749
if (use_offsets[0] == len) break;
1750
g_notempty = PCRE_NOTEMPTY | PCRE_ANCHORED;
1753
/* For /g, update the start offset, leaving the rest alone */
1755
if (do_g) start_offset = use_offsets[1];
1757
/* For /G, update the pointer and length */
1761
bptr += use_offsets[1];
1762
len -= use_offsets[1];
1764
} /* End of loop for /g and /G */
1765
} /* End of loop for data lines */
1769
#if !defined NOPOSIX
1770
if (posix || do_posix) regfree(&preg);
1773
if (re != NULL) free(re);
1774
if (extra != NULL) free(extra);
1777
free((void *)tables);
1778
setlocale(LC_CTYPE, "C");
1782
if (infile == stdin) fprintf(outfile, "\n");