1
/* $Cambridge: exim/exim-src/src/pcre/pcretest.c,v 1.2 2005/06/15 08:57:10 ph10 Exp $ */
1
3
/*************************************************
2
4
* PCRE testing program *
3
5
*************************************************/
5
7
/* This program was hacked up as a tester for PCRE. I really should have
6
8
written it more tidily in the first place. Will I ever learn? It has grown and
7
been extended and consequently is now rather untidy in places. */
9
been extended and consequently is now rather, er, *very* untidy in places.
11
-----------------------------------------------------------------------------
12
Redistribution and use in source and binary forms, with or without
13
modification, are permitted provided that the following conditions are met:
15
* Redistributions of source code must retain the above copyright notice,
16
this list of conditions and the following disclaimer.
18
* Redistributions in binary form must reproduce the above copyright
19
notice, this list of conditions and the following disclaimer in the
20
documentation and/or other materials provided with the distribution.
22
* Neither the name of the University of Cambridge nor the names of its
23
contributors may be used to endorse or promote products derived from
24
this software without specific prior written permission.
26
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36
POSSIBILITY OF SUCH DAMAGE.
37
-----------------------------------------------------------------------------
52
96
static int use_utf8;
53
97
static size_t gotten_store;
56
static const int utf8_table1[] = {
57
0x0000007f, 0x000007ff, 0x0000ffff, 0x001fffff, 0x03ffffff, 0x7fffffff};
59
static const int utf8_table2[] = {
60
0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
62
static const int utf8_table3[] = {
63
0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01};
67
/*************************************************
68
* Print compiled regex *
69
*************************************************/
71
/* The code for doing this is held in a separate file that is also included in
72
pcre.c when it is compiled with the debug switch. It defines a function called
73
print_internals(), which uses a table of opcode lengths defined by the macro
74
OP_LENGTHS, whose name must be OP_lengths. */
76
static uschar OP_lengths[] = { OP_LENGTHS };
99
static uschar *pbuffer = NULL;
109
/*************************************************
110
* Convert character value to UTF-8 *
111
*************************************************/
113
/* This function takes an integer value in the range 0 - 0x7fffffff
114
and encodes it as a UTF-8 character in 0 to 6 bytes.
117
cvalue the character value
118
buffer pointer to buffer for result - at least 6 bytes long
120
Returns: number of characters placed in the buffer
121
-1 if input character is negative
122
0 if input character is positive but too big (only when
123
int is longer than 32 bits)
127
ord2utf8(int cvalue, unsigned char *buffer)
130
for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
131
if (cvalue <= utf8_table1[i]) break;
132
if (i >= sizeof(utf8_table1)/sizeof(int)) return 0;
133
if (cvalue < 0) return -1;
136
for (j = i; j > 0; j--)
138
*buffer-- = 0x80 | (cvalue & 0x3f);
141
*buffer = utf8_table2[i] | cvalue;
146
131
/*************************************************
147
132
* Convert UTF-8 string to value *
300
291
post_start = pchars((unsigned char *)(cb->subject + cb->start_match),
301
292
cb->current_position - cb->start_match, f);
294
subject_length = pchars((unsigned char *)cb->subject, cb->subject_length, NULL);
303
296
(void)pchars((unsigned char *)(cb->subject + cb->current_position),
304
297
cb->subject_length - cb->current_position, f);
306
299
if (f != NULL) fprintf(f, "\n");
308
301
/* Always print appropriate indicators, with callout number if not already
302
shown. For automatic callouts, show the pattern offset. */
311
if (callout_extra) fprintf(outfile, " ");
312
else fprintf(outfile, "%3d ", cb->callout_number);
304
if (cb->callout_number == 255)
306
fprintf(outfile, "%+3d ", cb->pattern_position);
307
if (cb->pattern_position > 99) fprintf(outfile, "\n ");
311
if (callout_extra) fprintf(outfile, " ");
312
else fprintf(outfile, "%3d ", cb->callout_number);
314
315
for (i = 0; i < pre_start; i++) fprintf(outfile, " ");
315
316
fprintf(outfile, "^");
479
517
printf("** Unknown or malformed option %s\n", argv[op]);
480
518
printf("Usage: pcretest [-d] [-i] [-o <n>] [-p] [-s] [-t] [<input> [<output>]]\n");
481
519
printf(" -C show PCRE compile-time options and exit\n");
482
printf(" -d debug: show compiled code; implies -i\n"
483
" -i show information about compiled pattern\n"
520
printf(" -d debug: show compiled code; implies -i\n");
522
printf(" -dfa force DFA matching for all subjects\n");
524
printf(" -i show information about compiled pattern\n"
525
" -m output memory used information\n"
484
526
" -o <n> set size of offsets vector to <n>\n");
485
527
#if !defined NOPOSIX
486
528
printf(" -p use POSIX interface\n");
488
printf(" -s output store information\n"
530
printf(" -s output store (memory) used information\n"
489
531
" -t time compilation and execution\n");
571
621
while (isspace(*p)) p++;
572
622
if (*p == 0) continue;
574
/* Get the delimiter and seek the end of the pattern; if is isn't
575
complete, read more. */
624
/* See if the pattern is to be loaded pre-compiled from a file. */
626
if (*p == '<' && strchr((char *)(p+1), '<') == NULL)
628
unsigned long int magic;
633
pp = p + (int)strlen((char *)p);
634
while (isspace(pp[-1])) pp--;
637
f = fopen((char *)p, "rb");
640
fprintf(outfile, "Failed to open %s: %s\n", p, strerror(errno));
644
if (fread(sbuf, 1, 8, f) != 8) goto FAIL_READ;
647
(sbuf[0] << 24) | (sbuf[1] << 16) | (sbuf[2] << 8) | sbuf[3];
649
(sbuf[4] << 24) | (sbuf[5] << 16) | (sbuf[6] << 8) | sbuf[7];
651
re = (real_pcre *)new_malloc(true_size);
652
regex_gotten_store = gotten_store;
654
if (fread(re, 1, true_size, f) != true_size) goto FAIL_READ;
656
magic = ((real_pcre *)re)->magic_number;
657
if (magic != MAGIC_NUMBER)
659
if (byteflip(magic, sizeof(magic)) == MAGIC_NUMBER)
665
fprintf(outfile, "Data in %s is not a compiled PCRE regex\n", p);
671
fprintf(outfile, "Compiled regex%s loaded from %s\n",
672
do_flip? " (byte-inverted)" : "", p);
674
/* Need to know if UTF-8 for printing data strings */
676
new_info(re, NULL, PCRE_INFO_OPTIONS, &options);
677
use_utf8 = (options & PCRE_UTF8) != 0;
679
/* Now see if there is any following study data */
681
if (true_study_size != 0)
683
pcre_study_data *psd;
685
extra = (pcre_extra *)new_malloc(sizeof(pcre_extra) + true_study_size);
686
extra->flags = PCRE_EXTRA_STUDY_DATA;
688
psd = (pcre_study_data *)(((char *)extra) + sizeof(pcre_extra));
689
extra->study_data = psd;
691
if (fread(psd, 1, true_study_size, f) != true_study_size)
694
fprintf(outfile, "Failed to read data from %s\n", p);
695
if (extra != NULL) new_free(extra);
696
if (re != NULL) new_free(re);
700
fprintf(outfile, "Study data loaded from %s\n", p);
701
do_study = 1; /* To get the data output if requested */
703
else fprintf(outfile, "No study data\n");
709
/* In-line pattern (the usual case). Get the delimiter and seek the end of
710
the pattern; if is isn't complete, read more. */
577
712
delimiter = *p++;
759
913
sizeof(real_pcre) -
760
914
((real_pcre *)re)->name_count * ((real_pcre *)re)->name_entry_size));
916
/* Extract the size for possible writing before possibly flipping it,
917
and remember the store that was got. */
919
true_size = ((real_pcre *)re)->size;
920
regex_gotten_store = gotten_store;
922
/* If /S was present, study the regexp to generate additional info to
923
help with the matching. */
931
clock_t start_time = clock();
932
for (i = 0; i < LOOPREPEAT; i++)
933
extra = pcre_study(re, study_options, &error);
934
time_taken = clock() - start_time;
935
if (extra != NULL) free(extra);
936
fprintf(outfile, " Study time %.3f milliseconds\n",
937
(((double)time_taken * 1000.0) / (double)LOOPREPEAT) /
938
(double)CLOCKS_PER_SEC);
940
extra = pcre_study(re, study_options, &error);
942
fprintf(outfile, "Failed to study: %s\n", error);
943
else if (extra != NULL)
944
true_study_size = ((pcre_study_data *)(extra->study_data))->size;
947
/* If the 'F' option was present, we flip the bytes of all the integer
948
fields in the regex data block and the study block. This is to make it
949
possible to test PCRE's handling of byte-flipped patterns, e.g. those
950
compiled on a different architecture. */
954
real_pcre *rre = (real_pcre *)re;
955
rre->magic_number = byteflip(rre->magic_number, sizeof(rre->magic_number));
956
rre->size = byteflip(rre->size, sizeof(rre->size));
957
rre->options = byteflip(rre->options, sizeof(rre->options));
958
rre->top_bracket = byteflip(rre->top_bracket, sizeof(rre->top_bracket));
959
rre->top_backref = byteflip(rre->top_backref, sizeof(rre->top_backref));
960
rre->first_byte = byteflip(rre->first_byte, sizeof(rre->first_byte));
961
rre->req_byte = byteflip(rre->req_byte, sizeof(rre->req_byte));
962
rre->name_table_offset = byteflip(rre->name_table_offset,
963
sizeof(rre->name_table_offset));
964
rre->name_entry_size = byteflip(rre->name_entry_size,
965
sizeof(rre->name_entry_size));
966
rre->name_count = byteflip(rre->name_count, sizeof(rre->name_count));
970
pcre_study_data *rsd = (pcre_study_data *)(extra->study_data);
971
rsd->size = byteflip(rsd->size, sizeof(rsd->size));
972
rsd->options = byteflip(rsd->options, sizeof(rsd->options));
976
/* Extract information from the compiled data if required */
764
unsigned long int get_options;
982
unsigned long int get_options, all_options;
983
#if !defined NOINFOCHECK
765
984
int old_first_char, old_options, old_count;
766
986
int count, backrefmax, first_char, need_char;
767
987
int nameentrysize, namecount;
768
988
const uschar *nametable;
773
992
fprintf(outfile, "------------------------------------------------------------------\n");
774
print_internals(re, outfile);
993
_pcre_printint(re, outfile);
777
996
new_info(re, NULL, PCRE_INFO_OPTIONS, &get_options);
1046
/* The NOPARTIAL bit is a private bit in the options, so we have
1047
to fish it out via out back door */
1049
all_options = ((real_pcre *)re)->options;
1052
all_options = byteflip(all_options, sizeof(all_options));
1055
if ((all_options & PCRE_NOPARTIAL) != 0)
1056
fprintf(outfile, "Partial matching not supported\n");
825
1058
if (get_options == 0) fprintf(outfile, "No options\n");
826
else fprintf(outfile, "Options:%s%s%s%s%s%s%s%s%s%s\n",
1059
else fprintf(outfile, "Options:%s%s%s%s%s%s%s%s%s%s%s\n",
827
1060
((get_options & PCRE_ANCHORED) != 0)? " anchored" : "",
828
1061
((get_options & PCRE_CASELESS) != 0)? " caseless" : "",
829
1062
((get_options & PCRE_EXTENDED) != 0)? " extended" : "",
830
1063
((get_options & PCRE_MULTILINE) != 0)? " multiline" : "",
1064
((get_options & PCRE_FIRSTLINE) != 0)? " firstline" : "",
831
1065
((get_options & PCRE_DOTALL) != 0)? " dotall" : "",
832
1066
((get_options & PCRE_DOLLAR_ENDONLY) != 0)? " dollar_endonly" : "",
833
1067
((get_options & PCRE_EXTRA) != 0)? " extra" : "",
872
1106
fprintf(outfile, "Need char = %d%s\n", ch, caseless);
876
/* If /S was present, study the regexp to generate additional info to
877
help with the matching. */
885
clock_t start_time = clock();
886
for (i = 0; i < LOOPREPEAT; i++)
887
extra = pcre_study(re, study_options, &error);
888
time_taken = clock() - start_time;
889
if (extra != NULL) free(extra);
890
fprintf(outfile, " Study time %.3f milliseconds\n",
891
(((double)time_taken * 1000.0) / (double)LOOPREPEAT) /
892
(double)CLOCKS_PER_SEC);
895
extra = pcre_study(re, study_options, &error);
897
fprintf(outfile, "Failed to study: %s\n", error);
898
else if (extra == NULL)
899
fprintf(outfile, "Study returned NULL\n");
901
1109
/* Don't output study size; at present it is in any case a fixed
902
1110
value, but it varies, depending on the computer architecture, and
903
so messes up the test suite. */
905
else if (do_showinfo)
908
uschar *start_bits = NULL;
909
new_info(re, extra, PCRE_INFO_STUDYSIZE, &size);
910
new_info(re, extra, PCRE_INFO_FIRSTTABLE, &start_bits);
911
/* fprintf(outfile, "Study size = %d\n", size); */
912
if (start_bits == NULL)
913
fprintf(outfile, "No starting character set\n");
918
fprintf(outfile, "Starting character set: ");
919
for (i = 0; i < 256; i++)
921
if ((start_bits[i/8] & (1<<(i%8))) != 0)
925
fprintf(outfile, "\n ");
928
if (isprint(i) && i != ' ')
930
fprintf(outfile, "%c ", i);
935
fprintf(outfile, "\\x%02x ", i);
940
fprintf(outfile, "\n");
1111
so messes up the test suite. (And with the /F option, it might be
1117
fprintf(outfile, "Study returned NULL\n");
1120
uschar *start_bits = NULL;
1121
new_info(re, extra, PCRE_INFO_FIRSTTABLE, &start_bits);
1123
if (start_bits == NULL)
1124
fprintf(outfile, "No starting byte set\n");
1129
fprintf(outfile, "Starting byte set: ");
1130
for (i = 0; i < 256; i++)
1132
if ((start_bits[i/8] & (1<<(i&7))) != 0)
1136
fprintf(outfile, "\n ");
1139
if (isprint(i) && i != ' ')
1141
fprintf(outfile, "%c ", i);
1146
fprintf(outfile, "\\x%02x ", i);
1151
fprintf(outfile, "\n");
1157
/* If the '>' option was present, we write out the regex to a file, and
1158
that is all. The first 8 bytes of the file are the regex length and then
1159
the study length, in big-endian order. */
1161
if (to_file != NULL)
1163
FILE *f = fopen((char *)to_file, "wb");
1166
fprintf(outfile, "Unable to open %s: %s\n", to_file, strerror(errno));
1171
sbuf[0] = (true_size >> 24) & 255;
1172
sbuf[1] = (true_size >> 16) & 255;
1173
sbuf[2] = (true_size >> 8) & 255;
1174
sbuf[3] = (true_size) & 255;
1176
sbuf[4] = (true_study_size >> 24) & 255;
1177
sbuf[5] = (true_study_size >> 16) & 255;
1178
sbuf[6] = (true_study_size >> 8) & 255;
1179
sbuf[7] = (true_study_size) & 255;
1181
if (fwrite(sbuf, 1, 8, f) < 8 ||
1182
fwrite(re, 1, true_size, f) < true_size)
1184
fprintf(outfile, "Write error on %s: %s\n", to_file, strerror(errno));
1188
fprintf(outfile, "Compiled regex written to %s\n", to_file);
1191
if (fwrite(extra->study_data, 1, true_study_size, f) <
1194
fprintf(outfile, "Write error on %s: %s\n", to_file,
1197
else fprintf(outfile, "Study data written to %s\n", to_file);
1204
if (extra != NULL) new_free(extra);
1205
if (tables != NULL) new_free((void *)tables);
1206
continue; /* With next regex */
1208
} /* End of non-POSIX compile */
946
1210
/* Read data lines and test them */
1233
1536
register int i;
1234
1537
clock_t time_taken;
1235
1538
clock_t start_time = clock();
1541
if (all_use_dfa || use_dfa)
1543
int workspace[1000];
1544
for (i = 0; i < LOOPREPEAT; i++)
1545
count = pcre_dfa_exec(re, NULL, (char *)bptr, len, start_offset,
1546
options | g_notempty, use_offsets, use_size_offsets, workspace,
1547
sizeof(workspace)/sizeof(int));
1236
1552
for (i = 0; i < LOOPREPEAT; i++)
1237
1553
count = pcre_exec(re, extra, (char *)bptr, len,
1238
1554
start_offset, options | g_notempty, use_offsets, use_size_offsets);
1239
1556
time_taken = clock() - start_time;
1240
1557
fprintf(outfile, "Execute time %.3f milliseconds\n",
1241
1558
(((double)time_taken * 1000.0) / (double)LOOPREPEAT) /
1305
1623
/* The normal case is just to do the match once, with the default
1306
1624
value of match_limit. */
1308
else count = pcre_exec(re, extra, (char *)bptr, len,
1309
start_offset, options | g_notempty, use_offsets, use_size_offsets);
1627
else if (all_use_dfa || use_dfa)
1629
int workspace[1000];
1630
count = pcre_dfa_exec(re, NULL, (char *)bptr, len, start_offset,
1631
options | g_notempty, use_offsets, use_size_offsets, workspace,
1632
sizeof(workspace)/sizeof(int));
1635
fprintf(outfile, "Matched, but too many subsidiary matches\n");
1636
count = use_size_offsets/2;
1313
fprintf(outfile, "Matched, but too many substrings\n");
1314
count = use_size_offsets/3;
1643
count = pcre_exec(re, extra, (char *)bptr, len,
1644
start_offset, options | g_notempty, use_offsets, use_size_offsets);
1647
fprintf(outfile, "Matched, but too many substrings\n");
1648
count = use_size_offsets/3;