2
tre-compile.c - TRE regex compiler
4
Copyright (c) 2001-2006 Ville Laurikari <vl@iki.fi>
6
This library is free software; you can redistribute it and/or
7
modify it under the terms of the GNU Lesser General Public
8
License as published by the Free Software Foundation; either
9
version 2.1 of the License, or (at your option) any later version.
11
This library is distributed in the hope that it will be useful,
12
but WITHOUT ANY WARRANTY; without even the implied warranty of
13
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14
Lesser General Public License for more details.
16
You should have received a copy of the GNU Lesser General Public
17
License along with this library; if not, write to the Free Software
18
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
24
- Fix tre_ast_to_tnfa() to recurse using a stack instead of recursive
31
#endif /* HAVE_CONFIG_H */
36
#include "tre-internal.h"
38
#include "tre-stack.h"
40
#include "tre-parse.h"
41
#include "tre-compile.h"
46
Algorithms to setup tags so that submatch addressing can be done.
50
/* Inserts a catenation node to the root of the tree given in `node'.
51
As the left child a new tag with number `tag_id' to `node' is added,
52
and the right child is the old root. */
54
tre_add_tag_left(tre_mem_t mem, tre_ast_node_t *node, int tag_id)
58
DPRINT(("add_tag_left: tag %d\n", tag_id));
60
c = tre_mem_alloc(mem, sizeof(*c));
63
c->left = tre_ast_new_literal(mem, TAG, tag_id, -1);
66
c->right = tre_mem_alloc(mem, sizeof(tre_ast_node_t));
70
c->right->obj = node->obj;
71
c->right->type = node->type;
72
c->right->nullable = -1;
73
c->right->submatch_id = -1;
74
c->right->firstpos = NULL;
75
c->right->lastpos = NULL;
76
c->right->num_tags = 0;
78
node->type = CATENATION;
82
/* Inserts a catenation node to the root of the tree given in `node'.
83
As the right child a new tag with number `tag_id' to `node' is added,
84
and the left child is the old root. */
86
tre_add_tag_right(tre_mem_t mem, tre_ast_node_t *node, int tag_id)
90
DPRINT(("tre_add_tag_right: tag %d\n", tag_id));
92
c = tre_mem_alloc(mem, sizeof(*c));
95
c->right = tre_ast_new_literal(mem, TAG, tag_id, -1);
98
c->left = tre_mem_alloc(mem, sizeof(tre_ast_node_t));
102
c->left->obj = node->obj;
103
c->left->type = node->type;
104
c->left->nullable = -1;
105
c->left->submatch_id = -1;
106
c->left->firstpos = NULL;
107
c->left->lastpos = NULL;
108
c->left->num_tags = 0;
110
node->type = CATENATION;
116
ADDTAGS_AFTER_ITERATION,
117
ADDTAGS_AFTER_UNION_LEFT,
118
ADDTAGS_AFTER_UNION_RIGHT,
119
ADDTAGS_AFTER_CAT_LEFT,
120
ADDTAGS_AFTER_CAT_RIGHT,
121
ADDTAGS_SET_SUBMATCH_END
122
} tre_addtags_symbol_t;
130
/* Adds tags to appropriate locations in the parse tree in `tree', so that
131
subexpressions marked for submatch addressing can be traced. */
133
tre_add_tags(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree,
136
reg_errcode_t status = REG_OK;
137
tre_addtags_symbol_t symbol;
138
tre_ast_node_t *node = tree; /* Tree node we are currently looking at. */
139
int bottom = tre_stack_num_objects(stack);
140
/* True for first pass (counting number of needed tags) */
141
int first_pass = (mem == NULL || tnfa == NULL);
142
int *regset, *orig_regset;
143
int num_tags = 0; /* Total number of tags. */
144
int num_minimals = 0; /* Number of special minimal tags. */
145
int tag = 0; /* The tag that is to be added next. */
146
int next_tag = 1; /* Next tag to use after this one. */
147
int *parents; /* Stack of submatches the current submatch is
149
int minimal_tag = -1; /* Tag that marks the beginning of a minimal match. */
150
tre_tag_states_t *saved_states;
152
tre_tag_direction_t direction = TRE_TAG_MINIMIZE;
156
tnfa->minimal_tags[0] = -1;
159
regset = xmalloc(sizeof(*regset) * ((tnfa->num_submatches + 1) * 2));
163
orig_regset = regset;
165
parents = xmalloc(sizeof(*parents) * (tnfa->num_submatches + 1));
173
saved_states = xmalloc(sizeof(*saved_states) * (tnfa->num_submatches + 1));
174
if (saved_states == NULL)
183
for (i = 0; i <= tnfa->num_submatches; i++)
184
saved_states[i].tag = -1;
187
STACK_PUSH(stack, voidptr, node);
188
STACK_PUSH(stack, int, ADDTAGS_RECURSE);
190
while (tre_stack_num_objects(stack) > bottom)
192
if (status != REG_OK)
195
symbol = (tre_addtags_symbol_t)tre_stack_pop_int(stack);
199
case ADDTAGS_SET_SUBMATCH_END:
201
int id = tre_stack_pop_int(stack);
204
/* Add end of this submatch to regset. */
205
for (i = 0; regset[i] >= 0; i++);
206
regset[i] = id * 2 + 1;
209
/* Pop this submatch from the parents stack. */
210
for (i = 0; parents[i] >= 0; i++);
215
case ADDTAGS_RECURSE:
216
node = tre_stack_pop_voidptr(stack);
218
if (node->submatch_id >= 0)
220
int id = node->submatch_id;
224
/* Add start of this submatch to regset. */
225
for (i = 0; regset[i] >= 0; i++);
231
for (i = 0; parents[i] >= 0; i++);
232
tnfa->submatch_data[id].parents = NULL;
235
int *p = xmalloc(sizeof(*p) * (i + 1));
241
assert(tnfa->submatch_data[id].parents == NULL);
242
tnfa->submatch_data[id].parents = p;
243
for (i = 0; parents[i] >= 0; i++)
249
/* Add end of this submatch to regset after processing this
251
STACK_PUSHX(stack, int, node->submatch_id);
252
STACK_PUSHX(stack, int, ADDTAGS_SET_SUBMATCH_END);
259
tre_literal_t *lit = node->obj;
261
if (!IS_SPECIAL(lit) || IS_BACKREF(lit))
264
DPRINT(("Literal %d-%d\n",
265
(int)lit->code_min, (int)lit->code_max));
268
/* Regset is not empty, so add a tag before the
269
literal or backref. */
272
status = tre_add_tag_left(mem, node, tag);
273
tnfa->tag_directions[tag] = direction;
274
if (minimal_tag >= 0)
276
DPRINT(("Minimal %d, %d\n", minimal_tag, tag));
277
for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
278
tnfa->minimal_tags[i] = tag;
279
tnfa->minimal_tags[i + 1] = minimal_tag;
280
tnfa->minimal_tags[i + 2] = -1;
284
/* Go through the regset and set submatch data for
285
submatches that are using this tag. */
286
for (i = 0; regset[i] >= 0; i++)
288
int id = regset[i] / 2;
289
int start = !(regset[i] % 2);
290
DPRINT((" Using tag %d for %s offset of "
291
"submatch %d\n", tag,
292
start ? "start" : "end", id));
294
tnfa->submatch_data[id].so_tag = tag;
296
tnfa->submatch_data[id].eo_tag = tag;
301
DPRINT((" num_tags = 1\n"));
305
DPRINT((" num_tags++\n"));
314
assert(!IS_TAG(lit));
320
tre_catenation_t *cat = node->obj;
321
tre_ast_node_t *left = cat->left;
322
tre_ast_node_t *right = cat->right;
323
int reserved_tag = -1;
324
DPRINT(("Catenation, next_tag = %d\n", next_tag));
327
/* After processing right child. */
328
STACK_PUSHX(stack, voidptr, node);
329
STACK_PUSHX(stack, int, ADDTAGS_AFTER_CAT_RIGHT);
331
/* Process right child. */
332
STACK_PUSHX(stack, voidptr, right);
333
STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
335
/* After processing left child. */
336
STACK_PUSHX(stack, int, next_tag + left->num_tags);
337
DPRINT((" Pushing %d for after left\n",
338
next_tag + left->num_tags));
339
if (left->num_tags > 0 && right->num_tags > 0)
341
/* Reserve the next tag to the right child. */
342
DPRINT((" Reserving next_tag %d to right child\n",
344
reserved_tag = next_tag;
347
STACK_PUSHX(stack, int, reserved_tag);
348
STACK_PUSHX(stack, int, ADDTAGS_AFTER_CAT_LEFT);
350
/* Process left child. */
351
STACK_PUSHX(stack, voidptr, left);
352
STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
358
tre_iteration_t *iter = node->obj;
359
DPRINT(("Iteration\n"));
363
STACK_PUSHX(stack, int, regset[0] >= 0 || iter->minimal);
367
STACK_PUSHX(stack, int, tag);
368
STACK_PUSHX(stack, int, iter->minimal);
370
STACK_PUSHX(stack, voidptr, node);
371
STACK_PUSHX(stack, int, ADDTAGS_AFTER_ITERATION);
373
STACK_PUSHX(stack, voidptr, iter->arg);
374
STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
376
/* Regset is not empty, so add a tag here. */
377
if (regset[0] >= 0 || iter->minimal)
382
status = tre_add_tag_left(mem, node, tag);
384
tnfa->tag_directions[tag] = TRE_TAG_MAXIMIZE;
386
tnfa->tag_directions[tag] = direction;
387
if (minimal_tag >= 0)
389
DPRINT(("Minimal %d, %d\n", minimal_tag, tag));
390
for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
391
tnfa->minimal_tags[i] = tag;
392
tnfa->minimal_tags[i + 1] = minimal_tag;
393
tnfa->minimal_tags[i + 2] = -1;
397
/* Go through the regset and set submatch data for
398
submatches that are using this tag. */
399
for (i = 0; regset[i] >= 0; i++)
401
int id = regset[i] / 2;
402
int start = !(regset[i] % 2);
403
DPRINT((" Using tag %d for %s offset of "
404
"submatch %d\n", tag,
405
start ? "start" : "end", id));
407
tnfa->submatch_data[id].so_tag = tag;
409
tnfa->submatch_data[id].eo_tag = tag;
413
DPRINT((" num_tags++\n"));
419
direction = TRE_TAG_MINIMIZE;
424
tre_union_t *uni = node->obj;
425
tre_ast_node_t *left = uni->left;
426
tre_ast_node_t *right = uni->right;
433
right_tag = next_tag + 1;
438
right_tag = next_tag;
443
/* After processing right child. */
444
STACK_PUSHX(stack, int, right_tag);
445
STACK_PUSHX(stack, int, left_tag);
446
STACK_PUSHX(stack, voidptr, regset);
447
STACK_PUSHX(stack, int, regset[0] >= 0);
448
STACK_PUSHX(stack, voidptr, node);
449
STACK_PUSHX(stack, voidptr, right);
450
STACK_PUSHX(stack, voidptr, left);
451
STACK_PUSHX(stack, int, ADDTAGS_AFTER_UNION_RIGHT);
453
/* Process right child. */
454
STACK_PUSHX(stack, voidptr, right);
455
STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
457
/* After processing left child. */
458
STACK_PUSHX(stack, int, ADDTAGS_AFTER_UNION_LEFT);
460
/* Process left child. */
461
STACK_PUSHX(stack, voidptr, left);
462
STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
464
/* Regset is not empty, so add a tag here. */
470
status = tre_add_tag_left(mem, node, tag);
471
tnfa->tag_directions[tag] = direction;
472
if (minimal_tag >= 0)
474
DPRINT(("Minimal %d, %d\n", minimal_tag, tag));
475
for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
476
tnfa->minimal_tags[i] = tag;
477
tnfa->minimal_tags[i + 1] = minimal_tag;
478
tnfa->minimal_tags[i + 2] = -1;
482
/* Go through the regset and set submatch data for
483
submatches that are using this tag. */
484
for (i = 0; regset[i] >= 0; i++)
486
int id = regset[i] / 2;
487
int start = !(regset[i] % 2);
488
DPRINT((" Using tag %d for %s offset of "
489
"submatch %d\n", tag,
490
start ? "start" : "end", id));
492
tnfa->submatch_data[id].so_tag = tag;
494
tnfa->submatch_data[id].eo_tag = tag;
498
DPRINT((" num_tags++\n"));
505
if (node->num_submatches > 0)
507
/* The next two tags are reserved for markers. */
517
if (node->submatch_id >= 0)
520
/* Push this submatch on the parents stack. */
521
for (i = 0; parents[i] >= 0; i++);
522
parents[i] = node->submatch_id;
526
break; /* end case: ADDTAGS_RECURSE */
528
case ADDTAGS_AFTER_ITERATION:
532
node = tre_stack_pop_voidptr(stack);
535
node->num_tags = ((tre_iteration_t *)node->obj)->arg->num_tags
536
+ tre_stack_pop_int(stack);
541
minimal = tre_stack_pop_int(stack);
542
enter_tag = tre_stack_pop_int(stack);
544
minimal_tag = enter_tag;
547
DPRINT(("After iteration\n"));
550
DPRINT((" Setting direction to %s\n",
551
minimal ? "minimize" : "maximize"));
553
direction = TRE_TAG_MINIMIZE;
555
direction = TRE_TAG_MAXIMIZE;
560
case ADDTAGS_AFTER_CAT_LEFT:
562
int new_tag = tre_stack_pop_int(stack);
563
next_tag = tre_stack_pop_int(stack);
564
DPRINT(("After cat left, tag = %d, next_tag = %d\n",
568
DPRINT((" Setting tag to %d\n", new_tag));
574
case ADDTAGS_AFTER_CAT_RIGHT:
575
DPRINT(("After cat right\n"));
576
node = tre_stack_pop_voidptr(stack);
578
node->num_tags = ((tre_catenation_t *)node->obj)->left->num_tags
579
+ ((tre_catenation_t *)node->obj)->right->num_tags;
582
case ADDTAGS_AFTER_UNION_LEFT:
583
DPRINT(("After union left\n"));
584
/* Lift the bottom of the `regset' array so that when processing
585
the right operand the items currently in the array are
586
invisible. The original bottom was saved at ADDTAGS_UNION and
587
will be restored at ADDTAGS_AFTER_UNION_RIGHT below. */
592
case ADDTAGS_AFTER_UNION_RIGHT:
594
int added_tags, tag_left, tag_right;
595
tre_ast_node_t *left = tre_stack_pop_voidptr(stack);
596
tre_ast_node_t *right = tre_stack_pop_voidptr(stack);
597
DPRINT(("After union right\n"));
598
node = tre_stack_pop_voidptr(stack);
599
added_tags = tre_stack_pop_int(stack);
602
node->num_tags = ((tre_union_t *)node->obj)->left->num_tags
603
+ ((tre_union_t *)node->obj)->right->num_tags + added_tags
604
+ ((node->num_submatches > 0) ? 2 : 0);
606
regset = tre_stack_pop_voidptr(stack);
607
tag_left = tre_stack_pop_int(stack);
608
tag_right = tre_stack_pop_int(stack);
610
/* Add tags after both children, the left child gets a smaller
611
tag than the right child. This guarantees that we prefer
612
the left child over the right child. */
613
/* XXX - This is not always necessary (if the children have
614
tags which must be seen for every match of that child). */
615
/* XXX - Check if this is the only place where tre_add_tag_right
616
is used. If so, use tre_add_tag_left (putting the tag before
617
the child as opposed after the child) and throw away
618
tre_add_tag_right. */
619
if (node->num_submatches > 0)
623
status = tre_add_tag_right(mem, left, tag_left);
624
tnfa->tag_directions[tag_left] = TRE_TAG_MAXIMIZE;
625
status = tre_add_tag_right(mem, right, tag_right);
626
tnfa->tag_directions[tag_right] = TRE_TAG_MAXIMIZE;
628
DPRINT((" num_tags += 2\n"));
631
direction = TRE_TAG_MAXIMIZE;
639
} /* end switch(symbol) */
640
} /* end while(tre_stack_num_objects(stack) > bottom) */
645
/* Go through the regset and set submatch data for
646
submatches that are using this tag. */
647
for (i = 0; regset[i] >= 0; i++)
649
int id = regset[i] / 2;
650
int start = !(regset[i] % 2);
651
DPRINT((" Using tag %d for %s offset of "
652
"submatch %d\n", num_tags,
653
start ? "start" : "end", id));
655
tnfa->submatch_data[id].so_tag = num_tags;
657
tnfa->submatch_data[id].eo_tag = num_tags;
661
if (!first_pass && minimal_tag >= 0)
664
DPRINT(("Minimal %d, %d\n", minimal_tag, tag));
665
for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
666
tnfa->minimal_tags[i] = tag;
667
tnfa->minimal_tags[i + 1] = minimal_tag;
668
tnfa->minimal_tags[i + 2] = -1;
673
DPRINT(("tre_add_tags: %s complete. Number of tags %d.\n",
674
first_pass? "First pass" : "Second pass", num_tags));
676
assert(tree->num_tags == num_tags);
677
tnfa->end_tag = num_tags;
678
tnfa->num_tags = num_tags;
679
tnfa->num_minimals = num_minimals;
689
AST to TNFA compilation routines.
695
} tre_copyast_symbol_t;
697
/* Flags for tre_copy_ast(). */
698
#define COPY_REMOVE_TAGS 1
699
#define COPY_MAXIMIZE_FIRST_TAG 2
702
tre_copy_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast,
703
int flags, int *pos_add, tre_tag_direction_t *tag_directions,
704
tre_ast_node_t **copy, int *max_pos)
706
reg_errcode_t status = REG_OK;
707
int bottom = tre_stack_num_objects(stack);
710
tre_ast_node_t **result = copy;
711
tre_copyast_symbol_t symbol;
713
STACK_PUSH(stack, voidptr, ast);
714
STACK_PUSH(stack, int, COPY_RECURSE);
716
while (status == REG_OK && tre_stack_num_objects(stack) > bottom)
718
tre_ast_node_t *node;
719
if (status != REG_OK)
722
symbol = (tre_copyast_symbol_t)tre_stack_pop_int(stack);
725
case COPY_SET_RESULT_PTR:
726
result = tre_stack_pop_voidptr(stack);
729
node = tre_stack_pop_voidptr(stack);
734
tre_literal_t *lit = node->obj;
735
int pos = lit->position;
736
int min = lit->code_min;
737
int max = lit->code_max;
738
if (!IS_SPECIAL(lit) || IS_BACKREF(lit))
740
/* XXX - e.g. [ab] has only one position but two
741
nodes, so we are creating holes in the state space
742
here. Not fatal, just wastes memory. */
746
else if (IS_TAG(lit) && (flags & COPY_REMOVE_TAGS))
748
/* Change this tag to empty. */
752
else if (IS_TAG(lit) && (flags & COPY_MAXIMIZE_FIRST_TAG)
755
/* Maximize the first tag. */
756
tag_directions[max] = TRE_TAG_MAXIMIZE;
759
*result = tre_ast_new_literal(mem, min, max, pos);
769
tre_union_t *uni = node->obj;
771
*result = tre_ast_new_union(mem, uni->left, uni->right);
777
tmp = (*result)->obj;
779
STACK_PUSHX(stack, voidptr, uni->right);
780
STACK_PUSHX(stack, int, COPY_RECURSE);
781
STACK_PUSHX(stack, voidptr, &tmp->right);
782
STACK_PUSHX(stack, int, COPY_SET_RESULT_PTR);
783
STACK_PUSHX(stack, voidptr, uni->left);
784
STACK_PUSHX(stack, int, COPY_RECURSE);
789
tre_catenation_t *cat = node->obj;
790
tre_catenation_t *tmp;
791
*result = tre_ast_new_catenation(mem, cat->left, cat->right);
797
tmp = (*result)->obj;
802
STACK_PUSHX(stack, voidptr, cat->right);
803
STACK_PUSHX(stack, int, COPY_RECURSE);
804
STACK_PUSHX(stack, voidptr, &tmp->right);
805
STACK_PUSHX(stack, int, COPY_SET_RESULT_PTR);
806
STACK_PUSHX(stack, voidptr, cat->left);
807
STACK_PUSHX(stack, int, COPY_RECURSE);
812
tre_iteration_t *iter = node->obj;
813
STACK_PUSHX(stack, voidptr, iter->arg);
814
STACK_PUSHX(stack, int, COPY_RECURSE);
815
*result = tre_ast_new_iter(mem, iter->arg, iter->min,
816
iter->max, iter->minimal);
822
iter = (*result)->obj;
833
*pos_add += num_copied;
840
} tre_expand_ast_symbol_t;
842
/* Expands each iteration node that has a finite nonzero minimum or maximum
843
iteration count to a catenated sequence of copies of the node. */
845
tre_expand_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast,
846
int *position, tre_tag_direction_t *tag_directions,
849
reg_errcode_t status = REG_OK;
850
int bottom = tre_stack_num_objects(stack);
852
int pos_add_total = 0;
854
/* Current approximate matching parameters. */
855
int params[TRE_PARAM_LAST];
856
/* Approximate parameter nesting level. */
857
int params_depth = 0;
861
for (i = 0; i < TRE_PARAM_LAST; i++)
862
params[i] = TRE_PARAM_DEFAULT;
864
STACK_PUSHR(stack, voidptr, ast);
865
STACK_PUSHR(stack, int, EXPAND_RECURSE);
866
while (status == REG_OK && tre_stack_num_objects(stack) > bottom)
868
tre_ast_node_t *node;
869
tre_expand_ast_symbol_t symbol;
871
if (status != REG_OK)
874
DPRINT(("pos_add %d\n", pos_add));
876
symbol = (tre_expand_ast_symbol_t)tre_stack_pop_int(stack);
877
node = tre_stack_pop_voidptr(stack);
885
tre_literal_t *lit= node->obj;
886
if (!IS_SPECIAL(lit) || IS_BACKREF(lit))
888
lit->position += pos_add;
889
if (lit->position > max_pos)
890
max_pos = lit->position;
896
tre_union_t *uni = node->obj;
897
STACK_PUSHX(stack, voidptr, uni->right);
898
STACK_PUSHX(stack, int, EXPAND_RECURSE);
899
STACK_PUSHX(stack, voidptr, uni->left);
900
STACK_PUSHX(stack, int, EXPAND_RECURSE);
905
tre_catenation_t *cat = node->obj;
906
STACK_PUSHX(stack, voidptr, cat->right);
907
STACK_PUSHX(stack, int, EXPAND_RECURSE);
908
STACK_PUSHX(stack, voidptr, cat->left);
909
STACK_PUSHX(stack, int, EXPAND_RECURSE);
914
tre_iteration_t *iter = node->obj;
915
STACK_PUSHX(stack, int, pos_add);
916
STACK_PUSHX(stack, voidptr, node);
917
STACK_PUSHX(stack, int, EXPAND_AFTER_ITER);
918
STACK_PUSHX(stack, voidptr, iter->arg);
919
STACK_PUSHX(stack, int, EXPAND_RECURSE);
920
/* If we are going to expand this node at EXPAND_AFTER_ITER
921
then don't increase the `pos' fields of the nodes now, it
922
will get done when expanding. */
923
if (iter->min > 1 || iter->max > 1)
934
case EXPAND_AFTER_ITER:
936
tre_iteration_t *iter = node->obj;
938
pos_add = tre_stack_pop_int(stack);
939
pos_add_last = pos_add;
940
if (iter->min > 1 || iter->max > 1)
942
tre_ast_node_t *seq1 = NULL, *seq2 = NULL;
944
int pos_add_save = pos_add;
946
/* Create a catenated sequence of copies of the node. */
947
for (i = 0; i < iter->min; i++)
949
tre_ast_node_t *copy;
950
/* Remove tags from all but the last copy. */
951
int flags = ((i + 1 < iter->min)
953
: COPY_MAXIMIZE_FIRST_TAG);
954
DPRINT((" pos_add %d\n", pos_add));
955
pos_add_save = pos_add;
956
status = tre_copy_ast(mem, stack, iter->arg, flags,
957
&pos_add, tag_directions, ©,
959
if (status != REG_OK)
962
seq1 = tre_ast_new_catenation(mem, seq1, copy);
971
/* No upper limit. */
972
pos_add_save = pos_add;
973
status = tre_copy_ast(mem, stack, iter->arg, 0,
974
&pos_add, NULL, &seq2, &max_pos);
975
if (status != REG_OK)
977
seq2 = tre_ast_new_iter(mem, seq2, 0, -1, 0);
983
for (i = iter->min; i < iter->max; i++)
985
tre_ast_node_t *tmp, *copy;
986
pos_add_save = pos_add;
987
status = tre_copy_ast(mem, stack, iter->arg, 0,
988
&pos_add, NULL, ©, &max_pos);
989
if (status != REG_OK)
992
seq2 = tre_ast_new_catenation(mem, copy, seq2);
997
tmp = tre_ast_new_literal(mem, EMPTY, -1, -1);
1000
seq2 = tre_ast_new_union(mem, tmp, seq2);
1006
pos_add = pos_add_save;
1009
else if (seq2 != NULL)
1010
seq1 = tre_ast_new_catenation(mem, seq1, seq2);
1013
node->obj = seq1->obj;
1014
node->type = seq1->type;
1018
pos_add_total += pos_add - pos_add_last;
1019
if (iter_depth == 0)
1020
pos_add = pos_add_total;
1022
/* If approximate parameters are specified, surround the result
1023
with two parameter setting nodes. The one on the left sets
1024
the specified parameters, and the one on the right restores
1025
the old parameters. */
1028
tre_ast_node_t *tmp_l, *tmp_r, *tmp_node, *node_copy;
1031
tmp_l = tre_ast_new_literal(mem, PARAMETER, 0, -1);
1034
((tre_literal_t *)tmp_l->obj)->u.params = iter->params;
1035
iter->params[TRE_PARAM_DEPTH] = params_depth + 1;
1036
tmp_r = tre_ast_new_literal(mem, PARAMETER, 0, -1);
1039
old_params = tre_mem_alloc(mem, sizeof(*old_params)
1043
for (i = 0; i < TRE_PARAM_LAST; i++)
1044
old_params[i] = params[i];
1045
((tre_literal_t *)tmp_r->obj)->u.params = old_params;
1046
old_params[TRE_PARAM_DEPTH] = params_depth;
1047
/* XXX - this is the only place where ast_new_node is
1048
needed -- should be moved inside AST module. */
1049
node_copy = tre_ast_new_node(mem, ITERATION,
1050
sizeof(tre_iteration_t));
1053
node_copy->obj = node->obj;
1054
tmp_node = tre_ast_new_catenation(mem, tmp_l, node_copy);
1057
tmp_node = tre_ast_new_catenation(mem, tmp_node, tmp_r);
1060
/* Replace the contents of `node' with `tmp_node'. */
1061
memcpy(node, tmp_node, sizeof(*node));
1062
node->obj = tmp_node->obj;
1063
node->type = tmp_node->type;
1065
if (params_depth > *max_depth)
1066
*max_depth = params_depth;
1076
*position += pos_add_total;
1078
/* `max_pos' should never be larger than `*position' if the above
1079
code works, but just an extra safeguard let's make sure
1080
`*position' is set large enough so enough memory will be
1081
allocated for the transition table. */
1082
if (max_pos > *position)
1083
*position = max_pos;
1086
DPRINT(("Expanded AST:\n"));
1088
DPRINT(("*position %d, max_pos %d\n", *position, max_pos));
1094
static tre_pos_and_tags_t *
1095
tre_set_empty(tre_mem_t mem)
1097
tre_pos_and_tags_t *new_set;
1099
new_set = tre_mem_calloc(mem, sizeof(*new_set));
1100
if (new_set == NULL)
1103
new_set[0].position = -1;
1104
new_set[0].code_min = -1;
1105
new_set[0].code_max = -1;
1110
static tre_pos_and_tags_t *
1111
tre_set_one(tre_mem_t mem, int position, int code_min, int code_max,
1112
tre_ctype_t class, tre_ctype_t *neg_classes, int backref)
1114
tre_pos_and_tags_t *new_set;
1116
new_set = tre_mem_calloc(mem, sizeof(*new_set) * 2);
1117
if (new_set == NULL)
1120
new_set[0].position = position;
1121
new_set[0].code_min = code_min;
1122
new_set[0].code_max = code_max;
1123
new_set[0].class = class;
1124
new_set[0].neg_classes = neg_classes;
1125
new_set[0].backref = backref;
1126
new_set[1].position = -1;
1127
new_set[1].code_min = -1;
1128
new_set[1].code_max = -1;
1133
static tre_pos_and_tags_t *
1134
tre_set_union(tre_mem_t mem, tre_pos_and_tags_t *set1, tre_pos_and_tags_t *set2,
1135
int *tags, int assertions, int *params)
1138
tre_pos_and_tags_t *new_set;
1142
for (num_tags = 0; tags != NULL && tags[num_tags] >= 0; num_tags++);
1143
for (s1 = 0; set1[s1].position >= 0; s1++);
1144
for (s2 = 0; set2[s2].position >= 0; s2++);
1145
new_set = tre_mem_calloc(mem, sizeof(*new_set) * (s1 + s2 + 1));
1149
for (s1 = 0; set1[s1].position >= 0; s1++)
1151
new_set[s1].position = set1[s1].position;
1152
new_set[s1].code_min = set1[s1].code_min;
1153
new_set[s1].code_max = set1[s1].code_max;
1154
new_set[s1].assertions = set1[s1].assertions | assertions;
1155
new_set[s1].class = set1[s1].class;
1156
new_set[s1].neg_classes = set1[s1].neg_classes;
1157
new_set[s1].backref = set1[s1].backref;
1158
if (set1[s1].tags == NULL && tags == NULL)
1159
new_set[s1].tags = NULL;
1162
for (i = 0; set1[s1].tags != NULL && set1[s1].tags[i] >= 0; i++);
1163
new_tags = tre_mem_alloc(mem, (sizeof(*new_tags)
1164
* (i + num_tags + 1)));
1165
if (new_tags == NULL)
1167
for (j = 0; j < i; j++)
1168
new_tags[j] = set1[s1].tags[j];
1169
for (i = 0; i < num_tags; i++)
1170
new_tags[j + i] = tags[i];
1171
new_tags[j + i] = -1;
1172
new_set[s1].tags = new_tags;
1174
if (set1[s1].params)
1175
new_set[s1].params = set1[s1].params;
1178
if (!new_set[s1].params)
1179
new_set[s1].params = params;
1182
new_set[s1].params = tre_mem_alloc(mem, sizeof(*params) *
1184
if (!new_set[s1].params)
1186
for (i = 0; i < TRE_PARAM_LAST; i++)
1187
if (params[i] != TRE_PARAM_UNSET)
1188
new_set[s1].params[i] = params[i];
1193
for (s2 = 0; set2[s2].position >= 0; s2++)
1195
new_set[s1 + s2].position = set2[s2].position;
1196
new_set[s1 + s2].code_min = set2[s2].code_min;
1197
new_set[s1 + s2].code_max = set2[s2].code_max;
1198
/* XXX - why not | assertions here as well? */
1199
new_set[s1 + s2].assertions = set2[s2].assertions;
1200
new_set[s1 + s2].class = set2[s2].class;
1201
new_set[s1 + s2].neg_classes = set2[s2].neg_classes;
1202
new_set[s1 + s2].backref = set2[s2].backref;
1203
if (set2[s2].tags == NULL)
1204
new_set[s1 + s2].tags = NULL;
1207
for (i = 0; set2[s2].tags[i] >= 0; i++);
1208
new_tags = tre_mem_alloc(mem, sizeof(*new_tags) * (i + 1));
1209
if (new_tags == NULL)
1211
for (j = 0; j < i; j++)
1212
new_tags[j] = set2[s2].tags[j];
1214
new_set[s1 + s2].tags = new_tags;
1216
if (set2[s2].params)
1217
new_set[s1 + s2].params = set2[s2].params;
1220
if (!new_set[s1 + s2].params)
1221
new_set[s1 + s2].params = params;
1224
new_set[s1 + s2].params = tre_mem_alloc(mem, sizeof(*params) *
1226
if (!new_set[s1 + s2].params)
1228
for (i = 0; i < TRE_PARAM_LAST; i++)
1229
if (params[i] != TRE_PARAM_UNSET)
1230
new_set[s1 + s2].params[i] = params[i];
1234
new_set[s1 + s2].position = -1;
1238
/* Finds the empty path through `node' which is the one that should be
1239
taken according to POSIX.2 rules, and adds the tags on that path to
1240
`tags'. `tags' may be NULL. If `num_tags_seen' is not NULL, it is
1241
set to the number of tags seen on the path. */
1242
static reg_errcode_t
1243
tre_match_empty(tre_stack_t *stack, tre_ast_node_t *node, int *tags,
1244
int *assertions, int *params, int *num_tags_seen,
1249
tre_catenation_t *cat;
1250
tre_iteration_t *iter;
1252
int bottom = tre_stack_num_objects(stack);
1253
reg_errcode_t status = REG_OK;
1259
status = tre_stack_push_voidptr(stack, node);
1261
/* Walk through the tree recursively. */
1262
while (status == REG_OK && tre_stack_num_objects(stack) > bottom)
1264
node = tre_stack_pop_voidptr(stack);
1269
lit = (tre_literal_t *)node->obj;
1270
switch (lit->code_min)
1273
if (lit->code_max >= 0)
1277
/* Add the tag to `tags'. */
1278
for (i = 0; tags[i] >= 0; i++)
1279
if (tags[i] == lit->code_max)
1283
tags[i] = lit->code_max;
1292
assert(lit->code_max >= 1
1293
|| lit->code_max <= ASSERT_LAST);
1294
if (assertions != NULL)
1295
*assertions |= lit->code_max;
1299
for (i = 0; i < TRE_PARAM_LAST; i++)
1300
params[i] = lit->u.params[i];
1301
if (params_seen != NULL)
1313
/* Subexpressions starting earlier take priority over ones
1314
starting later, so we prefer the left subexpression over the
1315
right subexpression. */
1316
uni = (tre_union_t *)node->obj;
1317
if (uni->left->nullable)
1318
STACK_PUSHX(stack, voidptr, uni->left)
1319
else if (uni->right->nullable)
1320
STACK_PUSHX(stack, voidptr, uni->right)
1326
/* The path must go through both children. */
1327
cat = (tre_catenation_t *)node->obj;
1328
assert(cat->left->nullable);
1329
assert(cat->right->nullable);
1330
STACK_PUSHX(stack, voidptr, cat->left);
1331
STACK_PUSHX(stack, voidptr, cat->right);
1335
/* A match with an empty string is preferred over no match at
1336
all, so we go through the argument if possible. */
1337
iter = (tre_iteration_t *)node->obj;
1338
if (iter->arg->nullable)
1339
STACK_PUSHX(stack, voidptr, iter->arg);
1355
NFL_POST_CATENATION,
1357
} tre_nfl_stack_symbol_t;
1360
/* Computes and fills in the fields `nullable', `firstpos', and `lastpos' for
1361
the nodes of the AST `tree'. */
1362
static reg_errcode_t
1363
tre_compute_nfl(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree)
1365
int bottom = tre_stack_num_objects(stack);
1367
STACK_PUSHR(stack, voidptr, tree);
1368
STACK_PUSHR(stack, int, NFL_RECURSE);
1370
while (tre_stack_num_objects(stack) > bottom)
1372
tre_nfl_stack_symbol_t symbol;
1373
tre_ast_node_t *node;
1375
symbol = (tre_nfl_stack_symbol_t)tre_stack_pop_int(stack);
1376
node = tre_stack_pop_voidptr(stack);
1384
tre_literal_t *lit = (tre_literal_t *)node->obj;
1385
if (IS_BACKREF(lit))
1387
/* Back references: nullable = false, firstpos = {i},
1390
node->firstpos = tre_set_one(mem, lit->position, 0,
1391
TRE_CHAR_MAX, 0, NULL, -1);
1392
if (!node->firstpos)
1394
node->lastpos = tre_set_one(mem, lit->position, 0,
1395
TRE_CHAR_MAX, 0, NULL,
1400
else if (lit->code_min < 0)
1402
/* Tags, empty strings, params, and zero width assertions:
1403
nullable = true, firstpos = {}, and lastpos = {}. */
1405
node->firstpos = tre_set_empty(mem);
1406
if (!node->firstpos)
1408
node->lastpos = tre_set_empty(mem);
1414
/* Literal at position i: nullable = false, firstpos = {i},
1418
tre_set_one(mem, lit->position, lit->code_min,
1419
lit->code_max, 0, NULL, -1);
1420
if (!node->firstpos)
1422
node->lastpos = tre_set_one(mem, lit->position,
1423
lit->code_min, lit->code_max,
1424
lit->u.class, lit->neg_classes,
1433
/* Compute the attributes for the two subtrees, and after that
1435
STACK_PUSHR(stack, voidptr, node);
1436
STACK_PUSHR(stack, int, NFL_POST_UNION);
1437
STACK_PUSHR(stack, voidptr, ((tre_union_t *)node->obj)->right);
1438
STACK_PUSHR(stack, int, NFL_RECURSE);
1439
STACK_PUSHR(stack, voidptr, ((tre_union_t *)node->obj)->left);
1440
STACK_PUSHR(stack, int, NFL_RECURSE);
1444
/* Compute the attributes for the two subtrees, and after that
1446
STACK_PUSHR(stack, voidptr, node);
1447
STACK_PUSHR(stack, int, NFL_POST_CATENATION);
1448
STACK_PUSHR(stack, voidptr, ((tre_catenation_t *)node->obj)->right);
1449
STACK_PUSHR(stack, int, NFL_RECURSE);
1450
STACK_PUSHR(stack, voidptr, ((tre_catenation_t *)node->obj)->left);
1451
STACK_PUSHR(stack, int, NFL_RECURSE);
1455
/* Compute the attributes for the subtree, and after that for
1457
STACK_PUSHR(stack, voidptr, node);
1458
STACK_PUSHR(stack, int, NFL_POST_ITERATION);
1459
STACK_PUSHR(stack, voidptr, ((tre_iteration_t *)node->obj)->arg);
1460
STACK_PUSHR(stack, int, NFL_RECURSE);
1463
break; /* end case: NFL_RECURSE */
1465
case NFL_POST_UNION:
1467
tre_union_t *uni = (tre_union_t *)node->obj;
1468
node->nullable = uni->left->nullable || uni->right->nullable;
1469
node->firstpos = tre_set_union(mem, uni->left->firstpos,
1470
uni->right->firstpos, NULL, 0, NULL);
1471
if (!node->firstpos)
1473
node->lastpos = tre_set_union(mem, uni->left->lastpos,
1474
uni->right->lastpos, NULL, 0, NULL);
1480
case NFL_POST_ITERATION:
1482
tre_iteration_t *iter = (tre_iteration_t *)node->obj;
1484
if (iter->min == 0 || iter->arg->nullable)
1488
node->firstpos = iter->arg->firstpos;
1489
node->lastpos = iter->arg->lastpos;
1493
case NFL_POST_CATENATION:
1495
int num_tags, *tags, assertions, params_seen;
1497
reg_errcode_t status;
1498
tre_catenation_t *cat = node->obj;
1499
node->nullable = cat->left->nullable && cat->right->nullable;
1501
/* Compute firstpos. */
1502
if (cat->left->nullable)
1504
/* The left side matches the empty string. Make a first pass
1505
with tre_match_empty() to get the number of tags and
1507
status = tre_match_empty(stack, cat->left,
1508
NULL, NULL, NULL, &num_tags,
1510
if (status != REG_OK)
1512
/* Allocate arrays for the tags and parameters. */
1513
tags = xmalloc(sizeof(*tags) * (num_tags + 1));
1521
params = tre_mem_alloc(mem, sizeof(*params)
1529
/* Second pass with tre_mach_empty() to get the list of
1530
tags and parameters. */
1531
status = tre_match_empty(stack, cat->left, tags,
1532
&assertions, params, NULL, NULL);
1533
if (status != REG_OK)
1539
tre_set_union(mem, cat->right->firstpos, cat->left->firstpos,
1540
tags, assertions, params);
1542
if (!node->firstpos)
1547
node->firstpos = cat->left->firstpos;
1550
/* Compute lastpos. */
1551
if (cat->right->nullable)
1553
/* The right side matches the empty string. Make a first pass
1554
with tre_match_empty() to get the number of tags and
1556
status = tre_match_empty(stack, cat->right,
1557
NULL, NULL, NULL, &num_tags,
1559
if (status != REG_OK)
1561
/* Allocate arrays for the tags and parameters. */
1562
tags = xmalloc(sizeof(int) * (num_tags + 1));
1570
params = tre_mem_alloc(mem, sizeof(*params)
1578
/* Second pass with tre_mach_empty() to get the list of
1579
tags and parameters. */
1580
status = tre_match_empty(stack, cat->right, tags,
1581
&assertions, params, NULL, NULL);
1582
if (status != REG_OK)
1588
tre_set_union(mem, cat->left->lastpos, cat->right->lastpos,
1589
tags, assertions, params);
1596
node->lastpos = cat->right->lastpos;
1611
/* Adds a transition from each position in `p1' to each position in `p2'. */
1612
static reg_errcode_t
1613
tre_make_trans(tre_pos_and_tags_t *p1, tre_pos_and_tags_t *p2,
1614
tre_tnfa_transition_t *transitions,
1615
int *counts, int *offs)
1617
tre_pos_and_tags_t *orig_p2 = p2;
1618
tre_tnfa_transition_t *trans;
1619
int i, j, k, l, dup, prev_p2_pos;
1621
if (transitions != NULL)
1622
while (p1->position >= 0)
1626
while (p2->position >= 0)
1628
/* Optimization: if this position was already handled, skip it. */
1629
if (p2->position == prev_p2_pos)
1634
prev_p2_pos = p2->position;
1635
/* Set `trans' to point to the next unused transition from
1636
position `p1->position'. */
1637
trans = transitions + offs[p1->position];
1638
while (trans->state != NULL)
1641
/* If we find a previous transition from `p1->position' to
1642
`p2->position', it is overwritten. This can happen only
1643
if there are nested loops in the regexp, like in "((a)*)*".
1644
In POSIX.2 repetition using the outer loop is always
1645
preferred over using the inner loop. Therefore the
1646
transition for the inner loop is useless and can be thrown
1648
/* XXX - The same position is used for all nodes in a bracket
1649
expression, so this optimization cannot be used (it will
1650
break bracket expressions) unless I figure out a way to
1652
if (trans->state_id == p2->position)
1661
if (trans->state == NULL)
1662
(trans + 1)->state = NULL;
1663
/* Use the character ranges, assertions, etc. from `p1' for
1664
the transition from `p1' to `p2'. */
1665
trans->code_min = p1->code_min;
1666
trans->code_max = p1->code_max;
1667
trans->state = transitions + offs[p2->position];
1668
trans->state_id = p2->position;
1669
trans->assertions = p1->assertions | p2->assertions
1670
| (p1->class ? ASSERT_CHAR_CLASS : 0)
1671
| (p1->neg_classes != NULL ? ASSERT_CHAR_CLASS_NEG : 0);
1672
if (p1->backref >= 0)
1674
assert((trans->assertions & ASSERT_CHAR_CLASS) == 0);
1675
assert(p2->backref < 0);
1676
trans->u.backref = p1->backref;
1677
trans->assertions |= ASSERT_BACKREF;
1680
trans->u.class = p1->class;
1681
if (p1->neg_classes != NULL)
1683
for (i = 0; p1->neg_classes[i] != (tre_ctype_t)0; i++);
1684
trans->neg_classes =
1685
xmalloc(sizeof(*trans->neg_classes) * (i + 1));
1686
if (trans->neg_classes == NULL)
1688
for (i = 0; p1->neg_classes[i] != (tre_ctype_t)0; i++)
1689
trans->neg_classes[i] = p1->neg_classes[i];
1690
trans->neg_classes[i] = (tre_ctype_t)0;
1693
trans->neg_classes = NULL;
1695
/* Find out how many tags this transition has. */
1697
if (p1->tags != NULL)
1698
while(p1->tags[i] >= 0)
1701
if (p2->tags != NULL)
1702
while(p2->tags[j] >= 0)
1705
/* If we are overwriting a transition, free the old tag array. */
1706
if (trans->tags != NULL)
1710
/* If there were any tags, allocate an array and fill it. */
1713
trans->tags = xmalloc(sizeof(*trans->tags) * (i + j + 1));
1717
if (p1->tags != NULL)
1718
while(p1->tags[i] >= 0)
1720
trans->tags[i] = p1->tags[i];
1725
if (p2->tags != NULL)
1726
while (p2->tags[j] >= 0)
1728
/* Don't add duplicates. */
1730
for (k = 0; k < i; k++)
1731
if (trans->tags[k] == p2->tags[j])
1737
trans->tags[l++] = p2->tags[j];
1740
trans->tags[l] = -1;
1743
/* Set the parameter array. If both `p2' and `p1' have same
1744
parameters, the values in `p2' override those in `p1'. */
1745
if (p1->params || p2->params)
1748
trans->params = xmalloc(sizeof(*trans->params)
1752
for (i = 0; i < TRE_PARAM_LAST; i++)
1754
trans->params[i] = TRE_PARAM_UNSET;
1755
if (p1->params && p1->params[i] != TRE_PARAM_UNSET)
1756
trans->params[i] = p1->params[i];
1757
if (p2->params && p2->params[i] != TRE_PARAM_UNSET)
1758
trans->params[i] = p2->params[i];
1764
xfree(trans->params);
1765
trans->params = NULL;
1773
DPRINT((" %2d -> %2d on %3d", p1->position, p2->position,
1775
if (p1->code_max != p1->code_min)
1776
DPRINT(("-%3d", p1->code_max));
1780
DPRINT((", tags ["));
1783
DPRINT(("%d", *tags));
1790
if (trans->assertions)
1791
DPRINT((", assert %d", trans->assertions));
1792
if (trans->assertions & ASSERT_BACKREF)
1793
DPRINT((", backref %d", trans->u.backref));
1794
else if (trans->u.class)
1795
DPRINT((", class %ld", (long)trans->u.class));
1796
if (trans->neg_classes)
1797
DPRINT((", neg_classes %p", trans->neg_classes));
1801
tre_print_params(trans->params);
1805
#endif /* TRE_DEBUG */
1811
/* Compute a maximum limit for the number of transitions leaving
1813
while (p1->position >= 0)
1816
while (p2->position >= 0)
1818
counts[p1->position]++;
1826
/* Converts the syntax tree to a TNFA. All the transitions in the TNFA are
1827
labelled with one character range (there are no transitions on empty
1828
strings). The TNFA takes O(n^2) space in the worst case, `n' is size of
1830
static reg_errcode_t
1831
tre_ast_to_tnfa(tre_ast_node_t *node, tre_tnfa_transition_t *transitions,
1832
int *counts, int *offs)
1835
tre_catenation_t *cat;
1836
tre_iteration_t *iter;
1837
reg_errcode_t errcode = REG_OK;
1839
/* XXX - recurse using a stack!. */
1845
uni = (tre_union_t *)node->obj;
1846
errcode = tre_ast_to_tnfa(uni->left, transitions, counts, offs);
1847
if (errcode != REG_OK)
1849
errcode = tre_ast_to_tnfa(uni->right, transitions, counts, offs);
1853
cat = (tre_catenation_t *)node->obj;
1854
/* Add a transition from each position in cat->left->lastpos
1855
to each position in cat->right->firstpos. */
1856
errcode = tre_make_trans(cat->left->lastpos, cat->right->firstpos,
1857
transitions, counts, offs);
1858
if (errcode != REG_OK)
1860
errcode = tre_ast_to_tnfa(cat->left, transitions, counts, offs);
1861
if (errcode != REG_OK)
1863
errcode = tre_ast_to_tnfa(cat->right, transitions, counts, offs);
1867
iter = (tre_iteration_t *)node->obj;
1868
assert(iter->max == -1 || iter->max == 1);
1870
if (iter->max == -1)
1872
assert(iter->min == 0 || iter->min == 1);
1873
/* Add a transition from each last position in the iterated
1874
expression to each first position. */
1875
errcode = tre_make_trans(iter->arg->lastpos, iter->arg->firstpos,
1876
transitions, counts, offs);
1877
if (errcode != REG_OK)
1880
errcode = tre_ast_to_tnfa(iter->arg, transitions, counts, offs);
1887
#define ERROR_EXIT(err) \
1891
if (1) goto error_exit; \
1897
tre_compile(regex_t *preg, const tre_char_t *regex, size_t n, int cflags)
1900
tre_ast_node_t *tree, *tmp_ast_l, *tmp_ast_r;
1901
tre_pos_and_tags_t *p;
1902
int *counts = NULL, *offs = NULL;
1904
tre_tnfa_transition_t *transitions, *initial;
1905
tre_tnfa_t *tnfa = NULL;
1906
tre_submatch_data_t *submatch_data;
1907
tre_tag_direction_t *tag_directions = NULL;
1908
reg_errcode_t errcode;
1911
/* Parse context. */
1912
tre_parse_ctx_t parse_ctx;
1914
/* Allocate a stack used throughout the compilation process for various
1916
stack = tre_stack_new(512, 10240, 128);
1919
/* Allocate a fast memory allocator. */
1920
mem = tre_mem_new();
1923
tre_stack_destroy(stack);
1927
/* Parse the regexp. */
1928
memset(&parse_ctx, 0, sizeof(parse_ctx));
1929
parse_ctx.mem = mem;
1930
parse_ctx.stack = stack;
1931
parse_ctx.re = regex;
1933
parse_ctx.cflags = cflags;
1934
parse_ctx.max_backref = -1;
1935
DPRINT(("tre_compile: parsing '%.*" STRF "'\n", (int)n, regex));
1936
errcode = tre_parse(&parse_ctx);
1937
if (errcode != REG_OK)
1938
ERROR_EXIT(errcode);
1939
preg->re_nsub = parse_ctx.submatch_id - 1;
1940
tree = parse_ctx.result;
1942
/* Back references and approximate matching cannot currently be used
1943
in the same regexp. */
1944
if (parse_ctx.max_backref >= 0 && parse_ctx.have_approx)
1945
ERROR_EXIT(REG_BADPAT);
1948
tre_ast_print(tree);
1949
#endif /* TRE_DEBUG */
1951
/* Referring to nonexistent subexpressions is illegal. */
1952
if (parse_ctx.max_backref > (int)preg->re_nsub)
1953
ERROR_EXIT(REG_ESUBREG);
1955
/* Allocate the TNFA struct. */
1956
tnfa = xcalloc(1, sizeof(tre_tnfa_t));
1958
ERROR_EXIT(REG_ESPACE);
1959
tnfa->have_backrefs = parse_ctx.max_backref >= 0;
1960
tnfa->have_approx = parse_ctx.have_approx;
1961
tnfa->num_submatches = parse_ctx.submatch_id;
1963
/* Set up tags for submatch addressing. If REG_NOSUB is set and the
1964
regexp does not have back references, this can be skipped. */
1965
if (tnfa->have_backrefs || !(cflags & REG_NOSUB))
1967
DPRINT(("tre_compile: setting up tags\n"));
1969
/* Figure out how many tags we will need. */
1970
errcode = tre_add_tags(NULL, stack, tree, tnfa);
1971
if (errcode != REG_OK)
1972
ERROR_EXIT(errcode);
1974
tre_ast_print(tree);
1975
#endif /* TRE_DEBUG */
1977
if (tnfa->num_tags > 0)
1979
tag_directions = xmalloc(sizeof(*tag_directions)
1980
* (tnfa->num_tags + 1));
1981
if (tag_directions == NULL)
1982
ERROR_EXIT(REG_ESPACE);
1983
tnfa->tag_directions = tag_directions;
1984
memset(tag_directions, -1,
1985
sizeof(*tag_directions) * (tnfa->num_tags + 1));
1987
tnfa->minimal_tags = xcalloc(tnfa->num_tags * 2 + 1,
1988
sizeof(tnfa->minimal_tags));
1989
if (tnfa->minimal_tags == NULL)
1990
ERROR_EXIT(REG_ESPACE);
1992
submatch_data = xcalloc(parse_ctx.submatch_id, sizeof(*submatch_data));
1993
if (submatch_data == NULL)
1994
ERROR_EXIT(REG_ESPACE);
1995
tnfa->submatch_data = submatch_data;
1997
errcode = tre_add_tags(mem, stack, tree, tnfa);
1998
if (errcode != REG_OK)
1999
ERROR_EXIT(errcode);
2002
for (i = 0; i < parse_ctx.submatch_id; i++)
2003
DPRINT(("pmatch[%d] = {t%d, t%d}\n",
2004
i, submatch_data[i].so_tag, submatch_data[i].eo_tag));
2005
for (i = 0; i < tnfa->num_tags; i++)
2006
DPRINT(("t%d is %s\n", i,
2007
tag_directions[i] == TRE_TAG_MINIMIZE ?
2008
"minimized" : "maximized"));
2009
#endif /* TRE_DEBUG */
2012
/* Expand iteration nodes. */
2013
errcode = tre_expand_ast(mem, stack, tree, &parse_ctx.position,
2014
tag_directions, &tnfa->params_depth);
2015
if (errcode != REG_OK)
2016
ERROR_EXIT(errcode);
2018
/* Add a dummy node for the final state.
2019
XXX - For certain patterns this dummy node can be optimized away,
2020
for example "a*" or "ab*". Figure out a simple way to detect
2021
this possibility. */
2023
tmp_ast_r = tre_ast_new_literal(mem, 0, 0, parse_ctx.position++);
2024
if (tmp_ast_r == NULL)
2025
ERROR_EXIT(REG_ESPACE);
2027
tree = tre_ast_new_catenation(mem, tmp_ast_l, tmp_ast_r);
2029
ERROR_EXIT(REG_ESPACE);
2032
tre_ast_print(tree);
2033
DPRINT(("Number of states: %d\n", parse_ctx.position));
2034
#endif /* TRE_DEBUG */
2036
errcode = tre_compute_nfl(mem, stack, tree);
2037
if (errcode != REG_OK)
2038
ERROR_EXIT(errcode);
2040
counts = xmalloc(sizeof(int) * parse_ctx.position);
2042
ERROR_EXIT(REG_ESPACE);
2044
offs = xmalloc(sizeof(int) * parse_ctx.position);
2046
ERROR_EXIT(REG_ESPACE);
2048
for (i = 0; i < parse_ctx.position; i++)
2050
tre_ast_to_tnfa(tree, NULL, counts, NULL);
2053
for (i = 0; i < parse_ctx.position; i++)
2056
add += counts[i] + 1;
2059
transitions = xcalloc(add + 1, sizeof(*transitions));
2060
if (transitions == NULL)
2061
ERROR_EXIT(REG_ESPACE);
2062
tnfa->transitions = transitions;
2063
tnfa->num_transitions = add;
2065
DPRINT(("Converting to TNFA:\n"));
2066
errcode = tre_ast_to_tnfa(tree, transitions, counts, offs);
2067
if (errcode != REG_OK)
2068
ERROR_EXIT(errcode);
2070
/* If in eight bit mode, compute a table of characters that can be the
2071
first character of a match. */
2072
tnfa->first_char = -1;
2073
if (TRE_MB_CUR_MAX == 1 && !tmp_ast_l->nullable)
2077
DPRINT(("Characters that can start a match:"));
2078
tnfa->firstpos_chars = xcalloc(256, sizeof(char));
2079
if (tnfa->firstpos_chars == NULL)
2080
ERROR_EXIT(REG_ESPACE);
2081
for (p = tree->firstpos; p->position >= 0; p++)
2083
tre_tnfa_transition_t *j = transitions + offs[p->position];
2084
while (j->state != NULL)
2086
for (k = j->code_min; k <= j->code_max && k < 256; k++)
2089
tnfa->firstpos_chars[k] = 1;
2096
#define TRE_OPTIMIZE_FIRST_CHAR 1
2097
#if TRE_OPTIMIZE_FIRST_CHAR
2100
for (k = 0; k < 256; k++)
2101
if (tnfa->firstpos_chars[k])
2103
DPRINT(("first char must be %d\n", k));
2104
tnfa->first_char = k;
2105
xfree(tnfa->firstpos_chars);
2106
tnfa->firstpos_chars = NULL;
2114
tnfa->firstpos_chars = NULL;
2119
while (p->position >= 0)
2126
DPRINT(("initial: %d", p->position));
2134
DPRINT(("%d", *tags));
2140
DPRINT((", assert %d", p->assertions));
2144
tre_print_params(p->params);
2148
#endif /* TRE_DEBUG */
2153
initial = xcalloc(i + 1, sizeof(tre_tnfa_transition_t));
2154
if (initial == NULL)
2155
ERROR_EXIT(REG_ESPACE);
2156
tnfa->initial = initial;
2159
for (p = tree->firstpos; p->position >= 0; p++)
2161
initial[i].state = transitions + offs[p->position];
2162
initial[i].state_id = p->position;
2163
initial[i].tags = NULL;
2164
/* Copy the arrays p->tags, and p->params, they are allocated
2165
from a tre_mem object. */
2169
for (j = 0; p->tags[j] >= 0; j++);
2170
initial[i].tags = xmalloc(sizeof(*p->tags) * (j + 1));
2171
if (!initial[i].tags)
2172
ERROR_EXIT(REG_ESPACE);
2173
memcpy(initial[i].tags, p->tags, sizeof(*p->tags) * (j + 1));
2175
initial[i].params = NULL;
2178
initial[i].params = xmalloc(sizeof(*p->params) * TRE_PARAM_LAST);
2179
if (!initial[i].params)
2180
ERROR_EXIT(REG_ESPACE);
2181
memcpy(initial[i].params, p->params,
2182
sizeof(*p->params) * TRE_PARAM_LAST);
2184
initial[i].assertions = p->assertions;
2187
initial[i].state = NULL;
2189
tnfa->num_transitions = add;
2190
tnfa->final = transitions + offs[tree->lastpos[0].position];
2191
tnfa->num_states = parse_ctx.position;
2192
tnfa->cflags = cflags;
2194
DPRINT(("final state %p\n", (void *)tnfa->final));
2196
tre_mem_destroy(mem);
2197
tre_stack_destroy(stack);
2201
preg->TRE_REGEX_T_FIELD = (void *)tnfa;
2205
/* Free everything that was allocated and return the error code. */
2206
tre_mem_destroy(mem);
2208
tre_stack_destroy(stack);
2213
preg->TRE_REGEX_T_FIELD = (void *)tnfa;
2222
tre_free(regex_t *preg)
2226
tre_tnfa_transition_t *trans;
2228
tnfa = (void *)preg->TRE_REGEX_T_FIELD;
2232
for (i = 0; i < tnfa->num_transitions; i++)
2233
if (tnfa->transitions[i].state)
2235
if (tnfa->transitions[i].tags)
2236
xfree(tnfa->transitions[i].tags);
2237
if (tnfa->transitions[i].neg_classes)
2238
xfree(tnfa->transitions[i].neg_classes);
2239
if (tnfa->transitions[i].params)
2240
xfree(tnfa->transitions[i].params);
2242
if (tnfa->transitions)
2243
xfree(tnfa->transitions);
2247
for (trans = tnfa->initial; trans->state; trans++)
2252
xfree(trans->params);
2254
xfree(tnfa->initial);
2257
if (tnfa->submatch_data)
2259
for (i = 0; i < tnfa->num_submatches; i++)
2260
if (tnfa->submatch_data[i].parents)
2261
xfree(tnfa->submatch_data[i].parents);
2262
xfree(tnfa->submatch_data);
2265
if (tnfa->tag_directions)
2266
xfree(tnfa->tag_directions);
2267
if (tnfa->firstpos_chars)
2268
xfree(tnfa->firstpos_chars);
2269
if (tnfa->minimal_tags)
2270
xfree(tnfa->minimal_tags);
2277
static char str[256];
2282
tre_config(TRE_CONFIG_VERSION, &version);
2283
sprintf(str, "TRE %s (LGPL)", version);
2289
tre_config(int query, void *result)
2291
int *int_result = result;
2292
char **string_result = result;
2296
case TRE_CONFIG_APPROX:
2299
#else /* !TRE_APPROX */
2301
#endif /* !TRE_APPROX */
2304
case TRE_CONFIG_WCHAR:
2307
#else /* !TRE_WCHAR */
2309
#endif /* !TRE_WCHAR */
2312
case TRE_CONFIG_MULTIBYTE:
2313
#ifdef TRE_MULTIBYTE
2315
#else /* !TRE_MULTIBYTE */
2317
#endif /* !TRE_MULTIBYTE */
2320
case TRE_CONFIG_SYSTEM_ABI:
2321
#ifdef TRE_CONFIG_SYSTEM_ABI
2323
#else /* !TRE_CONFIG_SYSTEM_ABI */
2325
#endif /* !TRE_CONFIG_SYSTEM_ABI */
2328
case TRE_CONFIG_VERSION:
2329
*string_result = TRE_VERSION;