1
/**********************************************************************
2
regparse.c - Oniguruma (regular expression library)
3
**********************************************************************/
5
* Copyright (c) 2002-2007 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
8
* Redistribution and use in source and binary forms, with or without
9
* modification, are permitted provided that the following conditions
11
* 1. Redistributions of source code must retain the above copyright
12
* notice, this list of conditions and the following disclaimer.
13
* 2. Redistributions in binary form must reproduce the above copyright
14
* notice, this list of conditions and the following disclaimer in the
15
* documentation and/or other materials provided with the distribution.
17
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32
#define WARN_BUFSIZE 256
34
OnigSyntaxType OnigSyntaxRuby = {
35
(( SYN_GNU_REGEX_OP | ONIG_SYN_OP_QMARK_NON_GREEDY |
36
ONIG_SYN_OP_ESC_OCTAL3 | ONIG_SYN_OP_ESC_X_HEX2 |
37
ONIG_SYN_OP_ESC_X_BRACE_HEX8 | ONIG_SYN_OP_ESC_CONTROL_CHARS |
38
ONIG_SYN_OP_ESC_C_CONTROL )
39
& ~ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END )
40
, ( ONIG_SYN_OP2_QMARK_GROUP_EFFECT |
41
ONIG_SYN_OP2_OPTION_RUBY |
42
ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP | ONIG_SYN_OP2_ESC_K_NAMED_BACKREF |
43
ONIG_SYN_OP2_ESC_G_SUBEXP_CALL |
44
ONIG_SYN_OP2_PLUS_POSSESSIVE_REPEAT |
45
ONIG_SYN_OP2_CCLASS_SET_OP | ONIG_SYN_OP2_ESC_CAPITAL_C_BAR_CONTROL |
46
ONIG_SYN_OP2_ESC_CAPITAL_M_BAR_META | ONIG_SYN_OP2_ESC_V_VTAB |
47
ONIG_SYN_OP2_ESC_H_XDIGIT )
48
, ( SYN_GNU_REGEX_BV |
49
ONIG_SYN_ALLOW_INTERVAL_LOW_ABBREV |
50
ONIG_SYN_DIFFERENT_LEN_ALT_LOOK_BEHIND |
51
ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP |
52
ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME |
53
ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY |
54
ONIG_SYN_WARN_CC_OP_NOT_ESCAPED |
55
ONIG_SYN_WARN_REDUNDANT_NESTED_REPEAT )
59
OnigSyntaxType* OnigDefaultSyntax = ONIG_SYNTAX_RUBY;
61
extern void onig_null_warn(const char* s) { }
65
onig_rb_warn(const char* s)
71
onig_rb_warning(const char* s)
77
#ifdef DEFAULT_WARN_FUNCTION
78
static OnigWarnFunc onig_warn = (OnigWarnFunc )DEFAULT_WARN_FUNCTION;
80
static OnigWarnFunc onig_warn = onig_null_warn;
83
#ifdef DEFAULT_VERB_WARN_FUNCTION
84
static OnigWarnFunc onig_verb_warn = (OnigWarnFunc )DEFAULT_VERB_WARN_FUNCTION;
86
static OnigWarnFunc onig_verb_warn = onig_null_warn;
89
extern void onig_set_warn_func(OnigWarnFunc f)
94
extern void onig_set_verb_warn_func(OnigWarnFunc f)
100
bbuf_free(BBuf* bbuf)
102
if (IS_NOT_NULL(bbuf)) {
103
if (IS_NOT_NULL(bbuf->p)) xfree(bbuf->p);
109
bbuf_clone(BBuf** rto, BBuf* from)
114
*rto = to = (BBuf* )xmalloc(sizeof(BBuf));
115
CHECK_NULL_RETURN_VAL(to, ONIGERR_MEMORY);
116
r = BBUF_INIT(to, from->alloc);
117
if (r != 0) return r;
118
to->used = from->used;
119
xmemcpy(to->p, from->p, from->used);
123
#define ONOFF(v,f,negative) (negative) ? ((v) &= ~(f)) : ((v) |= (f))
125
#define MBCODE_START_POS(enc) \
126
(OnigCodePoint )(ONIGENC_MBC_MINLEN(enc) > 1 ? 0 : 0x80)
128
#define SET_ALL_MULTI_BYTE_RANGE(enc, pbuf) \
129
add_code_range_to_buf(pbuf, MBCODE_START_POS(enc), ~((OnigCodePoint )0))
131
#define ADD_ALL_MULTI_BYTE_RANGE(enc, mbuf) do {\
132
if (! ONIGENC_IS_SINGLEBYTE(enc)) {\
133
r = SET_ALL_MULTI_BYTE_RANGE(enc, &(mbuf));\
139
#define BITSET_IS_EMPTY(bs,empty) do {\
142
for (i = 0; i < BITSET_SIZE; i++) {\
150
bitset_set_range(BitSetRef bs, int from, int to)
153
for (i = from; i <= to && i < SINGLE_BYTE_SIZE; i++) {
154
BITSET_SET_BIT(bs, i);
160
bitset_set_all(BitSetRef bs)
163
for (i = 0; i < BITSET_SIZE; i++) {
170
bitset_invert(BitSetRef bs)
173
for (i = 0; i < BITSET_SIZE; i++) {
179
bitset_invert_to(BitSetRef from, BitSetRef to)
182
for (i = 0; i < BITSET_SIZE; i++) {
188
bitset_and(BitSetRef dest, BitSetRef bs)
191
for (i = 0; i < BITSET_SIZE; i++) {
197
bitset_or(BitSetRef dest, BitSetRef bs)
200
for (i = 0; i < BITSET_SIZE; i++) {
206
bitset_copy(BitSetRef dest, BitSetRef bs)
209
for (i = 0; i < BITSET_SIZE; i++) {
215
onig_strncmp(const UChar* s1, const UChar* s2, int n)
227
k_strcpy(UChar* dest, const UChar* src, const UChar* end)
231
xmemcpy(dest, src, len);
232
dest[len] = (UChar )0;
237
strdup_with_null(OnigEncoding enc, UChar* s, UChar* end)
239
int slen, term_len, i;
243
term_len = ONIGENC_MBC_MINLEN(enc);
245
r = (UChar* )xmalloc(slen + term_len);
246
CHECK_NULL_RETURN(r);
249
for (i = 0; i < term_len; i++)
250
r[slen + i] = (UChar )0;
256
/* scan pattern methods */
259
#define PFETCH_READY UChar* pfetch_prev
260
#define PEND (p < end ? 0 : 1)
261
#define PUNFETCH p = pfetch_prev
264
p += ONIGENC_MBC_ENC_LEN(enc, p); \
266
#define PFETCH(c) do { \
267
c = ONIGENC_MBC_TO_CODE(enc, p, end); \
269
p += ONIGENC_MBC_ENC_LEN(enc, p); \
272
#define PPEEK (p < end ? ONIGENC_MBC_TO_CODE(enc, p, end) : PEND_VALUE)
273
#define PPEEK_IS(c) (PPEEK == (OnigCodePoint )c)
276
k_strcat_capa(UChar* dest, UChar* dest_end, const UChar* src, const UChar* src_end,
282
r = (UChar* )xrealloc(dest, capa + 1);
284
r = (UChar* )xmalloc(capa + 1);
286
CHECK_NULL_RETURN(r);
287
k_strcpy(r + (dest_end - dest), src, src_end);
291
/* dest on static area */
293
strcat_capa_from_static(UChar* dest, UChar* dest_end,
294
const UChar* src, const UChar* src_end, int capa)
298
r = (UChar* )xmalloc(capa + 1);
299
CHECK_NULL_RETURN(r);
300
k_strcpy(r, dest, dest_end);
301
k_strcpy(r + (dest_end - dest), src, src_end);
305
#ifdef USE_NAMED_GROUP
307
#define INIT_NAME_BACKREFS_ALLOC_NUM 8
311
int name_len; /* byte length */
312
int back_num; /* number of backrefs */
318
#ifdef USE_ST_HASH_TABLE
327
static int strend_cmp(st_strend_key*, st_strend_key*);
328
static int strend_hash(st_strend_key*);
330
static struct st_hash_type type_strend_hash = {
336
onig_st_init_strend_table_with_size(int size)
338
return onig_st_init_table_with_size(&type_strend_hash, size);
342
onig_st_lookup_strend(st_table *table, const UChar* str_key, const UChar* end_key, st_data_t *value)
346
key.s = (unsigned char* )str_key;
347
key.end = (unsigned char* )end_key;
349
return onig_st_lookup(table, (st_data_t )(&key), value);
353
onig_st_insert_strend(st_table *table, const UChar* str_key, const UChar* end_key, st_data_t value)
358
key = (st_strend_key* )xmalloc(sizeof(st_strend_key));
359
key->s = (unsigned char* )str_key;
360
key->end = (unsigned char* )end_key;
361
result = onig_st_insert(table, (st_data_t )key, value);
369
strend_cmp(st_strend_key* x, st_strend_key* y)
371
unsigned char *p, *q;
374
if ((x->end - x->s) != (y->end - y->s))
380
c = (int )*p - (int )*q;
381
if (c != 0) return c;
390
strend_hash(st_strend_key* x)
398
val = val * 997 + (int )*p++;
401
return val + (val >> 5);
404
typedef st_table NameTable;
405
typedef st_data_t HashDataType; /* 1.6 st.h doesn't define st_data_t type */
407
#define NAMEBUF_SIZE 24
408
#define NAMEBUF_SIZE_1 25
412
i_print_name_entry(UChar* key, NameEntry* e, void* arg)
415
FILE* fp = (FILE* )arg;
417
fprintf(fp, "%s: ", e->name);
418
if (e->back_num == 0)
420
else if (e->back_num == 1)
421
fprintf(fp, "%d", e->back_ref1);
423
for (i = 0; i < e->back_num; i++) {
424
if (i > 0) fprintf(fp, ", ");
425
fprintf(fp, "%d", e->back_refs[i]);
433
onig_print_names(FILE* fp, regex_t* reg)
435
NameTable* t = (NameTable* )reg->name_table;
437
if (IS_NOT_NULL(t)) {
438
fprintf(fp, "name table\n");
439
onig_st_foreach(t, i_print_name_entry, (HashDataType )fp);
447
i_free_name_entry(UChar* key, NameEntry* e, void* arg)
450
if (IS_NOT_NULL(e->back_refs)) xfree(e->back_refs);
457
names_clear(regex_t* reg)
459
NameTable* t = (NameTable* )reg->name_table;
461
if (IS_NOT_NULL(t)) {
462
onig_st_foreach(t, i_free_name_entry, 0);
468
onig_names_free(regex_t* reg)
473
r = names_clear(reg);
476
t = (NameTable* )reg->name_table;
477
if (IS_NOT_NULL(t)) onig_st_free_table(t);
478
reg->name_table = (void* )NULL;
483
name_find(regex_t* reg, const UChar* name, const UChar* name_end)
486
NameTable* t = (NameTable* )reg->name_table;
488
e = (NameEntry* )NULL;
489
if (IS_NOT_NULL(t)) {
490
onig_st_lookup_strend(t, name, name_end, (HashDataType* )((void* )(&e)));
496
int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*);
504
i_names(UChar* key, NameEntry* e, INamesArg* arg)
506
int r = (*(arg->func))(e->name,
507
/*e->name + onigenc_str_bytelen_null(arg->enc, e->name), */
508
e->name + e->name_len,
510
(e->back_num > 1 ? e->back_refs : &(e->back_ref1)),
520
onig_foreach_name(regex_t* reg,
521
int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*),
525
NameTable* t = (NameTable* )reg->name_table;
528
if (IS_NOT_NULL(t)) {
532
narg.enc = reg->enc; /* should be pattern encoding. */
533
onig_st_foreach(t, i_names, (HashDataType )&narg);
539
i_renumber_name(UChar* key, NameEntry* e, GroupNumRemap* map)
543
if (e->back_num > 1) {
544
for (i = 0; i < e->back_num; i++) {
545
e->back_refs[i] = map[e->back_refs[i]].new_val;
548
else if (e->back_num == 1) {
549
e->back_ref1 = map[e->back_ref1].new_val;
556
onig_renumber_name_table(regex_t* reg, GroupNumRemap* map)
558
NameTable* t = (NameTable* )reg->name_table;
560
if (IS_NOT_NULL(t)) {
561
onig_st_foreach(t, i_renumber_name, (HashDataType )map);
568
onig_number_of_names(regex_t* reg)
570
NameTable* t = (NameTable* )reg->name_table;
573
return t->num_entries;
578
#else /* USE_ST_HASH_TABLE */
580
#define INIT_NAMES_ALLOC_NUM 8
591
onig_print_names(FILE* fp, regex_t* reg)
595
NameTable* t = (NameTable* )reg->name_table;
597
if (IS_NOT_NULL(t) && t->num > 0) {
598
fprintf(fp, "name table\n");
599
for (i = 0; i < t->num; i++) {
601
fprintf(fp, "%s: ", e->name);
602
if (e->back_num == 0) {
605
else if (e->back_num == 1) {
606
fprintf(fp, "%d", e->back_ref1);
609
for (j = 0; j < e->back_num; j++) {
610
if (j > 0) fprintf(fp, ", ");
611
fprintf(fp, "%d", e->back_refs[j]);
623
names_clear(regex_t* reg)
627
NameTable* t = (NameTable* )reg->name_table;
629
if (IS_NOT_NULL(t)) {
630
for (i = 0; i < t->num; i++) {
632
if (IS_NOT_NULL(e->name)) {
638
if (IS_NOT_NULL(e->back_refs)) xfree(e->back_refs);
639
e->back_refs = (int* )NULL;
642
if (IS_NOT_NULL(t->e)) {
652
onig_names_free(regex_t* reg)
657
r = names_clear(reg);
660
t = (NameTable* )reg->name_table;
661
if (IS_NOT_NULL(t)) xfree(t);
662
reg->name_table = NULL;
667
name_find(regex_t* reg, UChar* name, UChar* name_end)
671
NameTable* t = (NameTable* )reg->name_table;
673
if (IS_NOT_NULL(t)) {
674
len = name_end - name;
675
for (i = 0; i < t->num; i++) {
677
if (len == e->name_len && onig_strncmp(name, e->name, len) == 0)
681
return (NameEntry* )NULL;
685
onig_foreach_name(regex_t* reg,
686
int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*),
691
NameTable* t = (NameTable* )reg->name_table;
693
if (IS_NOT_NULL(t)) {
694
for (i = 0; i < t->num; i++) {
696
r = (*func)(e->name, e->name + e->name_len, e->back_num,
697
(e->back_num > 1 ? e->back_refs : &(e->back_ref1)),
699
if (r != 0) return r;
706
onig_number_of_names(regex_t* reg)
708
NameTable* t = (NameTable* )reg->name_table;
716
#endif /* else USE_ST_HASH_TABLE */
719
name_add(regex_t* reg, UChar* name, UChar* name_end, int backref, ScanEnv* env)
723
NameTable* t = (NameTable* )reg->name_table;
725
if (name_end - name <= 0)
726
return ONIGERR_EMPTY_GROUP_NAME;
728
e = name_find(reg, name, name_end);
730
#ifdef USE_ST_HASH_TABLE
732
t = onig_st_init_strend_table_with_size(5);
733
reg->name_table = (void* )t;
735
e = (NameEntry* )xmalloc(sizeof(NameEntry));
736
CHECK_NULL_RETURN_VAL(e, ONIGERR_MEMORY);
738
e->name = strdup_with_null(reg->enc, name, name_end);
739
if (IS_NULL(e->name)) return ONIGERR_MEMORY;
740
onig_st_insert_strend(t, e->name, (e->name + (name_end - name)),
743
e->name_len = name_end - name;
746
e->back_refs = (int* )NULL;
751
alloc = INIT_NAMES_ALLOC_NUM;
752
t = (NameTable* )xmalloc(sizeof(NameTable));
753
CHECK_NULL_RETURN_VAL(t, ONIGERR_MEMORY);
758
t->e = (NameEntry* )xmalloc(sizeof(NameEntry) * alloc);
761
return ONIGERR_MEMORY;
767
else if (t->num == t->alloc) {
770
alloc = t->alloc * 2;
771
t->e = (NameEntry* )xrealloc(t->e, sizeof(NameEntry) * alloc);
772
CHECK_NULL_RETURN_VAL(t->e, ONIGERR_MEMORY);
776
for (i = t->num; i < t->alloc; i++) {
778
t->e[i].name_len = 0;
779
t->e[i].back_num = 0;
780
t->e[i].back_alloc = 0;
781
t->e[i].back_refs = (int* )NULL;
786
e->name = strdup_with_null(reg->enc, name, name_end);
787
e->name_len = name_end - name;
791
if (e->back_num >= 1 &&
792
! IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME)) {
793
onig_scan_env_set_error_string(env, ONIGERR_MULTIPLEX_DEFINED_NAME,
795
return ONIGERR_MULTIPLEX_DEFINED_NAME;
799
if (e->back_num == 1) {
800
e->back_ref1 = backref;
803
if (e->back_num == 2) {
804
alloc = INIT_NAME_BACKREFS_ALLOC_NUM;
805
e->back_refs = (int* )xmalloc(sizeof(int) * alloc);
806
CHECK_NULL_RETURN_VAL(e->back_refs, ONIGERR_MEMORY);
807
e->back_alloc = alloc;
808
e->back_refs[0] = e->back_ref1;
809
e->back_refs[1] = backref;
812
if (e->back_num > e->back_alloc) {
813
alloc = e->back_alloc * 2;
814
e->back_refs = (int* )xrealloc(e->back_refs, sizeof(int) * alloc);
815
CHECK_NULL_RETURN_VAL(e->back_refs, ONIGERR_MEMORY);
816
e->back_alloc = alloc;
818
e->back_refs[e->back_num - 1] = backref;
826
onig_name_to_group_numbers(regex_t* reg, const UChar* name,
827
const UChar* name_end, int** nums)
831
e = name_find(reg, name, name_end);
832
if (IS_NULL(e)) return ONIGERR_UNDEFINED_NAME_REFERENCE;
834
switch (e->back_num) {
838
*nums = &(e->back_ref1);
841
*nums = e->back_refs;
848
onig_name_to_backref_number(regex_t* reg, const UChar* name,
849
const UChar* name_end, OnigRegion *region)
853
n = onig_name_to_group_numbers(reg, name, name_end, &nums);
857
return ONIGERR_PARSER_BUG;
861
if (IS_NOT_NULL(region)) {
862
for (i = n - 1; i >= 0; i--) {
863
if (region->beg[nums[i]] != ONIG_REGION_NOTPOS)
871
#else /* USE_NAMED_GROUP */
874
onig_name_to_group_numbers(regex_t* reg, const UChar* name,
875
const UChar* name_end, int** nums)
877
return ONIG_NO_SUPPORT_CONFIG;
881
onig_name_to_backref_number(regex_t* reg, const UChar* name,
882
const UChar* name_end, OnigRegion* region)
884
return ONIG_NO_SUPPORT_CONFIG;
888
onig_foreach_name(regex_t* reg,
889
int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*),
892
return ONIG_NO_SUPPORT_CONFIG;
896
onig_number_of_names(regex_t* reg)
900
#endif /* else USE_NAMED_GROUP */
903
onig_noname_group_capture_is_active(regex_t* reg)
905
if (ONIG_IS_OPTION_ON(reg->options, ONIG_OPTION_DONT_CAPTURE_GROUP))
908
#ifdef USE_NAMED_GROUP
909
if (onig_number_of_names(reg) > 0 &&
910
IS_SYNTAX_BV(reg->syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) &&
911
!ONIG_IS_OPTION_ON(reg->options, ONIG_OPTION_CAPTURE_GROUP)) {
920
#define INIT_SCANENV_MEMNODES_ALLOC_SIZE 16
923
scan_env_clear(ScanEnv* env)
927
BIT_STATUS_CLEAR(env->capture_history);
928
BIT_STATUS_CLEAR(env->bt_mem_start);
929
BIT_STATUS_CLEAR(env->bt_mem_end);
930
BIT_STATUS_CLEAR(env->backrefed_mem);
931
env->error = (UChar* )NULL;
932
env->error_end = (UChar* )NULL;
935
#ifdef USE_NAMED_GROUP
939
env->mem_nodes_dynamic = (Node** )NULL;
941
for (i = 0; i < SCANENV_MEMNODES_SIZE; i++)
942
env->mem_nodes_static[i] = NULL_NODE;
944
#ifdef USE_COMBINATION_EXPLOSION_CHECK
945
env->num_comb_exp_check = 0;
946
env->comb_exp_max_regnum = 0;
947
env->curr_max_regnum = 0;
948
env->has_recursion = 0;
953
scan_env_add_mem_entry(ScanEnv* env)
958
need = env->num_mem + 1;
959
if (need >= SCANENV_MEMNODES_SIZE) {
960
if (env->mem_alloc <= need) {
961
if (IS_NULL(env->mem_nodes_dynamic)) {
962
alloc = INIT_SCANENV_MEMNODES_ALLOC_SIZE;
963
p = (Node** )xmalloc(sizeof(Node*) * alloc);
964
xmemcpy(p, env->mem_nodes_static,
965
sizeof(Node*) * SCANENV_MEMNODES_SIZE);
968
alloc = env->mem_alloc * 2;
969
p = (Node** )xrealloc(env->mem_nodes_dynamic, sizeof(Node*) * alloc);
971
CHECK_NULL_RETURN_VAL(p, ONIGERR_MEMORY);
973
for (i = env->num_mem + 1; i < alloc; i++)
976
env->mem_nodes_dynamic = p;
977
env->mem_alloc = alloc;
986
scan_env_set_mem_node(ScanEnv* env, int num, Node* node)
988
if (env->num_mem >= num)
989
SCANENV_MEM_NODES(env)[num] = node;
991
return ONIGERR_PARSER_BUG;
996
#ifdef USE_RECYCLE_NODE
997
typedef struct _FreeNode {
998
struct _FreeNode* next;
1001
static FreeNode* FreeNodeList = (FreeNode* )NULL;
1005
onig_node_free(Node* node)
1008
if (IS_NULL(node)) return ;
1010
switch (NTYPE(node)) {
1012
if (IS_NOT_NULL(NSTRING(node).s) && NSTRING(node).s != NSTRING(node).buf) {
1013
xfree(NSTRING(node).s);
1019
onig_node_free(NCONS(node).left);
1020
/* onig_node_free(NCONS(node).right); */
1022
Node* next_node = NCONS(node).right;
1024
#ifdef USE_RECYCLE_NODE
1026
FreeNode* n = (FreeNode* )node;
1028
THREAD_ATOMIC_START;
1029
n->next = FreeNodeList;
1044
CClassNode* cc = &(NCCLASS(node));
1046
if (IS_CCLASS_SHARE(cc))
1050
bbuf_free(cc->mbuf);
1055
if (NQUANTIFIER(node).target)
1056
onig_node_free(NQUANTIFIER(node).target);
1060
if (NEFFECT(node).target)
1061
onig_node_free(NEFFECT(node).target);
1065
if (IS_NOT_NULL(NBACKREF(node).back_dynamic))
1066
xfree(NBACKREF(node).back_dynamic);
1070
if (NANCHOR(node).target)
1071
onig_node_free(NANCHOR(node).target);
1075
#ifdef USE_RECYCLE_NODE
1077
FreeNode* n = (FreeNode* )node;
1079
THREAD_ATOMIC_START;
1080
n->next = FreeNodeList;
1089
#ifdef USE_RECYCLE_NODE
1091
onig_free_node_list(void)
1095
/* THREAD_ATOMIC_START; */
1096
while (IS_NOT_NULL(FreeNodeList)) {
1098
FreeNodeList = FreeNodeList->next;
1101
/* THREAD_ATOMIC_END; */
1111
#ifdef USE_RECYCLE_NODE
1112
THREAD_ATOMIC_START;
1113
if (IS_NOT_NULL(FreeNodeList)) {
1114
node = (Node* )FreeNodeList;
1115
FreeNodeList = FreeNodeList->next;
1122
node = (Node* )xmalloc(sizeof(Node));
1128
initialize_cclass(CClassNode* cc)
1130
BITSET_CLEAR(cc->bs);
1136
node_new_cclass(void)
1138
Node* node = node_new();
1139
CHECK_NULL_RETURN(node);
1140
node->type = N_CCLASS;
1142
initialize_cclass(&(NCCLASS(node)));
1147
node_new_cclass_by_codepoint_range(int not,
1148
const OnigCodePoint sbr[], const OnigCodePoint mbr[])
1153
Node* node = node_new();
1154
CHECK_NULL_RETURN(node);
1155
node->type = N_CCLASS;
1157
cc = &(NCCLASS(node));
1159
if (not != 0) CCLASS_SET_NOT(cc);
1161
BITSET_CLEAR(cc->bs);
1162
if (IS_NOT_NULL(sbr)) {
1163
n = ONIGENC_CODE_RANGE_NUM(sbr);
1164
for (i = 0; i < n; i++) {
1165
for (j = ONIGENC_CODE_RANGE_FROM(sbr, i);
1166
j <= (int )ONIGENC_CODE_RANGE_TO(sbr, i); j++) {
1167
BITSET_SET_BIT(cc->bs, j);
1179
n = ONIGENC_CODE_RANGE_NUM(mbr);
1180
if (n == 0) goto is_null;
1182
bbuf = (BBuf* )xmalloc(sizeof(BBuf));
1183
CHECK_NULL_RETURN_VAL(bbuf, NULL);
1184
bbuf->alloc = n + 1;
1186
bbuf->p = (UChar* )((void* )mbr);
1195
node_new_ctype(int type)
1197
Node* node = node_new();
1198
CHECK_NULL_RETURN(node);
1199
node->type = N_CTYPE;
1200
NCTYPE(node).type = type;
1205
node_new_anychar(void)
1207
Node* node = node_new();
1208
CHECK_NULL_RETURN(node);
1209
node->type = N_ANYCHAR;
1214
node_new_list(Node* left, Node* right)
1216
Node* node = node_new();
1217
CHECK_NULL_RETURN(node);
1218
node->type = N_LIST;
1219
NCONS(node).left = left;
1220
NCONS(node).right = right;
1225
onig_node_new_list(Node* left, Node* right)
1227
return node_new_list(left, right);
1231
node_new_alt(Node* left, Node* right)
1233
Node* node = node_new();
1234
CHECK_NULL_RETURN(node);
1236
NCONS(node).left = left;
1237
NCONS(node).right = right;
1242
onig_node_new_anchor(int type)
1244
Node* node = node_new();
1245
CHECK_NULL_RETURN(node);
1246
node->type = N_ANCHOR;
1247
NANCHOR(node).type = type;
1248
NANCHOR(node).target = NULL;
1249
NANCHOR(node).char_len = -1;
1254
node_new_backref(int back_num, int* backrefs, int by_name,
1255
#ifdef USE_BACKREF_AT_LEVEL
1256
int exist_level, int nest_level,
1261
Node* node = node_new();
1263
CHECK_NULL_RETURN(node);
1264
node->type = N_BACKREF;
1265
NBACKREF(node).state = 0;
1266
NBACKREF(node).back_num = back_num;
1267
NBACKREF(node).back_dynamic = (int* )NULL;
1269
NBACKREF(node).state |= NST_NAME_REF;
1271
#ifdef USE_BACKREF_AT_LEVEL
1272
if (exist_level != 0) {
1273
NBACKREF(node).state |= NST_NEST_LEVEL;
1274
NBACKREF(node).nest_level = nest_level;
1278
for (i = 0; i < back_num; i++) {
1279
if (backrefs[i] <= env->num_mem &&
1280
IS_NULL(SCANENV_MEM_NODES(env)[backrefs[i]])) {
1281
NBACKREF(node).state |= NST_RECURSION; /* /...(\1).../ */
1286
if (back_num <= NODE_BACKREFS_SIZE) {
1287
for (i = 0; i < back_num; i++)
1288
NBACKREF(node).back_static[i] = backrefs[i];
1291
int* p = (int* )xmalloc(sizeof(int) * back_num);
1293
onig_node_free(node);
1296
NBACKREF(node).back_dynamic = p;
1297
for (i = 0; i < back_num; i++)
1303
#ifdef USE_SUBEXP_CALL
1305
node_new_call(UChar* name, UChar* name_end)
1307
Node* node = node_new();
1308
CHECK_NULL_RETURN(node);
1310
node->type = N_CALL;
1311
NCALL(node).state = 0;
1312
NCALL(node).ref_num = CALLNODE_REFNUM_UNDEF;
1313
NCALL(node).target = NULL_NODE;
1314
NCALL(node).name = name;
1315
NCALL(node).name_end = name_end;
1321
node_new_quantifier(int lower, int upper, int by_number)
1323
Node* node = node_new();
1324
CHECK_NULL_RETURN(node);
1325
node->type = N_QUANTIFIER;
1326
NQUANTIFIER(node).state = 0;
1327
NQUANTIFIER(node).target = NULL;
1328
NQUANTIFIER(node).lower = lower;
1329
NQUANTIFIER(node).upper = upper;
1330
NQUANTIFIER(node).greedy = 1;
1331
NQUANTIFIER(node).target_empty_info = NQ_TARGET_ISNOT_EMPTY;
1332
NQUANTIFIER(node).head_exact = NULL_NODE;
1333
NQUANTIFIER(node).next_head_exact = NULL_NODE;
1334
NQUANTIFIER(node).is_refered = 0;
1336
NQUANTIFIER(node).state |= NST_BY_NUMBER;
1338
#ifdef USE_COMBINATION_EXPLOSION_CHECK
1339
NQUANTIFIER(node).comb_exp_check_num = 0;
1346
node_new_effect(int type)
1348
Node* node = node_new();
1349
CHECK_NULL_RETURN(node);
1350
node->type = N_EFFECT;
1351
NEFFECT(node).type = type;
1352
NEFFECT(node).state = 0;
1353
NEFFECT(node).regnum = 0;
1354
NEFFECT(node).option = 0;
1355
NEFFECT(node).target = NULL;
1356
NEFFECT(node).call_addr = -1;
1357
NEFFECT(node).opt_count = 0;
1362
onig_node_new_effect(int type)
1364
return node_new_effect(type);
1368
node_new_effect_memory(OnigOptionType option, int is_named)
1370
Node* node = node_new_effect(EFFECT_MEMORY);
1371
CHECK_NULL_RETURN(node);
1373
SET_EFFECT_STATUS(node, NST_NAMED_GROUP);
1375
#ifdef USE_SUBEXP_CALL
1376
NEFFECT(node).option = option;
1382
node_new_option(OnigOptionType option)
1384
Node* node = node_new_effect(EFFECT_OPTION);
1385
CHECK_NULL_RETURN(node);
1386
NEFFECT(node).option = option;
1391
onig_node_str_cat(Node* node, const UChar* s, const UChar* end)
1393
int addlen = end - s;
1396
int len = NSTRING(node).end - NSTRING(node).s;
1398
if (NSTRING(node).capa > 0 || (len + addlen > NODE_STR_BUF_SIZE - 1)) {
1400
int capa = len + addlen + NODE_STR_MARGIN;
1402
if (capa <= NSTRING(node).capa) {
1403
k_strcpy(NSTRING(node).s + len, s, end);
1406
if (NSTRING(node).s == NSTRING(node).buf)
1407
p = strcat_capa_from_static(NSTRING(node).s, NSTRING(node).end,
1410
p = k_strcat_capa(NSTRING(node).s, NSTRING(node).end, s, end, capa);
1412
CHECK_NULL_RETURN_VAL(p, ONIGERR_MEMORY);
1413
NSTRING(node).s = p;
1414
NSTRING(node).capa = capa;
1418
k_strcpy(NSTRING(node).s + len, s, end);
1420
NSTRING(node).end = NSTRING(node).s + len + addlen;
1427
node_str_cat_char(Node* node, UChar c)
1432
return onig_node_str_cat(node, s, s + 1);
1436
onig_node_conv_to_str_node(Node* node, int flag)
1438
node->type = N_STRING;
1440
NSTRING(node).flag = flag;
1441
NSTRING(node).capa = 0;
1442
NSTRING(node).s = NSTRING(node).buf;
1443
NSTRING(node).end = NSTRING(node).buf;
1447
onig_node_str_clear(Node* node)
1449
if (NSTRING(node).capa != 0 &&
1450
IS_NOT_NULL(NSTRING(node).s) && NSTRING(node).s != NSTRING(node).buf) {
1451
xfree(NSTRING(node).s);
1454
NSTRING(node).capa = 0;
1455
NSTRING(node).flag = 0;
1456
NSTRING(node).s = NSTRING(node).buf;
1457
NSTRING(node).end = NSTRING(node).buf;
1461
node_new_str(const UChar* s, const UChar* end)
1463
Node* node = node_new();
1464
CHECK_NULL_RETURN(node);
1466
node->type = N_STRING;
1467
NSTRING(node).capa = 0;
1468
NSTRING(node).flag = 0;
1469
NSTRING(node).s = NSTRING(node).buf;
1470
NSTRING(node).end = NSTRING(node).buf;
1471
if (onig_node_str_cat(node, s, end)) {
1472
onig_node_free(node);
1479
onig_node_new_str(const UChar* s, const UChar* end)
1481
return node_new_str(s, end);
1484
#ifdef NUMBERED_CHAR_IS_NOT_CASE_AMBIG
1486
node_new_str_raw(UChar* s, UChar* end)
1488
Node* node = node_new_str(s, end);
1489
NSTRING_SET_RAW(node);
1495
node_new_empty(void)
1497
return node_new_str(NULL, NULL);
1501
node_new_str_char(UChar c)
1506
return node_new_str(p, p + 1);
1510
str_node_split_last_char(StrNode* sn, OnigEncoding enc)
1513
Node* n = NULL_NODE;
1515
if (sn->end > sn->s) {
1516
p = onigenc_get_prev_char_head(enc, sn->s, sn->end);
1517
if (p && p > sn->s) { /* can be splitted. */
1518
n = node_new_str(p, sn->end);
1519
if ((sn->flag & NSTR_RAW) != 0)
1521
sn->end = (UChar* )p;
1528
str_node_can_be_split(StrNode* sn, OnigEncoding enc)
1530
if (sn->end > sn->s) {
1531
return ((enc_len(enc, sn->s) < sn->end - sn->s) ? 1 : 0);
1536
#ifdef USE_PAD_TO_SHORT_BYTE_CHAR
1538
node_str_head_pad(StrNode* sn, int num, UChar val)
1540
UChar buf[NODE_STR_BUF_SIZE];
1543
len = sn->end - sn->s;
1544
onig_strcpy(buf, sn->s, sn->end);
1545
onig_strcpy(&(sn->s[num]), buf, buf + len);
1548
for (i = 0; i < num; i++) {
1555
onig_scan_unsigned_number(UChar** src, const UChar* end, OnigEncoding enc)
1557
unsigned int num, val;
1565
if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
1566
val = (unsigned int )DIGITVAL(c);
1567
if ((INT_MAX_LIMIT - val) / 10UL < num)
1568
return -1; /* overflow */
1570
num = num * 10 + val;
1582
scan_unsigned_hexadecimal_number(UChar** src, UChar* end, int maxlen,
1586
unsigned int num, val;
1591
while (!PEND && maxlen-- != 0) {
1593
if (ONIGENC_IS_CODE_XDIGIT(enc, c)) {
1594
val = (unsigned int )XDIGITVAL(enc,c);
1595
if ((INT_MAX_LIMIT - val) / 16UL < num)
1596
return -1; /* overflow */
1598
num = (num << 4) + XDIGITVAL(enc,c);
1610
scan_unsigned_octal_number(UChar** src, UChar* end, int maxlen,
1614
unsigned int num, val;
1619
while (!PEND && maxlen-- != 0) {
1621
if (ONIGENC_IS_CODE_DIGIT(enc, c) && c < '8') {
1623
if ((INT_MAX_LIMIT - val) / 8UL < num)
1624
return -1; /* overflow */
1626
num = (num << 3) + val;
1638
#define BBUF_WRITE_CODE_POINT(bbuf,pos,code) \
1639
BBUF_WRITE(bbuf, pos, &(code), SIZE_CODE_POINT)
1642
[n][from-1][to-1][from-2][to-2] ... [from-n][to-n]
1643
(all data size is OnigCodePoint)
1646
new_code_range(BBuf** pbuf)
1648
#define INIT_MULTI_BYTE_RANGE_SIZE (SIZE_CODE_POINT * 5)
1653
bbuf = *pbuf = (BBuf* )xmalloc(sizeof(BBuf));
1654
CHECK_NULL_RETURN_VAL(*pbuf, ONIGERR_MEMORY);
1655
r = BBUF_INIT(*pbuf, INIT_MULTI_BYTE_RANGE_SIZE);
1659
BBUF_WRITE_CODE_POINT(bbuf, 0, n);
1664
add_code_range_to_buf(BBuf** pbuf, OnigCodePoint from, OnigCodePoint to)
1667
int low, high, bound, x;
1668
OnigCodePoint n, *data;
1672
n = from; from = to; to = n;
1675
if (IS_NULL(*pbuf)) {
1676
r = new_code_range(pbuf);
1683
GET_CODE_POINT(n, bbuf->p);
1685
data = (OnigCodePoint* )(bbuf->p);
1688
for (low = 0, bound = n; low < bound; ) {
1689
x = (low + bound) >> 1;
1690
if (from > data[x*2 + 1])
1696
for (high = low, bound = n; high < bound; ) {
1697
x = (high + bound) >> 1;
1698
if (to >= data[x*2] - 1)
1704
inc_n = low + 1 - high;
1705
if (n + inc_n > ONIG_MAX_MULTI_BYTE_RANGES_NUM)
1706
return ONIGERR_TOO_MANY_MULTI_BYTE_RANGES;
1709
if (from > data[low*2])
1711
if (to < data[(high - 1)*2 + 1])
1712
to = data[(high - 1)*2 + 1];
1715
if (inc_n != 0 && (OnigCodePoint )high < n) {
1716
int from_pos = SIZE_CODE_POINT * (1 + high * 2);
1717
int to_pos = SIZE_CODE_POINT * (1 + (low + 1) * 2);
1718
int size = (n - high) * 2 * SIZE_CODE_POINT;
1721
BBUF_MOVE_RIGHT(bbuf, from_pos, to_pos, size);
1724
BBUF_MOVE_LEFT_REDUCE(bbuf, from_pos, to_pos);
1728
pos = SIZE_CODE_POINT * (1 + low * 2);
1729
BBUF_ENSURE_SIZE(bbuf, pos + SIZE_CODE_POINT * 2);
1730
BBUF_WRITE_CODE_POINT(bbuf, pos, from);
1731
BBUF_WRITE_CODE_POINT(bbuf, pos + SIZE_CODE_POINT, to);
1733
BBUF_WRITE_CODE_POINT(bbuf, 0, n);
1739
add_code_range(BBuf** pbuf, ScanEnv* env, OnigCodePoint from, OnigCodePoint to)
1742
if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC))
1745
return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS;
1748
return add_code_range_to_buf(pbuf, from, to);
1752
not_code_range_buf(OnigEncoding enc, BBuf* bbuf, BBuf** pbuf)
1755
OnigCodePoint pre, from, *data, to = 0;
1757
*pbuf = (BBuf* )NULL;
1758
if (IS_NULL(bbuf)) {
1760
return SET_ALL_MULTI_BYTE_RANGE(enc, pbuf);
1763
data = (OnigCodePoint* )(bbuf->p);
1764
GET_CODE_POINT(n, data);
1766
if (n <= 0) goto set_all;
1769
pre = MBCODE_START_POS(enc);
1770
for (i = 0; i < n; i++) {
1773
if (pre <= from - 1) {
1774
r = add_code_range_to_buf(pbuf, pre, from - 1);
1775
if (r != 0) return r;
1777
if (to == ~((OnigCodePoint )0)) break;
1780
if (to < ~((OnigCodePoint )0)) {
1781
r = add_code_range_to_buf(pbuf, to + 1, ~((OnigCodePoint )0));
1786
#define SWAP_BBUF_NOT(bbuf1, not1, bbuf2, not2) do {\
1789
tnot = not1; not1 = not2; not2 = tnot; \
1790
tbuf = bbuf1; bbuf1 = bbuf2; bbuf2 = tbuf; \
1794
or_code_range_buf(OnigEncoding enc, BBuf* bbuf1, int not1,
1795
BBuf* bbuf2, int not2, BBuf** pbuf)
1798
OnigCodePoint i, n1, *data1;
1799
OnigCodePoint from, to;
1801
*pbuf = (BBuf* )NULL;
1802
if (IS_NULL(bbuf1) && IS_NULL(bbuf2)) {
1803
if (not1 != 0 || not2 != 0)
1804
return SET_ALL_MULTI_BYTE_RANGE(enc, pbuf);
1810
SWAP_BBUF_NOT(bbuf1, not1, bbuf2, not2);
1812
if (IS_NULL(bbuf1)) {
1814
return SET_ALL_MULTI_BYTE_RANGE(enc, pbuf);
1818
return bbuf_clone(pbuf, bbuf2);
1821
return not_code_range_buf(enc, bbuf2, pbuf);
1827
SWAP_BBUF_NOT(bbuf1, not1, bbuf2, not2);
1829
data1 = (OnigCodePoint* )(bbuf1->p);
1830
GET_CODE_POINT(n1, data1);
1833
if (not2 == 0 && not1 == 0) { /* 1 OR 2 */
1834
r = bbuf_clone(pbuf, bbuf2);
1836
else if (not1 == 0) { /* 1 OR (not 2) */
1837
r = not_code_range_buf(enc, bbuf2, pbuf);
1839
if (r != 0) return r;
1841
for (i = 0; i < n1; i++) {
1844
r = add_code_range_to_buf(pbuf, from, to);
1845
if (r != 0) return r;
1851
and_code_range1(BBuf** pbuf, OnigCodePoint from1, OnigCodePoint to1,
1852
OnigCodePoint* data, int n)
1855
OnigCodePoint from2, to2;
1857
for (i = 0; i < n; i++) {
1860
if (from2 < from1) {
1861
if (to2 < from1) continue;
1866
else if (from2 <= to1) {
1868
if (from1 <= from2 - 1) {
1869
r = add_code_range_to_buf(pbuf, from1, from2-1);
1870
if (r != 0) return r;
1881
if (from1 > to1) break;
1884
r = add_code_range_to_buf(pbuf, from1, to1);
1885
if (r != 0) return r;
1891
and_code_range_buf(BBuf* bbuf1, int not1, BBuf* bbuf2, int not2, BBuf** pbuf)
1894
OnigCodePoint i, j, n1, n2, *data1, *data2;
1895
OnigCodePoint from, to, from1, to1, from2, to2;
1897
*pbuf = (BBuf* )NULL;
1898
if (IS_NULL(bbuf1)) {
1899
if (not1 != 0 && IS_NOT_NULL(bbuf2)) /* not1 != 0 -> not2 == 0 */
1900
return bbuf_clone(pbuf, bbuf2);
1903
else if (IS_NULL(bbuf2)) {
1905
return bbuf_clone(pbuf, bbuf1);
1910
SWAP_BBUF_NOT(bbuf1, not1, bbuf2, not2);
1912
data1 = (OnigCodePoint* )(bbuf1->p);
1913
data2 = (OnigCodePoint* )(bbuf2->p);
1914
GET_CODE_POINT(n1, data1);
1915
GET_CODE_POINT(n2, data2);
1919
if (not2 == 0 && not1 == 0) { /* 1 AND 2 */
1920
for (i = 0; i < n1; i++) {
1923
for (j = 0; j < n2; j++) {
1926
if (from2 > to1) break;
1927
if (to2 < from1) continue;
1928
from = MAX(from1, from2);
1930
r = add_code_range_to_buf(pbuf, from, to);
1931
if (r != 0) return r;
1935
else if (not1 == 0) { /* 1 AND (not 2) */
1936
for (i = 0; i < n1; i++) {
1939
r = and_code_range1(pbuf, from1, to1, data2, n2);
1940
if (r != 0) return r;
1948
and_cclass(CClassNode* dest, CClassNode* cc, OnigEncoding enc)
1951
BBuf *buf1, *buf2, *pbuf;
1952
BitSetRef bsr1, bsr2;
1955
not1 = IS_CCLASS_NOT(dest);
1958
not2 = IS_CCLASS_NOT(cc);
1963
bitset_invert_to(bsr1, bs1);
1967
bitset_invert_to(bsr2, bs2);
1970
bitset_and(bsr1, bsr2);
1971
if (bsr1 != dest->bs) {
1972
bitset_copy(dest->bs, bsr1);
1976
bitset_invert(dest->bs);
1979
if (! ONIGENC_IS_SINGLEBYTE(enc)) {
1980
if (not1 != 0 && not2 != 0) {
1981
r = or_code_range_buf(enc, buf1, 0, buf2, 0, &pbuf);
1984
r = and_code_range_buf(buf1, not1, buf2, not2, &pbuf);
1985
if (r == 0 && not1 != 0) {
1987
r = not_code_range_buf(enc, pbuf, &tbuf);
1996
if (r != 0) return r;
2006
or_cclass(CClassNode* dest, CClassNode* cc, OnigEncoding enc)
2009
BBuf *buf1, *buf2, *pbuf;
2010
BitSetRef bsr1, bsr2;
2013
not1 = IS_CCLASS_NOT(dest);
2016
not2 = IS_CCLASS_NOT(cc);
2021
bitset_invert_to(bsr1, bs1);
2025
bitset_invert_to(bsr2, bs2);
2028
bitset_or(bsr1, bsr2);
2029
if (bsr1 != dest->bs) {
2030
bitset_copy(dest->bs, bsr1);
2034
bitset_invert(dest->bs);
2037
if (! ONIGENC_IS_SINGLEBYTE(enc)) {
2038
if (not1 != 0 && not2 != 0) {
2039
r = and_code_range_buf(buf1, 0, buf2, 0, &pbuf);
2042
r = or_code_range_buf(enc, buf1, not1, buf2, not2, &pbuf);
2043
if (r == 0 && not1 != 0) {
2045
r = not_code_range_buf(enc, pbuf, &tbuf);
2054
if (r != 0) return r;
2065
conv_backslash_value(int c, ScanEnv* env)
2067
if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_CONTROL_CHARS)) {
2069
case 'n': return '\n';
2070
case 't': return '\t';
2071
case 'r': return '\r';
2072
case 'f': return '\f';
2073
case 'a': return '\007';
2074
case 'b': return '\010';
2075
case 'e': return '\033';
2077
if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ESC_V_VTAB))
2089
is_invalid_quantifier_target(Node* node)
2091
switch (NTYPE(node)) {
2097
if (NEFFECT(node).type == EFFECT_OPTION)
2098
return is_invalid_quantifier_target(NEFFECT(node).target);
2101
case N_LIST: /* ex. (?:\G\A)* */
2103
if (! is_invalid_quantifier_target(NCONS(node).left)) return 0;
2104
} while (IS_NOT_NULL(node = NCONS(node).right));
2108
case N_ALT: /* ex. (?:abc|\A)* */
2110
if (is_invalid_quantifier_target(NCONS(node).left)) return 1;
2111
} while (IS_NOT_NULL(node = NCONS(node).right));
2120
/* ?:0, *:1, +:2, ??:3, *?:4, +?:5 */
2122
popular_quantifier_num(QuantifierNode* qf)
2125
if (qf->lower == 0) {
2126
if (qf->upper == 1) return 0;
2127
else if (IS_REPEAT_INFINITE(qf->upper)) return 1;
2129
else if (qf->lower == 1) {
2130
if (IS_REPEAT_INFINITE(qf->upper)) return 2;
2134
if (qf->lower == 0) {
2135
if (qf->upper == 1) return 3;
2136
else if (IS_REPEAT_INFINITE(qf->upper)) return 4;
2138
else if (qf->lower == 1) {
2139
if (IS_REPEAT_INFINITE(qf->upper)) return 5;
2147
RQ_ASIS = 0, /* as is */
2148
RQ_DEL = 1, /* delete parent */
2150
RQ_AQ, /* to '*?' */
2151
RQ_QQ, /* to '??' */
2152
RQ_P_QQ, /* to '+)??' */
2153
RQ_PQ_Q /* to '+?)?' */
2156
static enum ReduceType ReduceTypeTable[6][6] = {
2157
{RQ_DEL, RQ_A, RQ_A, RQ_QQ, RQ_AQ, RQ_ASIS}, /* '?' */
2158
{RQ_DEL, RQ_DEL, RQ_DEL, RQ_P_QQ, RQ_P_QQ, RQ_DEL}, /* '*' */
2159
{RQ_A, RQ_A, RQ_DEL, RQ_ASIS, RQ_P_QQ, RQ_DEL}, /* '+' */
2160
{RQ_DEL, RQ_AQ, RQ_AQ, RQ_DEL, RQ_AQ, RQ_AQ}, /* '??' */
2161
{RQ_DEL, RQ_DEL, RQ_DEL, RQ_DEL, RQ_DEL, RQ_DEL}, /* '*?' */
2162
{RQ_ASIS, RQ_PQ_Q, RQ_DEL, RQ_AQ, RQ_AQ, RQ_DEL} /* '+?' */
2166
onig_reduce_nested_quantifier(Node* pnode, Node* cnode)
2169
QuantifierNode *p, *c;
2171
p = &(NQUANTIFIER(pnode));
2172
c = &(NQUANTIFIER(cnode));
2173
pnum = popular_quantifier_num(p);
2174
cnum = popular_quantifier_num(c);
2176
switch(ReduceTypeTable[cnum][pnum]) {
2181
p->target = c->target;
2182
p->lower = 0; p->upper = REPEAT_INFINITE; p->greedy = 1;
2185
p->target = c->target;
2186
p->lower = 0; p->upper = REPEAT_INFINITE; p->greedy = 0;
2189
p->target = c->target;
2190
p->lower = 0; p->upper = 1; p->greedy = 0;
2194
p->lower = 0; p->upper = 1; p->greedy = 0;
2195
c->lower = 1; c->upper = REPEAT_INFINITE; c->greedy = 1;
2200
p->lower = 0; p->upper = 1; p->greedy = 1;
2201
c->lower = 1; c->upper = REPEAT_INFINITE; c->greedy = 0;
2210
c->target = NULL_NODE;
2211
onig_node_free(cnode);
2216
TK_EOT = 0, /* end of token */
2228
TK_ANYCHAR_ANYTIME, /* SQL '%' == .* */
2234
TK_CHAR_PROPERTY, /* \p{...}, \P{...} */
2238
TK_POSIX_BRACKET_OPEN,
2240
TK_CC_CC_OPEN /* [ */
2244
enum TokenSyms type;
2246
int base; /* is number: 8, 16 (used in [....]) */
2265
#ifdef USE_BACKREF_AT_LEVEL
2267
int level; /* \k<name+n> */
2282
fetch_range_quantifier(UChar** src, UChar* end, OnigToken* tok, ScanEnv* env)
2284
int low, up, syn_allow, non_low = 0;
2287
OnigEncoding enc = env->enc;
2291
syn_allow = IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_INVALID_INTERVAL);
2295
return 1; /* "....{" : OK! */
2297
return ONIGERR_END_PATTERN_AT_LEFT_BRACE; /* "....{" syntax error */
2302
if (c == ')' || c == '(' || c == '|') {
2303
return ONIGERR_END_PATTERN_AT_LEFT_BRACE;
2307
low = onig_scan_unsigned_number(&p, end, env->enc);
2308
if (low < 0) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
2309
if (low > ONIG_MAX_REPEAT_NUM)
2310
return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
2312
if (p == *src) { /* can't read low */
2313
if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_INTERVAL_LOW_ABBREV)) {
2314
/* allow {,n} as {0,n} */
2322
if (PEND) goto invalid;
2326
up = onig_scan_unsigned_number(&p, end, env->enc);
2327
if (up < 0) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
2328
if (up > ONIG_MAX_REPEAT_NUM)
2329
return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
2334
up = REPEAT_INFINITE; /* {n,} : {n,infinite} */
2342
up = low; /* {n} : exact n times */
2346
if (PEND) goto invalid;
2348
if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_BRACE_INTERVAL)) {
2349
if (c != MC_ESC(enc)) goto invalid;
2352
if (c != '}') goto invalid;
2354
if (!IS_REPEAT_INFINITE(up) && low > up) {
2355
return ONIGERR_UPPER_SMALLER_THAN_LOWER_IN_REPEAT_RANGE;
2358
tok->type = TK_INTERVAL;
2359
tok->u.repeat.lower = low;
2360
tok->u.repeat.upper = up;
2362
return r; /* 0: normal {n,m}, 2: fixed {n} */
2368
return ONIGERR_INVALID_REPEAT_RANGE_PATTERN;
2371
/* \M-, \C-, \c, or \... */
2373
fetch_escaped_value(UChar** src, UChar* end, ScanEnv* env)
2377
OnigEncoding enc = env->enc;
2381
if (PEND) return ONIGERR_END_PATTERN_AT_ESCAPE;
2386
if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ESC_CAPITAL_M_BAR_META)) {
2387
if (PEND) return ONIGERR_END_PATTERN_AT_META;
2389
if (c != '-') return ONIGERR_META_CODE_SYNTAX;
2390
if (PEND) return ONIGERR_END_PATTERN_AT_META;
2392
if (c == MC_ESC(enc)) {
2393
v = fetch_escaped_value(&p, end, env);
2394
if (v < 0) return v;
2395
c = (OnigCodePoint )v;
2397
c = ((c & 0xff) | 0x80);
2404
if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ESC_CAPITAL_C_BAR_CONTROL)) {
2405
if (PEND) return ONIGERR_END_PATTERN_AT_CONTROL;
2407
if (c != '-') return ONIGERR_CONTROL_CODE_SYNTAX;
2414
if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_C_CONTROL)) {
2416
if (PEND) return ONIGERR_END_PATTERN_AT_CONTROL;
2422
if (c == MC_ESC(enc)) {
2423
v = fetch_escaped_value(&p, end, env);
2424
if (v < 0) return v;
2425
c = (OnigCodePoint )v;
2436
c = conv_backslash_value(c, env);
2445
static int fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env);
2447
#ifdef USE_NAMED_GROUP
2448
#ifdef USE_BACKREF_AT_LEVEL
2450
\k<name+n>, \k<name-n>
2453
fetch_name_with_level(UChar** src, UChar* end, UChar** rname_end
2454
, ScanEnv* env, int* level)
2456
int r, exist_level = 0;
2457
OnigCodePoint c = 0;
2458
OnigCodePoint first_code;
2459
OnigEncoding enc = env->enc;
2467
return ONIGERR_EMPTY_GROUP_NAME;
2473
return ONIGERR_EMPTY_GROUP_NAME;
2475
if (!ONIGENC_IS_CODE_WORD(enc, c)) {
2476
r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2483
if (c == '>' || c == ')' || c == '+' || c == '-') break;
2485
if (!ONIGENC_IS_CODE_WORD(enc, c)) {
2486
r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2491
if (c == '+' || c == '-') {
2493
int flag = (c == '-' ? -1 : 1);
2496
if (! ONIGENC_IS_CODE_DIGIT(enc, c)) goto err;
2498
num = onig_scan_unsigned_number(&p, end, enc);
2499
if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
2500
*level = (num * flag);
2509
r = ONIGERR_INVALID_GROUP_NAME;
2514
if (ONIGENC_IS_CODE_ASCII(first_code) &&
2515
ONIGENC_IS_CODE_UPPER(enc, first_code))
2516
r = ONIGERR_INVALID_GROUP_NAME;
2520
*rname_end = name_end;
2522
return (exist_level ? 1 : 0);
2525
onig_scan_env_set_error_string(env, r, *src, name_end);
2529
#endif /* USE_BACKREF_AT_LEVEL */
2532
def: 0 -> define name (don't allow number name)
2533
1 -> reference name (allow number name)
2536
fetch_name(UChar** src, UChar* end, UChar** rname_end, ScanEnv* env, int ref)
2539
OnigCodePoint c = 0;
2540
OnigCodePoint first_code;
2541
OnigEncoding enc = env->enc;
2550
return ONIGERR_EMPTY_GROUP_NAME;
2556
return ONIGERR_EMPTY_GROUP_NAME;
2558
if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
2562
r = ONIGERR_INVALID_GROUP_NAME;
2565
else if (!ONIGENC_IS_CODE_WORD(enc, c)) {
2566
r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2573
if (c == '>' || c == ')') break;
2576
if (! ONIGENC_IS_CODE_DIGIT(enc, c)) {
2577
if (!ONIGENC_IS_CODE_WORD(enc, c))
2578
r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2580
r = ONIGERR_INVALID_GROUP_NAME;
2584
if (!ONIGENC_IS_CODE_WORD(enc, c)) {
2585
r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2591
r = ONIGERR_INVALID_GROUP_NAME;
2595
if (ONIGENC_IS_CODE_ASCII(first_code) &&
2596
ONIGENC_IS_CODE_UPPER(enc, first_code))
2597
r = ONIGERR_INVALID_GROUP_NAME;
2601
*rname_end = name_end;
2606
onig_scan_env_set_error_string(env, r, *src, name_end);
2612
fetch_name(UChar** src, UChar* end, UChar** rname_end, ScanEnv* env, int ref)
2615
OnigCodePoint c = 0;
2617
OnigEncoding enc = env->enc;
2624
if (enc_len(enc, p) > 1)
2625
r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2628
if (c == '>' || c == ')') break;
2629
if (! ONIGENC_IS_CODE_DIGIT(enc, c))
2630
r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2633
r = ONIGERR_INVALID_GROUP_NAME;
2638
*rname_end = name_end;
2644
onig_scan_env_set_error_string(env, r, *src, name_end);
2651
CC_ESC_WARN(ScanEnv* env, UChar *c)
2653
if (onig_warn == onig_null_warn) return ;
2655
if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_WARN_CC_OP_NOT_ESCAPED) &&
2656
IS_SYNTAX_BV(env->syntax, ONIG_SYN_BACKSLASH_ESCAPE_IN_CC)) {
2657
UChar buf[WARN_BUFSIZE];
2658
onig_snprintf_with_pattern(buf, WARN_BUFSIZE, env->enc,
2659
env->pattern, env->pattern_end,
2660
(UChar* )"character class has '%s' without escape", c);
2661
(*onig_warn)((char* )buf);
2666
CCEND_ESC_WARN(ScanEnv* env, UChar* c)
2668
if (onig_warn == onig_null_warn) return ;
2670
if (IS_SYNTAX_BV((env)->syntax, ONIG_SYN_WARN_CC_OP_NOT_ESCAPED)) {
2671
UChar buf[WARN_BUFSIZE];
2672
onig_snprintf_with_pattern(buf, WARN_BUFSIZE, (env)->enc,
2673
(env)->pattern, (env)->pattern_end,
2674
(UChar* )"regular expression has '%s' without escape", c);
2675
(*onig_warn)((char* )buf);
2680
find_str_position(OnigCodePoint s[], int n, UChar* from, UChar* to,
2681
UChar **next, OnigEncoding enc)
2689
x = ONIGENC_MBC_TO_CODE(enc, p, to);
2690
q = p + enc_len(enc, p);
2692
for (i = 1; i < n && q < to; i++) {
2693
x = ONIGENC_MBC_TO_CODE(enc, q, to);
2694
if (x != s[i]) break;
2695
q += enc_len(enc, q);
2698
if (IS_NOT_NULL(next))
2709
str_exist_check_with_esc(OnigCodePoint s[], int n, UChar* from, UChar* to,
2710
OnigCodePoint bad, OnigEncoding enc)
2721
p += enc_len(enc, p);
2724
x = ONIGENC_MBC_TO_CODE(enc, p, to);
2725
q = p + enc_len(enc, p);
2727
for (i = 1; i < n && q < to; i++) {
2728
x = ONIGENC_MBC_TO_CODE(enc, q, to);
2729
if (x != s[i]) break;
2730
q += enc_len(enc, q);
2732
if (i >= n) return 1;
2733
p += enc_len(enc, p);
2736
x = ONIGENC_MBC_TO_CODE(enc, p, to);
2737
if (x == bad) return 0;
2738
else if (x == MC_ESC(enc)) in_esc = 1;
2747
fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
2750
OnigCodePoint c, c2;
2751
OnigSyntaxType* syn = env->syntax;
2752
OnigEncoding enc = env->enc;
2763
tok->type = TK_CHAR;
2769
tok->type = TK_CC_CLOSE;
2771
else if (c == '-') {
2772
tok->type = TK_CC_RANGE;
2774
else if (c == MC_ESC(enc)) {
2775
if (! IS_SYNTAX_BV(syn, ONIG_SYN_BACKSLASH_ESCAPE_IN_CC))
2778
if (PEND) return ONIGERR_END_PATTERN_AT_ESCAPE;
2785
tok->type = TK_CHAR_TYPE;
2786
tok->u.subtype = CTYPE_WORD;
2789
tok->type = TK_CHAR_TYPE;
2790
tok->u.subtype = CTYPE_NOT_WORD;
2793
tok->type = TK_CHAR_TYPE;
2794
tok->u.subtype = CTYPE_DIGIT;
2797
tok->type = TK_CHAR_TYPE;
2798
tok->u.subtype = CTYPE_NOT_DIGIT;
2801
tok->type = TK_CHAR_TYPE;
2802
tok->u.subtype = CTYPE_WHITE_SPACE;
2805
tok->type = TK_CHAR_TYPE;
2806
tok->u.subtype = CTYPE_NOT_WHITE_SPACE;
2809
if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break;
2810
tok->type = TK_CHAR_TYPE;
2811
tok->u.subtype = CTYPE_XDIGIT;
2814
if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break;
2815
tok->type = TK_CHAR_TYPE;
2816
tok->u.subtype = CTYPE_NOT_XDIGIT;
2823
IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY)) {
2825
tok->type = TK_CHAR_PROPERTY;
2826
tok->u.prop.not = (c == 'P' ? 1 : 0);
2828
if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT)) {
2831
tok->u.prop.not = (tok->u.prop.not == 0 ? 1 : 0);
2843
if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_BRACE_HEX8)) {
2845
num = scan_unsigned_hexadecimal_number(&p, end, 8, enc);
2846
if (num < 0) return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE;
2849
if (ONIGENC_IS_CODE_XDIGIT(enc, c2))
2850
return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE;
2853
if (p > prev + enc_len(enc, prev) && !PEND && (PPEEK_IS('}'))) {
2855
tok->type = TK_CODE_POINT;
2857
tok->u.code = (OnigCodePoint )num;
2860
/* can't read nothing or invalid format */
2864
else if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_HEX2)) {
2865
num = scan_unsigned_hexadecimal_number(&p, end, 2, enc);
2866
if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
2867
if (p == prev) { /* can't read nothing. */
2868
num = 0; /* but, it's not error */
2870
tok->type = TK_RAW_BYTE;
2880
if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_U_HEX4)) {
2881
num = scan_unsigned_hexadecimal_number(&p, end, 4, enc);
2882
if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
2883
if (p == prev) { /* can't read nothing. */
2884
num = 0; /* but, it's not error */
2886
tok->type = TK_CODE_POINT;
2888
tok->u.code = (OnigCodePoint )num;
2893
case '1': case '2': case '3': case '4': case '5': case '6': case '7':
2894
if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_OCTAL3)) {
2897
num = scan_unsigned_octal_number(&p, end, 3, enc);
2898
if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
2899
if (p == prev) { /* can't read nothing. */
2900
num = 0; /* but, it's not error */
2902
tok->type = TK_RAW_BYTE;
2910
num = fetch_escaped_value(&p, end, env);
2911
if (num < 0) return num;
2912
if (tok->u.c != num) {
2913
tok->u.code = (OnigCodePoint )num;
2914
tok->type = TK_CODE_POINT;
2919
else if (c == '[') {
2920
if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_POSIX_BRACKET) && (PPEEK_IS(':'))) {
2921
OnigCodePoint send[] = { (OnigCodePoint )':', (OnigCodePoint )']' };
2922
tok->backp = p; /* point at '[' is readed */
2924
if (str_exist_check_with_esc(send, 2, p, end,
2925
(OnigCodePoint )']', enc)) {
2926
tok->type = TK_POSIX_BRACKET_OPEN;
2935
if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_CCLASS_SET_OP)) {
2936
tok->type = TK_CC_CC_OPEN;
2939
CC_ESC_WARN(env, (UChar* )"[");
2943
else if (c == '&') {
2944
if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_CCLASS_SET_OP) &&
2945
!PEND && (PPEEK_IS('&'))) {
2947
tok->type = TK_CC_AND;
2957
fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
2961
OnigEncoding enc = env->enc;
2962
OnigSyntaxType* syn = env->syntax;
2973
tok->type = TK_STRING;
2978
if (IS_MC_ESC_CODE(c, enc, syn)) {
2979
if (PEND) return ONIGERR_END_PATTERN_AT_ESCAPE;
2988
if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_ASTERISK_ZERO_INF)) break;
2989
tok->type = TK_OP_REPEAT;
2990
tok->u.repeat.lower = 0;
2991
tok->u.repeat.upper = REPEAT_INFINITE;
2996
if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_PLUS_ONE_INF)) break;
2997
tok->type = TK_OP_REPEAT;
2998
tok->u.repeat.lower = 1;
2999
tok->u.repeat.upper = REPEAT_INFINITE;
3004
if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_QMARK_ZERO_ONE)) break;
3005
tok->type = TK_OP_REPEAT;
3006
tok->u.repeat.lower = 0;
3007
tok->u.repeat.upper = 1;
3009
if (!PEND && PPEEK_IS('?') &&
3010
IS_SYNTAX_OP(syn, ONIG_SYN_OP_QMARK_NON_GREEDY)) {
3012
tok->u.repeat.greedy = 0;
3013
tok->u.repeat.possessive = 0;
3017
if (!PEND && PPEEK_IS('+') &&
3018
((IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_PLUS_POSSESSIVE_REPEAT) &&
3019
tok->type != TK_INTERVAL) ||
3020
(IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_PLUS_POSSESSIVE_INTERVAL) &&
3021
tok->type == TK_INTERVAL))) {
3023
tok->u.repeat.greedy = 1;
3024
tok->u.repeat.possessive = 1;
3027
tok->u.repeat.greedy = 1;
3028
tok->u.repeat.possessive = 0;
3034
if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_BRACE_INTERVAL)) break;
3035
r = fetch_range_quantifier(&p, end, tok, env);
3036
if (r < 0) return r; /* error */
3037
if (r == 0) goto greedy_check;
3038
else if (r == 2) { /* {n} */
3039
if (IS_SYNTAX_BV(syn, ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY))
3040
goto possessive_check;
3044
/* r == 1 : normal char */
3048
if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_VBAR_ALT)) break;
3053
if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LPAREN_SUBEXP)) break;
3054
tok->type = TK_SUBEXP_OPEN;
3058
if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LPAREN_SUBEXP)) break;
3059
tok->type = TK_SUBEXP_CLOSE;
3063
if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_W_WORD)) break;
3064
tok->type = TK_CHAR_TYPE;
3065
tok->u.subtype = CTYPE_WORD;
3069
if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_W_WORD)) break;
3070
tok->type = TK_CHAR_TYPE;
3071
tok->u.subtype = CTYPE_NOT_WORD;
3075
if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_B_WORD_BOUND)) break;
3076
tok->type = TK_ANCHOR;
3077
tok->u.anchor = ANCHOR_WORD_BOUND;
3081
if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_B_WORD_BOUND)) break;
3082
tok->type = TK_ANCHOR;
3083
tok->u.anchor = ANCHOR_NOT_WORD_BOUND;
3086
#ifdef USE_WORD_BEGIN_END
3088
if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END)) break;
3089
tok->type = TK_ANCHOR;
3090
tok->u.anchor = ANCHOR_WORD_BEGIN;
3094
if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END)) break;
3095
tok->type = TK_ANCHOR;
3096
tok->u.anchor = ANCHOR_WORD_END;
3101
if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_S_WHITE_SPACE)) break;
3102
tok->type = TK_CHAR_TYPE;
3103
tok->u.subtype = CTYPE_WHITE_SPACE;
3107
if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_S_WHITE_SPACE)) break;
3108
tok->type = TK_CHAR_TYPE;
3109
tok->u.subtype = CTYPE_NOT_WHITE_SPACE;
3113
if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_D_DIGIT)) break;
3114
tok->type = TK_CHAR_TYPE;
3115
tok->u.subtype = CTYPE_DIGIT;
3119
if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_D_DIGIT)) break;
3120
tok->type = TK_CHAR_TYPE;
3121
tok->u.subtype = CTYPE_NOT_DIGIT;
3125
if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break;
3126
tok->type = TK_CHAR_TYPE;
3127
tok->u.subtype = CTYPE_XDIGIT;
3131
if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break;
3132
tok->type = TK_CHAR_TYPE;
3133
tok->u.subtype = CTYPE_NOT_XDIGIT;
3137
if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break;
3139
tok->type = TK_ANCHOR;
3140
tok->u.subtype = ANCHOR_BEGIN_BUF;
3144
if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break;
3145
tok->type = TK_ANCHOR;
3146
tok->u.subtype = ANCHOR_SEMI_END_BUF;
3150
if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break;
3152
tok->type = TK_ANCHOR;
3153
tok->u.subtype = ANCHOR_END_BUF;
3157
if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_CAPITAL_G_BEGIN_ANCHOR)) break;
3158
tok->type = TK_ANCHOR;
3159
tok->u.subtype = ANCHOR_BEGIN_POSITION;
3163
if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_GNU_BUF_ANCHOR)) break;
3168
if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_GNU_BUF_ANCHOR)) break;
3176
if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_BRACE_HEX8)) {
3178
num = scan_unsigned_hexadecimal_number(&p, end, 8, enc);
3179
if (num < 0) return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE;
3181
if (ONIGENC_IS_CODE_XDIGIT(enc, PPEEK))
3182
return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE;
3185
if ((p > prev + enc_len(enc, prev)) && !PEND && PPEEK_IS('}')) {
3187
tok->type = TK_CODE_POINT;
3188
tok->u.code = (OnigCodePoint )num;
3191
/* can't read nothing or invalid format */
3195
else if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_HEX2)) {
3196
num = scan_unsigned_hexadecimal_number(&p, end, 2, enc);
3197
if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
3198
if (p == prev) { /* can't read nothing. */
3199
num = 0; /* but, it's not error */
3201
tok->type = TK_RAW_BYTE;
3211
if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_U_HEX4)) {
3212
num = scan_unsigned_hexadecimal_number(&p, end, 4, enc);
3213
if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
3214
if (p == prev) { /* can't read nothing. */
3215
num = 0; /* but, it's not error */
3217
tok->type = TK_CODE_POINT;
3219
tok->u.code = (OnigCodePoint )num;
3223
case '1': case '2': case '3': case '4':
3224
case '5': case '6': case '7': case '8': case '9':
3227
num = onig_scan_unsigned_number(&p, end, enc);
3228
if (num < 0 || num > ONIG_MAX_BACKREF_NUM) {
3232
if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_DECIMAL_BACKREF) &&
3233
(num <= env->num_mem || num <= 9)) { /* This spec. from GNU regex */
3234
if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) {
3235
if (num > env->num_mem || IS_NULL(SCANENV_MEM_NODES(env)[num]))
3236
return ONIGERR_INVALID_BACKREF;
3239
tok->type = TK_BACKREF;
3240
tok->u.backref.num = 1;
3241
tok->u.backref.ref1 = num;
3242
tok->u.backref.by_name = 0;
3243
#ifdef USE_BACKREF_AT_LEVEL
3244
tok->u.backref.exist_level = 0;
3250
if (c == '8' || c == '9') {
3259
if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_OCTAL3)) {
3261
num = scan_unsigned_octal_number(&p, end, (c == '0' ? 2:3), enc);
3262
if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
3263
if (p == prev) { /* can't read nothing. */
3264
num = 0; /* but, it's not error */
3266
tok->type = TK_RAW_BYTE;
3270
else if (c != '0') {
3275
#ifdef USE_NAMED_GROUP
3277
if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_K_NAMED_BACKREF)) {
3285
#ifdef USE_BACKREF_AT_LEVEL
3286
name_end = NULL_UCHARP; /* no need. escape gcc warning. */
3287
r = fetch_name_with_level(&p, end, &name_end, env, &tok->u.backref.level);
3288
if (r == 1) tok->u.backref.exist_level = 1;
3289
else tok->u.backref.exist_level = 0;
3291
r = fetch_name(&p, end, &name_end, env, 1);
3293
if (r < 0) return r;
3295
num = onig_name_to_group_numbers(env->reg, prev, name_end, &backs);
3297
onig_scan_env_set_error_string(env,
3298
ONIGERR_UNDEFINED_NAME_REFERENCE, prev, name_end);
3299
return ONIGERR_UNDEFINED_NAME_REFERENCE;
3301
if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) {
3303
for (i = 0; i < num; i++) {
3304
if (backs[i] > env->num_mem ||
3305
IS_NULL(SCANENV_MEM_NODES(env)[backs[i]]))
3306
return ONIGERR_INVALID_BACKREF;
3310
tok->type = TK_BACKREF;
3311
tok->u.backref.by_name = 1;
3313
tok->u.backref.num = 1;
3314
tok->u.backref.ref1 = backs[0];
3317
tok->u.backref.num = num;
3318
tok->u.backref.refs = backs;
3327
#ifdef USE_SUBEXP_CALL
3329
if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_G_SUBEXP_CALL)) {
3335
r = fetch_name(&p, end, &name_end, env, 1);
3336
if (r < 0) return r;
3338
tok->type = TK_CALL;
3339
tok->u.call.name = prev;
3340
tok->u.call.name_end = name_end;
3349
if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_CAPITAL_Q_QUOTE)) {
3350
tok->type = TK_QUOTE_OPEN;
3356
if (PPEEK_IS('{') &&
3357
IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY)) {
3359
tok->type = TK_CHAR_PROPERTY;
3360
tok->u.prop.not = (c == 'P' ? 1 : 0);
3362
if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT)) {
3365
tok->u.prop.not = (tok->u.prop.not == 0 ? 1 : 0);
3375
num = fetch_escaped_value(&p, end, env);
3376
if (num < 0) return num;
3378
if (tok->u.c != num) {
3379
tok->type = TK_CODE_POINT;
3380
tok->u.code = (OnigCodePoint )num;
3383
p = tok->backp + enc_len(enc, tok->backp);
3392
#ifdef USE_VARIABLE_META_CHARS
3393
if ((c != ONIG_INEFFECTIVE_META_CHAR) &&
3394
IS_SYNTAX_OP(syn, ONIG_SYN_OP_VARIABLE_META_CHARACTERS)) {
3395
if (c == MC_ANYCHAR(enc))
3397
else if (c == MC_ANYTIME(enc))
3399
else if (c == MC_ZERO_OR_ONE_TIME(enc))
3400
goto zero_or_one_time;
3401
else if (c == MC_ONE_OR_MORE_TIME(enc))
3402
goto one_or_more_time;
3403
else if (c == MC_ANYCHAR_ANYTIME(enc)) {
3404
tok->type = TK_ANYCHAR_ANYTIME;
3412
if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_DOT_ANYCHAR)) break;
3413
#ifdef USE_VARIABLE_META_CHARS
3416
tok->type = TK_ANYCHAR;
3420
if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ASTERISK_ZERO_INF)) break;
3421
#ifdef USE_VARIABLE_META_CHARS
3424
tok->type = TK_OP_REPEAT;
3425
tok->u.repeat.lower = 0;
3426
tok->u.repeat.upper = REPEAT_INFINITE;
3431
if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_PLUS_ONE_INF)) break;
3432
#ifdef USE_VARIABLE_META_CHARS
3435
tok->type = TK_OP_REPEAT;
3436
tok->u.repeat.lower = 1;
3437
tok->u.repeat.upper = REPEAT_INFINITE;
3442
if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_QMARK_ZERO_ONE)) break;
3443
#ifdef USE_VARIABLE_META_CHARS
3446
tok->type = TK_OP_REPEAT;
3447
tok->u.repeat.lower = 0;
3448
tok->u.repeat.upper = 1;
3453
if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_BRACE_INTERVAL)) break;
3454
r = fetch_range_quantifier(&p, end, tok, env);
3455
if (r < 0) return r; /* error */
3456
if (r == 0) goto greedy_check;
3457
else if (r == 2) { /* {n} */
3458
if (IS_SYNTAX_BV(syn, ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY))
3459
goto possessive_check;
3463
/* r == 1 : normal char */
3467
if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_VBAR_ALT)) break;
3472
if (PPEEK_IS('?') &&
3473
IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_QMARK_GROUP_EFFECT)) {
3475
if (PPEEK_IS('#')) {
3478
if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
3480
if (c == MC_ESC(enc)) {
3481
if (!PEND) PFETCH(c);
3484
if (c == ')') break;
3492
if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LPAREN_SUBEXP)) break;
3493
tok->type = TK_SUBEXP_OPEN;
3497
if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LPAREN_SUBEXP)) break;
3498
tok->type = TK_SUBEXP_CLOSE;
3502
if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LINE_ANCHOR)) break;
3503
tok->type = TK_ANCHOR;
3504
tok->u.subtype = (IS_SINGLELINE(env->option)
3505
? ANCHOR_BEGIN_BUF : ANCHOR_BEGIN_LINE);
3509
if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LINE_ANCHOR)) break;
3510
tok->type = TK_ANCHOR;
3511
tok->u.subtype = (IS_SINGLELINE(env->option)
3512
? ANCHOR_SEMI_END_BUF : ANCHOR_END_LINE);
3516
if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_BRACKET_CC)) break;
3517
tok->type = TK_CC_OPEN;
3521
if (*src > env->pattern) /* /].../ is allowed. */
3522
CCEND_ESC_WARN(env, (UChar* )"]");
3526
if (IS_EXTEND(env->option)) {
3529
if (ONIGENC_IS_CODE_NEWLINE(enc, c))
3537
case ' ': case '\t': case '\n': case '\r': case '\f':
3538
if (IS_EXTEND(env->option))
3548
#ifdef USE_VARIABLE_META_CHARS
3556
add_ctype_to_cc_by_range(CClassNode* cc, int ctype, int not, OnigEncoding enc,
3557
const OnigCodePoint sbr[], const OnigCodePoint mbr[])
3562
int nsb = ONIGENC_CODE_RANGE_NUM(sbr);
3563
int nmb = ONIGENC_CODE_RANGE_NUM(mbr);
3566
for (i = 0; i < nsb; i++) {
3567
for (j = ONIGENC_CODE_RANGE_FROM(sbr, i);
3568
j <= ONIGENC_CODE_RANGE_TO(sbr, i); j++) {
3569
BITSET_SET_BIT(cc->bs, j);
3573
for (i = 0; i < nmb; i++) {
3574
r = add_code_range_to_buf(&(cc->mbuf),
3575
ONIGENC_CODE_RANGE_FROM(mbr, i),
3576
ONIGENC_CODE_RANGE_TO(mbr, i));
3577
if (r != 0) return r;
3581
OnigCodePoint prev = 0;
3583
if (ONIGENC_MBC_MINLEN(enc) == 1) {
3584
for (i = 0; i < nsb; i++) {
3586
j < ONIGENC_CODE_RANGE_FROM(sbr, i); j++) {
3587
BITSET_SET_BIT(cc->bs, j);
3589
prev = ONIGENC_CODE_RANGE_TO(sbr, i) + 1;
3592
for (j = prev; j < 0x7f; j++) {
3593
BITSET_SET_BIT(cc->bs, j);
3600
for (i = 0; i < nmb; i++) {
3601
if (prev < ONIGENC_CODE_RANGE_FROM(mbr, i)) {
3602
r = add_code_range_to_buf(&(cc->mbuf), prev,
3603
ONIGENC_CODE_RANGE_FROM(mbr, i) - 1);
3604
if (r != 0) return r;
3606
prev = ONIGENC_CODE_RANGE_TO(mbr, i) + 1;
3608
if (prev < 0x7fffffff) {
3609
r = add_code_range_to_buf(&(cc->mbuf), prev, 0x7fffffff);
3610
if (r != 0) return r;
3618
add_ctype_to_cc(CClassNode* cc, int ctype, int not, ScanEnv* env)
3621
const OnigCodePoint *sbr, *mbr;
3622
OnigEncoding enc = env->enc;
3624
r = ONIGENC_GET_CTYPE_CODE_RANGE(enc, ctype, &sbr, &mbr);
3626
return add_ctype_to_cc_by_range(cc, ctype, not, env->enc, sbr, mbr);
3628
else if (r != ONIG_NO_SUPPORT_CONFIG) {
3634
case ONIGENC_CTYPE_ALPHA:
3635
case ONIGENC_CTYPE_BLANK:
3636
case ONIGENC_CTYPE_CNTRL:
3637
case ONIGENC_CTYPE_DIGIT:
3638
case ONIGENC_CTYPE_LOWER:
3639
case ONIGENC_CTYPE_PUNCT:
3640
case ONIGENC_CTYPE_SPACE:
3641
case ONIGENC_CTYPE_UPPER:
3642
case ONIGENC_CTYPE_XDIGIT:
3643
case ONIGENC_CTYPE_ASCII:
3644
case ONIGENC_CTYPE_ALNUM:
3646
for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
3647
if (! ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype))
3648
BITSET_SET_BIT(cc->bs, c);
3650
ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf);
3653
for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
3654
if (ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype))
3655
BITSET_SET_BIT(cc->bs, c);
3660
case ONIGENC_CTYPE_GRAPH:
3661
case ONIGENC_CTYPE_PRINT:
3663
for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
3664
if (! ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype))
3665
BITSET_SET_BIT(cc->bs, c);
3669
for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
3670
if (ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype))
3671
BITSET_SET_BIT(cc->bs, c);
3673
ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf);
3677
case ONIGENC_CTYPE_WORD:
3679
for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
3680
if (ONIGENC_IS_CODE_SB_WORD(enc, c)) BITSET_SET_BIT(cc->bs, c);
3682
ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf);
3685
for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
3686
if ((ONIGENC_CODE_TO_MBCLEN(enc, c) > 0) /* 0: invalid code point */
3687
&& ! ONIGENC_IS_CODE_WORD(enc, c))
3688
BITSET_SET_BIT(cc->bs, c);
3694
return ONIGERR_PARSER_BUG;
3702
parse_ctype_to_enc_ctype(int pctype, int* not)
3708
ctype = ONIGENC_CTYPE_WORD;
3711
case CTYPE_NOT_WORD:
3712
ctype = ONIGENC_CTYPE_WORD;
3715
case CTYPE_WHITE_SPACE:
3716
ctype = ONIGENC_CTYPE_SPACE;
3719
case CTYPE_NOT_WHITE_SPACE:
3720
ctype = ONIGENC_CTYPE_SPACE;
3724
ctype = ONIGENC_CTYPE_DIGIT;
3727
case CTYPE_NOT_DIGIT:
3728
ctype = ONIGENC_CTYPE_DIGIT;
3732
ctype = ONIGENC_CTYPE_XDIGIT;
3735
case CTYPE_NOT_XDIGIT:
3736
ctype = ONIGENC_CTYPE_XDIGIT;
3740
return ONIGERR_PARSER_BUG;
3750
} PosixBracketEntryType;
3753
parse_posix_bracket(CClassNode* cc, UChar** src, UChar* end, ScanEnv* env)
3755
#define POSIX_BRACKET_CHECK_LIMIT_LENGTH 20
3756
#define POSIX_BRACKET_NAME_MAX_LEN 6
3758
static PosixBracketEntryType PBS[] = {
3759
{ (UChar* )"alnum", ONIGENC_CTYPE_ALNUM, 5 },
3760
{ (UChar* )"alpha", ONIGENC_CTYPE_ALPHA, 5 },
3761
{ (UChar* )"blank", ONIGENC_CTYPE_BLANK, 5 },
3762
{ (UChar* )"cntrl", ONIGENC_CTYPE_CNTRL, 5 },
3763
{ (UChar* )"digit", ONIGENC_CTYPE_DIGIT, 5 },
3764
{ (UChar* )"graph", ONIGENC_CTYPE_GRAPH, 5 },
3765
{ (UChar* )"lower", ONIGENC_CTYPE_LOWER, 5 },
3766
{ (UChar* )"print", ONIGENC_CTYPE_PRINT, 5 },
3767
{ (UChar* )"punct", ONIGENC_CTYPE_PUNCT, 5 },
3768
{ (UChar* )"space", ONIGENC_CTYPE_SPACE, 5 },
3769
{ (UChar* )"upper", ONIGENC_CTYPE_UPPER, 5 },
3770
{ (UChar* )"xdigit", ONIGENC_CTYPE_XDIGIT, 6 },
3771
{ (UChar* )"ascii", ONIGENC_CTYPE_ASCII, 5 },
3772
{ (UChar* )NULL, -1, 0 }
3775
PosixBracketEntryType *pb;
3778
OnigEncoding enc = env->enc;
3782
if (PPEEK_IS('^')) {
3789
if (onigenc_strlen(enc, p, end) < POSIX_BRACKET_NAME_MAX_LEN + 2)
3790
goto not_posix_bracket;
3792
for (pb = PBS; IS_NOT_NULL(pb->name); pb++) {
3793
if (onigenc_with_ascii_strncmp(enc, p, end, pb->name, pb->len) == 0) {
3794
p = (UChar* )onigenc_step(enc, p, end, pb->len);
3795
if (onigenc_with_ascii_strncmp(enc, p, end, (UChar* )":]", 2) != 0)
3796
return ONIGERR_INVALID_POSIX_BRACKET_TYPE;
3798
r = add_ctype_to_cc(cc, pb->ctype, not, env);
3799
if (r != 0) return r;
3810
while (!PEND && ((c = PPEEK) != ':') && c != ']') {
3812
if (++i > POSIX_BRACKET_CHECK_LIMIT_LENGTH) break;
3814
if (c == ':' && ! PEND) {
3819
return ONIGERR_INVALID_POSIX_BRACKET_TYPE;
3823
return 1; /* 1: is not POSIX bracket, but no error. */
3827
property_name_to_ctype(UChar* p, UChar* end, OnigEncoding enc)
3829
static PosixBracketEntryType PBS[] = {
3830
{ (UChar* )"Alnum", ONIGENC_CTYPE_ALNUM, 5 },
3831
{ (UChar* )"Alpha", ONIGENC_CTYPE_ALPHA, 5 },
3832
{ (UChar* )"Blank", ONIGENC_CTYPE_BLANK, 5 },
3833
{ (UChar* )"Cntrl", ONIGENC_CTYPE_CNTRL, 5 },
3834
{ (UChar* )"Digit", ONIGENC_CTYPE_DIGIT, 5 },
3835
{ (UChar* )"Graph", ONIGENC_CTYPE_GRAPH, 5 },
3836
{ (UChar* )"Lower", ONIGENC_CTYPE_LOWER, 5 },
3837
{ (UChar* )"Print", ONIGENC_CTYPE_PRINT, 5 },
3838
{ (UChar* )"Punct", ONIGENC_CTYPE_PUNCT, 5 },
3839
{ (UChar* )"Space", ONIGENC_CTYPE_SPACE, 5 },
3840
{ (UChar* )"Upper", ONIGENC_CTYPE_UPPER, 5 },
3841
{ (UChar* )"XDigit", ONIGENC_CTYPE_XDIGIT, 6 },
3842
{ (UChar* )"ASCII", ONIGENC_CTYPE_ASCII, 5 },
3843
{ (UChar* )NULL, -1, 0 }
3846
PosixBracketEntryType *pb;
3849
len = onigenc_strlen(enc, p, end);
3850
for (pb = PBS; IS_NOT_NULL(pb->name); pb++) {
3851
if (len == pb->len &&
3852
onigenc_with_ascii_strncmp(enc, p, end, pb->name, pb->len) == 0)
3860
fetch_char_property_to_ctype(UChar** src, UChar* end, ScanEnv* env)
3864
OnigEncoding enc = env->enc;
3865
UChar *prev, *start, *p = *src;
3868
/* 'IsXXXX' => 'XXXX' */
3870
IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_CHAR_PROPERTY_PREFIX_IS)) {
3890
ctype = property_name_to_ctype(start, prev, enc);
3891
if (ctype < 0) break;
3896
else if (c == '(' || c == ')' || c == '{' || c == '|')
3900
onig_scan_env_set_error_string(env, ONIGERR_INVALID_CHAR_PROPERTY_NAME,
3902
return ONIGERR_INVALID_CHAR_PROPERTY_NAME;
3906
parse_char_property(Node** np, OnigToken* tok, UChar** src, UChar* end,
3912
ctype = fetch_char_property_to_ctype(src, end, env);
3913
if (ctype < 0) return ctype;
3915
*np = node_new_cclass();
3916
CHECK_NULL_RETURN_VAL(*np, ONIGERR_MEMORY);
3917
cc = &(NCCLASS(*np));
3918
r = add_ctype_to_cc(cc, ctype, 0, env);
3919
if (r != 0) return r;
3920
if (tok->u.prop.not != 0) CCLASS_SET_NOT(cc);
3940
next_state_class(CClassNode* cc, OnigCodePoint* vs, enum CCVALTYPE* type,
3941
enum CCSTATE* state, ScanEnv* env)
3945
if (*state == CCS_RANGE)
3946
return ONIGERR_CHAR_CLASS_VALUE_AT_END_OF_RANGE;
3948
if (*state == CCS_VALUE && *type != CCV_CLASS) {
3949
if (*type == CCV_SB)
3950
BITSET_SET_BIT(cc->bs, (int )(*vs));
3951
else if (*type == CCV_CODE_POINT) {
3952
r = add_code_range(&(cc->mbuf), env, *vs, *vs);
3953
if (r < 0) return r;
3963
next_state_val(CClassNode* cc, OnigCodePoint *vs, OnigCodePoint v,
3964
int* vs_israw, int v_israw,
3965
enum CCVALTYPE intype, enum CCVALTYPE* type,
3966
enum CCSTATE* state, ScanEnv* env)
3972
if (*type == CCV_SB)
3973
BITSET_SET_BIT(cc->bs, (int )(*vs));
3974
else if (*type == CCV_CODE_POINT) {
3975
r = add_code_range(&(cc->mbuf), env, *vs, *vs);
3976
if (r < 0) return r;
3981
if (intype == *type) {
3982
if (intype == CCV_SB) {
3983
if (*vs > 0xff || v > 0xff)
3984
return ONIGERR_INVALID_WIDE_CHAR_VALUE;
3987
if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC))
3990
return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS;
3992
bitset_set_range(cc->bs, (int )*vs, (int )v);
3995
r = add_code_range(&(cc->mbuf), env, *vs, v);
3996
if (r < 0) return r;
4001
if (intype == CCV_CODE_POINT && *type == CCV_SB) {
4004
if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC))
4007
return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS;
4009
bitset_set_range(cc->bs, (int )*vs, (int )(v < 0xff ? v : 0xff));
4010
r = add_code_range(&(cc->mbuf), env, (OnigCodePoint )*vs, v);
4011
if (r < 0) return r;
4015
return ONIGERR_MISMATCH_CODE_LENGTH_IN_CLASS_RANGE;
4019
*state = CCS_COMPLETE;
4031
*vs_israw = v_israw;
4038
code_exist_check(OnigCodePoint c, UChar* from, UChar* end, int ignore_escaped,
4048
if (ignore_escaped && in_esc) {
4053
if (code == c) return 1;
4054
if (code == MC_ESC(enc)) in_esc = 1;
4061
parse_char_class(Node** np, OnigToken* tok, UChar** src, UChar* end,
4064
int r, neg, len, fetched, and_start;
4065
OnigCodePoint v, vs;
4068
CClassNode *cc, *prev_cc;
4072
enum CCVALTYPE val_type, in_type;
4073
int val_israw, in_israw;
4075
prev_cc = (CClassNode* )NULL;
4077
r = fetch_token_in_cc(tok, src, end, env);
4078
if (r == TK_CHAR && tok->u.c == '^' && tok->escaped == 0) {
4080
r = fetch_token_in_cc(tok, src, end, env);
4086
if (r < 0) return r;
4087
if (r == TK_CC_CLOSE) {
4088
if (! code_exist_check((OnigCodePoint )']',
4089
*src, env->pattern_end, 1, env->enc))
4090
return ONIGERR_EMPTY_CHAR_CLASS;
4092
CC_ESC_WARN(env, (UChar* )"]");
4093
r = tok->type = TK_CHAR; /* allow []...] */
4096
*np = node = node_new_cclass();
4097
CHECK_NULL_RETURN_VAL(node, ONIGERR_MEMORY);
4098
cc = &(NCCLASS(node));
4103
while (r != TK_CC_CLOSE) {
4107
len = ONIGENC_CODE_TO_MBCLEN(env->enc, tok->u.c);
4109
in_type = CCV_CODE_POINT;
4115
v = (OnigCodePoint )tok->u.c;
4121
/* tok->base != 0 : octal or hexadec. */
4122
if (! ONIGENC_IS_SINGLEBYTE(env->enc) && tok->base != 0) {
4123
UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
4124
UChar* bufe = buf + ONIGENC_CODE_TO_MBC_MAXLEN;
4126
int i, base = tok->base;
4129
for (i = 1; i < ONIGENC_MBC_MAXLEN(env->enc); i++) {
4130
r = fetch_token_in_cc(tok, &p, end, env);
4131
if (r < 0) goto err;
4132
if (r != TK_RAW_BYTE || tok->base != base) {
4139
if (i < ONIGENC_MBC_MINLEN(env->enc)) {
4140
r = ONIGERR_TOO_SHORT_MULTI_BYTE_STRING;
4144
len = enc_len(env->enc, buf);
4146
r = ONIGERR_TOO_SHORT_MULTI_BYTE_STRING;
4149
else if (i > len) { /* fetch back */
4151
for (i = 1; i < len; i++) {
4152
r = fetch_token_in_cc(tok, &p, end, env);
4158
v = (OnigCodePoint )buf[0];
4162
v = ONIGENC_MBC_TO_CODE(env->enc, buf, bufe);
4163
in_type = CCV_CODE_POINT;
4167
v = (OnigCodePoint )tok->u.c;
4179
len = ONIGENC_CODE_TO_MBCLEN(env->enc, v);
4184
in_type = (len == 1 ? CCV_SB : CCV_CODE_POINT);
4186
r = next_state_val(cc, &vs, v, &val_israw, in_israw, in_type, &val_type,
4188
if (r != 0) goto err;
4191
case TK_POSIX_BRACKET_OPEN:
4192
r = parse_posix_bracket(cc, &p, end, env);
4193
if (r < 0) goto err;
4194
if (r == 1) { /* is not POSIX bracket */
4195
CC_ESC_WARN(env, (UChar* )"[");
4197
v = (OnigCodePoint )tok->u.c;
4207
ctype = parse_ctype_to_enc_ctype(tok->u.subtype, ¬);
4208
r = add_ctype_to_cc(cc, ctype, not, env);
4209
if (r != 0) return r;
4213
r = next_state_class(cc, &vs, &val_type, &state, env);
4214
if (r != 0) goto err;
4217
case TK_CHAR_PROPERTY:
4221
ctype = fetch_char_property_to_ctype(&p, end, env);
4222
if (ctype < 0) return ctype;
4223
r = add_ctype_to_cc(cc, ctype, tok->u.prop.not, env);
4224
if (r != 0) return r;
4230
if (state == CCS_VALUE) {
4231
r = fetch_token_in_cc(tok, &p, end, env);
4232
if (r < 0) goto err;
4234
if (r == TK_CC_CLOSE) { /* allow [x-] */
4236
v = (OnigCodePoint )'-';
4240
else if (r == TK_CC_AND) {
4241
CC_ESC_WARN(env, (UChar* )"-");
4246
else if (state == CCS_START) {
4247
/* [-xa] is allowed */
4248
v = (OnigCodePoint )tok->u.c;
4251
r = fetch_token_in_cc(tok, &p, end, env);
4252
if (r < 0) goto err;
4254
/* [--x] or [a&&-x] is warned. */
4255
if (r == TK_CC_RANGE || and_start != 0)
4256
CC_ESC_WARN(env, (UChar* )"-");
4260
else if (state == CCS_RANGE) {
4261
CC_ESC_WARN(env, (UChar* )"-");
4262
goto sb_char; /* [!--x] is allowed */
4264
else { /* CCS_COMPLETE */
4265
r = fetch_token_in_cc(tok, &p, end, env);
4266
if (r < 0) goto err;
4268
if (r == TK_CC_CLOSE) goto range_end_val; /* allow [a-b-] */
4269
else if (r == TK_CC_AND) {
4270
CC_ESC_WARN(env, (UChar* )"-");
4274
if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_DOUBLE_RANGE_OP_IN_CC)) {
4275
CC_ESC_WARN(env, (UChar* )"-");
4276
goto sb_char; /* [0-9-a] is allowed as [0-9\-a] */
4278
r = ONIGERR_UNMATCHED_RANGE_SPECIFIER_IN_CHAR_CLASS;
4283
case TK_CC_CC_OPEN: /* [ */
4288
r = parse_char_class(&anode, tok, &p, end, env);
4289
if (r != 0) goto cc_open_err;
4290
acc = &(NCCLASS(anode));
4291
r = or_cclass(cc, acc, env->enc);
4293
onig_node_free(anode);
4295
if (r != 0) goto err;
4299
case TK_CC_AND: /* && */
4301
if (state == CCS_VALUE) {
4302
r = next_state_val(cc, &vs, 0, &val_israw, 0, val_type,
4303
&val_type, &state, env);
4304
if (r != 0) goto err;
4306
/* initialize local variables */
4310
if (IS_NOT_NULL(prev_cc)) {
4311
r = and_cclass(prev_cc, cc, env->enc);
4312
if (r != 0) goto err;
4313
bbuf_free(cc->mbuf);
4319
initialize_cclass(cc);
4324
r = ONIGERR_PREMATURE_END_OF_CHAR_CLASS;
4328
r = ONIGERR_PARSER_BUG;
4336
r = fetch_token_in_cc(tok, &p, end, env);
4337
if (r < 0) goto err;
4341
if (state == CCS_VALUE) {
4342
r = next_state_val(cc, &vs, 0, &val_israw, 0, val_type,
4343
&val_type, &state, env);
4344
if (r != 0) goto err;
4347
if (IS_NOT_NULL(prev_cc)) {
4348
r = and_cclass(prev_cc, cc, env->enc);
4349
if (r != 0) goto err;
4350
bbuf_free(cc->mbuf);
4357
CCLASS_CLEAR_NOT(cc);
4358
if (IS_CCLASS_NOT(cc) &&
4359
IS_SYNTAX_BV(env->syntax, ONIG_SYN_NOT_NEWLINE_IN_NEGATIVE_CC)) {
4362
is_empty = (IS_NULL(cc->mbuf) ? 1 : 0);
4364
BITSET_IS_EMPTY(cc->bs, is_empty);
4366
if (is_empty == 0) {
4367
#define NEWLINE_CODE 0x0a
4369
if (ONIGENC_IS_CODE_NEWLINE(env->enc, NEWLINE_CODE)) {
4370
if (ONIGENC_CODE_TO_MBCLEN(env->enc, NEWLINE_CODE) == 1)
4371
BITSET_SET_BIT(cc->bs, NEWLINE_CODE);
4373
add_code_range(&(cc->mbuf), env, NEWLINE_CODE, NEWLINE_CODE);
4381
if (cc != &(NCCLASS(*np)))
4382
bbuf_free(cc->mbuf);
4383
onig_node_free(*np);
4387
static int parse_subexp(Node** top, OnigToken* tok, int term,
4388
UChar** src, UChar* end, ScanEnv* env);
4391
parse_effect(Node** np, OnigToken* tok, int term, UChar** src, UChar* end,
4397
OnigOptionType option;
4398
OnigEncoding enc = env->enc;
4404
if (PEND) return ONIGERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS;
4406
option = env->option;
4407
if (PPEEK_IS('?') &&
4408
IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_GROUP_EFFECT)) {
4410
if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
4414
case ':': /* (?:...) grouping only */
4416
r = fetch_token(tok, &p, end, env);
4417
if (r < 0) return r;
4418
r = parse_subexp(np, tok, term, &p, end, env);
4419
if (r < 0) return r;
4421
return 1; /* group */
4425
*np = onig_node_new_anchor(ANCHOR_PREC_READ);
4427
case '!': /* preceding read */
4428
*np = onig_node_new_anchor(ANCHOR_PREC_READ_NOT);
4430
case '>': /* (?>...) stop backtrack */
4431
*np = node_new_effect(EFFECT_STOP_BACKTRACK);
4434
case '<': /* look behind (?<=...), (?<!...) */
4437
*np = onig_node_new_anchor(ANCHOR_LOOK_BEHIND);
4439
*np = onig_node_new_anchor(ANCHOR_LOOK_BEHIND_NOT);
4440
#ifdef USE_NAMED_GROUP
4441
else if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP)) {
4450
r = fetch_name(&p, end, &name_end, env, 0);
4451
if (r < 0) return r;
4453
num = scan_env_add_mem_entry(env);
4454
if (num < 0) return num;
4455
if (list_capture != 0 && num >= BIT_STATUS_BITS_NUM)
4456
return ONIGERR_GROUP_NUMBER_OVER_FOR_CAPTURE_HISTORY;
4458
r = name_add(env->reg, name, name_end, num, env);
4459
if (r != 0) return r;
4460
*np = node_new_effect_memory(env->option, 1);
4461
CHECK_NULL_RETURN_VAL(*np, ONIGERR_MEMORY);
4462
NEFFECT(*np).regnum = num;
4463
if (list_capture != 0)
4464
BIT_STATUS_ON_AT_SIMPLE(env->capture_history, num);
4469
return ONIGERR_UNDEFINED_GROUP_OPTION;
4473
if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ATMARK_CAPTURE_HISTORY)) {
4474
#ifdef USE_NAMED_GROUP
4475
if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP)) {
4479
goto named_group; /* (?@<name>...) */
4484
*np = node_new_effect_memory(env->option, 0);
4485
CHECK_NULL_RETURN_VAL(*np, ONIGERR_MEMORY);
4486
num = scan_env_add_mem_entry(env);
4488
onig_node_free(*np);
4491
else if (num >= BIT_STATUS_BITS_NUM) {
4492
onig_node_free(*np);
4493
return ONIGERR_GROUP_NUMBER_OVER_FOR_CAPTURE_HISTORY;
4495
NEFFECT(*np).regnum = num;
4496
BIT_STATUS_ON_AT_SIMPLE(env->capture_history, num);
4499
return ONIGERR_UNDEFINED_GROUP_OPTION;
4503
#ifdef USE_POSIXLINE_OPTION
4506
case '-': case 'i': case 'm': case 's': case 'x':
4516
case '-': neg = 1; break;
4517
case 'x': ONOFF(option, ONIG_OPTION_EXTEND, neg); break;
4518
case 'i': ONOFF(option, ONIG_OPTION_IGNORECASE, neg); break;
4520
if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL)) {
4521
ONOFF(option, ONIG_OPTION_MULTILINE, neg);
4524
return ONIGERR_UNDEFINED_GROUP_OPTION;
4528
if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL)) {
4529
ONOFF(option, ONIG_OPTION_SINGLELINE, (neg == 0 ? 1 : 0));
4531
else if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_RUBY)) {
4532
ONOFF(option, ONIG_OPTION_MULTILINE, neg);
4535
return ONIGERR_UNDEFINED_GROUP_OPTION;
4537
#ifdef USE_POSIXLINE_OPTION
4539
ONOFF(option, ONIG_OPTION_MULTILINE|ONIG_OPTION_SINGLELINE, neg);
4543
return ONIGERR_UNDEFINED_GROUP_OPTION;
4547
*np = node_new_option(option);
4548
CHECK_NULL_RETURN_VAL(*np, ONIGERR_MEMORY);
4550
return 2; /* option only */
4552
else if (c == ':') {
4553
OnigOptionType prev = env->option;
4555
env->option = option;
4556
r = fetch_token(tok, &p, end, env);
4557
if (r < 0) return r;
4558
r = parse_subexp(&target, tok, term, &p, end, env);
4560
if (r < 0) return r;
4561
*np = node_new_option(option);
4562
CHECK_NULL_RETURN_VAL(*np, ONIGERR_MEMORY);
4563
NEFFECT(*np).target = target;
4568
if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
4575
return ONIGERR_UNDEFINED_GROUP_OPTION;
4579
if (ONIG_IS_OPTION_ON(env->option, ONIG_OPTION_DONT_CAPTURE_GROUP))
4582
*np = node_new_effect_memory(env->option, 0);
4583
CHECK_NULL_RETURN_VAL(*np, ONIGERR_MEMORY);
4584
num = scan_env_add_mem_entry(env);
4585
if (num < 0) return num;
4586
NEFFECT(*np).regnum = num;
4589
CHECK_NULL_RETURN_VAL(*np, ONIGERR_MEMORY);
4590
r = fetch_token(tok, &p, end, env);
4591
if (r < 0) return r;
4592
r = parse_subexp(&target, tok, term, &p, end, env);
4593
if (r < 0) return r;
4595
if (NTYPE(*np) == N_ANCHOR)
4596
NANCHOR(*np).target = target;
4598
NEFFECT(*np).target = target;
4599
if (NEFFECT(*np).type == EFFECT_MEMORY) {
4600
/* Don't move this to previous of parse_subexp() */
4601
r = scan_env_set_mem_node(env, NEFFECT(*np).regnum, *np);
4602
if (r != 0) return r;
4610
static const char* PopularQStr[] = {
4611
"?", "*", "+", "??", "*?", "+?"
4614
static const char* ReduceQStr[] = {
4615
"", "", "*", "*?", "??", "+ and ??", "+? and ?"
4619
set_quantifier(Node* qnode, Node* target, int group, ScanEnv* env)
4623
qn = &(NQUANTIFIER(qnode));
4624
if (qn->lower == 1 && qn->upper == 1) {
4628
switch (NTYPE(target)) {
4631
StrNode* sn = &(NSTRING(target));
4632
if (str_node_can_be_split(sn, env->enc)) {
4633
Node* n = str_node_split_last_char(sn, env->enc);
4634
if (IS_NOT_NULL(n)) {
4643
{ /* check redundant double repeat. */
4644
/* verbose warn (?:.?)? etc... but not warn (.?)? etc... */
4645
QuantifierNode* qnt = &(NQUANTIFIER(target));
4646
int nestq_num = popular_quantifier_num(qn);
4647
int targetq_num = popular_quantifier_num(qnt);
4649
#ifdef USE_WARNING_REDUNDANT_NESTED_REPEAT_OPERATOR
4650
if (!IS_QUANTIFIER_BY_NUMBER(qn) && !IS_QUANTIFIER_BY_NUMBER(qnt) &&
4651
IS_SYNTAX_BV(env->syntax, ONIG_SYN_WARN_REDUNDANT_NESTED_REPEAT)) {
4652
UChar buf[WARN_BUFSIZE];
4654
switch(ReduceTypeTable[targetq_num][nestq_num]) {
4659
if (onig_verb_warn != onig_null_warn) {
4660
onig_snprintf_with_pattern(buf, WARN_BUFSIZE, env->enc,
4661
env->pattern, env->pattern_end,
4662
(UChar* )"redundant nested repeat operator");
4663
(*onig_verb_warn)((char* )buf);
4669
if (onig_verb_warn != onig_null_warn) {
4670
onig_snprintf_with_pattern(buf, WARN_BUFSIZE, env->enc,
4671
env->pattern, env->pattern_end,
4672
(UChar* )"nested repeat operator %s and %s was replaced with '%s'",
4673
PopularQStr[targetq_num], PopularQStr[nestq_num],
4674
ReduceQStr[ReduceTypeTable[targetq_num][nestq_num]]);
4675
(*onig_verb_warn)((char* )buf);
4684
if (targetq_num >= 0) {
4685
if (nestq_num >= 0) {
4686
onig_reduce_nested_quantifier(qnode, target);
4689
else if (targetq_num == 1 || targetq_num == 2) { /* * or + */
4690
/* (?:a*){n,m}, (?:a+){n,m} => (?:a*){n,n}, (?:a+){n,n} */
4691
if (! IS_REPEAT_INFINITE(qn->upper) && qn->upper > 1 && qn->greedy) {
4692
qn->upper = (qn->lower == 0 ? 1 : qn->lower);
4703
qn->target = target;
4708
#ifdef USE_SHARED_CCLASS_TABLE
4710
#define THRESHOLD_RANGE_NUM_FOR_SHARE_CCLASS 8
4712
/* for ctype node hash table */
4720
static int type_cclass_cmp(type_cclass_key* x, type_cclass_key* y)
4722
if (x->type != y->type) return 1;
4723
if (x->enc != y->enc) return 1;
4724
if (x->not != y->not) return 1;
4728
static int type_cclass_hash(type_cclass_key* key)
4735
p = (unsigned char* )&(key->enc);
4736
for (i = 0; i < sizeof(key->enc); i++) {
4737
val = val * 997 + (int )*p++;
4740
p = (unsigned char* )(&key->type);
4741
for (i = 0; i < sizeof(key->type); i++) {
4742
val = val * 997 + (int )*p++;
4746
return val + (val >> 5);
4749
static struct st_hash_type type_type_cclass_hash = {
4754
static st_table* OnigTypeCClassTable;
4758
i_free_shared_class(type_cclass_key* key, Node* node, void* arg)
4760
if (IS_NOT_NULL(node)) {
4761
CClassNode* cc = &(NCCLASS(node));
4762
if (IS_NOT_NULL(cc->mbuf)) xfree(cc->mbuf);
4766
if (IS_NOT_NULL(key)) xfree(key);
4771
onig_free_shared_cclass_table(void)
4773
if (IS_NOT_NULL(OnigTypeCClassTable)) {
4774
onig_st_foreach(OnigTypeCClassTable, i_free_shared_class, 0);
4775
onig_st_free_table(OnigTypeCClassTable);
4776
OnigTypeCClassTable = NULL;
4782
#endif /* USE_SHARED_CCLASS_TABLE */
4786
parse_exp(Node** np, OnigToken* tok, int term,
4787
UChar** src, UChar* end, ScanEnv* env)
4789
int r, len, group = 0;
4794
if (tok->type == term)
4797
switch (tok->type) {
4801
*np = node_new_empty();
4805
case TK_SUBEXP_OPEN:
4806
r = parse_effect(np, tok, TK_SUBEXP_CLOSE, src, end, env);
4807
if (r < 0) return r;
4808
if (r == 1) group = 1;
4809
else if (r == 2) { /* option only */
4811
OnigOptionType prev = env->option;
4813
env->option = NEFFECT(*np).option;
4814
r = fetch_token(tok, src, end, env);
4815
if (r < 0) return r;
4816
r = parse_subexp(&target, tok, term, src, end, env);
4818
if (r < 0) return r;
4819
NEFFECT(*np).target = target;
4824
case TK_SUBEXP_CLOSE:
4825
if (! IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_UNMATCHED_CLOSE_SUBEXP))
4826
return ONIGERR_UNMATCHED_CLOSE_PARENTHESIS;
4828
if (tok->escaped) goto tk_raw_byte;
4835
*np = node_new_str(tok->backp, *src);
4836
CHECK_NULL_RETURN_VAL(*np, ONIGERR_MEMORY);
4839
r = fetch_token(tok, src, end, env);
4840
if (r < 0) return r;
4841
if (r != TK_STRING) break;
4843
r = onig_node_str_cat(*np, tok->backp, *src);
4844
if (r < 0) return r;
4856
*np = node_new_str_char((UChar )tok->u.c);
4857
CHECK_NULL_RETURN_VAL(*np, ONIGERR_MEMORY);
4860
if (len >= ONIGENC_MBC_MINLEN(env->enc)) {
4861
if (len == enc_len(env->enc, NSTRING(*np).s)) {
4862
r = fetch_token(tok, src, end, env);
4867
r = fetch_token(tok, src, end, env);
4868
if (r < 0) return r;
4869
if (r != TK_RAW_BYTE) {
4870
#ifdef USE_PAD_TO_SHORT_BYTE_CHAR
4872
if (len < ONIGENC_MBC_MINLEN(env->enc)) {
4873
rem = ONIGENC_MBC_MINLEN(env->enc) - len;
4874
(void )node_str_head_pad(&NSTRING(*np), rem, (UChar )0);
4875
if (len + rem == enc_len(env->enc, NSTRING(*np).s)) {
4880
return ONIGERR_TOO_SHORT_MULTI_BYTE_STRING;
4883
r = node_str_cat_char(*np, (UChar )tok->u.c);
4884
if (r < 0) return r;
4893
UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
4894
int num = ONIGENC_CODE_TO_MBC(env->enc, tok->u.code, buf);
4895
if (num < 0) return num;
4896
#ifdef NUMBERED_CHAR_IS_NOT_CASE_AMBIG
4897
*np = node_new_str_raw(buf, buf + num);
4899
*np = node_new_str(buf, buf + num);
4901
CHECK_NULL_RETURN_VAL(*np, ONIGERR_MEMORY);
4907
OnigCodePoint end_op[2];
4908
UChar *qstart, *qend, *nextp;
4910
end_op[0] = (OnigCodePoint )MC_ESC(env->enc);
4911
end_op[1] = (OnigCodePoint )'E';
4913
qend = find_str_position(end_op, 2, qstart, end, &nextp, env->enc);
4914
if (IS_NULL(qend)) {
4917
*np = node_new_str(qstart, qend);
4918
CHECK_NULL_RETURN_VAL(*np, ONIGERR_MEMORY);
4925
switch (tok->u.subtype) {
4927
case CTYPE_NOT_WORD:
4928
*np = node_new_ctype(tok->u.subtype);
4929
CHECK_NULL_RETURN_VAL(*np, ONIGERR_MEMORY);
4932
case CTYPE_WHITE_SPACE:
4933
case CTYPE_NOT_WHITE_SPACE:
4935
case CTYPE_NOT_DIGIT:
4937
case CTYPE_NOT_XDIGIT:
4942
#ifdef USE_SHARED_CCLASS_TABLE
4943
const OnigCodePoint *sbr, *mbr;
4945
ctype = parse_ctype_to_enc_ctype(tok->u.subtype, ¬);
4946
r = ONIGENC_GET_CTYPE_CODE_RANGE(env->enc, ctype, &sbr, &mbr);
4948
ONIGENC_CODE_RANGE_NUM(mbr)
4949
>= THRESHOLD_RANGE_NUM_FOR_SHARE_CCLASS) {
4950
type_cclass_key key;
4951
type_cclass_key* new_key;
4957
THREAD_ATOMIC_START;
4959
if (IS_NULL(OnigTypeCClassTable)) {
4961
= onig_st_init_table_with_size(&type_type_cclass_hash, 10);
4962
if (IS_NULL(OnigTypeCClassTable)) {
4964
return ONIGERR_MEMORY;
4968
if (onig_st_lookup(OnigTypeCClassTable, (st_data_t )&key,
4975
*np = node_new_cclass_by_codepoint_range(not, sbr, mbr);
4978
return ONIGERR_MEMORY;
4981
CCLASS_SET_SHARE(&(NCCLASS(*np)));
4982
new_key = (type_cclass_key* )xmalloc(sizeof(type_cclass_key));
4983
onig_st_add_direct(OnigTypeCClassTable, (st_data_t )new_key,
4990
ctype = parse_ctype_to_enc_ctype(tok->u.subtype, ¬);
4991
*np = node_new_cclass();
4992
CHECK_NULL_RETURN_VAL(*np, ONIGERR_MEMORY);
4993
cc = &(NCCLASS(*np));
4994
add_ctype_to_cc(cc, ctype, 0, env);
4995
if (not != 0) CCLASS_SET_NOT(cc);
4996
#ifdef USE_SHARED_CCLASS_TABLE
5003
return ONIGERR_PARSER_BUG;
5009
case TK_CHAR_PROPERTY:
5010
r = parse_char_property(np, tok, src, end, env);
5011
if (r != 0) return r;
5018
r = parse_char_class(np, tok, src, end, env);
5019
if (r != 0) return r;
5021
cc = &(NCCLASS(*np));
5023
if (IS_IGNORECASE(env->option)) {
5025
const OnigPairAmbigCodes* ccs;
5026
BitSetRef bs = cc->bs;
5029
for (amb = 0x01; amb <= ONIGENC_AMBIGUOUS_MATCH_LIMIT; amb <<= 1) {
5030
if ((amb & env->ambig_flag) == 0) continue;
5032
n = ONIGENC_GET_ALL_PAIR_AMBIG_CODES(env->enc, amb, &ccs);
5033
for (i = 0; i < n; i++) {
5034
in_cc = onig_is_code_in_cc(env->enc, ccs[i].from, cc);
5036
if ((in_cc != 0 && !IS_CCLASS_NOT(cc)) ||
5037
(in_cc == 0 && IS_CCLASS_NOT(cc))) {
5038
if (ONIGENC_MBC_MINLEN(env->enc) > 1 ||
5039
ccs[i].from >= SINGLE_BYTE_SIZE) {
5040
/* if (cc->not) clear_not_flag_cclass(cc, env->enc); */
5041
add_code_range(&(cc->mbuf), env, ccs[i].to, ccs[i].to);
5044
if (BITSET_AT(bs, ccs[i].from)) {
5045
/* /(?i:[^A-C])/.match("a") ==> fail. */
5046
BITSET_SET_BIT(bs, ccs[i].to);
5048
if (BITSET_AT(bs, ccs[i].to)) {
5049
BITSET_SET_BIT(bs, ccs[i].from);
5060
*np = node_new_anychar();
5061
CHECK_NULL_RETURN_VAL(*np, ONIGERR_MEMORY);
5064
case TK_ANYCHAR_ANYTIME:
5065
*np = node_new_anychar();
5066
CHECK_NULL_RETURN_VAL(*np, ONIGERR_MEMORY);
5067
qn = node_new_quantifier(0, REPEAT_INFINITE, 0);
5068
CHECK_NULL_RETURN_VAL(qn, ONIGERR_MEMORY);
5069
NQUANTIFIER(qn).target = *np;
5074
len = tok->u.backref.num;
5075
*np = node_new_backref(len,
5076
(len > 1 ? tok->u.backref.refs : &(tok->u.backref.ref1)),
5077
tok->u.backref.by_name,
5078
#ifdef USE_BACKREF_AT_LEVEL
5079
tok->u.backref.exist_level,
5080
tok->u.backref.level,
5083
CHECK_NULL_RETURN_VAL(*np, ONIGERR_MEMORY);
5086
#ifdef USE_SUBEXP_CALL
5088
*np = node_new_call(tok->u.call.name, tok->u.call.name_end);
5089
CHECK_NULL_RETURN_VAL(*np, ONIGERR_MEMORY);
5095
*np = onig_node_new_anchor(tok->u.anchor);
5100
if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_CONTEXT_INDEP_REPEAT_OPS)) {
5101
if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_CONTEXT_INVALID_REPEAT_OPS))
5102
return ONIGERR_TARGET_OF_REPEAT_OPERATOR_NOT_SPECIFIED;
5104
*np = node_new_empty();
5112
return ONIGERR_PARSER_BUG;
5120
r = fetch_token(tok, src, end, env);
5121
if (r < 0) return r;
5124
if (r == TK_OP_REPEAT || r == TK_INTERVAL) {
5125
if (is_invalid_quantifier_target(*targetp))
5126
return ONIGERR_TARGET_OF_REPEAT_OPERATOR_INVALID;
5128
qn = node_new_quantifier(tok->u.repeat.lower, tok->u.repeat.upper,
5129
(r == TK_INTERVAL ? 1 : 0));
5130
CHECK_NULL_RETURN_VAL(qn, ONIGERR_MEMORY);
5131
NQUANTIFIER(qn).greedy = tok->u.repeat.greedy;
5132
r = set_quantifier(qn, *targetp, group, env);
5133
if (r < 0) return r;
5135
if (tok->u.repeat.possessive != 0) {
5137
en = node_new_effect(EFFECT_STOP_BACKTRACK);
5138
CHECK_NULL_RETURN_VAL(en, ONIGERR_MEMORY);
5139
NEFFECT(en).target = qn;
5146
else if (r == 2) { /* split case: /abc+/ */
5149
*targetp = node_new_list(*targetp, NULL);
5150
CHECK_NULL_RETURN_VAL(*targetp, ONIGERR_MEMORY);
5151
tmp = NCONS(*targetp).right = node_new_list(qn, NULL);
5152
CHECK_NULL_RETURN_VAL(tmp, ONIGERR_MEMORY);
5153
targetp = &(NCONS(tmp).left);
5163
parse_branch(Node** top, OnigToken* tok, int term,
5164
UChar** src, UChar* end, ScanEnv* env)
5167
Node *node, **headp;
5170
r = parse_exp(&node, tok, term, src, end, env);
5171
if (r < 0) return r;
5173
if (r == TK_EOT || r == term || r == TK_ALT) {
5177
*top = node_new_list(node, NULL);
5178
headp = &(NCONS(*top).right);
5179
while (r != TK_EOT && r != term && r != TK_ALT) {
5180
r = parse_exp(&node, tok, term, src, end, env);
5181
if (r < 0) return r;
5183
if (NTYPE(node) == N_LIST) {
5185
while (IS_NOT_NULL(NCONS(node).right)) node = NCONS(node).right;
5186
headp = &(NCONS(node).right);
5189
*headp = node_new_list(node, NULL);
5190
headp = &(NCONS(*headp).right);
5198
/* term_tok: TK_EOT or TK_SUBEXP_CLOSE */
5200
parse_subexp(Node** top, OnigToken* tok, int term,
5201
UChar** src, UChar* end, ScanEnv* env)
5204
Node *node, **headp;
5207
r = parse_branch(&node, tok, term, src, end, env);
5209
onig_node_free(node);
5216
else if (r == TK_ALT) {
5217
*top = node_new_alt(node, NULL);
5218
headp = &(NCONS(*top).right);
5219
while (r == TK_ALT) {
5220
r = fetch_token(tok, src, end, env);
5221
if (r < 0) return r;
5222
r = parse_branch(&node, tok, term, src, end, env);
5223
if (r < 0) return r;
5225
*headp = node_new_alt(node, NULL);
5226
headp = &(NCONS(*headp).right);
5229
if (tok->type != term)
5234
if (term == TK_SUBEXP_CLOSE)
5235
return ONIGERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS;
5237
return ONIGERR_PARSER_BUG;
5244
parse_regexp(Node** top, UChar** src, UChar* end, ScanEnv* env)
5249
r = fetch_token(&tok, src, end, env);
5250
if (r < 0) return r;
5251
r = parse_subexp(top, &tok, TK_EOT, src, end, env);
5252
if (r < 0) return r;
5257
onig_parse_make_tree(Node** root, const UChar* pattern, const UChar* end, regex_t* reg,
5263
#ifdef USE_NAMED_GROUP
5267
scan_env_clear(env);
5268
env->option = reg->options;
5269
env->ambig_flag = reg->ambig_flag;
5270
env->enc = reg->enc;
5271
env->syntax = reg->syntax;
5272
env->pattern = (UChar* )pattern;
5273
env->pattern_end = (UChar* )end;
5277
p = (UChar* )pattern;
5278
r = parse_regexp(root, &p, (UChar* )end, env);
5279
reg->num_mem = env->num_mem;
5284
onig_scan_env_set_error_string(ScanEnv* env, int ecode,
5285
UChar* arg, UChar* arg_end)
5288
env->error_end = arg_end;