4861
4863
self->weakreflist = NULL;
4865
if (!_validate(self)) {
4863
4870
return (PyObject*) self;
4866
4873
/* -------------------------------------------------------------------- */
4874
/* Code validation */
4876
/* To learn more about this code, have a look at the _compile() function in
4877
Lib/sre_compile.py. The validation functions below checks the code array
4878
for conformance with the code patterns generated there.
4880
The nice thing about the generated code is that it is position-independent:
4881
all jumps are relative jumps forward. Also, jumps don't cross each other:
4882
the target of a later jump is always earlier than the target of an earlier
4883
jump. IOW, this is okay:
4885
J---------J-------T--------T
4887
\______________________/
4891
J---------J-------T--------T
4895
It also helps that SRE_CODE is always an unsigned type, either 2 bytes or 4
4896
bytes wide (the latter if Python is compiled for "wide" unicode support).
4899
/* Defining this one enables tracing of the validator */
4902
/* Trace macro for the validator */
4903
#if defined(VVERBOSE)
4904
#define VTRACE(v) printf v
4909
/* Report failure */
4910
#define FAIL do { VTRACE(("FAIL: %d\n", __LINE__)); return 0; } while (0)
4912
/* Extract opcode, argument, or skip count from code array */
4915
VTRACE(("%p: ", code)); \
4916
if (code >= end) FAIL; \
4918
VTRACE(("%lu (op)\n", (unsigned long)op)); \
4922
VTRACE(("%p= ", code)); \
4923
if (code >= end) FAIL; \
4925
VTRACE(("%lu (arg)\n", (unsigned long)arg)); \
4927
#define GET_SKIP_ADJ(adj) \
4929
VTRACE(("%p= ", code)); \
4930
if (code >= end) FAIL; \
4932
VTRACE(("%lu (skip to %p)\n", \
4933
(unsigned long)skip, code+skip)); \
4934
if (code+skip-adj < code || code+skip-adj > end)\
4938
#define GET_SKIP GET_SKIP_ADJ(0)
4941
_validate_charset(SRE_CODE *code, SRE_CODE *end)
4943
/* Some variables are manipulated by the macros above */
4949
while (code < end) {
4956
case SRE_OP_LITERAL:
4965
case SRE_OP_CHARSET:
4966
offset = 32/sizeof(SRE_CODE); /* 32-byte bitmap */
4967
if (code+offset < code || code+offset > end)
4972
case SRE_OP_BIGCHARSET:
4973
GET_ARG; /* Number of blocks */
4974
offset = 256/sizeof(SRE_CODE); /* 256-byte table */
4975
if (code+offset < code || code+offset > end)
4977
/* Make sure that each byte points to a valid block */
4978
for (i = 0; i < 256; i++) {
4979
if (((unsigned char *)code)[i] >= arg)
4983
offset = arg * 32/sizeof(SRE_CODE); /* 32-byte bitmap times arg */
4984
if (code+offset < code || code+offset > end)
4989
case SRE_OP_CATEGORY:
4992
case SRE_CATEGORY_DIGIT:
4993
case SRE_CATEGORY_NOT_DIGIT:
4994
case SRE_CATEGORY_SPACE:
4995
case SRE_CATEGORY_NOT_SPACE:
4996
case SRE_CATEGORY_WORD:
4997
case SRE_CATEGORY_NOT_WORD:
4998
case SRE_CATEGORY_LINEBREAK:
4999
case SRE_CATEGORY_NOT_LINEBREAK:
5000
case SRE_CATEGORY_LOC_WORD:
5001
case SRE_CATEGORY_LOC_NOT_WORD:
5002
case SRE_CATEGORY_UNI_DIGIT:
5003
case SRE_CATEGORY_UNI_NOT_DIGIT:
5004
case SRE_CATEGORY_UNI_SPACE:
5005
case SRE_CATEGORY_UNI_NOT_SPACE:
5006
case SRE_CATEGORY_UNI_WORD:
5007
case SRE_CATEGORY_UNI_NOT_WORD:
5008
case SRE_CATEGORY_UNI_LINEBREAK:
5009
case SRE_CATEGORY_UNI_NOT_LINEBREAK:
5026
_validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
5028
/* Some variables are manipulated by the macros above */
5033
VTRACE(("code=%p, end=%p\n", code, end));
5038
while (code < end) {
5043
/* We don't check whether marks are properly nested; the
5044
sre_match() code is robust even if they don't, and the worst
5045
you can get is nonsensical match results. */
5047
if (arg > 2*groups+1) {
5048
VTRACE(("arg=%d, groups=%d\n", (int)arg, (int)groups));
5053
case SRE_OP_LITERAL:
5054
case SRE_OP_NOT_LITERAL:
5055
case SRE_OP_LITERAL_IGNORE:
5056
case SRE_OP_NOT_LITERAL_IGNORE:
5058
/* The arg is just a character, nothing to check */
5061
case SRE_OP_SUCCESS:
5062
case SRE_OP_FAILURE:
5063
/* Nothing to check; these normally end the matching process */
5069
case SRE_AT_BEGINNING:
5070
case SRE_AT_BEGINNING_STRING:
5071
case SRE_AT_BEGINNING_LINE:
5073
case SRE_AT_END_LINE:
5074
case SRE_AT_END_STRING:
5075
case SRE_AT_BOUNDARY:
5076
case SRE_AT_NON_BOUNDARY:
5077
case SRE_AT_LOC_BOUNDARY:
5078
case SRE_AT_LOC_NON_BOUNDARY:
5079
case SRE_AT_UNI_BOUNDARY:
5080
case SRE_AT_UNI_NON_BOUNDARY:
5088
case SRE_OP_ANY_ALL:
5089
/* These have no operands */
5093
case SRE_OP_IN_IGNORE:
5095
/* Stop 1 before the end; we check the FAILURE below */
5096
if (!_validate_charset(code, code+skip-2))
5098
if (code[skip-2] != SRE_OP_FAILURE)
5105
/* A minimal info field is
5106
<INFO> <1=skip> <2=flags> <3=min> <4=max>;
5107
If SRE_INFO_PREFIX or SRE_INFO_CHARSET is in the flags,
5109
SRE_CODE flags, min, max, i;
5112
newcode = code+skip-1;
5113
GET_ARG; flags = arg;
5116
/* Check that only valid flags are present */
5117
if ((flags & ~(SRE_INFO_PREFIX |
5119
SRE_INFO_CHARSET)) != 0)
5121
/* PREFIX and CHARSET are mutually exclusive */
5122
if ((flags & SRE_INFO_PREFIX) &&
5123
(flags & SRE_INFO_CHARSET))
5125
/* LITERAL implies PREFIX */
5126
if ((flags & SRE_INFO_LITERAL) &&
5127
!(flags & SRE_INFO_PREFIX))
5129
/* Validate the prefix */
5130
if (flags & SRE_INFO_PREFIX) {
5131
SRE_CODE prefix_len, prefix_skip;
5132
GET_ARG; prefix_len = arg;
5133
GET_ARG; prefix_skip = arg;
5134
/* Here comes the prefix string */
5135
if (code+prefix_len < code || code+prefix_len > newcode)
5138
/* And here comes the overlap table */
5139
if (code+prefix_len < code || code+prefix_len > newcode)
5141
/* Each overlap value should be < prefix_len */
5142
for (i = 0; i < prefix_len; i++) {
5143
if (code[i] >= prefix_len)
5148
/* Validate the charset */
5149
if (flags & SRE_INFO_CHARSET) {
5150
if (!_validate_charset(code, newcode-1))
5152
if (newcode[-1] != SRE_OP_FAILURE)
5156
else if (code != newcode) {
5157
VTRACE(("code=%p, newcode=%p\n", code, newcode));
5165
SRE_CODE *target = NULL;
5170
/* Stop 2 before the end; we check the JUMP below */
5171
if (!_validate_inner(code, code+skip-3, groups))
5174
/* Check that it ends with a JUMP, and that each JUMP
5175
has the same target */
5177
if (op != SRE_OP_JUMP)
5181
target = code+skip-1;
5182
else if (code+skip-1 != target)
5188
case SRE_OP_REPEAT_ONE:
5189
case SRE_OP_MIN_REPEAT_ONE:
5197
#ifdef Py_UNICODE_WIDE
5201
if (!_validate_inner(code, code+skip-4, groups))
5205
if (op != SRE_OP_SUCCESS)
5218
#ifdef Py_UNICODE_WIDE
5222
if (!_validate_inner(code, code+skip-3, groups))
5226
if (op != SRE_OP_MAX_UNTIL && op != SRE_OP_MIN_UNTIL)
5231
case SRE_OP_GROUPREF:
5232
case SRE_OP_GROUPREF_IGNORE:
5238
case SRE_OP_GROUPREF_EXISTS:
5239
/* The regex syntax for this is: '(?(group)then|else)', where
5240
'group' is either an integer group number or a group name,
5241
'then' and 'else' are sub-regexes, and 'else' is optional. */
5246
code--; /* The skip is relative to the first arg! */
5247
/* There are two possibilities here: if there is both a 'then'
5248
part and an 'else' part, the generated code looks like:
5256
(<skipyes> jumps here)
5258
(<skipno> jumps here)
5260
If there is only a 'then' part, it looks like:
5268
There is no direct way to decide which it is, and we don't want
5269
to allow arbitrary jumps anywhere in the code; so we just look
5270
for a JUMP opcode preceding our skip target.
5272
if (skip >= 3 && code+skip-3 >= code &&
5273
code[skip-3] == SRE_OP_JUMP)
5275
VTRACE(("both then and else parts present\n"));
5276
if (!_validate_inner(code+1, code+skip-3, groups))
5278
code += skip-2; /* Position after JUMP, at <skipno> */
5280
if (!_validate_inner(code, code+skip-1, groups))
5285
VTRACE(("only a then part present\n"));
5286
if (!_validate_inner(code+1, code+skip-1, groups))
5293
case SRE_OP_ASSERT_NOT:
5295
GET_ARG; /* 0 for lookahead, width for lookbehind */
5296
code--; /* Back up over arg to simplify math below */
5297
if (arg & 0x80000000)
5298
FAIL; /* Width too large */
5299
/* Stop 1 before the end; we check the SUCCESS below */
5300
if (!_validate_inner(code+1, code+skip-2, groups))
5304
if (op != SRE_OP_SUCCESS)
5319
_validate_outer(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
5321
if (groups < 0 || groups > 100 || code >= end || end[-1] != SRE_OP_SUCCESS)
5323
if (groups == 0) /* fix for simplejson */
5324
groups = 100; /* 100 groups should always be safe */
5325
return _validate_inner(code, end-1, groups);
5329
_validate(PatternObject *self)
5331
if (!_validate_outer(self->code, self->code+self->codesize, self->groups))
5333
PyErr_SetString(PyExc_RuntimeError, "invalid SRE code");
5337
VTRACE(("Success!\n"));
5341
/* -------------------------------------------------------------------- */
4867
5342
/* match methods */