2
* Secret Labs' Regular Expression Engine
4
* regular expression matching engine
7
* 1999-10-24 fl created (based on existing template matcher code)
8
* 2000-03-06 fl first alpha, sort of
9
* 2000-08-01 fl fixes for 1.6b1
10
* 2000-08-07 fl use PyOS_CheckStack() if available
11
* 2000-09-20 fl added expand method
12
* 2001-03-20 fl lots of fixes for 2.1b2
13
* 2001-04-15 fl export copyright as Python attribute, not global
14
* 2001-04-28 fl added __copy__ methods (work in progress)
15
* 2001-05-14 fl fixes for 1.5.2 compatibility
16
* 2001-07-01 fl added BIGCHARSET support (from Martin von Loewis)
17
* 2001-10-18 fl fixed group reset issue (from Matthew Mueller)
18
* 2001-10-20 fl added split primitive; reenable unicode for 1.6/2.0/2.1
19
* 2001-10-21 fl added sub/subn primitive
20
* 2001-10-24 fl added finditer primitive (for 2.2 only)
21
* 2001-12-07 fl fixed memory leak in sub/subn (Guido van Rossum)
22
* 2002-11-09 fl fixed empty sub/subn return type
23
* 2003-04-18 mvl fully support 4-byte codes
24
* 2003-10-17 gn implemented non recursive scheme
26
* Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
28
* This version of the SRE library can be redistributed under CNRI's
29
* Python 1.6 license. For any other use, please contact Secret Labs
30
* AB (info@pythonware.com).
32
* Portions of this engine have been developed in cooperation with
33
* CNRI. Hewlett-Packard provided funding for 1.6 integration and
34
* other compatibility work.
39
static char copyright[] =
40
" SRE 2.2.2 Copyright (c) 1997-2002 by Secret Labs AB ";
42
#define PY_SSIZE_T_CLEAN
45
#include "structmember.h" /* offsetof */
51
/* name of this module, minus the leading underscore */
52
#if !defined(SRE_MODULE)
53
#define SRE_MODULE "sre"
56
#define SRE_PY_MODULE "re"
58
/* defining this one enables tracing */
61
#if PY_VERSION_HEX >= 0x01060000
62
#if PY_VERSION_HEX < 0x02020000 || defined(Py_USING_UNICODE)
63
/* defining this enables unicode support (default under 1.6a1 and later) */
68
/* -------------------------------------------------------------------- */
69
/* optional features */
71
/* enables fast searching */
72
#define USE_FAST_SEARCH
74
/* enables aggressive inlining (always on for Visual C) */
77
/* enables copy/deepcopy handling (work in progress) */
78
#undef USE_BUILTIN_COPY
80
#if PY_VERSION_HEX < 0x01060000
81
#define PyObject_DEL(op) PyMem_DEL((op))
84
/* -------------------------------------------------------------------- */
87
#pragma optimize("agtw", on) /* doesn't seem to make much difference... */
88
#pragma warning(disable: 4710) /* who cares if functions are not inlined ;-) */
89
/* fastest possible local call under MSVC */
90
#define LOCAL(type) static __inline type __fastcall
91
#elif defined(USE_INLINE)
92
#define LOCAL(type) static inline type
94
#define LOCAL(type) static type
98
#define SRE_ERROR_ILLEGAL -1 /* illegal opcode */
99
#define SRE_ERROR_STATE -2 /* illegal state */
100
#define SRE_ERROR_RECURSION_LIMIT -3 /* runaway recursion */
101
#define SRE_ERROR_MEMORY -9 /* out of memory */
102
#define SRE_ERROR_INTERRUPTED -10 /* signal handler raised exception */
105
#define TRACE(v) printf v
110
/* -------------------------------------------------------------------- */
111
/* search engine state */
113
/* default character predicates (run sre_chars.py to regenerate tables) */
115
#define SRE_DIGIT_MASK 1
116
#define SRE_SPACE_MASK 2
117
#define SRE_LINEBREAK_MASK 4
118
#define SRE_ALNUM_MASK 8
119
#define SRE_WORD_MASK 16
121
/* FIXME: this assumes ASCII. create tables in init_sre() instead */
123
static char sre_char_info[128] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 6, 2,
124
2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0,
125
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 25, 25, 25, 25, 25, 25, 25, 25,
126
25, 25, 0, 0, 0, 0, 0, 0, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
127
24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0,
128
0, 0, 16, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
129
24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0, 0, 0, 0 };
131
static char sre_char_lower[128] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
132
10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
133
27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
134
44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
135
61, 62, 63, 64, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107,
136
108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121,
137
122, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105,
138
106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
139
120, 121, 122, 123, 124, 125, 126, 127 };
141
#define SRE_IS_DIGIT(ch)\
142
((ch) < 128 ? (sre_char_info[(ch)] & SRE_DIGIT_MASK) : 0)
143
#define SRE_IS_SPACE(ch)\
144
((ch) < 128 ? (sre_char_info[(ch)] & SRE_SPACE_MASK) : 0)
145
#define SRE_IS_LINEBREAK(ch)\
146
((ch) < 128 ? (sre_char_info[(ch)] & SRE_LINEBREAK_MASK) : 0)
147
#define SRE_IS_ALNUM(ch)\
148
((ch) < 128 ? (sre_char_info[(ch)] & SRE_ALNUM_MASK) : 0)
149
#define SRE_IS_WORD(ch)\
150
((ch) < 128 ? (sre_char_info[(ch)] & SRE_WORD_MASK) : 0)
152
static unsigned int sre_lower(unsigned int ch)
154
return ((ch) < 128 ? (unsigned int)sre_char_lower[ch] : ch);
157
/* locale-specific character predicates */
158
/* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
159
* warnings when c's type supports only numbers < N+1 */
160
#define SRE_LOC_IS_DIGIT(ch) (!((ch) & ~255) ? isdigit((ch)) : 0)
161
#define SRE_LOC_IS_SPACE(ch) (!((ch) & ~255) ? isspace((ch)) : 0)
162
#define SRE_LOC_IS_LINEBREAK(ch) ((ch) == '\n')
163
#define SRE_LOC_IS_ALNUM(ch) (!((ch) & ~255) ? isalnum((ch)) : 0)
164
#define SRE_LOC_IS_WORD(ch) (SRE_LOC_IS_ALNUM((ch)) || (ch) == '_')
166
static unsigned int sre_lower_locale(unsigned int ch)
168
return ((ch) < 256 ? (unsigned int)tolower((ch)) : ch);
171
/* unicode-specific character predicates */
173
#if defined(HAVE_UNICODE)
175
#define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDIGIT((Py_UNICODE)(ch))
176
#define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE((Py_UNICODE)(ch))
177
#define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK((Py_UNICODE)(ch))
178
#define SRE_UNI_IS_ALNUM(ch) Py_UNICODE_ISALNUM((Py_UNICODE)(ch))
179
#define SRE_UNI_IS_WORD(ch) (SRE_UNI_IS_ALNUM((ch)) || (ch) == '_')
181
static unsigned int sre_lower_unicode(unsigned int ch)
183
return (unsigned int) Py_UNICODE_TOLOWER((Py_UNICODE)(ch));
189
sre_category(SRE_CODE category, unsigned int ch)
193
case SRE_CATEGORY_DIGIT:
194
return SRE_IS_DIGIT(ch);
195
case SRE_CATEGORY_NOT_DIGIT:
196
return !SRE_IS_DIGIT(ch);
197
case SRE_CATEGORY_SPACE:
198
return SRE_IS_SPACE(ch);
199
case SRE_CATEGORY_NOT_SPACE:
200
return !SRE_IS_SPACE(ch);
201
case SRE_CATEGORY_WORD:
202
return SRE_IS_WORD(ch);
203
case SRE_CATEGORY_NOT_WORD:
204
return !SRE_IS_WORD(ch);
205
case SRE_CATEGORY_LINEBREAK:
206
return SRE_IS_LINEBREAK(ch);
207
case SRE_CATEGORY_NOT_LINEBREAK:
208
return !SRE_IS_LINEBREAK(ch);
210
case SRE_CATEGORY_LOC_WORD:
211
return SRE_LOC_IS_WORD(ch);
212
case SRE_CATEGORY_LOC_NOT_WORD:
213
return !SRE_LOC_IS_WORD(ch);
215
#if defined(HAVE_UNICODE)
216
case SRE_CATEGORY_UNI_DIGIT:
217
return SRE_UNI_IS_DIGIT(ch);
218
case SRE_CATEGORY_UNI_NOT_DIGIT:
219
return !SRE_UNI_IS_DIGIT(ch);
220
case SRE_CATEGORY_UNI_SPACE:
221
return SRE_UNI_IS_SPACE(ch);
222
case SRE_CATEGORY_UNI_NOT_SPACE:
223
return !SRE_UNI_IS_SPACE(ch);
224
case SRE_CATEGORY_UNI_WORD:
225
return SRE_UNI_IS_WORD(ch);
226
case SRE_CATEGORY_UNI_NOT_WORD:
227
return !SRE_UNI_IS_WORD(ch);
228
case SRE_CATEGORY_UNI_LINEBREAK:
229
return SRE_UNI_IS_LINEBREAK(ch);
230
case SRE_CATEGORY_UNI_NOT_LINEBREAK:
231
return !SRE_UNI_IS_LINEBREAK(ch);
233
case SRE_CATEGORY_UNI_DIGIT:
234
return SRE_IS_DIGIT(ch);
235
case SRE_CATEGORY_UNI_NOT_DIGIT:
236
return !SRE_IS_DIGIT(ch);
237
case SRE_CATEGORY_UNI_SPACE:
238
return SRE_IS_SPACE(ch);
239
case SRE_CATEGORY_UNI_NOT_SPACE:
240
return !SRE_IS_SPACE(ch);
241
case SRE_CATEGORY_UNI_WORD:
242
return SRE_LOC_IS_WORD(ch);
243
case SRE_CATEGORY_UNI_NOT_WORD:
244
return !SRE_LOC_IS_WORD(ch);
245
case SRE_CATEGORY_UNI_LINEBREAK:
246
return SRE_IS_LINEBREAK(ch);
247
case SRE_CATEGORY_UNI_NOT_LINEBREAK:
248
return !SRE_IS_LINEBREAK(ch);
257
data_stack_dealloc(SRE_STATE* state)
259
if (state->data_stack) {
260
PyMem_FREE(state->data_stack);
261
state->data_stack = NULL;
263
state->data_stack_size = state->data_stack_base = 0;
267
data_stack_grow(SRE_STATE* state, Py_ssize_t size)
269
Py_ssize_t minsize, cursize;
270
minsize = state->data_stack_base+size;
271
cursize = state->data_stack_size;
272
if (cursize < minsize) {
274
cursize = minsize+minsize/4+1024;
275
TRACE(("allocate/grow stack %d\n", cursize));
276
stack = PyMem_REALLOC(state->data_stack, cursize);
278
data_stack_dealloc(state);
279
return SRE_ERROR_MEMORY;
281
state->data_stack = (char *)stack;
282
state->data_stack_size = cursize;
287
/* generate 8-bit version */
289
#define SRE_CHAR unsigned char
290
#define SRE_AT sre_at
291
#define SRE_COUNT sre_count
292
#define SRE_CHARSET sre_charset
293
#define SRE_INFO sre_info
294
#define SRE_MATCH sre_match
295
#define SRE_MATCH_CONTEXT sre_match_context
296
#define SRE_SEARCH sre_search
297
#define SRE_LITERAL_TEMPLATE sre_literal_template
299
#if defined(HAVE_UNICODE)
301
#define SRE_RECURSIVE
305
#undef SRE_LITERAL_TEMPLATE
308
#undef SRE_MATCH_CONTEXT
315
/* generate 16-bit unicode version */
317
#define SRE_CHAR Py_UNICODE
318
#define SRE_AT sre_uat
319
#define SRE_COUNT sre_ucount
320
#define SRE_CHARSET sre_ucharset
321
#define SRE_INFO sre_uinfo
322
#define SRE_MATCH sre_umatch
323
#define SRE_MATCH_CONTEXT sre_umatch_context
324
#define SRE_SEARCH sre_usearch
325
#define SRE_LITERAL_TEMPLATE sre_uliteral_template
328
#endif /* SRE_RECURSIVE */
330
/* -------------------------------------------------------------------- */
331
/* String matching engine */
333
/* the following section is compiled twice, with different character
337
SRE_AT(SRE_STATE* state, SRE_CHAR* ptr, SRE_CODE at)
339
/* check if pointer is at given position */
341
Py_ssize_t thisp, thatp;
345
case SRE_AT_BEGINNING:
346
case SRE_AT_BEGINNING_STRING:
347
return ((void*) ptr == state->beginning);
349
case SRE_AT_BEGINNING_LINE:
350
return ((void*) ptr == state->beginning ||
351
SRE_IS_LINEBREAK((int) ptr[-1]));
354
return (((void*) (ptr+1) == state->end &&
355
SRE_IS_LINEBREAK((int) ptr[0])) ||
356
((void*) ptr == state->end));
358
case SRE_AT_END_LINE:
359
return ((void*) ptr == state->end ||
360
SRE_IS_LINEBREAK((int) ptr[0]));
362
case SRE_AT_END_STRING:
363
return ((void*) ptr == state->end);
365
case SRE_AT_BOUNDARY:
366
if (state->beginning == state->end)
368
thatp = ((void*) ptr > state->beginning) ?
369
SRE_IS_WORD((int) ptr[-1]) : 0;
370
thisp = ((void*) ptr < state->end) ?
371
SRE_IS_WORD((int) ptr[0]) : 0;
372
return thisp != thatp;
374
case SRE_AT_NON_BOUNDARY:
375
if (state->beginning == state->end)
377
thatp = ((void*) ptr > state->beginning) ?
378
SRE_IS_WORD((int) ptr[-1]) : 0;
379
thisp = ((void*) ptr < state->end) ?
380
SRE_IS_WORD((int) ptr[0]) : 0;
381
return thisp == thatp;
383
case SRE_AT_LOC_BOUNDARY:
384
if (state->beginning == state->end)
386
thatp = ((void*) ptr > state->beginning) ?
387
SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
388
thisp = ((void*) ptr < state->end) ?
389
SRE_LOC_IS_WORD((int) ptr[0]) : 0;
390
return thisp != thatp;
392
case SRE_AT_LOC_NON_BOUNDARY:
393
if (state->beginning == state->end)
395
thatp = ((void*) ptr > state->beginning) ?
396
SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
397
thisp = ((void*) ptr < state->end) ?
398
SRE_LOC_IS_WORD((int) ptr[0]) : 0;
399
return thisp == thatp;
401
#if defined(HAVE_UNICODE)
402
case SRE_AT_UNI_BOUNDARY:
403
if (state->beginning == state->end)
405
thatp = ((void*) ptr > state->beginning) ?
406
SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
407
thisp = ((void*) ptr < state->end) ?
408
SRE_UNI_IS_WORD((int) ptr[0]) : 0;
409
return thisp != thatp;
411
case SRE_AT_UNI_NON_BOUNDARY:
412
if (state->beginning == state->end)
414
thatp = ((void*) ptr > state->beginning) ?
415
SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
416
thisp = ((void*) ptr < state->end) ?
417
SRE_UNI_IS_WORD((int) ptr[0]) : 0;
418
return thisp == thatp;
427
SRE_CHARSET(SRE_CODE* set, SRE_CODE ch)
429
/* check if character is a member of the given set */
440
/* <LITERAL> <code> */
446
case SRE_OP_CATEGORY:
447
/* <CATEGORY> <code> */
448
if (sre_category(set[0], (int) ch))
454
if (sizeof(SRE_CODE) == 2) {
455
/* <CHARSET> <bitmap> (16 bits per code word) */
456
if (ch < 256 && (set[ch >> 4] & (1 << (ch & 15))))
461
/* <CHARSET> <bitmap> (32 bits per code word) */
462
if (ch < 256 && (set[ch >> 5] & (1 << (ch & 31))))
469
/* <RANGE> <lower> <upper> */
470
if (set[0] <= ch && ch <= set[1])
479
case SRE_OP_BIGCHARSET:
480
/* <BIGCHARSET> <blockcount> <256 blockindices> <blocks> */
482
Py_ssize_t count, block;
485
if (sizeof(SRE_CODE) == 2) {
486
block = ((unsigned char*)set)[ch >> 8];
488
if (set[block*16 + ((ch & 255)>>4)] & (1 << (ch & 15)))
493
/* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
494
* warnings when c's type supports only numbers < N+1 */
496
block = ((unsigned char*)set)[ch >> 8];
501
(set[block*8 + ((ch & 255)>>5)] & (1 << (ch & 31))))
509
/* internal error -- there's not much we can do about it
510
here, so let's just pretend it didn't match... */
516
LOCAL(Py_ssize_t) SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern);
519
SRE_COUNT(SRE_STATE* state, SRE_CODE* pattern, Py_ssize_t maxcount)
522
SRE_CHAR* ptr = (SRE_CHAR *)state->ptr;
523
SRE_CHAR* end = (SRE_CHAR *)state->end;
527
if (maxcount < end - ptr && maxcount != 65535)
528
end = ptr + maxcount;
530
switch (pattern[0]) {
534
TRACE(("|%p|%p|COUNT IN\n", pattern, ptr));
535
while (ptr < end && SRE_CHARSET(pattern + 2, *ptr))
540
/* repeated dot wildcard. */
541
TRACE(("|%p|%p|COUNT ANY\n", pattern, ptr));
542
while (ptr < end && !SRE_IS_LINEBREAK(*ptr))
547
/* repeated dot wildcard. skip to the end of the target
548
string, and backtrack from there */
549
TRACE(("|%p|%p|COUNT ANY_ALL\n", pattern, ptr));
554
/* repeated literal */
556
TRACE(("|%p|%p|COUNT LITERAL %d\n", pattern, ptr, chr));
557
while (ptr < end && (SRE_CODE) *ptr == chr)
561
case SRE_OP_LITERAL_IGNORE:
562
/* repeated literal */
564
TRACE(("|%p|%p|COUNT LITERAL_IGNORE %d\n", pattern, ptr, chr));
565
while (ptr < end && (SRE_CODE) state->lower(*ptr) == chr)
569
case SRE_OP_NOT_LITERAL:
570
/* repeated non-literal */
572
TRACE(("|%p|%p|COUNT NOT_LITERAL %d\n", pattern, ptr, chr));
573
while (ptr < end && (SRE_CODE) *ptr != chr)
577
case SRE_OP_NOT_LITERAL_IGNORE:
578
/* repeated non-literal */
580
TRACE(("|%p|%p|COUNT NOT_LITERAL_IGNORE %d\n", pattern, ptr, chr));
581
while (ptr < end && (SRE_CODE) state->lower(*ptr) != chr)
586
/* repeated single character pattern */
587
TRACE(("|%p|%p|COUNT SUBPATTERN\n", pattern, ptr));
588
while ((SRE_CHAR*) state->ptr < end) {
589
i = SRE_MATCH(state, pattern);
595
TRACE(("|%p|%p|COUNT %d\n", pattern, ptr,
596
(SRE_CHAR*) state->ptr - ptr));
597
return (SRE_CHAR*) state->ptr - ptr;
600
TRACE(("|%p|%p|COUNT %d\n", pattern, ptr, ptr - (SRE_CHAR*) state->ptr));
601
return ptr - (SRE_CHAR*) state->ptr;
604
#if 0 /* not used in this release */
606
SRE_INFO(SRE_STATE* state, SRE_CODE* pattern)
608
/* check if an SRE_OP_INFO block matches at the current position.
609
returns the number of SRE_CODE objects to skip if successful, 0
612
SRE_CHAR* end = state->end;
613
SRE_CHAR* ptr = state->ptr;
616
/* check minimal length */
617
if (pattern[3] && (end - ptr) < pattern[3])
620
/* check known prefix */
621
if (pattern[2] & SRE_INFO_PREFIX && pattern[5] > 1) {
622
/* <length> <skip> <prefix data> <overlap data> */
623
for (i = 0; i < pattern[5]; i++)
624
if ((SRE_CODE) ptr[i] != pattern[7 + i])
626
return pattern[0] + 2 * pattern[6];
632
/* The macros below should be used to protect recursive SRE_MATCH()
633
* calls that *failed* and do *not* return immediately (IOW, those
634
* that will backtrack). Explaining:
636
* - Recursive SRE_MATCH() returned true: that's usually a success
637
* (besides atypical cases like ASSERT_NOT), therefore there's no
638
* reason to restore lastmark;
640
* - Recursive SRE_MATCH() returned false but the current SRE_MATCH()
641
* is returning to the caller: If the current SRE_MATCH() is the
642
* top function of the recursion, returning false will be a matching
643
* failure, and it doesn't matter where lastmark is pointing to.
644
* If it's *not* the top function, it will be a recursive SRE_MATCH()
645
* failure by itself, and the calling SRE_MATCH() will have to deal
646
* with the failure by the same rules explained here (it will restore
647
* lastmark by itself if necessary);
649
* - Recursive SRE_MATCH() returned false, and will continue the
650
* outside 'for' loop: must be protected when breaking, since the next
651
* OP could potentially depend on lastmark;
653
* - Recursive SRE_MATCH() returned false, and will be called again
654
* inside a local for/while loop: must be protected between each
655
* loop iteration, since the recursive SRE_MATCH() could do anything,
656
* and could potentially depend on lastmark.
658
* For more information, check the discussion at SF patch #712900.
660
#define LASTMARK_SAVE() \
662
ctx->lastmark = state->lastmark; \
663
ctx->lastindex = state->lastindex; \
665
#define LASTMARK_RESTORE() \
667
state->lastmark = ctx->lastmark; \
668
state->lastindex = ctx->lastindex; \
671
#define RETURN_ERROR(i) do { return i; } while(0)
672
#define RETURN_FAILURE do { ret = 0; goto exit; } while(0)
673
#define RETURN_SUCCESS do { ret = 1; goto exit; } while(0)
675
#define RETURN_ON_ERROR(i) \
676
do { if (i < 0) RETURN_ERROR(i); } while (0)
677
#define RETURN_ON_SUCCESS(i) \
678
do { RETURN_ON_ERROR(i); if (i > 0) RETURN_SUCCESS; } while (0)
679
#define RETURN_ON_FAILURE(i) \
680
do { RETURN_ON_ERROR(i); if (i == 0) RETURN_FAILURE; } while (0)
684
#define DATA_STACK_ALLOC(state, type, ptr) \
686
alloc_pos = state->data_stack_base; \
687
TRACE(("allocating %s in %d (%d)\n", \
688
SFY(type), alloc_pos, sizeof(type))); \
689
if (state->data_stack_size < alloc_pos+sizeof(type)) { \
690
int j = data_stack_grow(state, sizeof(type)); \
691
if (j < 0) return j; \
693
DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \
695
ptr = (type*)(state->data_stack+alloc_pos); \
696
state->data_stack_base += sizeof(type); \
699
#define DATA_STACK_LOOKUP_AT(state, type, ptr, pos) \
701
TRACE(("looking up %s at %d\n", SFY(type), pos)); \
702
ptr = (type*)(state->data_stack+pos); \
705
#define DATA_STACK_PUSH(state, data, size) \
707
TRACE(("copy data in %p to %d (%d)\n", \
708
data, state->data_stack_base, size)); \
709
if (state->data_stack_size < state->data_stack_base+size) { \
710
int j = data_stack_grow(state, size); \
711
if (j < 0) return j; \
713
DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \
715
memcpy(state->data_stack+state->data_stack_base, data, size); \
716
state->data_stack_base += size; \
719
#define DATA_STACK_POP(state, data, size, discard) \
721
TRACE(("copy data to %p from %d (%d)\n", \
722
data, state->data_stack_base-size, size)); \
723
memcpy(data, state->data_stack+state->data_stack_base-size, size); \
725
state->data_stack_base -= size; \
728
#define DATA_STACK_POP_DISCARD(state, size) \
730
TRACE(("discard data from %d (%d)\n", \
731
state->data_stack_base-size, size)); \
732
state->data_stack_base -= size; \
735
#define DATA_PUSH(x) \
736
DATA_STACK_PUSH(state, (x), sizeof(*(x)))
737
#define DATA_POP(x) \
738
DATA_STACK_POP(state, (x), sizeof(*(x)), 1)
739
#define DATA_POP_DISCARD(x) \
740
DATA_STACK_POP_DISCARD(state, sizeof(*(x)))
741
#define DATA_ALLOC(t,p) \
742
DATA_STACK_ALLOC(state, t, p)
743
#define DATA_LOOKUP_AT(t,p,pos) \
744
DATA_STACK_LOOKUP_AT(state,t,p,pos)
746
#define MARK_PUSH(lastmark) \
747
do if (lastmark > 0) { \
748
i = lastmark; /* ctx->lastmark may change if reallocated */ \
749
DATA_STACK_PUSH(state, state->mark, (i+1)*sizeof(void*)); \
751
#define MARK_POP(lastmark) \
752
do if (lastmark > 0) { \
753
DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 1); \
755
#define MARK_POP_KEEP(lastmark) \
756
do if (lastmark > 0) { \
757
DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 0); \
759
#define MARK_POP_DISCARD(lastmark) \
760
do if (lastmark > 0) { \
761
DATA_STACK_POP_DISCARD(state, (lastmark+1)*sizeof(void*)); \
765
#define JUMP_MAX_UNTIL_1 1
766
#define JUMP_MAX_UNTIL_2 2
767
#define JUMP_MAX_UNTIL_3 3
768
#define JUMP_MIN_UNTIL_1 4
769
#define JUMP_MIN_UNTIL_2 5
770
#define JUMP_MIN_UNTIL_3 6
771
#define JUMP_REPEAT 7
772
#define JUMP_REPEAT_ONE_1 8
773
#define JUMP_REPEAT_ONE_2 9
774
#define JUMP_MIN_REPEAT_ONE 10
775
#define JUMP_BRANCH 11
776
#define JUMP_ASSERT 12
777
#define JUMP_ASSERT_NOT 13
779
#define DO_JUMP(jumpvalue, jumplabel, nextpattern) \
780
DATA_ALLOC(SRE_MATCH_CONTEXT, nextctx); \
781
nextctx->last_ctx_pos = ctx_pos; \
782
nextctx->jump = jumpvalue; \
783
nextctx->pattern = nextpattern; \
784
ctx_pos = alloc_pos; \
788
while (0) /* gcc doesn't like labels at end of scopes */ \
791
Py_ssize_t last_ctx_pos;
797
Py_ssize_t lastindex;
804
/* check if string matches the given pattern. returns <0 for
805
error, 0 for failure, and 1 for success */
807
SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
809
SRE_CHAR* end = (SRE_CHAR *)state->end;
810
Py_ssize_t alloc_pos, ctx_pos = -1;
811
Py_ssize_t i, ret = 0;
813
unsigned int sigcount=0;
815
SRE_MATCH_CONTEXT* ctx;
816
SRE_MATCH_CONTEXT* nextctx;
818
TRACE(("|%p|%p|ENTER\n", pattern, state->ptr));
820
DATA_ALLOC(SRE_MATCH_CONTEXT, ctx);
821
ctx->last_ctx_pos = -1;
822
ctx->jump = JUMP_NONE;
823
ctx->pattern = pattern;
828
ctx->ptr = (SRE_CHAR *)state->ptr;
830
if (ctx->pattern[0] == SRE_OP_INFO) {
831
/* optimization info block */
832
/* <INFO> <1=skip> <2=flags> <3=min> ... */
833
if (ctx->pattern[3] && (end - ctx->ptr) < ctx->pattern[3]) {
834
TRACE(("reject (got %d chars, need %d)\n",
835
(end - ctx->ptr), ctx->pattern[3]));
838
ctx->pattern += ctx->pattern[1] + 1;
843
if ((0 == (sigcount & 0xfff)) && PyErr_CheckSignals())
844
RETURN_ERROR(SRE_ERROR_INTERRUPTED);
846
switch (*ctx->pattern++) {
851
TRACE(("|%p|%p|MARK %d\n", ctx->pattern,
852
ctx->ptr, ctx->pattern[0]));
855
state->lastindex = i/2 + 1;
856
if (i > state->lastmark) {
857
/* state->lastmark is the highest valid index in the
858
state->mark array. If it is increased by more than 1,
859
the intervening marks must be set to NULL to signal
860
that these marks have not been encountered. */
861
Py_ssize_t j = state->lastmark + 1;
863
state->mark[j++] = NULL;
866
state->mark[i] = ctx->ptr;
871
/* match literal string */
872
/* <LITERAL> <code> */
873
TRACE(("|%p|%p|LITERAL %d\n", ctx->pattern,
874
ctx->ptr, *ctx->pattern));
875
if (ctx->ptr >= end || (SRE_CODE) ctx->ptr[0] != ctx->pattern[0])
881
case SRE_OP_NOT_LITERAL:
882
/* match anything that is not literal character */
883
/* <NOT_LITERAL> <code> */
884
TRACE(("|%p|%p|NOT_LITERAL %d\n", ctx->pattern,
885
ctx->ptr, *ctx->pattern));
886
if (ctx->ptr >= end || (SRE_CODE) ctx->ptr[0] == ctx->pattern[0])
894
TRACE(("|%p|%p|SUCCESS\n", ctx->pattern, ctx->ptr));
895
state->ptr = ctx->ptr;
899
/* match at given position */
901
TRACE(("|%p|%p|AT %d\n", ctx->pattern, ctx->ptr, *ctx->pattern));
902
if (!SRE_AT(state, ctx->ptr, *ctx->pattern))
907
case SRE_OP_CATEGORY:
908
/* match at given category */
909
/* <CATEGORY> <code> */
910
TRACE(("|%p|%p|CATEGORY %d\n", ctx->pattern,
911
ctx->ptr, *ctx->pattern));
912
if (ctx->ptr >= end || !sre_category(ctx->pattern[0], ctx->ptr[0]))
919
/* match anything (except a newline) */
921
TRACE(("|%p|%p|ANY\n", ctx->pattern, ctx->ptr));
922
if (ctx->ptr >= end || SRE_IS_LINEBREAK(ctx->ptr[0]))
930
TRACE(("|%p|%p|ANY_ALL\n", ctx->pattern, ctx->ptr));
937
/* match set member (or non_member) */
938
/* <IN> <skip> <set> */
939
TRACE(("|%p|%p|IN\n", ctx->pattern, ctx->ptr));
940
if (ctx->ptr >= end || !SRE_CHARSET(ctx->pattern + 1, *ctx->ptr))
942
ctx->pattern += ctx->pattern[0];
946
case SRE_OP_LITERAL_IGNORE:
947
TRACE(("|%p|%p|LITERAL_IGNORE %d\n",
948
ctx->pattern, ctx->ptr, ctx->pattern[0]));
949
if (ctx->ptr >= end ||
950
state->lower(*ctx->ptr) != state->lower(*ctx->pattern))
956
case SRE_OP_NOT_LITERAL_IGNORE:
957
TRACE(("|%p|%p|NOT_LITERAL_IGNORE %d\n",
958
ctx->pattern, ctx->ptr, *ctx->pattern));
959
if (ctx->ptr >= end ||
960
state->lower(*ctx->ptr) == state->lower(*ctx->pattern))
966
case SRE_OP_IN_IGNORE:
967
TRACE(("|%p|%p|IN_IGNORE\n", ctx->pattern, ctx->ptr));
969
|| !SRE_CHARSET(ctx->pattern+1,
970
(SRE_CODE)state->lower(*ctx->ptr)))
972
ctx->pattern += ctx->pattern[0];
979
/* <JUMP> <offset> */
980
TRACE(("|%p|%p|JUMP %d\n", ctx->pattern,
981
ctx->ptr, ctx->pattern[0]));
982
ctx->pattern += ctx->pattern[0];
987
/* <BRANCH> <0=skip> code <JUMP> ... <NULL> */
988
TRACE(("|%p|%p|BRANCH\n", ctx->pattern, ctx->ptr));
990
ctx->u.rep = state->repeat;
992
MARK_PUSH(ctx->lastmark);
993
for (; ctx->pattern[0]; ctx->pattern += ctx->pattern[0]) {
994
if (ctx->pattern[1] == SRE_OP_LITERAL &&
996
(SRE_CODE) *ctx->ptr != ctx->pattern[2]))
998
if (ctx->pattern[1] == SRE_OP_IN &&
1000
!SRE_CHARSET(ctx->pattern + 3, (SRE_CODE) *ctx->ptr)))
1002
state->ptr = ctx->ptr;
1003
DO_JUMP(JUMP_BRANCH, jump_branch, ctx->pattern+1);
1006
MARK_POP_DISCARD(ctx->lastmark);
1007
RETURN_ON_ERROR(ret);
1011
MARK_POP_KEEP(ctx->lastmark);
1015
MARK_POP_DISCARD(ctx->lastmark);
1018
case SRE_OP_REPEAT_ONE:
1019
/* match repeated sequence (maximizing regexp) */
1021
/* this operator only works if the repeated item is
1022
exactly one character wide, and we're not already
1023
collecting backtracking points. for other cases,
1024
use the MAX_REPEAT operator */
1026
/* <REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
1028
TRACE(("|%p|%p|REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr,
1029
ctx->pattern[1], ctx->pattern[2]));
1031
if (ctx->ptr + ctx->pattern[1] > end)
1032
RETURN_FAILURE; /* cannot match */
1034
state->ptr = ctx->ptr;
1036
ret = SRE_COUNT(state, ctx->pattern+3, ctx->pattern[2]);
1037
RETURN_ON_ERROR(ret);
1038
DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1040
ctx->ptr += ctx->count;
1042
/* when we arrive here, count contains the number of
1043
matches, and ctx->ptr points to the tail of the target
1044
string. check if the rest of the pattern matches,
1045
and backtrack if not. */
1047
if (ctx->count < (Py_ssize_t) ctx->pattern[1])
1050
if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS) {
1051
/* tail is empty. we're finished */
1052
state->ptr = ctx->ptr;
1058
if (ctx->pattern[ctx->pattern[0]] == SRE_OP_LITERAL) {
1059
/* tail starts with a literal. skip positions where
1060
the rest of the pattern cannot possibly match */
1061
ctx->u.chr = ctx->pattern[ctx->pattern[0]+1];
1063
while (ctx->count >= (Py_ssize_t) ctx->pattern[1] &&
1064
(ctx->ptr >= end || *ctx->ptr != ctx->u.chr)) {
1068
if (ctx->count < (Py_ssize_t) ctx->pattern[1])
1070
state->ptr = ctx->ptr;
1071
DO_JUMP(JUMP_REPEAT_ONE_1, jump_repeat_one_1,
1072
ctx->pattern+ctx->pattern[0]);
1074
RETURN_ON_ERROR(ret);
1086
while (ctx->count >= (Py_ssize_t) ctx->pattern[1]) {
1087
state->ptr = ctx->ptr;
1088
DO_JUMP(JUMP_REPEAT_ONE_2, jump_repeat_one_2,
1089
ctx->pattern+ctx->pattern[0]);
1091
RETURN_ON_ERROR(ret);
1101
case SRE_OP_MIN_REPEAT_ONE:
1102
/* match repeated sequence (minimizing regexp) */
1104
/* this operator only works if the repeated item is
1105
exactly one character wide, and we're not already
1106
collecting backtracking points. for other cases,
1107
use the MIN_REPEAT operator */
1109
/* <MIN_REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
1111
TRACE(("|%p|%p|MIN_REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr,
1112
ctx->pattern[1], ctx->pattern[2]));
1114
if (ctx->ptr + ctx->pattern[1] > end)
1115
RETURN_FAILURE; /* cannot match */
1117
state->ptr = ctx->ptr;
1119
if (ctx->pattern[1] == 0)
1122
/* count using pattern min as the maximum */
1123
ret = SRE_COUNT(state, ctx->pattern+3, ctx->pattern[1]);
1124
RETURN_ON_ERROR(ret);
1125
DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1126
if (ret < (Py_ssize_t) ctx->pattern[1])
1127
/* didn't match minimum number of times */
1129
/* advance past minimum matches of repeat */
1131
ctx->ptr += ctx->count;
1134
if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS) {
1135
/* tail is empty. we're finished */
1136
state->ptr = ctx->ptr;
1142
while ((Py_ssize_t)ctx->pattern[2] == 65535
1143
|| ctx->count <= (Py_ssize_t)ctx->pattern[2]) {
1144
state->ptr = ctx->ptr;
1145
DO_JUMP(JUMP_MIN_REPEAT_ONE,jump_min_repeat_one,
1146
ctx->pattern+ctx->pattern[0]);
1148
RETURN_ON_ERROR(ret);
1151
state->ptr = ctx->ptr;
1152
ret = SRE_COUNT(state, ctx->pattern+3, 1);
1153
RETURN_ON_ERROR(ret);
1154
DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1166
/* create repeat context. all the hard work is done
1167
by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */
1168
/* <REPEAT> <skip> <1=min> <2=max> item <UNTIL> tail */
1169
TRACE(("|%p|%p|REPEAT %d %d\n", ctx->pattern, ctx->ptr,
1170
ctx->pattern[1], ctx->pattern[2]));
1172
/* install new repeat context */
1173
ctx->u.rep = (SRE_REPEAT*) PyObject_MALLOC(sizeof(*ctx->u.rep));
1178
ctx->u.rep->count = -1;
1179
ctx->u.rep->pattern = ctx->pattern;
1180
ctx->u.rep->prev = state->repeat;
1181
ctx->u.rep->last_ptr = NULL;
1182
state->repeat = ctx->u.rep;
1184
state->ptr = ctx->ptr;
1185
DO_JUMP(JUMP_REPEAT, jump_repeat, ctx->pattern+ctx->pattern[0]);
1186
state->repeat = ctx->u.rep->prev;
1187
PyObject_FREE(ctx->u.rep);
1190
RETURN_ON_ERROR(ret);
1195
case SRE_OP_MAX_UNTIL:
1196
/* maximizing repeat */
1197
/* <REPEAT> <skip> <1=min> <2=max> item <MAX_UNTIL> tail */
1199
/* FIXME: we probably need to deal with zero-width
1200
matches in here... */
1202
ctx->u.rep = state->repeat;
1204
RETURN_ERROR(SRE_ERROR_STATE);
1206
state->ptr = ctx->ptr;
1208
ctx->count = ctx->u.rep->count+1;
1210
TRACE(("|%p|%p|MAX_UNTIL %d\n", ctx->pattern,
1211
ctx->ptr, ctx->count));
1213
if (ctx->count < ctx->u.rep->pattern[1]) {
1214
/* not enough matches */
1215
ctx->u.rep->count = ctx->count;
1216
DO_JUMP(JUMP_MAX_UNTIL_1, jump_max_until_1,
1217
ctx->u.rep->pattern+3);
1219
RETURN_ON_ERROR(ret);
1222
ctx->u.rep->count = ctx->count-1;
1223
state->ptr = ctx->ptr;
1227
if ((ctx->count < ctx->u.rep->pattern[2] ||
1228
ctx->u.rep->pattern[2] == 65535) &&
1229
state->ptr != ctx->u.rep->last_ptr) {
1230
/* we may have enough matches, but if we can
1231
match another item, do so */
1232
ctx->u.rep->count = ctx->count;
1234
MARK_PUSH(ctx->lastmark);
1235
/* zero-width match protection */
1236
DATA_PUSH(&ctx->u.rep->last_ptr);
1237
ctx->u.rep->last_ptr = state->ptr;
1238
DO_JUMP(JUMP_MAX_UNTIL_2, jump_max_until_2,
1239
ctx->u.rep->pattern+3);
1240
DATA_POP(&ctx->u.rep->last_ptr);
1242
MARK_POP_DISCARD(ctx->lastmark);
1243
RETURN_ON_ERROR(ret);
1246
MARK_POP(ctx->lastmark);
1248
ctx->u.rep->count = ctx->count-1;
1249
state->ptr = ctx->ptr;
1252
/* cannot match more repeated items here. make sure the
1254
state->repeat = ctx->u.rep->prev;
1255
DO_JUMP(JUMP_MAX_UNTIL_3, jump_max_until_3, ctx->pattern);
1256
RETURN_ON_SUCCESS(ret);
1257
state->repeat = ctx->u.rep;
1258
state->ptr = ctx->ptr;
1261
case SRE_OP_MIN_UNTIL:
1262
/* minimizing repeat */
1263
/* <REPEAT> <skip> <1=min> <2=max> item <MIN_UNTIL> tail */
1265
ctx->u.rep = state->repeat;
1267
RETURN_ERROR(SRE_ERROR_STATE);
1269
state->ptr = ctx->ptr;
1271
ctx->count = ctx->u.rep->count+1;
1273
TRACE(("|%p|%p|MIN_UNTIL %d %p\n", ctx->pattern,
1274
ctx->ptr, ctx->count, ctx->u.rep->pattern));
1276
if (ctx->count < ctx->u.rep->pattern[1]) {
1277
/* not enough matches */
1278
ctx->u.rep->count = ctx->count;
1279
DO_JUMP(JUMP_MIN_UNTIL_1, jump_min_until_1,
1280
ctx->u.rep->pattern+3);
1282
RETURN_ON_ERROR(ret);
1285
ctx->u.rep->count = ctx->count-1;
1286
state->ptr = ctx->ptr;
1292
/* see if the tail matches */
1293
state->repeat = ctx->u.rep->prev;
1294
DO_JUMP(JUMP_MIN_UNTIL_2, jump_min_until_2, ctx->pattern);
1296
RETURN_ON_ERROR(ret);
1300
state->repeat = ctx->u.rep;
1301
state->ptr = ctx->ptr;
1305
if (ctx->count >= ctx->u.rep->pattern[2]
1306
&& ctx->u.rep->pattern[2] != 65535)
1309
ctx->u.rep->count = ctx->count;
1310
DO_JUMP(JUMP_MIN_UNTIL_3,jump_min_until_3,
1311
ctx->u.rep->pattern+3);
1313
RETURN_ON_ERROR(ret);
1316
ctx->u.rep->count = ctx->count-1;
1317
state->ptr = ctx->ptr;
1320
case SRE_OP_GROUPREF:
1321
/* match backreference */
1322
TRACE(("|%p|%p|GROUPREF %d\n", ctx->pattern,
1323
ctx->ptr, ctx->pattern[0]));
1324
i = ctx->pattern[0];
1326
Py_ssize_t groupref = i+i;
1327
if (groupref >= state->lastmark) {
1330
SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1331
SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1332
if (!p || !e || e < p)
1335
if (ctx->ptr >= end || *ctx->ptr != *p)
1344
case SRE_OP_GROUPREF_IGNORE:
1345
/* match backreference */
1346
TRACE(("|%p|%p|GROUPREF_IGNORE %d\n", ctx->pattern,
1347
ctx->ptr, ctx->pattern[0]));
1348
i = ctx->pattern[0];
1350
Py_ssize_t groupref = i+i;
1351
if (groupref >= state->lastmark) {
1354
SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1355
SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1356
if (!p || !e || e < p)
1359
if (ctx->ptr >= end ||
1360
state->lower(*ctx->ptr) != state->lower(*p))
1369
case SRE_OP_GROUPREF_EXISTS:
1370
TRACE(("|%p|%p|GROUPREF_EXISTS %d\n", ctx->pattern,
1371
ctx->ptr, ctx->pattern[0]));
1372
/* <GROUPREF_EXISTS> <group> <skip> codeyes <JUMP> codeno ... */
1373
i = ctx->pattern[0];
1375
Py_ssize_t groupref = i+i;
1376
if (groupref >= state->lastmark) {
1377
ctx->pattern += ctx->pattern[1];
1380
SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1381
SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1382
if (!p || !e || e < p) {
1383
ctx->pattern += ctx->pattern[1];
1392
/* assert subpattern */
1393
/* <ASSERT> <skip> <back> <pattern> */
1394
TRACE(("|%p|%p|ASSERT %d\n", ctx->pattern,
1395
ctx->ptr, ctx->pattern[1]));
1396
state->ptr = ctx->ptr - ctx->pattern[1];
1397
if (state->ptr < state->beginning)
1399
DO_JUMP(JUMP_ASSERT, jump_assert, ctx->pattern+2);
1400
RETURN_ON_FAILURE(ret);
1401
ctx->pattern += ctx->pattern[0];
1404
case SRE_OP_ASSERT_NOT:
1405
/* assert not subpattern */
1406
/* <ASSERT_NOT> <skip> <back> <pattern> */
1407
TRACE(("|%p|%p|ASSERT_NOT %d\n", ctx->pattern,
1408
ctx->ptr, ctx->pattern[1]));
1409
state->ptr = ctx->ptr - ctx->pattern[1];
1410
if (state->ptr >= state->beginning) {
1411
DO_JUMP(JUMP_ASSERT_NOT, jump_assert_not, ctx->pattern+2);
1413
RETURN_ON_ERROR(ret);
1417
ctx->pattern += ctx->pattern[0];
1420
case SRE_OP_FAILURE:
1421
/* immediate failure */
1422
TRACE(("|%p|%p|FAILURE\n", ctx->pattern, ctx->ptr));
1426
TRACE(("|%p|%p|UNKNOWN %d\n", ctx->pattern, ctx->ptr,
1428
RETURN_ERROR(SRE_ERROR_ILLEGAL);
1433
ctx_pos = ctx->last_ctx_pos;
1435
DATA_POP_DISCARD(ctx);
1438
DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1441
case JUMP_MAX_UNTIL_2:
1442
TRACE(("|%p|%p|JUMP_MAX_UNTIL_2\n", ctx->pattern, ctx->ptr));
1443
goto jump_max_until_2;
1444
case JUMP_MAX_UNTIL_3:
1445
TRACE(("|%p|%p|JUMP_MAX_UNTIL_3\n", ctx->pattern, ctx->ptr));
1446
goto jump_max_until_3;
1447
case JUMP_MIN_UNTIL_2:
1448
TRACE(("|%p|%p|JUMP_MIN_UNTIL_2\n", ctx->pattern, ctx->ptr));
1449
goto jump_min_until_2;
1450
case JUMP_MIN_UNTIL_3:
1451
TRACE(("|%p|%p|JUMP_MIN_UNTIL_3\n", ctx->pattern, ctx->ptr));
1452
goto jump_min_until_3;
1454
TRACE(("|%p|%p|JUMP_BRANCH\n", ctx->pattern, ctx->ptr));
1456
case JUMP_MAX_UNTIL_1:
1457
TRACE(("|%p|%p|JUMP_MAX_UNTIL_1\n", ctx->pattern, ctx->ptr));
1458
goto jump_max_until_1;
1459
case JUMP_MIN_UNTIL_1:
1460
TRACE(("|%p|%p|JUMP_MIN_UNTIL_1\n", ctx->pattern, ctx->ptr));
1461
goto jump_min_until_1;
1463
TRACE(("|%p|%p|JUMP_REPEAT\n", ctx->pattern, ctx->ptr));
1465
case JUMP_REPEAT_ONE_1:
1466
TRACE(("|%p|%p|JUMP_REPEAT_ONE_1\n", ctx->pattern, ctx->ptr));
1467
goto jump_repeat_one_1;
1468
case JUMP_REPEAT_ONE_2:
1469
TRACE(("|%p|%p|JUMP_REPEAT_ONE_2\n", ctx->pattern, ctx->ptr));
1470
goto jump_repeat_one_2;
1471
case JUMP_MIN_REPEAT_ONE:
1472
TRACE(("|%p|%p|JUMP_MIN_REPEAT_ONE\n", ctx->pattern, ctx->ptr));
1473
goto jump_min_repeat_one;
1475
TRACE(("|%p|%p|JUMP_ASSERT\n", ctx->pattern, ctx->ptr));
1477
case JUMP_ASSERT_NOT:
1478
TRACE(("|%p|%p|JUMP_ASSERT_NOT\n", ctx->pattern, ctx->ptr));
1479
goto jump_assert_not;
1481
TRACE(("|%p|%p|RETURN %d\n", ctx->pattern, ctx->ptr, ret));
1485
return ret; /* should never get here */
1489
SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
1491
SRE_CHAR* ptr = (SRE_CHAR *)state->start;
1492
SRE_CHAR* end = (SRE_CHAR *)state->end;
1493
Py_ssize_t status = 0;
1494
Py_ssize_t prefix_len = 0;
1495
Py_ssize_t prefix_skip = 0;
1496
SRE_CODE* prefix = NULL;
1497
SRE_CODE* charset = NULL;
1498
SRE_CODE* overlap = NULL;
1501
if (pattern[0] == SRE_OP_INFO) {
1502
/* optimization info block */
1503
/* <INFO> <1=skip> <2=flags> <3=min> <4=max> <5=prefix info> */
1507
if (pattern[3] > 1) {
1508
/* adjust end point (but make sure we leave at least one
1509
character in there, so literal search will work) */
1510
end -= pattern[3]-1;
1515
if (flags & SRE_INFO_PREFIX) {
1516
/* pattern starts with a known prefix */
1517
/* <length> <skip> <prefix data> <overlap data> */
1518
prefix_len = pattern[5];
1519
prefix_skip = pattern[6];
1520
prefix = pattern + 7;
1521
overlap = prefix + prefix_len - 1;
1522
} else if (flags & SRE_INFO_CHARSET)
1523
/* pattern starts with a character from a known set */
1525
charset = pattern + 5;
1527
pattern += 1 + pattern[1];
1530
TRACE(("prefix = %p %d %d\n", prefix, prefix_len, prefix_skip));
1531
TRACE(("charset = %p\n", charset));
1533
#if defined(USE_FAST_SEARCH)
1534
if (prefix_len > 1) {
1535
/* pattern starts with a known prefix. use the overlap
1536
table to skip forward as fast as we possibly can */
1538
end = (SRE_CHAR *)state->end;
1541
if ((SRE_CODE) ptr[0] != prefix[i]) {
1547
if (++i == prefix_len) {
1548
/* found a potential match */
1549
TRACE(("|%p|%p|SEARCH SCAN\n", pattern, ptr));
1550
state->start = ptr + 1 - prefix_len;
1551
state->ptr = ptr + 1 - prefix_len + prefix_skip;
1552
if (flags & SRE_INFO_LITERAL)
1553
return 1; /* we got all of it */
1554
status = SRE_MATCH(state, pattern + 2*prefix_skip);
1557
/* close but no cigar -- try again */
1569
if (pattern[0] == SRE_OP_LITERAL) {
1570
/* pattern starts with a literal character. this is used
1571
for short prefixes, and if fast search is disabled */
1572
SRE_CODE chr = pattern[1];
1573
end = (SRE_CHAR *)state->end;
1575
while (ptr < end && (SRE_CODE) ptr[0] != chr)
1579
TRACE(("|%p|%p|SEARCH LITERAL\n", pattern, ptr));
1582
if (flags & SRE_INFO_LITERAL)
1583
return 1; /* we got all of it */
1584
status = SRE_MATCH(state, pattern + 2);
1588
} else if (charset) {
1589
/* pattern starts with a character from a known set */
1590
end = (SRE_CHAR *)state->end;
1592
while (ptr < end && !SRE_CHARSET(charset, ptr[0]))
1596
TRACE(("|%p|%p|SEARCH CHARSET\n", pattern, ptr));
1599
status = SRE_MATCH(state, pattern);
1606
while (ptr <= end) {
1607
TRACE(("|%p|%p|SEARCH\n", pattern, ptr));
1608
state->start = state->ptr = ptr++;
1609
status = SRE_MATCH(state, pattern);
1618
SRE_LITERAL_TEMPLATE(SRE_CHAR* ptr, Py_ssize_t len)
1620
/* check if given string is a literal template (i.e. no escapes) */
1627
#if !defined(SRE_RECURSIVE)
1629
/* -------------------------------------------------------------------- */
1630
/* factories and destructors */
1632
/* see sre.h for object declarations */
1633
static PyObject*pattern_new_match(PatternObject*, SRE_STATE*, int);
1634
static PyObject*pattern_scanner(PatternObject*, PyObject*);
1637
sre_codesize(PyObject* self, PyObject *unused)
1639
return Py_BuildValue("l", sizeof(SRE_CODE));
1643
sre_getlower(PyObject* self, PyObject* args)
1645
int character, flags;
1646
if (!PyArg_ParseTuple(args, "ii", &character, &flags))
1648
if (flags & SRE_FLAG_LOCALE)
1649
return Py_BuildValue("i", sre_lower_locale(character));
1650
if (flags & SRE_FLAG_UNICODE)
1651
#if defined(HAVE_UNICODE)
1652
return Py_BuildValue("i", sre_lower_unicode(character));
1654
return Py_BuildValue("i", sre_lower_locale(character));
1656
return Py_BuildValue("i", sre_lower(character));
1660
state_reset(SRE_STATE* state)
1662
/* FIXME: dynamic! */
1663
/*memset(state->mark, 0, sizeof(*state->mark) * SRE_MARK_SIZE);*/
1665
state->lastmark = -1;
1666
state->lastindex = -1;
1668
state->repeat = NULL;
1670
data_stack_dealloc(state);
1674
getstring(PyObject* string, Py_ssize_t* p_length, int* p_charsize)
1676
/* given a python object, return a data pointer, a length (in
1677
characters), and a character size. return NULL if the object
1678
is not a string (or not compatible) */
1680
PyBufferProcs *buffer;
1681
Py_ssize_t size, bytes;
1685
#if defined(HAVE_UNICODE)
1686
if (PyUnicode_Check(string)) {
1687
/* unicode strings doesn't always support the buffer interface */
1688
ptr = (void*) PyUnicode_AS_DATA(string);
1689
bytes = PyUnicode_GET_DATA_SIZE(string);
1690
size = PyUnicode_GET_SIZE(string);
1691
charsize = sizeof(Py_UNICODE);
1696
/* get pointer to string buffer */
1697
buffer = Py_TYPE(string)->tp_as_buffer;
1698
if (!buffer || !buffer->bf_getreadbuffer || !buffer->bf_getsegcount ||
1699
buffer->bf_getsegcount(string, NULL) != 1) {
1700
PyErr_SetString(PyExc_TypeError, "expected string or buffer");
1704
/* determine buffer size */
1705
bytes = buffer->bf_getreadbuffer(string, 0, &ptr);
1707
PyErr_SetString(PyExc_TypeError, "buffer has negative size");
1711
/* determine character size */
1712
#if PY_VERSION_HEX >= 0x01060000
1713
size = PyObject_Size(string);
1715
size = PyObject_Length(string);
1718
if (PyString_Check(string) || bytes == size)
1720
#if defined(HAVE_UNICODE)
1721
else if (bytes == (Py_ssize_t) (size * sizeof(Py_UNICODE)))
1722
charsize = sizeof(Py_UNICODE);
1725
PyErr_SetString(PyExc_TypeError, "buffer size mismatch");
1729
#if defined(HAVE_UNICODE)
1734
*p_charsize = charsize;
1740
state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string,
1741
Py_ssize_t start, Py_ssize_t end)
1743
/* prepare state object */
1749
memset(state, 0, sizeof(SRE_STATE));
1751
state->lastmark = -1;
1752
state->lastindex = -1;
1754
ptr = getstring(string, &length, &charsize);
1758
/* adjust boundaries */
1761
else if (start > length)
1766
else if (end > length)
1769
state->charsize = charsize;
1771
state->beginning = ptr;
1773
state->start = (void*) ((char*) ptr + start * state->charsize);
1774
state->end = (void*) ((char*) ptr + end * state->charsize);
1777
state->string = string;
1779
state->endpos = end;
1781
if (pattern->flags & SRE_FLAG_LOCALE)
1782
state->lower = sre_lower_locale;
1783
else if (pattern->flags & SRE_FLAG_UNICODE)
1784
#if defined(HAVE_UNICODE)
1785
state->lower = sre_lower_unicode;
1787
state->lower = sre_lower_locale;
1790
state->lower = sre_lower;
1796
state_fini(SRE_STATE* state)
1798
Py_XDECREF(state->string);
1799
data_stack_dealloc(state);
1802
/* calculate offset from start of string */
1803
#define STATE_OFFSET(state, member)\
1804
(((char*)(member) - (char*)(state)->beginning) / (state)->charsize)
1807
state_getslice(SRE_STATE* state, Py_ssize_t index, PyObject* string, int empty)
1811
index = (index - 1) * 2;
1813
if (string == Py_None || index >= state->lastmark || !state->mark[index] || !state->mark[index+1]) {
1815
/* want empty string */
1822
i = STATE_OFFSET(state, state->mark[index]);
1823
j = STATE_OFFSET(state, state->mark[index+1]);
1826
return PySequence_GetSlice(string, i, j);
1830
pattern_error(int status)
1833
case SRE_ERROR_RECURSION_LIMIT:
1836
"maximum recursion limit exceeded"
1839
case SRE_ERROR_MEMORY:
1842
case SRE_ERROR_INTERRUPTED:
1843
/* An exception has already been raised, so let it fly */
1846
/* other error codes indicate compiler/engine bugs */
1849
"internal error in regular expression engine"
1855
pattern_dealloc(PatternObject* self)
1857
if (self->weakreflist != NULL)
1858
PyObject_ClearWeakRefs((PyObject *) self);
1859
Py_XDECREF(self->pattern);
1860
Py_XDECREF(self->groupindex);
1861
Py_XDECREF(self->indexgroup);
1866
pattern_match(PatternObject* self, PyObject* args, PyObject* kw)
1872
Py_ssize_t start = 0;
1873
Py_ssize_t end = PY_SSIZE_T_MAX;
1874
static char* kwlist[] = { "pattern", "pos", "endpos", NULL };
1875
if (!PyArg_ParseTupleAndKeywords(args, kw, "O|nn:match", kwlist,
1876
&string, &start, &end))
1879
string = state_init(&state, self, string, start, end);
1883
state.ptr = state.start;
1885
TRACE(("|%p|%p|MATCH\n", PatternObject_GetCode(self), state.ptr));
1887
if (state.charsize == 1) {
1888
status = sre_match(&state, PatternObject_GetCode(self));
1890
#if defined(HAVE_UNICODE)
1891
status = sre_umatch(&state, PatternObject_GetCode(self));
1895
TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
1896
if (PyErr_Occurred())
1901
return pattern_new_match(self, &state, status);
1905
pattern_search(PatternObject* self, PyObject* args, PyObject* kw)
1911
Py_ssize_t start = 0;
1912
Py_ssize_t end = PY_SSIZE_T_MAX;
1913
static char* kwlist[] = { "pattern", "pos", "endpos", NULL };
1914
if (!PyArg_ParseTupleAndKeywords(args, kw, "O|nn:search", kwlist,
1915
&string, &start, &end))
1918
string = state_init(&state, self, string, start, end);
1922
TRACE(("|%p|%p|SEARCH\n", PatternObject_GetCode(self), state.ptr));
1924
if (state.charsize == 1) {
1925
status = sre_search(&state, PatternObject_GetCode(self));
1927
#if defined(HAVE_UNICODE)
1928
status = sre_usearch(&state, PatternObject_GetCode(self));
1932
TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
1936
if (PyErr_Occurred())
1939
return pattern_new_match(self, &state, status);
1943
call(char* module, char* function, PyObject* args)
1952
name = PyString_FromString(module);
1955
mod = PyImport_Import(name);
1959
func = PyObject_GetAttrString(mod, function);
1963
result = PyObject_CallObject(func, args);
1969
#ifdef USE_BUILTIN_COPY
1971
deepcopy(PyObject** object, PyObject* memo)
1977
PyTuple_Pack(2, *object, memo)
1985
return 1; /* success */
1990
join_list(PyObject* list, PyObject* string)
1992
/* join list elements */
1995
#if PY_VERSION_HEX >= 0x01060000
2001
joiner = PySequence_GetSlice(string, 0, 0);
2005
if (PyList_GET_SIZE(list) == 0) {
2010
#if PY_VERSION_HEX >= 0x01060000
2011
function = PyObject_GetAttrString(joiner, "join");
2016
args = PyTuple_New(1);
2018
Py_DECREF(function);
2022
PyTuple_SET_ITEM(args, 0, list);
2023
result = PyObject_CallObject(function, args);
2024
Py_DECREF(args); /* also removes list */
2025
Py_DECREF(function);
2029
PyTuple_Pack(2, list, joiner)
2038
pattern_findall(PatternObject* self, PyObject* args, PyObject* kw)
2046
Py_ssize_t start = 0;
2047
Py_ssize_t end = PY_SSIZE_T_MAX;
2048
static char* kwlist[] = { "source", "pos", "endpos", NULL };
2049
if (!PyArg_ParseTupleAndKeywords(args, kw, "O|nn:findall", kwlist,
2050
&string, &start, &end))
2053
string = state_init(&state, self, string, start, end);
2057
list = PyList_New(0);
2063
while (state.start <= state.end) {
2067
state_reset(&state);
2069
state.ptr = state.start;
2071
if (state.charsize == 1) {
2072
status = sre_search(&state, PatternObject_GetCode(self));
2074
#if defined(HAVE_UNICODE)
2075
status = sre_usearch(&state, PatternObject_GetCode(self));
2079
if (PyErr_Occurred())
2085
pattern_error(status);
2089
/* don't bother to build a match object */
2090
switch (self->groups) {
2092
b = STATE_OFFSET(&state, state.start);
2093
e = STATE_OFFSET(&state, state.ptr);
2094
item = PySequence_GetSlice(string, b, e);
2099
item = state_getslice(&state, 1, string, 1);
2104
item = PyTuple_New(self->groups);
2107
for (i = 0; i < self->groups; i++) {
2108
PyObject* o = state_getslice(&state, i+1, string, 1);
2113
PyTuple_SET_ITEM(item, i, o);
2118
status = PyList_Append(list, item);
2123
if (state.ptr == state.start)
2124
state.start = (void*) ((char*) state.ptr + state.charsize);
2126
state.start = state.ptr;
2140
#if PY_VERSION_HEX >= 0x02020000
2142
pattern_finditer(PatternObject* pattern, PyObject* args)
2148
scanner = pattern_scanner(pattern, args);
2152
search = PyObject_GetAttrString(scanner, "search");
2157
iterator = PyCallIter_New(search, Py_None);
2165
pattern_split(PatternObject* self, PyObject* args, PyObject* kw)
2176
Py_ssize_t maxsplit = 0;
2177
static char* kwlist[] = { "source", "maxsplit", NULL };
2178
if (!PyArg_ParseTupleAndKeywords(args, kw, "O|n:split", kwlist,
2179
&string, &maxsplit))
2182
string = state_init(&state, self, string, 0, PY_SSIZE_T_MAX);
2186
list = PyList_New(0);
2195
while (!maxsplit || n < maxsplit) {
2197
state_reset(&state);
2199
state.ptr = state.start;
2201
if (state.charsize == 1) {
2202
status = sre_search(&state, PatternObject_GetCode(self));
2204
#if defined(HAVE_UNICODE)
2205
status = sre_usearch(&state, PatternObject_GetCode(self));
2209
if (PyErr_Occurred())
2215
pattern_error(status);
2219
if (state.start == state.ptr) {
2220
if (last == state.end)
2222
/* skip one character */
2223
state.start = (void*) ((char*) state.ptr + state.charsize);
2227
/* get segment before this match */
2228
item = PySequence_GetSlice(
2229
string, STATE_OFFSET(&state, last),
2230
STATE_OFFSET(&state, state.start)
2234
status = PyList_Append(list, item);
2239
/* add groups (if any) */
2240
for (i = 0; i < self->groups; i++) {
2241
item = state_getslice(&state, i+1, string, 0);
2244
status = PyList_Append(list, item);
2252
last = state.start = state.ptr;
2256
/* get segment following last match (even if empty) */
2257
item = PySequence_GetSlice(
2258
string, STATE_OFFSET(&state, last), state.endpos
2262
status = PyList_Append(list, item);
2278
pattern_subx(PatternObject* self, PyObject* ptemplate, PyObject* string,
2279
Py_ssize_t count, Py_ssize_t subn)
2292
int filter_is_callable;
2294
if (PyCallable_Check(ptemplate)) {
2295
/* sub/subn takes either a function or a template */
2298
filter_is_callable = 1;
2300
/* if not callable, check if it's a literal string */
2302
ptr = getstring(ptemplate, &n, &bint);
2306
literal = sre_literal_template((unsigned char *)ptr, n);
2308
#if defined(HAVE_UNICODE)
2309
literal = sre_uliteral_template((Py_UNICODE *)ptr, n);
2319
filter_is_callable = 0;
2321
/* not a literal; hand it over to the template compiler */
2323
SRE_PY_MODULE, "_subx",
2324
PyTuple_Pack(2, self, ptemplate)
2328
filter_is_callable = PyCallable_Check(filter);
2332
string = state_init(&state, self, string, 0, PY_SSIZE_T_MAX);
2338
list = PyList_New(0);
2347
while (!count || n < count) {
2349
state_reset(&state);
2351
state.ptr = state.start;
2353
if (state.charsize == 1) {
2354
status = sre_search(&state, PatternObject_GetCode(self));
2356
#if defined(HAVE_UNICODE)
2357
status = sre_usearch(&state, PatternObject_GetCode(self));
2361
if (PyErr_Occurred())
2367
pattern_error(status);
2371
b = STATE_OFFSET(&state, state.start);
2372
e = STATE_OFFSET(&state, state.ptr);
2375
/* get segment before this match */
2376
item = PySequence_GetSlice(string, i, b);
2379
status = PyList_Append(list, item);
2384
} else if (i == b && i == e && n > 0)
2385
/* ignore empty match on latest position */
2388
if (filter_is_callable) {
2389
/* pass match object through filter */
2390
match = pattern_new_match(self, &state, 1);
2393
args = PyTuple_Pack(1, match);
2398
item = PyObject_CallObject(filter, args);
2404
/* filter is literal string */
2410
if (item != Py_None) {
2411
status = PyList_Append(list, item);
2422
if (state.ptr == state.start)
2423
state.start = (void*) ((char*) state.ptr + state.charsize);
2425
state.start = state.ptr;
2429
/* get segment following last match */
2430
if (i < state.endpos) {
2431
item = PySequence_GetSlice(string, i, state.endpos);
2434
status = PyList_Append(list, item);
2444
/* convert list to single string (also removes list) */
2445
item = join_list(list, string);
2451
return Py_BuildValue("Ni", item, n);
2464
pattern_sub(PatternObject* self, PyObject* args, PyObject* kw)
2466
PyObject* ptemplate;
2468
Py_ssize_t count = 0;
2469
static char* kwlist[] = { "repl", "string", "count", NULL };
2470
if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|n:sub", kwlist,
2471
&ptemplate, &string, &count))
2474
return pattern_subx(self, ptemplate, string, count, 0);
2478
pattern_subn(PatternObject* self, PyObject* args, PyObject* kw)
2480
PyObject* ptemplate;
2482
Py_ssize_t count = 0;
2483
static char* kwlist[] = { "repl", "string", "count", NULL };
2484
if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|n:subn", kwlist,
2485
&ptemplate, &string, &count))
2488
return pattern_subx(self, ptemplate, string, count, 1);
2492
pattern_copy(PatternObject* self, PyObject *unused)
2494
#ifdef USE_BUILTIN_COPY
2495
PatternObject* copy;
2498
copy = PyObject_NEW_VAR(PatternObject, &Pattern_Type, self->codesize);
2502
offset = offsetof(PatternObject, groups);
2504
Py_XINCREF(self->groupindex);
2505
Py_XINCREF(self->indexgroup);
2506
Py_XINCREF(self->pattern);
2508
memcpy((char*) copy + offset, (char*) self + offset,
2509
sizeof(PatternObject) + self->codesize * sizeof(SRE_CODE) - offset);
2510
copy->weakreflist = NULL;
2512
return (PyObject*) copy;
2514
PyErr_SetString(PyExc_TypeError, "cannot copy this pattern object");
2520
pattern_deepcopy(PatternObject* self, PyObject* memo)
2522
#ifdef USE_BUILTIN_COPY
2523
PatternObject* copy;
2525
copy = (PatternObject*) pattern_copy(self);
2529
if (!deepcopy(©->groupindex, memo) ||
2530
!deepcopy(©->indexgroup, memo) ||
2531
!deepcopy(©->pattern, memo)) {
2537
PyErr_SetString(PyExc_TypeError, "cannot deepcopy this pattern object");
2542
PyDoc_STRVAR(pattern_match_doc,
2543
"match(string[, pos[, endpos]]) --> match object or None.\n\
2544
Matches zero or more characters at the beginning of the string");
2546
PyDoc_STRVAR(pattern_search_doc,
2547
"search(string[, pos[, endpos]]) --> match object or None.\n\
2548
Scan through string looking for a match, and return a corresponding\n\
2549
MatchObject instance. Return None if no position in the string matches.");
2551
PyDoc_STRVAR(pattern_split_doc,
2552
"split(string[, maxsplit = 0]) --> list.\n\
2553
Split string by the occurrences of pattern.");
2555
PyDoc_STRVAR(pattern_findall_doc,
2556
"findall(string[, pos[, endpos]]) --> list.\n\
2557
Return a list of all non-overlapping matches of pattern in string.");
2559
PyDoc_STRVAR(pattern_finditer_doc,
2560
"finditer(string[, pos[, endpos]]) --> iterator.\n\
2561
Return an iterator over all non-overlapping matches for the \n\
2562
RE pattern in string. For each match, the iterator returns a\n\
2565
PyDoc_STRVAR(pattern_sub_doc,
2566
"sub(repl, string[, count = 0]) --> newstring\n\
2567
Return the string obtained by replacing the leftmost non-overlapping\n\
2568
occurrences of pattern in string by the replacement repl.");
2570
PyDoc_STRVAR(pattern_subn_doc,
2571
"subn(repl, string[, count = 0]) --> (newstring, number of subs)\n\
2572
Return the tuple (new_string, number_of_subs_made) found by replacing\n\
2573
the leftmost non-overlapping occurrences of pattern with the\n\
2574
replacement repl.");
2576
PyDoc_STRVAR(pattern_doc, "Compiled regular expression objects");
2578
static PyMethodDef pattern_methods[] = {
2579
{"match", (PyCFunction) pattern_match, METH_VARARGS|METH_KEYWORDS,
2581
{"search", (PyCFunction) pattern_search, METH_VARARGS|METH_KEYWORDS,
2582
pattern_search_doc},
2583
{"sub", (PyCFunction) pattern_sub, METH_VARARGS|METH_KEYWORDS,
2585
{"subn", (PyCFunction) pattern_subn, METH_VARARGS|METH_KEYWORDS,
2587
{"split", (PyCFunction) pattern_split, METH_VARARGS|METH_KEYWORDS,
2589
{"findall", (PyCFunction) pattern_findall, METH_VARARGS|METH_KEYWORDS,
2590
pattern_findall_doc},
2591
#if PY_VERSION_HEX >= 0x02020000
2592
{"finditer", (PyCFunction) pattern_finditer, METH_VARARGS,
2593
pattern_finditer_doc},
2595
{"scanner", (PyCFunction) pattern_scanner, METH_VARARGS},
2596
{"__copy__", (PyCFunction) pattern_copy, METH_NOARGS},
2597
{"__deepcopy__", (PyCFunction) pattern_deepcopy, METH_O},
2602
pattern_getattr(PatternObject* self, char* name)
2606
res = Py_FindMethod(pattern_methods, (PyObject*) self, name);
2614
if (!strcmp(name, "pattern")) {
2615
Py_INCREF(self->pattern);
2616
return self->pattern;
2619
if (!strcmp(name, "flags"))
2620
return Py_BuildValue("i", self->flags);
2622
if (!strcmp(name, "groups"))
2623
return Py_BuildValue("i", self->groups);
2625
if (!strcmp(name, "groupindex") && self->groupindex) {
2626
Py_INCREF(self->groupindex);
2627
return self->groupindex;
2630
PyErr_SetString(PyExc_AttributeError, name);
2634
statichere PyTypeObject Pattern_Type = {
2635
PyObject_HEAD_INIT(NULL)
2636
0, "_" SRE_MODULE ".SRE_Pattern",
2637
sizeof(PatternObject), sizeof(SRE_CODE),
2638
(destructor)pattern_dealloc, /*tp_dealloc*/
2640
(getattrfunc)pattern_getattr, /*tp_getattr*/
2644
0, /* tp_as_number */
2645
0, /* tp_as_sequence */
2646
0, /* tp_as_mapping */
2650
0, /* tp_getattro */
2651
0, /* tp_setattro */
2652
0, /* tp_as_buffer */
2653
Py_TPFLAGS_HAVE_WEAKREFS, /* tp_flags */
2654
pattern_doc, /* tp_doc */
2655
0, /* tp_traverse */
2657
0, /* tp_richcompare */
2658
offsetof(PatternObject, weakreflist), /* tp_weaklistoffset */
2661
static int _validate(PatternObject *self); /* Forward */
2664
_compile(PyObject* self_, PyObject* args)
2666
/* "compile" pattern descriptor to pattern object */
2668
PatternObject* self;
2674
Py_ssize_t groups = 0;
2675
PyObject* groupindex = NULL;
2676
PyObject* indexgroup = NULL;
2677
if (!PyArg_ParseTuple(args, "OiO!|nOO", &pattern, &flags,
2678
&PyList_Type, &code, &groups,
2679
&groupindex, &indexgroup))
2682
n = PyList_GET_SIZE(code);
2683
/* coverity[ampersand_in_size] */
2684
self = PyObject_NEW_VAR(PatternObject, &Pattern_Type, n);
2690
for (i = 0; i < n; i++) {
2691
PyObject *o = PyList_GET_ITEM(code, i);
2692
unsigned long value = PyInt_Check(o) ? (unsigned long)PyInt_AsLong(o)
2693
: PyLong_AsUnsignedLong(o);
2694
self->code[i] = (SRE_CODE) value;
2695
if ((unsigned long) self->code[i] != value) {
2696
PyErr_SetString(PyExc_OverflowError,
2697
"regular expression code size limit exceeded");
2702
if (PyErr_Occurred()) {
2708
self->pattern = pattern;
2710
self->flags = flags;
2712
self->groups = groups;
2714
Py_XINCREF(groupindex);
2715
self->groupindex = groupindex;
2717
Py_XINCREF(indexgroup);
2718
self->indexgroup = indexgroup;
2720
self->weakreflist = NULL;
2722
if (!_validate(self)) {
2727
return (PyObject*) self;
2730
/* -------------------------------------------------------------------- */
2731
/* Code validation */
2733
/* To learn more about this code, have a look at the _compile() function in
2734
Lib/sre_compile.py. The validation functions below checks the code array
2735
for conformance with the code patterns generated there.
2737
The nice thing about the generated code is that it is position-independent:
2738
all jumps are relative jumps forward. Also, jumps don't cross each other:
2739
the target of a later jump is always earlier than the target of an earlier
2740
jump. IOW, this is okay:
2742
J---------J-------T--------T
2744
\______________________/
2748
J---------J-------T--------T
2752
It also helps that SRE_CODE is always an unsigned type, either 2 bytes or 4
2753
bytes wide (the latter if Python is compiled for "wide" unicode support).
2756
/* Defining this one enables tracing of the validator */
2759
/* Trace macro for the validator */
2760
#if defined(VVERBOSE)
2761
#define VTRACE(v) printf v
2766
/* Report failure */
2767
#define FAIL do { VTRACE(("FAIL: %d\n", __LINE__)); return 0; } while (0)
2769
/* Extract opcode, argument, or skip count from code array */
2772
VTRACE(("%p: ", code)); \
2773
if (code >= end) FAIL; \
2775
VTRACE(("%lu (op)\n", (unsigned long)op)); \
2779
VTRACE(("%p= ", code)); \
2780
if (code >= end) FAIL; \
2782
VTRACE(("%lu (arg)\n", (unsigned long)arg)); \
2784
#define GET_SKIP_ADJ(adj) \
2786
VTRACE(("%p= ", code)); \
2787
if (code >= end) FAIL; \
2789
VTRACE(("%lu (skip to %p)\n", \
2790
(unsigned long)skip, code+skip)); \
2791
if (code+skip-adj < code || code+skip-adj > end)\
2795
#define GET_SKIP GET_SKIP_ADJ(0)
2798
_validate_charset(SRE_CODE *code, SRE_CODE *end)
2800
/* Some variables are manipulated by the macros above */
2806
while (code < end) {
2813
case SRE_OP_LITERAL:
2822
case SRE_OP_CHARSET:
2823
offset = 32/sizeof(SRE_CODE); /* 32-byte bitmap */
2824
if (code+offset < code || code+offset > end)
2829
case SRE_OP_BIGCHARSET:
2830
GET_ARG; /* Number of blocks */
2831
offset = 256/sizeof(SRE_CODE); /* 256-byte table */
2832
if (code+offset < code || code+offset > end)
2834
/* Make sure that each byte points to a valid block */
2835
for (i = 0; i < 256; i++) {
2836
if (((unsigned char *)code)[i] >= arg)
2840
offset = arg * 32/sizeof(SRE_CODE); /* 32-byte bitmap times arg */
2841
if (code+offset < code || code+offset > end)
2846
case SRE_OP_CATEGORY:
2849
case SRE_CATEGORY_DIGIT:
2850
case SRE_CATEGORY_NOT_DIGIT:
2851
case SRE_CATEGORY_SPACE:
2852
case SRE_CATEGORY_NOT_SPACE:
2853
case SRE_CATEGORY_WORD:
2854
case SRE_CATEGORY_NOT_WORD:
2855
case SRE_CATEGORY_LINEBREAK:
2856
case SRE_CATEGORY_NOT_LINEBREAK:
2857
case SRE_CATEGORY_LOC_WORD:
2858
case SRE_CATEGORY_LOC_NOT_WORD:
2859
case SRE_CATEGORY_UNI_DIGIT:
2860
case SRE_CATEGORY_UNI_NOT_DIGIT:
2861
case SRE_CATEGORY_UNI_SPACE:
2862
case SRE_CATEGORY_UNI_NOT_SPACE:
2863
case SRE_CATEGORY_UNI_WORD:
2864
case SRE_CATEGORY_UNI_NOT_WORD:
2865
case SRE_CATEGORY_UNI_LINEBREAK:
2866
case SRE_CATEGORY_UNI_NOT_LINEBREAK:
2883
_validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
2885
/* Some variables are manipulated by the macros above */
2890
VTRACE(("code=%p, end=%p\n", code, end));
2895
while (code < end) {
2900
/* We don't check whether marks are properly nested; the
2901
sre_match() code is robust even if they don't, and the worst
2902
you can get is nonsensical match results. */
2904
if (arg > 2*groups+1) {
2905
VTRACE(("arg=%d, groups=%d\n", (int)arg, (int)groups));
2910
case SRE_OP_LITERAL:
2911
case SRE_OP_NOT_LITERAL:
2912
case SRE_OP_LITERAL_IGNORE:
2913
case SRE_OP_NOT_LITERAL_IGNORE:
2915
/* The arg is just a character, nothing to check */
2918
case SRE_OP_SUCCESS:
2919
case SRE_OP_FAILURE:
2920
/* Nothing to check; these normally end the matching process */
2926
case SRE_AT_BEGINNING:
2927
case SRE_AT_BEGINNING_STRING:
2928
case SRE_AT_BEGINNING_LINE:
2930
case SRE_AT_END_LINE:
2931
case SRE_AT_END_STRING:
2932
case SRE_AT_BOUNDARY:
2933
case SRE_AT_NON_BOUNDARY:
2934
case SRE_AT_LOC_BOUNDARY:
2935
case SRE_AT_LOC_NON_BOUNDARY:
2936
case SRE_AT_UNI_BOUNDARY:
2937
case SRE_AT_UNI_NON_BOUNDARY:
2945
case SRE_OP_ANY_ALL:
2946
/* These have no operands */
2950
case SRE_OP_IN_IGNORE:
2952
/* Stop 1 before the end; we check the FAILURE below */
2953
if (!_validate_charset(code, code+skip-2))
2955
if (code[skip-2] != SRE_OP_FAILURE)
2962
/* A minimal info field is
2963
<INFO> <1=skip> <2=flags> <3=min> <4=max>;
2964
If SRE_INFO_PREFIX or SRE_INFO_CHARSET is in the flags,
2966
SRE_CODE flags, min, max, i;
2969
newcode = code+skip-1;
2970
GET_ARG; flags = arg;
2973
/* Check that only valid flags are present */
2974
if ((flags & ~(SRE_INFO_PREFIX |
2976
SRE_INFO_CHARSET)) != 0)
2978
/* PREFIX and CHARSET are mutually exclusive */
2979
if ((flags & SRE_INFO_PREFIX) &&
2980
(flags & SRE_INFO_CHARSET))
2982
/* LITERAL implies PREFIX */
2983
if ((flags & SRE_INFO_LITERAL) &&
2984
!(flags & SRE_INFO_PREFIX))
2986
/* Validate the prefix */
2987
if (flags & SRE_INFO_PREFIX) {
2988
SRE_CODE prefix_len, prefix_skip;
2989
GET_ARG; prefix_len = arg;
2990
GET_ARG; prefix_skip = arg;
2991
/* Here comes the prefix string */
2992
if (code+prefix_len < code || code+prefix_len > newcode)
2995
/* And here comes the overlap table */
2996
if (code+prefix_len < code || code+prefix_len > newcode)
2998
/* Each overlap value should be < prefix_len */
2999
for (i = 0; i < prefix_len; i++) {
3000
if (code[i] >= prefix_len)
3005
/* Validate the charset */
3006
if (flags & SRE_INFO_CHARSET) {
3007
if (!_validate_charset(code, newcode-1))
3009
if (newcode[-1] != SRE_OP_FAILURE)
3013
else if (code != newcode) {
3014
VTRACE(("code=%p, newcode=%p\n", code, newcode));
3022
SRE_CODE *target = NULL;
3027
/* Stop 2 before the end; we check the JUMP below */
3028
if (!_validate_inner(code, code+skip-3, groups))
3031
/* Check that it ends with a JUMP, and that each JUMP
3032
has the same target */
3034
if (op != SRE_OP_JUMP)
3038
target = code+skip-1;
3039
else if (code+skip-1 != target)
3045
case SRE_OP_REPEAT_ONE:
3046
case SRE_OP_MIN_REPEAT_ONE:
3054
#ifdef Py_UNICODE_WIDE
3058
if (!_validate_inner(code, code+skip-4, groups))
3062
if (op != SRE_OP_SUCCESS)
3075
#ifdef Py_UNICODE_WIDE
3079
if (!_validate_inner(code, code+skip-3, groups))
3083
if (op != SRE_OP_MAX_UNTIL && op != SRE_OP_MIN_UNTIL)
3088
case SRE_OP_GROUPREF:
3089
case SRE_OP_GROUPREF_IGNORE:
3095
case SRE_OP_GROUPREF_EXISTS:
3096
/* The regex syntax for this is: '(?(group)then|else)', where
3097
'group' is either an integer group number or a group name,
3098
'then' and 'else' are sub-regexes, and 'else' is optional. */
3103
code--; /* The skip is relative to the first arg! */
3104
/* There are two possibilities here: if there is both a 'then'
3105
part and an 'else' part, the generated code looks like:
3113
(<skipyes> jumps here)
3115
(<skipno> jumps here)
3117
If there is only a 'then' part, it looks like:
3125
There is no direct way to decide which it is, and we don't want
3126
to allow arbitrary jumps anywhere in the code; so we just look
3127
for a JUMP opcode preceding our skip target.
3129
if (skip >= 3 && code+skip-3 >= code &&
3130
code[skip-3] == SRE_OP_JUMP)
3132
VTRACE(("both then and else parts present\n"));
3133
if (!_validate_inner(code+1, code+skip-3, groups))
3135
code += skip-2; /* Position after JUMP, at <skipno> */
3137
if (!_validate_inner(code, code+skip-1, groups))
3142
VTRACE(("only a then part present\n"));
3143
if (!_validate_inner(code+1, code+skip-1, groups))
3150
case SRE_OP_ASSERT_NOT:
3152
GET_ARG; /* 0 for lookahead, width for lookbehind */
3153
code--; /* Back up over arg to simplify math below */
3154
if (arg & 0x80000000)
3155
FAIL; /* Width too large */
3156
/* Stop 1 before the end; we check the SUCCESS below */
3157
if (!_validate_inner(code+1, code+skip-2, groups))
3161
if (op != SRE_OP_SUCCESS)
3176
_validate_outer(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
3178
if (groups < 0 || groups > 100 || code >= end || end[-1] != SRE_OP_SUCCESS)
3180
if (groups == 0) /* fix for simplejson */
3181
groups = 100; /* 100 groups should always be safe */
3182
return _validate_inner(code, end-1, groups);
3186
_validate(PatternObject *self)
3188
if (!_validate_outer(self->code, self->code+self->codesize, self->groups))
3190
PyErr_SetString(PyExc_RuntimeError, "invalid SRE code");
3194
VTRACE(("Success!\n"));
3198
/* -------------------------------------------------------------------- */
3202
match_dealloc(MatchObject* self)
3204
Py_XDECREF(self->regs);
3205
Py_XDECREF(self->string);
3206
Py_DECREF(self->pattern);
3211
match_getslice_by_index(MatchObject* self, Py_ssize_t index, PyObject* def)
3213
if (index < 0 || index >= self->groups) {
3214
/* raise IndexError if we were given a bad group number */
3224
if (self->string == Py_None || self->mark[index] < 0) {
3225
/* return default value if the string or group is undefined */
3230
return PySequence_GetSlice(
3231
self->string, self->mark[index], self->mark[index+1]
3236
match_getindex(MatchObject* self, PyObject* index)
3240
if (PyInt_Check(index))
3241
return PyInt_AsSsize_t(index);
3245
if (self->pattern->groupindex) {
3246
index = PyObject_GetItem(self->pattern->groupindex, index);
3248
if (PyInt_Check(index) || PyLong_Check(index))
3249
i = PyInt_AsSsize_t(index);
3259
match_getslice(MatchObject* self, PyObject* index, PyObject* def)
3261
return match_getslice_by_index(self, match_getindex(self, index), def);
3265
match_expand(MatchObject* self, PyObject* ptemplate)
3267
/* delegate to Python code */
3269
SRE_PY_MODULE, "_expand",
3270
PyTuple_Pack(3, self->pattern, self, ptemplate)
3275
match_group(MatchObject* self, PyObject* args)
3280
size = PyTuple_GET_SIZE(args);
3284
result = match_getslice(self, Py_False, Py_None);
3287
result = match_getslice(self, PyTuple_GET_ITEM(args, 0), Py_None);
3290
/* fetch multiple items */
3291
result = PyTuple_New(size);
3294
for (i = 0; i < size; i++) {
3295
PyObject* item = match_getslice(
3296
self, PyTuple_GET_ITEM(args, i), Py_None
3302
PyTuple_SET_ITEM(result, i, item);
3310
match_groups(MatchObject* self, PyObject* args, PyObject* kw)
3315
PyObject* def = Py_None;
3316
static char* kwlist[] = { "default", NULL };
3317
if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groups", kwlist, &def))
3320
result = PyTuple_New(self->groups-1);
3324
for (index = 1; index < self->groups; index++) {
3326
item = match_getslice_by_index(self, index, def);
3331
PyTuple_SET_ITEM(result, index-1, item);
3338
match_groupdict(MatchObject* self, PyObject* args, PyObject* kw)
3344
PyObject* def = Py_None;
3345
static char* kwlist[] = { "default", NULL };
3346
if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groupdict", kwlist, &def))
3349
result = PyDict_New();
3350
if (!result || !self->pattern->groupindex)
3353
keys = PyMapping_Keys(self->pattern->groupindex);
3357
for (index = 0; index < PyList_GET_SIZE(keys); index++) {
3361
key = PyList_GET_ITEM(keys, index);
3364
value = match_getslice(self, key, def);
3369
status = PyDict_SetItem(result, key, value);
3386
match_start(MatchObject* self, PyObject* args)
3390
PyObject* index_ = Py_False; /* zero */
3391
if (!PyArg_UnpackTuple(args, "start", 0, 1, &index_))
3394
index = match_getindex(self, index_);
3396
if (index < 0 || index >= self->groups) {
3404
/* mark is -1 if group is undefined */
3405
return Py_BuildValue("i", self->mark[index*2]);
3409
match_end(MatchObject* self, PyObject* args)
3413
PyObject* index_ = Py_False; /* zero */
3414
if (!PyArg_UnpackTuple(args, "end", 0, 1, &index_))
3417
index = match_getindex(self, index_);
3419
if (index < 0 || index >= self->groups) {
3427
/* mark is -1 if group is undefined */
3428
return Py_BuildValue("i", self->mark[index*2+1]);
3432
_pair(Py_ssize_t i1, Py_ssize_t i2)
3437
pair = PyTuple_New(2);
3441
item = PyInt_FromSsize_t(i1);
3444
PyTuple_SET_ITEM(pair, 0, item);
3446
item = PyInt_FromSsize_t(i2);
3449
PyTuple_SET_ITEM(pair, 1, item);
3459
match_span(MatchObject* self, PyObject* args)
3463
PyObject* index_ = Py_False; /* zero */
3464
if (!PyArg_UnpackTuple(args, "span", 0, 1, &index_))
3467
index = match_getindex(self, index_);
3469
if (index < 0 || index >= self->groups) {
3477
/* marks are -1 if group is undefined */
3478
return _pair(self->mark[index*2], self->mark[index*2+1]);
3482
match_regs(MatchObject* self)
3488
regs = PyTuple_New(self->groups);
3492
for (index = 0; index < self->groups; index++) {
3493
item = _pair(self->mark[index*2], self->mark[index*2+1]);
3498
PyTuple_SET_ITEM(regs, index, item);
3508
match_copy(MatchObject* self, PyObject *unused)
3510
#ifdef USE_BUILTIN_COPY
3512
Py_ssize_t slots, offset;
3514
slots = 2 * (self->pattern->groups+1);
3516
copy = PyObject_NEW_VAR(MatchObject, &Match_Type, slots);
3520
/* this value a constant, but any compiler should be able to
3521
figure that out all by itself */
3522
offset = offsetof(MatchObject, string);
3524
Py_XINCREF(self->pattern);
3525
Py_XINCREF(self->string);
3526
Py_XINCREF(self->regs);
3528
memcpy((char*) copy + offset, (char*) self + offset,
3529
sizeof(MatchObject) + slots * sizeof(Py_ssize_t) - offset);
3531
return (PyObject*) copy;
3533
PyErr_SetString(PyExc_TypeError, "cannot copy this match object");
3539
match_deepcopy(MatchObject* self, PyObject* memo)
3541
#ifdef USE_BUILTIN_COPY
3544
copy = (MatchObject*) match_copy(self);
3548
if (!deepcopy((PyObject**) ©->pattern, memo) ||
3549
!deepcopy(©->string, memo) ||
3550
!deepcopy(©->regs, memo)) {
3556
PyErr_SetString(PyExc_TypeError, "cannot deepcopy this match object");
3561
static PyMethodDef match_methods[] = {
3562
{"group", (PyCFunction) match_group, METH_VARARGS},
3563
{"start", (PyCFunction) match_start, METH_VARARGS},
3564
{"end", (PyCFunction) match_end, METH_VARARGS},
3565
{"span", (PyCFunction) match_span, METH_VARARGS},
3566
{"groups", (PyCFunction) match_groups, METH_VARARGS|METH_KEYWORDS},
3567
{"groupdict", (PyCFunction) match_groupdict, METH_VARARGS|METH_KEYWORDS},
3568
{"expand", (PyCFunction) match_expand, METH_O},
3569
{"__copy__", (PyCFunction) match_copy, METH_NOARGS},
3570
{"__deepcopy__", (PyCFunction) match_deepcopy, METH_O},
3575
match_getattr(MatchObject* self, char* name)
3579
res = Py_FindMethod(match_methods, (PyObject*) self, name);
3585
if (!strcmp(name, "lastindex")) {
3586
if (self->lastindex >= 0)
3587
return Py_BuildValue("i", self->lastindex);
3592
if (!strcmp(name, "lastgroup")) {
3593
if (self->pattern->indexgroup && self->lastindex >= 0) {
3594
PyObject* result = PySequence_GetItem(
3595
self->pattern->indexgroup, self->lastindex
3605
if (!strcmp(name, "string")) {
3607
Py_INCREF(self->string);
3608
return self->string;
3615
if (!strcmp(name, "regs")) {
3617
Py_INCREF(self->regs);
3620
return match_regs(self);
3623
if (!strcmp(name, "re")) {
3624
Py_INCREF(self->pattern);
3625
return (PyObject*) self->pattern;
3628
if (!strcmp(name, "pos"))
3629
return Py_BuildValue("i", self->pos);
3631
if (!strcmp(name, "endpos"))
3632
return Py_BuildValue("i", self->endpos);
3634
PyErr_SetString(PyExc_AttributeError, name);
3638
/* FIXME: implement setattr("string", None) as a special case (to
3639
detach the associated string, if any */
3641
statichere PyTypeObject Match_Type = {
3642
PyObject_HEAD_INIT(NULL)
3643
0, "_" SRE_MODULE ".SRE_Match",
3644
sizeof(MatchObject), sizeof(Py_ssize_t),
3645
(destructor)match_dealloc, /*tp_dealloc*/
3647
(getattrfunc)match_getattr /*tp_getattr*/
3651
pattern_new_match(PatternObject* pattern, SRE_STATE* state, int status)
3653
/* create match object (from state object) */
3662
/* create match object (with room for extra group marks) */
3663
/* coverity[ampersand_in_size] */
3664
match = PyObject_NEW_VAR(MatchObject, &Match_Type,
3665
2*(pattern->groups+1));
3670
match->pattern = pattern;
3672
Py_INCREF(state->string);
3673
match->string = state->string;
3676
match->groups = pattern->groups+1;
3678
/* fill in group slices */
3680
base = (char*) state->beginning;
3681
n = state->charsize;
3683
match->mark[0] = ((char*) state->start - base) / n;
3684
match->mark[1] = ((char*) state->ptr - base) / n;
3686
for (i = j = 0; i < pattern->groups; i++, j+=2)
3687
if (j+1 <= state->lastmark && state->mark[j] && state->mark[j+1]) {
3688
match->mark[j+2] = ((char*) state->mark[j] - base) / n;
3689
match->mark[j+3] = ((char*) state->mark[j+1] - base) / n;
3691
match->mark[j+2] = match->mark[j+3] = -1; /* undefined */
3693
match->pos = state->pos;
3694
match->endpos = state->endpos;
3696
match->lastindex = state->lastindex;
3698
return (PyObject*) match;
3700
} else if (status == 0) {
3708
/* internal error */
3709
pattern_error(status);
3714
/* -------------------------------------------------------------------- */
3715
/* scanner methods (experimental) */
3718
scanner_dealloc(ScannerObject* self)
3720
state_fini(&self->state);
3721
Py_DECREF(self->pattern);
3726
scanner_match(ScannerObject* self, PyObject *unused)
3728
SRE_STATE* state = &self->state;
3734
state->ptr = state->start;
3736
if (state->charsize == 1) {
3737
status = sre_match(state, PatternObject_GetCode(self->pattern));
3739
#if defined(HAVE_UNICODE)
3740
status = sre_umatch(state, PatternObject_GetCode(self->pattern));
3743
if (PyErr_Occurred())
3746
match = pattern_new_match((PatternObject*) self->pattern,
3749
if (status == 0 || state->ptr == state->start)
3750
state->start = (void*) ((char*) state->ptr + state->charsize);
3752
state->start = state->ptr;
3759
scanner_search(ScannerObject* self, PyObject *unused)
3761
SRE_STATE* state = &self->state;
3767
state->ptr = state->start;
3769
if (state->charsize == 1) {
3770
status = sre_search(state, PatternObject_GetCode(self->pattern));
3772
#if defined(HAVE_UNICODE)
3773
status = sre_usearch(state, PatternObject_GetCode(self->pattern));
3776
if (PyErr_Occurred())
3779
match = pattern_new_match((PatternObject*) self->pattern,
3782
if (status == 0 || state->ptr == state->start)
3783
state->start = (void*) ((char*) state->ptr + state->charsize);
3785
state->start = state->ptr;
3790
static PyMethodDef scanner_methods[] = {
3791
{"match", (PyCFunction) scanner_match, METH_NOARGS},
3792
{"search", (PyCFunction) scanner_search, METH_NOARGS},
3797
scanner_getattr(ScannerObject* self, char* name)
3801
res = Py_FindMethod(scanner_methods, (PyObject*) self, name);
3808
if (!strcmp(name, "pattern")) {
3809
Py_INCREF(self->pattern);
3810
return self->pattern;
3813
PyErr_SetString(PyExc_AttributeError, name);
3817
statichere PyTypeObject Scanner_Type = {
3818
PyObject_HEAD_INIT(NULL)
3819
0, "_" SRE_MODULE ".SRE_Scanner",
3820
sizeof(ScannerObject), 0,
3821
(destructor)scanner_dealloc, /*tp_dealloc*/
3823
(getattrfunc)scanner_getattr, /*tp_getattr*/
3827
pattern_scanner(PatternObject* pattern, PyObject* args)
3829
/* create search state object */
3831
ScannerObject* self;
3834
Py_ssize_t start = 0;
3835
Py_ssize_t end = PY_SSIZE_T_MAX;
3836
if (!PyArg_ParseTuple(args, "O|nn:scanner", &string, &start, &end))
3839
/* create scanner object */
3840
self = PyObject_NEW(ScannerObject, &Scanner_Type);
3844
string = state_init(&self->state, pattern, string, start, end);
3851
self->pattern = (PyObject*) pattern;
3853
return (PyObject*) self;
3856
static PyMethodDef _functions[] = {
3857
{"compile", _compile, METH_VARARGS},
3858
{"getcodesize", sre_codesize, METH_NOARGS},
3859
{"getlower", sre_getlower, METH_VARARGS},
3863
#if PY_VERSION_HEX < 0x02030000
3864
DL_EXPORT(void) init_sre(void)
3866
PyMODINIT_FUNC init_sre(void)
3873
/* Patch object types */
3874
Pattern_Type.ob_type = Match_Type.ob_type =
3875
Scanner_Type.ob_type = &PyType_Type;
3877
m = Py_InitModule("_" SRE_MODULE, _functions);
3880
d = PyModule_GetDict(m);
3882
x = PyInt_FromLong(SRE_MAGIC);
3884
PyDict_SetItemString(d, "MAGIC", x);
3888
x = PyInt_FromLong(sizeof(SRE_CODE));
3890
PyDict_SetItemString(d, "CODESIZE", x);
3894
x = PyString_FromString(copyright);
3896
PyDict_SetItemString(d, "copyright", x);
3901
#endif /* !defined(SRE_RECURSIVE) */