2
* Secret Labs' Regular Expression Engine
4
* regular expression matching engine
7
* 1999-10-24 fl created (based on existing template matcher code)
8
* 2000-03-06 fl first alpha, sort of
9
* 2000-08-01 fl fixes for 1.6b1
10
* 2000-08-07 fl use PyOS_CheckStack() if available
11
* 2000-09-20 fl added expand method
12
* 2001-03-20 fl lots of fixes for 2.1b2
13
* 2001-04-15 fl export copyright as Python attribute, not global
14
* 2001-04-28 fl added __copy__ methods (work in progress)
15
* 2001-05-14 fl fixes for 1.5.2 compatibility
16
* 2001-07-01 fl added BIGCHARSET support (from Martin von Loewis)
17
* 2001-10-18 fl fixed group reset issue (from Matthew Mueller)
18
* 2001-10-20 fl added split primitive; reenable unicode for 1.6/2.0/2.1
19
* 2001-10-21 fl added sub/subn primitive
20
* 2001-10-24 fl added finditer primitive (for 2.2 only)
21
* 2001-12-07 fl fixed memory leak in sub/subn (Guido van Rossum)
22
* 2002-11-09 fl fixed empty sub/subn return type
23
* 2003-04-18 mvl fully support 4-byte codes
24
* 2003-10-17 gn implemented non recursive scheme
26
* Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
28
* This version of the SRE library can be redistributed under CNRI's
29
* Python 1.6 license. For any other use, please contact Secret Labs
30
* AB (info@pythonware.com).
32
* Portions of this engine have been developed in cooperation with
33
* CNRI. Hewlett-Packard provided funding for 1.6 integration and
34
* other compatibility work.
39
static char copyright[] =
40
" SRE 2.2.2 Copyright (c) 1997-2002 by Secret Labs AB ";
42
#define PY_SSIZE_T_CLEAN
45
#include "structmember.h" /* offsetof */
51
/* name of this module, minus the leading underscore */
52
#if !defined(SRE_MODULE)
53
#define SRE_MODULE "sre"
56
#define SRE_PY_MODULE "re"
58
/* defining this one enables tracing */
61
/* defining this enables unicode support (default under 1.6a1 and later) */
64
/* -------------------------------------------------------------------- */
65
/* optional features */
67
/* enables fast searching */
68
#define USE_FAST_SEARCH
70
/* enables aggressive inlining (always on for Visual C) */
73
/* enables copy/deepcopy handling (work in progress) */
74
#undef USE_BUILTIN_COPY
76
#if PY_VERSION_HEX < 0x01060000
77
#define PyObject_DEL(op) PyMem_DEL((op))
80
/* -------------------------------------------------------------------- */
83
#pragma optimize("agtw", on) /* doesn't seem to make much difference... */
84
#pragma warning(disable: 4710) /* who cares if functions are not inlined ;-) */
85
/* fastest possible local call under MSVC */
86
#define LOCAL(type) static __inline type __fastcall
87
#elif defined(USE_INLINE)
88
#define LOCAL(type) static inline type
90
#define LOCAL(type) static type
94
#define SRE_ERROR_ILLEGAL -1 /* illegal opcode */
95
#define SRE_ERROR_STATE -2 /* illegal state */
96
#define SRE_ERROR_RECURSION_LIMIT -3 /* runaway recursion */
97
#define SRE_ERROR_MEMORY -9 /* out of memory */
98
#define SRE_ERROR_INTERRUPTED -10 /* signal handler raised exception */
101
#define TRACE(v) printf v
106
/* -------------------------------------------------------------------- */
107
/* search engine state */
109
/* default character predicates (run sre_chars.py to regenerate tables) */
111
#define SRE_DIGIT_MASK 1
112
#define SRE_SPACE_MASK 2
113
#define SRE_LINEBREAK_MASK 4
114
#define SRE_ALNUM_MASK 8
115
#define SRE_WORD_MASK 16
117
/* FIXME: this assumes ASCII. create tables in init_sre() instead */
119
static char sre_char_info[128] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 6, 2,
120
2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0,
121
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 25, 25, 25, 25, 25, 25, 25, 25,
122
25, 25, 0, 0, 0, 0, 0, 0, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
123
24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0,
124
0, 0, 16, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
125
24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0, 0, 0, 0 };
127
static char sre_char_lower[128] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
128
10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
129
27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
130
44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
131
61, 62, 63, 64, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107,
132
108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121,
133
122, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105,
134
106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
135
120, 121, 122, 123, 124, 125, 126, 127 };
137
#define SRE_IS_DIGIT(ch)\
138
((ch) < 128 ? (sre_char_info[(ch)] & SRE_DIGIT_MASK) : 0)
139
#define SRE_IS_SPACE(ch)\
140
((ch) < 128 ? (sre_char_info[(ch)] & SRE_SPACE_MASK) : 0)
141
#define SRE_IS_LINEBREAK(ch)\
142
((ch) < 128 ? (sre_char_info[(ch)] & SRE_LINEBREAK_MASK) : 0)
143
#define SRE_IS_ALNUM(ch)\
144
((ch) < 128 ? (sre_char_info[(ch)] & SRE_ALNUM_MASK) : 0)
145
#define SRE_IS_WORD(ch)\
146
((ch) < 128 ? (sre_char_info[(ch)] & SRE_WORD_MASK) : 0)
148
static unsigned int sre_lower(unsigned int ch)
150
return ((ch) < 128 ? (unsigned int)sre_char_lower[ch] : ch);
153
/* locale-specific character predicates */
154
/* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
155
* warnings when c's type supports only numbers < N+1 */
156
#define SRE_LOC_IS_DIGIT(ch) (!((ch) & ~255) ? isdigit((ch)) : 0)
157
#define SRE_LOC_IS_SPACE(ch) (!((ch) & ~255) ? isspace((ch)) : 0)
158
#define SRE_LOC_IS_LINEBREAK(ch) ((ch) == '\n')
159
#define SRE_LOC_IS_ALNUM(ch) (!((ch) & ~255) ? isalnum((ch)) : 0)
160
#define SRE_LOC_IS_WORD(ch) (SRE_LOC_IS_ALNUM((ch)) || (ch) == '_')
162
static unsigned int sre_lower_locale(unsigned int ch)
164
return ((ch) < 256 ? (unsigned int)tolower((ch)) : ch);
167
/* unicode-specific character predicates */
169
#if defined(HAVE_UNICODE)
171
#define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDIGIT((Py_UNICODE)(ch))
172
#define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE((Py_UNICODE)(ch))
173
#define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK((Py_UNICODE)(ch))
174
#define SRE_UNI_IS_ALNUM(ch) Py_UNICODE_ISALNUM((Py_UNICODE)(ch))
175
#define SRE_UNI_IS_WORD(ch) (SRE_UNI_IS_ALNUM((ch)) || (ch) == '_')
177
static unsigned int sre_lower_unicode(unsigned int ch)
179
return (unsigned int) Py_UNICODE_TOLOWER((Py_UNICODE)(ch));
185
sre_category(SRE_CODE category, unsigned int ch)
189
case SRE_CATEGORY_DIGIT:
190
return SRE_IS_DIGIT(ch);
191
case SRE_CATEGORY_NOT_DIGIT:
192
return !SRE_IS_DIGIT(ch);
193
case SRE_CATEGORY_SPACE:
194
return SRE_IS_SPACE(ch);
195
case SRE_CATEGORY_NOT_SPACE:
196
return !SRE_IS_SPACE(ch);
197
case SRE_CATEGORY_WORD:
198
return SRE_IS_WORD(ch);
199
case SRE_CATEGORY_NOT_WORD:
200
return !SRE_IS_WORD(ch);
201
case SRE_CATEGORY_LINEBREAK:
202
return SRE_IS_LINEBREAK(ch);
203
case SRE_CATEGORY_NOT_LINEBREAK:
204
return !SRE_IS_LINEBREAK(ch);
206
case SRE_CATEGORY_LOC_WORD:
207
return SRE_LOC_IS_WORD(ch);
208
case SRE_CATEGORY_LOC_NOT_WORD:
209
return !SRE_LOC_IS_WORD(ch);
211
#if defined(HAVE_UNICODE)
212
case SRE_CATEGORY_UNI_DIGIT:
213
return SRE_UNI_IS_DIGIT(ch);
214
case SRE_CATEGORY_UNI_NOT_DIGIT:
215
return !SRE_UNI_IS_DIGIT(ch);
216
case SRE_CATEGORY_UNI_SPACE:
217
return SRE_UNI_IS_SPACE(ch);
218
case SRE_CATEGORY_UNI_NOT_SPACE:
219
return !SRE_UNI_IS_SPACE(ch);
220
case SRE_CATEGORY_UNI_WORD:
221
return SRE_UNI_IS_WORD(ch);
222
case SRE_CATEGORY_UNI_NOT_WORD:
223
return !SRE_UNI_IS_WORD(ch);
224
case SRE_CATEGORY_UNI_LINEBREAK:
225
return SRE_UNI_IS_LINEBREAK(ch);
226
case SRE_CATEGORY_UNI_NOT_LINEBREAK:
227
return !SRE_UNI_IS_LINEBREAK(ch);
229
case SRE_CATEGORY_UNI_DIGIT:
230
return SRE_IS_DIGIT(ch);
231
case SRE_CATEGORY_UNI_NOT_DIGIT:
232
return !SRE_IS_DIGIT(ch);
233
case SRE_CATEGORY_UNI_SPACE:
234
return SRE_IS_SPACE(ch);
235
case SRE_CATEGORY_UNI_NOT_SPACE:
236
return !SRE_IS_SPACE(ch);
237
case SRE_CATEGORY_UNI_WORD:
238
return SRE_LOC_IS_WORD(ch);
239
case SRE_CATEGORY_UNI_NOT_WORD:
240
return !SRE_LOC_IS_WORD(ch);
241
case SRE_CATEGORY_UNI_LINEBREAK:
242
return SRE_IS_LINEBREAK(ch);
243
case SRE_CATEGORY_UNI_NOT_LINEBREAK:
244
return !SRE_IS_LINEBREAK(ch);
253
data_stack_dealloc(SRE_STATE* state)
255
if (state->data_stack) {
256
PyMem_FREE(state->data_stack);
257
state->data_stack = NULL;
259
state->data_stack_size = state->data_stack_base = 0;
263
data_stack_grow(SRE_STATE* state, Py_ssize_t size)
265
Py_ssize_t minsize, cursize;
266
minsize = state->data_stack_base+size;
267
cursize = state->data_stack_size;
268
if (cursize < minsize) {
270
cursize = minsize+minsize/4+1024;
271
TRACE(("allocate/grow stack %d\n", cursize));
272
stack = PyMem_REALLOC(state->data_stack, cursize);
274
data_stack_dealloc(state);
275
return SRE_ERROR_MEMORY;
277
state->data_stack = (char *)stack;
278
state->data_stack_size = cursize;
283
/* generate 8-bit version */
285
#define SRE_CHAR unsigned char
286
#define SRE_AT sre_at
287
#define SRE_COUNT sre_count
288
#define SRE_CHARSET sre_charset
289
#define SRE_INFO sre_info
290
#define SRE_MATCH sre_match
291
#define SRE_MATCH_CONTEXT sre_match_context
292
#define SRE_SEARCH sre_search
293
#define SRE_LITERAL_TEMPLATE sre_literal_template
295
#if defined(HAVE_UNICODE)
297
#define SRE_RECURSIVE
301
#undef SRE_LITERAL_TEMPLATE
304
#undef SRE_MATCH_CONTEXT
311
/* generate 16-bit unicode version */
313
#define SRE_CHAR Py_UNICODE
314
#define SRE_AT sre_uat
315
#define SRE_COUNT sre_ucount
316
#define SRE_CHARSET sre_ucharset
317
#define SRE_INFO sre_uinfo
318
#define SRE_MATCH sre_umatch
319
#define SRE_MATCH_CONTEXT sre_umatch_context
320
#define SRE_SEARCH sre_usearch
321
#define SRE_LITERAL_TEMPLATE sre_uliteral_template
324
#endif /* SRE_RECURSIVE */
326
/* -------------------------------------------------------------------- */
327
/* String matching engine */
329
/* the following section is compiled twice, with different character
333
SRE_AT(SRE_STATE* state, SRE_CHAR* ptr, SRE_CODE at)
335
/* check if pointer is at given position */
337
Py_ssize_t thisp, thatp;
341
case SRE_AT_BEGINNING:
342
case SRE_AT_BEGINNING_STRING:
343
return ((void*) ptr == state->beginning);
345
case SRE_AT_BEGINNING_LINE:
346
return ((void*) ptr == state->beginning ||
347
SRE_IS_LINEBREAK((int) ptr[-1]));
350
return (((void*) (ptr+1) == state->end &&
351
SRE_IS_LINEBREAK((int) ptr[0])) ||
352
((void*) ptr == state->end));
354
case SRE_AT_END_LINE:
355
return ((void*) ptr == state->end ||
356
SRE_IS_LINEBREAK((int) ptr[0]));
358
case SRE_AT_END_STRING:
359
return ((void*) ptr == state->end);
361
case SRE_AT_BOUNDARY:
362
if (state->beginning == state->end)
364
thatp = ((void*) ptr > state->beginning) ?
365
SRE_IS_WORD((int) ptr[-1]) : 0;
366
thisp = ((void*) ptr < state->end) ?
367
SRE_IS_WORD((int) ptr[0]) : 0;
368
return thisp != thatp;
370
case SRE_AT_NON_BOUNDARY:
371
if (state->beginning == state->end)
373
thatp = ((void*) ptr > state->beginning) ?
374
SRE_IS_WORD((int) ptr[-1]) : 0;
375
thisp = ((void*) ptr < state->end) ?
376
SRE_IS_WORD((int) ptr[0]) : 0;
377
return thisp == thatp;
379
case SRE_AT_LOC_BOUNDARY:
380
if (state->beginning == state->end)
382
thatp = ((void*) ptr > state->beginning) ?
383
SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
384
thisp = ((void*) ptr < state->end) ?
385
SRE_LOC_IS_WORD((int) ptr[0]) : 0;
386
return thisp != thatp;
388
case SRE_AT_LOC_NON_BOUNDARY:
389
if (state->beginning == state->end)
391
thatp = ((void*) ptr > state->beginning) ?
392
SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
393
thisp = ((void*) ptr < state->end) ?
394
SRE_LOC_IS_WORD((int) ptr[0]) : 0;
395
return thisp == thatp;
397
#if defined(HAVE_UNICODE)
398
case SRE_AT_UNI_BOUNDARY:
399
if (state->beginning == state->end)
401
thatp = ((void*) ptr > state->beginning) ?
402
SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
403
thisp = ((void*) ptr < state->end) ?
404
SRE_UNI_IS_WORD((int) ptr[0]) : 0;
405
return thisp != thatp;
407
case SRE_AT_UNI_NON_BOUNDARY:
408
if (state->beginning == state->end)
410
thatp = ((void*) ptr > state->beginning) ?
411
SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
412
thisp = ((void*) ptr < state->end) ?
413
SRE_UNI_IS_WORD((int) ptr[0]) : 0;
414
return thisp == thatp;
423
SRE_CHARSET(SRE_CODE* set, SRE_CODE ch)
425
/* check if character is a member of the given set */
436
/* <LITERAL> <code> */
442
case SRE_OP_CATEGORY:
443
/* <CATEGORY> <code> */
444
if (sre_category(set[0], (int) ch))
450
if (sizeof(SRE_CODE) == 2) {
451
/* <CHARSET> <bitmap> (16 bits per code word) */
452
if (ch < 256 && (set[ch >> 4] & (1 << (ch & 15))))
457
/* <CHARSET> <bitmap> (32 bits per code word) */
458
if (ch < 256 && (set[ch >> 5] & (1 << (ch & 31))))
465
/* <RANGE> <lower> <upper> */
466
if (set[0] <= ch && ch <= set[1])
475
case SRE_OP_BIGCHARSET:
476
/* <BIGCHARSET> <blockcount> <256 blockindices> <blocks> */
478
Py_ssize_t count, block;
481
if (sizeof(SRE_CODE) == 2) {
482
block = ((unsigned char*)set)[ch >> 8];
484
if (set[block*16 + ((ch & 255)>>4)] & (1 << (ch & 15)))
489
/* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
490
* warnings when c's type supports only numbers < N+1 */
492
block = ((unsigned char*)set)[ch >> 8];
497
(set[block*8 + ((ch & 255)>>5)] & (1 << (ch & 31))))
505
/* internal error -- there's not much we can do about it
506
here, so let's just pretend it didn't match... */
512
LOCAL(Py_ssize_t) SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern);
515
SRE_COUNT(SRE_STATE* state, SRE_CODE* pattern, Py_ssize_t maxcount)
518
SRE_CHAR* ptr = (SRE_CHAR *)state->ptr;
519
SRE_CHAR* end = (SRE_CHAR *)state->end;
523
if (maxcount < end - ptr && maxcount != 65535)
524
end = ptr + maxcount;
526
switch (pattern[0]) {
530
TRACE(("|%p|%p|COUNT IN\n", pattern, ptr));
531
while (ptr < end && SRE_CHARSET(pattern + 2, *ptr))
536
/* repeated dot wildcard. */
537
TRACE(("|%p|%p|COUNT ANY\n", pattern, ptr));
538
while (ptr < end && !SRE_IS_LINEBREAK(*ptr))
543
/* repeated dot wildcard. skip to the end of the target
544
string, and backtrack from there */
545
TRACE(("|%p|%p|COUNT ANY_ALL\n", pattern, ptr));
550
/* repeated literal */
552
TRACE(("|%p|%p|COUNT LITERAL %d\n", pattern, ptr, chr));
553
while (ptr < end && (SRE_CODE) *ptr == chr)
557
case SRE_OP_LITERAL_IGNORE:
558
/* repeated literal */
560
TRACE(("|%p|%p|COUNT LITERAL_IGNORE %d\n", pattern, ptr, chr));
561
while (ptr < end && (SRE_CODE) state->lower(*ptr) == chr)
565
case SRE_OP_NOT_LITERAL:
566
/* repeated non-literal */
568
TRACE(("|%p|%p|COUNT NOT_LITERAL %d\n", pattern, ptr, chr));
569
while (ptr < end && (SRE_CODE) *ptr != chr)
573
case SRE_OP_NOT_LITERAL_IGNORE:
574
/* repeated non-literal */
576
TRACE(("|%p|%p|COUNT NOT_LITERAL_IGNORE %d\n", pattern, ptr, chr));
577
while (ptr < end && (SRE_CODE) state->lower(*ptr) != chr)
582
/* repeated single character pattern */
583
TRACE(("|%p|%p|COUNT SUBPATTERN\n", pattern, ptr));
584
while ((SRE_CHAR*) state->ptr < end) {
585
i = SRE_MATCH(state, pattern);
591
TRACE(("|%p|%p|COUNT %d\n", pattern, ptr,
592
(SRE_CHAR*) state->ptr - ptr));
593
return (SRE_CHAR*) state->ptr - ptr;
596
TRACE(("|%p|%p|COUNT %d\n", pattern, ptr, ptr - (SRE_CHAR*) state->ptr));
597
return ptr - (SRE_CHAR*) state->ptr;
600
#if 0 /* not used in this release */
602
SRE_INFO(SRE_STATE* state, SRE_CODE* pattern)
604
/* check if an SRE_OP_INFO block matches at the current position.
605
returns the number of SRE_CODE objects to skip if successful, 0
608
SRE_CHAR* end = state->end;
609
SRE_CHAR* ptr = state->ptr;
612
/* check minimal length */
613
if (pattern[3] && (end - ptr) < pattern[3])
616
/* check known prefix */
617
if (pattern[2] & SRE_INFO_PREFIX && pattern[5] > 1) {
618
/* <length> <skip> <prefix data> <overlap data> */
619
for (i = 0; i < pattern[5]; i++)
620
if ((SRE_CODE) ptr[i] != pattern[7 + i])
622
return pattern[0] + 2 * pattern[6];
628
/* The macros below should be used to protect recursive SRE_MATCH()
629
* calls that *failed* and do *not* return immediately (IOW, those
630
* that will backtrack). Explaining:
632
* - Recursive SRE_MATCH() returned true: that's usually a success
633
* (besides atypical cases like ASSERT_NOT), therefore there's no
634
* reason to restore lastmark;
636
* - Recursive SRE_MATCH() returned false but the current SRE_MATCH()
637
* is returning to the caller: If the current SRE_MATCH() is the
638
* top function of the recursion, returning false will be a matching
639
* failure, and it doesn't matter where lastmark is pointing to.
640
* If it's *not* the top function, it will be a recursive SRE_MATCH()
641
* failure by itself, and the calling SRE_MATCH() will have to deal
642
* with the failure by the same rules explained here (it will restore
643
* lastmark by itself if necessary);
645
* - Recursive SRE_MATCH() returned false, and will continue the
646
* outside 'for' loop: must be protected when breaking, since the next
647
* OP could potentially depend on lastmark;
649
* - Recursive SRE_MATCH() returned false, and will be called again
650
* inside a local for/while loop: must be protected between each
651
* loop iteration, since the recursive SRE_MATCH() could do anything,
652
* and could potentially depend on lastmark.
654
* For more information, check the discussion at SF patch #712900.
656
#define LASTMARK_SAVE() \
658
ctx->lastmark = state->lastmark; \
659
ctx->lastindex = state->lastindex; \
661
#define LASTMARK_RESTORE() \
663
state->lastmark = ctx->lastmark; \
664
state->lastindex = ctx->lastindex; \
667
#define RETURN_ERROR(i) do { return i; } while(0)
668
#define RETURN_FAILURE do { ret = 0; goto exit; } while(0)
669
#define RETURN_SUCCESS do { ret = 1; goto exit; } while(0)
671
#define RETURN_ON_ERROR(i) \
672
do { if (i < 0) RETURN_ERROR(i); } while (0)
673
#define RETURN_ON_SUCCESS(i) \
674
do { RETURN_ON_ERROR(i); if (i > 0) RETURN_SUCCESS; } while (0)
675
#define RETURN_ON_FAILURE(i) \
676
do { RETURN_ON_ERROR(i); if (i == 0) RETURN_FAILURE; } while (0)
680
#define DATA_STACK_ALLOC(state, type, ptr) \
682
alloc_pos = state->data_stack_base; \
683
TRACE(("allocating %s in %d (%d)\n", \
684
SFY(type), alloc_pos, sizeof(type))); \
685
if (state->data_stack_size < alloc_pos+sizeof(type)) { \
686
int j = data_stack_grow(state, sizeof(type)); \
687
if (j < 0) return j; \
689
DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \
691
ptr = (type*)(state->data_stack+alloc_pos); \
692
state->data_stack_base += sizeof(type); \
695
#define DATA_STACK_LOOKUP_AT(state, type, ptr, pos) \
697
TRACE(("looking up %s at %d\n", SFY(type), pos)); \
698
ptr = (type*)(state->data_stack+pos); \
701
#define DATA_STACK_PUSH(state, data, size) \
703
TRACE(("copy data in %p to %d (%d)\n", \
704
data, state->data_stack_base, size)); \
705
if (state->data_stack_size < state->data_stack_base+size) { \
706
int j = data_stack_grow(state, size); \
707
if (j < 0) return j; \
709
DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \
711
memcpy(state->data_stack+state->data_stack_base, data, size); \
712
state->data_stack_base += size; \
715
#define DATA_STACK_POP(state, data, size, discard) \
717
TRACE(("copy data to %p from %d (%d)\n", \
718
data, state->data_stack_base-size, size)); \
719
memcpy(data, state->data_stack+state->data_stack_base-size, size); \
721
state->data_stack_base -= size; \
724
#define DATA_STACK_POP_DISCARD(state, size) \
726
TRACE(("discard data from %d (%d)\n", \
727
state->data_stack_base-size, size)); \
728
state->data_stack_base -= size; \
731
#define DATA_PUSH(x) \
732
DATA_STACK_PUSH(state, (x), sizeof(*(x)))
733
#define DATA_POP(x) \
734
DATA_STACK_POP(state, (x), sizeof(*(x)), 1)
735
#define DATA_POP_DISCARD(x) \
736
DATA_STACK_POP_DISCARD(state, sizeof(*(x)))
737
#define DATA_ALLOC(t,p) \
738
DATA_STACK_ALLOC(state, t, p)
739
#define DATA_LOOKUP_AT(t,p,pos) \
740
DATA_STACK_LOOKUP_AT(state,t,p,pos)
742
#define MARK_PUSH(lastmark) \
743
do if (lastmark > 0) { \
744
i = lastmark; /* ctx->lastmark may change if reallocated */ \
745
DATA_STACK_PUSH(state, state->mark, (i+1)*sizeof(void*)); \
747
#define MARK_POP(lastmark) \
748
do if (lastmark > 0) { \
749
DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 1); \
751
#define MARK_POP_KEEP(lastmark) \
752
do if (lastmark > 0) { \
753
DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 0); \
755
#define MARK_POP_DISCARD(lastmark) \
756
do if (lastmark > 0) { \
757
DATA_STACK_POP_DISCARD(state, (lastmark+1)*sizeof(void*)); \
761
#define JUMP_MAX_UNTIL_1 1
762
#define JUMP_MAX_UNTIL_2 2
763
#define JUMP_MAX_UNTIL_3 3
764
#define JUMP_MIN_UNTIL_1 4
765
#define JUMP_MIN_UNTIL_2 5
766
#define JUMP_MIN_UNTIL_3 6
767
#define JUMP_REPEAT 7
768
#define JUMP_REPEAT_ONE_1 8
769
#define JUMP_REPEAT_ONE_2 9
770
#define JUMP_MIN_REPEAT_ONE 10
771
#define JUMP_BRANCH 11
772
#define JUMP_ASSERT 12
773
#define JUMP_ASSERT_NOT 13
775
#define DO_JUMP(jumpvalue, jumplabel, nextpattern) \
776
DATA_ALLOC(SRE_MATCH_CONTEXT, nextctx); \
777
nextctx->last_ctx_pos = ctx_pos; \
778
nextctx->jump = jumpvalue; \
779
nextctx->pattern = nextpattern; \
780
ctx_pos = alloc_pos; \
784
while (0) /* gcc doesn't like labels at end of scopes */ \
787
Py_ssize_t last_ctx_pos;
793
Py_ssize_t lastindex;
800
/* check if string matches the given pattern. returns <0 for
801
error, 0 for failure, and 1 for success */
803
SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
805
SRE_CHAR* end = (SRE_CHAR *)state->end;
806
Py_ssize_t alloc_pos, ctx_pos = -1;
807
Py_ssize_t i, ret = 0;
809
unsigned int sigcount=0;
811
SRE_MATCH_CONTEXT* ctx;
812
SRE_MATCH_CONTEXT* nextctx;
814
TRACE(("|%p|%p|ENTER\n", pattern, state->ptr));
816
DATA_ALLOC(SRE_MATCH_CONTEXT, ctx);
817
ctx->last_ctx_pos = -1;
818
ctx->jump = JUMP_NONE;
819
ctx->pattern = pattern;
824
ctx->ptr = (SRE_CHAR *)state->ptr;
826
if (ctx->pattern[0] == SRE_OP_INFO) {
827
/* optimization info block */
828
/* <INFO> <1=skip> <2=flags> <3=min> ... */
829
if (ctx->pattern[3] && (end - ctx->ptr) < ctx->pattern[3]) {
830
TRACE(("reject (got %d chars, need %d)\n",
831
(end - ctx->ptr), ctx->pattern[3]));
834
ctx->pattern += ctx->pattern[1] + 1;
839
if ((0 == (sigcount & 0xfff)) && PyErr_CheckSignals())
840
RETURN_ERROR(SRE_ERROR_INTERRUPTED);
842
switch (*ctx->pattern++) {
847
TRACE(("|%p|%p|MARK %d\n", ctx->pattern,
848
ctx->ptr, ctx->pattern[0]));
851
state->lastindex = i/2 + 1;
852
if (i > state->lastmark) {
853
/* state->lastmark is the highest valid index in the
854
state->mark array. If it is increased by more than 1,
855
the intervening marks must be set to NULL to signal
856
that these marks have not been encountered. */
857
Py_ssize_t j = state->lastmark + 1;
859
state->mark[j++] = NULL;
862
state->mark[i] = ctx->ptr;
867
/* match literal string */
868
/* <LITERAL> <code> */
869
TRACE(("|%p|%p|LITERAL %d\n", ctx->pattern,
870
ctx->ptr, *ctx->pattern));
871
if (ctx->ptr >= end || (SRE_CODE) ctx->ptr[0] != ctx->pattern[0])
877
case SRE_OP_NOT_LITERAL:
878
/* match anything that is not literal character */
879
/* <NOT_LITERAL> <code> */
880
TRACE(("|%p|%p|NOT_LITERAL %d\n", ctx->pattern,
881
ctx->ptr, *ctx->pattern));
882
if (ctx->ptr >= end || (SRE_CODE) ctx->ptr[0] == ctx->pattern[0])
890
TRACE(("|%p|%p|SUCCESS\n", ctx->pattern, ctx->ptr));
891
state->ptr = ctx->ptr;
895
/* match at given position */
897
TRACE(("|%p|%p|AT %d\n", ctx->pattern, ctx->ptr, *ctx->pattern));
898
if (!SRE_AT(state, ctx->ptr, *ctx->pattern))
903
case SRE_OP_CATEGORY:
904
/* match at given category */
905
/* <CATEGORY> <code> */
906
TRACE(("|%p|%p|CATEGORY %d\n", ctx->pattern,
907
ctx->ptr, *ctx->pattern));
908
if (ctx->ptr >= end || !sre_category(ctx->pattern[0], ctx->ptr[0]))
915
/* match anything (except a newline) */
917
TRACE(("|%p|%p|ANY\n", ctx->pattern, ctx->ptr));
918
if (ctx->ptr >= end || SRE_IS_LINEBREAK(ctx->ptr[0]))
926
TRACE(("|%p|%p|ANY_ALL\n", ctx->pattern, ctx->ptr));
933
/* match set member (or non_member) */
934
/* <IN> <skip> <set> */
935
TRACE(("|%p|%p|IN\n", ctx->pattern, ctx->ptr));
936
if (ctx->ptr >= end || !SRE_CHARSET(ctx->pattern + 1, *ctx->ptr))
938
ctx->pattern += ctx->pattern[0];
942
case SRE_OP_LITERAL_IGNORE:
943
TRACE(("|%p|%p|LITERAL_IGNORE %d\n",
944
ctx->pattern, ctx->ptr, ctx->pattern[0]));
945
if (ctx->ptr >= end ||
946
state->lower(*ctx->ptr) != state->lower(*ctx->pattern))
952
case SRE_OP_NOT_LITERAL_IGNORE:
953
TRACE(("|%p|%p|NOT_LITERAL_IGNORE %d\n",
954
ctx->pattern, ctx->ptr, *ctx->pattern));
955
if (ctx->ptr >= end ||
956
state->lower(*ctx->ptr) == state->lower(*ctx->pattern))
962
case SRE_OP_IN_IGNORE:
963
TRACE(("|%p|%p|IN_IGNORE\n", ctx->pattern, ctx->ptr));
965
|| !SRE_CHARSET(ctx->pattern+1,
966
(SRE_CODE)state->lower(*ctx->ptr)))
968
ctx->pattern += ctx->pattern[0];
975
/* <JUMP> <offset> */
976
TRACE(("|%p|%p|JUMP %d\n", ctx->pattern,
977
ctx->ptr, ctx->pattern[0]));
978
ctx->pattern += ctx->pattern[0];
983
/* <BRANCH> <0=skip> code <JUMP> ... <NULL> */
984
TRACE(("|%p|%p|BRANCH\n", ctx->pattern, ctx->ptr));
986
ctx->u.rep = state->repeat;
988
MARK_PUSH(ctx->lastmark);
989
for (; ctx->pattern[0]; ctx->pattern += ctx->pattern[0]) {
990
if (ctx->pattern[1] == SRE_OP_LITERAL &&
992
(SRE_CODE) *ctx->ptr != ctx->pattern[2]))
994
if (ctx->pattern[1] == SRE_OP_IN &&
996
!SRE_CHARSET(ctx->pattern + 3, (SRE_CODE) *ctx->ptr)))
998
state->ptr = ctx->ptr;
999
DO_JUMP(JUMP_BRANCH, jump_branch, ctx->pattern+1);
1002
MARK_POP_DISCARD(ctx->lastmark);
1003
RETURN_ON_ERROR(ret);
1007
MARK_POP_KEEP(ctx->lastmark);
1011
MARK_POP_DISCARD(ctx->lastmark);
1014
case SRE_OP_REPEAT_ONE:
1015
/* match repeated sequence (maximizing regexp) */
1017
/* this operator only works if the repeated item is
1018
exactly one character wide, and we're not already
1019
collecting backtracking points. for other cases,
1020
use the MAX_REPEAT operator */
1022
/* <REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
1024
TRACE(("|%p|%p|REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr,
1025
ctx->pattern[1], ctx->pattern[2]));
1027
if (ctx->ptr + ctx->pattern[1] > end)
1028
RETURN_FAILURE; /* cannot match */
1030
state->ptr = ctx->ptr;
1032
ret = SRE_COUNT(state, ctx->pattern+3, ctx->pattern[2]);
1033
RETURN_ON_ERROR(ret);
1034
DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1036
ctx->ptr += ctx->count;
1038
/* when we arrive here, count contains the number of
1039
matches, and ctx->ptr points to the tail of the target
1040
string. check if the rest of the pattern matches,
1041
and backtrack if not. */
1043
if (ctx->count < (Py_ssize_t) ctx->pattern[1])
1046
if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS) {
1047
/* tail is empty. we're finished */
1048
state->ptr = ctx->ptr;
1054
if (ctx->pattern[ctx->pattern[0]] == SRE_OP_LITERAL) {
1055
/* tail starts with a literal. skip positions where
1056
the rest of the pattern cannot possibly match */
1057
ctx->u.chr = ctx->pattern[ctx->pattern[0]+1];
1059
while (ctx->count >= (Py_ssize_t) ctx->pattern[1] &&
1060
(ctx->ptr >= end || *ctx->ptr != ctx->u.chr)) {
1064
if (ctx->count < (Py_ssize_t) ctx->pattern[1])
1066
state->ptr = ctx->ptr;
1067
DO_JUMP(JUMP_REPEAT_ONE_1, jump_repeat_one_1,
1068
ctx->pattern+ctx->pattern[0]);
1070
RETURN_ON_ERROR(ret);
1082
while (ctx->count >= (Py_ssize_t) ctx->pattern[1]) {
1083
state->ptr = ctx->ptr;
1084
DO_JUMP(JUMP_REPEAT_ONE_2, jump_repeat_one_2,
1085
ctx->pattern+ctx->pattern[0]);
1087
RETURN_ON_ERROR(ret);
1097
case SRE_OP_MIN_REPEAT_ONE:
1098
/* match repeated sequence (minimizing regexp) */
1100
/* this operator only works if the repeated item is
1101
exactly one character wide, and we're not already
1102
collecting backtracking points. for other cases,
1103
use the MIN_REPEAT operator */
1105
/* <MIN_REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
1107
TRACE(("|%p|%p|MIN_REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr,
1108
ctx->pattern[1], ctx->pattern[2]));
1110
if (ctx->ptr + ctx->pattern[1] > end)
1111
RETURN_FAILURE; /* cannot match */
1113
state->ptr = ctx->ptr;
1115
if (ctx->pattern[1] == 0)
1118
/* count using pattern min as the maximum */
1119
ret = SRE_COUNT(state, ctx->pattern+3, ctx->pattern[1]);
1120
RETURN_ON_ERROR(ret);
1121
DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1122
if (ret < (Py_ssize_t) ctx->pattern[1])
1123
/* didn't match minimum number of times */
1125
/* advance past minimum matches of repeat */
1127
ctx->ptr += ctx->count;
1130
if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS) {
1131
/* tail is empty. we're finished */
1132
state->ptr = ctx->ptr;
1138
while ((Py_ssize_t)ctx->pattern[2] == 65535
1139
|| ctx->count <= (Py_ssize_t)ctx->pattern[2]) {
1140
state->ptr = ctx->ptr;
1141
DO_JUMP(JUMP_MIN_REPEAT_ONE,jump_min_repeat_one,
1142
ctx->pattern+ctx->pattern[0]);
1144
RETURN_ON_ERROR(ret);
1147
state->ptr = ctx->ptr;
1148
ret = SRE_COUNT(state, ctx->pattern+3, 1);
1149
RETURN_ON_ERROR(ret);
1150
DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1162
/* create repeat context. all the hard work is done
1163
by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */
1164
/* <REPEAT> <skip> <1=min> <2=max> item <UNTIL> tail */
1165
TRACE(("|%p|%p|REPEAT %d %d\n", ctx->pattern, ctx->ptr,
1166
ctx->pattern[1], ctx->pattern[2]));
1168
/* install new repeat context */
1169
ctx->u.rep = (SRE_REPEAT*) PyObject_MALLOC(sizeof(*ctx->u.rep));
1174
ctx->u.rep->count = -1;
1175
ctx->u.rep->pattern = ctx->pattern;
1176
ctx->u.rep->prev = state->repeat;
1177
ctx->u.rep->last_ptr = NULL;
1178
state->repeat = ctx->u.rep;
1180
state->ptr = ctx->ptr;
1181
DO_JUMP(JUMP_REPEAT, jump_repeat, ctx->pattern+ctx->pattern[0]);
1182
state->repeat = ctx->u.rep->prev;
1183
PyObject_FREE(ctx->u.rep);
1186
RETURN_ON_ERROR(ret);
1191
case SRE_OP_MAX_UNTIL:
1192
/* maximizing repeat */
1193
/* <REPEAT> <skip> <1=min> <2=max> item <MAX_UNTIL> tail */
1195
/* FIXME: we probably need to deal with zero-width
1196
matches in here... */
1198
ctx->u.rep = state->repeat;
1200
RETURN_ERROR(SRE_ERROR_STATE);
1202
state->ptr = ctx->ptr;
1204
ctx->count = ctx->u.rep->count+1;
1206
TRACE(("|%p|%p|MAX_UNTIL %d\n", ctx->pattern,
1207
ctx->ptr, ctx->count));
1209
if (ctx->count < ctx->u.rep->pattern[1]) {
1210
/* not enough matches */
1211
ctx->u.rep->count = ctx->count;
1212
DO_JUMP(JUMP_MAX_UNTIL_1, jump_max_until_1,
1213
ctx->u.rep->pattern+3);
1215
RETURN_ON_ERROR(ret);
1218
ctx->u.rep->count = ctx->count-1;
1219
state->ptr = ctx->ptr;
1223
if ((ctx->count < ctx->u.rep->pattern[2] ||
1224
ctx->u.rep->pattern[2] == 65535) &&
1225
state->ptr != ctx->u.rep->last_ptr) {
1226
/* we may have enough matches, but if we can
1227
match another item, do so */
1228
ctx->u.rep->count = ctx->count;
1230
MARK_PUSH(ctx->lastmark);
1231
/* zero-width match protection */
1232
DATA_PUSH(&ctx->u.rep->last_ptr);
1233
ctx->u.rep->last_ptr = state->ptr;
1234
DO_JUMP(JUMP_MAX_UNTIL_2, jump_max_until_2,
1235
ctx->u.rep->pattern+3);
1236
DATA_POP(&ctx->u.rep->last_ptr);
1238
MARK_POP_DISCARD(ctx->lastmark);
1239
RETURN_ON_ERROR(ret);
1242
MARK_POP(ctx->lastmark);
1244
ctx->u.rep->count = ctx->count-1;
1245
state->ptr = ctx->ptr;
1248
/* cannot match more repeated items here. make sure the
1250
state->repeat = ctx->u.rep->prev;
1251
DO_JUMP(JUMP_MAX_UNTIL_3, jump_max_until_3, ctx->pattern);
1252
RETURN_ON_SUCCESS(ret);
1253
state->repeat = ctx->u.rep;
1254
state->ptr = ctx->ptr;
1257
case SRE_OP_MIN_UNTIL:
1258
/* minimizing repeat */
1259
/* <REPEAT> <skip> <1=min> <2=max> item <MIN_UNTIL> tail */
1261
ctx->u.rep = state->repeat;
1263
RETURN_ERROR(SRE_ERROR_STATE);
1265
state->ptr = ctx->ptr;
1267
ctx->count = ctx->u.rep->count+1;
1269
TRACE(("|%p|%p|MIN_UNTIL %d %p\n", ctx->pattern,
1270
ctx->ptr, ctx->count, ctx->u.rep->pattern));
1272
if (ctx->count < ctx->u.rep->pattern[1]) {
1273
/* not enough matches */
1274
ctx->u.rep->count = ctx->count;
1275
DO_JUMP(JUMP_MIN_UNTIL_1, jump_min_until_1,
1276
ctx->u.rep->pattern+3);
1278
RETURN_ON_ERROR(ret);
1281
ctx->u.rep->count = ctx->count-1;
1282
state->ptr = ctx->ptr;
1288
/* see if the tail matches */
1289
state->repeat = ctx->u.rep->prev;
1290
DO_JUMP(JUMP_MIN_UNTIL_2, jump_min_until_2, ctx->pattern);
1292
RETURN_ON_ERROR(ret);
1296
state->repeat = ctx->u.rep;
1297
state->ptr = ctx->ptr;
1301
if (ctx->count >= ctx->u.rep->pattern[2]
1302
&& ctx->u.rep->pattern[2] != 65535)
1305
ctx->u.rep->count = ctx->count;
1306
DO_JUMP(JUMP_MIN_UNTIL_3,jump_min_until_3,
1307
ctx->u.rep->pattern+3);
1309
RETURN_ON_ERROR(ret);
1312
ctx->u.rep->count = ctx->count-1;
1313
state->ptr = ctx->ptr;
1316
case SRE_OP_GROUPREF:
1317
/* match backreference */
1318
TRACE(("|%p|%p|GROUPREF %d\n", ctx->pattern,
1319
ctx->ptr, ctx->pattern[0]));
1320
i = ctx->pattern[0];
1322
Py_ssize_t groupref = i+i;
1323
if (groupref >= state->lastmark) {
1326
SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1327
SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1328
if (!p || !e || e < p)
1331
if (ctx->ptr >= end || *ctx->ptr != *p)
1340
case SRE_OP_GROUPREF_IGNORE:
1341
/* match backreference */
1342
TRACE(("|%p|%p|GROUPREF_IGNORE %d\n", ctx->pattern,
1343
ctx->ptr, ctx->pattern[0]));
1344
i = ctx->pattern[0];
1346
Py_ssize_t groupref = i+i;
1347
if (groupref >= state->lastmark) {
1350
SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1351
SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1352
if (!p || !e || e < p)
1355
if (ctx->ptr >= end ||
1356
state->lower(*ctx->ptr) != state->lower(*p))
1365
case SRE_OP_GROUPREF_EXISTS:
1366
TRACE(("|%p|%p|GROUPREF_EXISTS %d\n", ctx->pattern,
1367
ctx->ptr, ctx->pattern[0]));
1368
/* <GROUPREF_EXISTS> <group> <skip> codeyes <JUMP> codeno ... */
1369
i = ctx->pattern[0];
1371
Py_ssize_t groupref = i+i;
1372
if (groupref >= state->lastmark) {
1373
ctx->pattern += ctx->pattern[1];
1376
SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1377
SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1378
if (!p || !e || e < p) {
1379
ctx->pattern += ctx->pattern[1];
1388
/* assert subpattern */
1389
/* <ASSERT> <skip> <back> <pattern> */
1390
TRACE(("|%p|%p|ASSERT %d\n", ctx->pattern,
1391
ctx->ptr, ctx->pattern[1]));
1392
state->ptr = ctx->ptr - ctx->pattern[1];
1393
if (state->ptr < state->beginning)
1395
DO_JUMP(JUMP_ASSERT, jump_assert, ctx->pattern+2);
1396
RETURN_ON_FAILURE(ret);
1397
ctx->pattern += ctx->pattern[0];
1400
case SRE_OP_ASSERT_NOT:
1401
/* assert not subpattern */
1402
/* <ASSERT_NOT> <skip> <back> <pattern> */
1403
TRACE(("|%p|%p|ASSERT_NOT %d\n", ctx->pattern,
1404
ctx->ptr, ctx->pattern[1]));
1405
state->ptr = ctx->ptr - ctx->pattern[1];
1406
if (state->ptr >= state->beginning) {
1407
DO_JUMP(JUMP_ASSERT_NOT, jump_assert_not, ctx->pattern+2);
1409
RETURN_ON_ERROR(ret);
1413
ctx->pattern += ctx->pattern[0];
1416
case SRE_OP_FAILURE:
1417
/* immediate failure */
1418
TRACE(("|%p|%p|FAILURE\n", ctx->pattern, ctx->ptr));
1422
TRACE(("|%p|%p|UNKNOWN %d\n", ctx->pattern, ctx->ptr,
1424
RETURN_ERROR(SRE_ERROR_ILLEGAL);
1429
ctx_pos = ctx->last_ctx_pos;
1431
DATA_POP_DISCARD(ctx);
1434
DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1437
case JUMP_MAX_UNTIL_2:
1438
TRACE(("|%p|%p|JUMP_MAX_UNTIL_2\n", ctx->pattern, ctx->ptr));
1439
goto jump_max_until_2;
1440
case JUMP_MAX_UNTIL_3:
1441
TRACE(("|%p|%p|JUMP_MAX_UNTIL_3\n", ctx->pattern, ctx->ptr));
1442
goto jump_max_until_3;
1443
case JUMP_MIN_UNTIL_2:
1444
TRACE(("|%p|%p|JUMP_MIN_UNTIL_2\n", ctx->pattern, ctx->ptr));
1445
goto jump_min_until_2;
1446
case JUMP_MIN_UNTIL_3:
1447
TRACE(("|%p|%p|JUMP_MIN_UNTIL_3\n", ctx->pattern, ctx->ptr));
1448
goto jump_min_until_3;
1450
TRACE(("|%p|%p|JUMP_BRANCH\n", ctx->pattern, ctx->ptr));
1452
case JUMP_MAX_UNTIL_1:
1453
TRACE(("|%p|%p|JUMP_MAX_UNTIL_1\n", ctx->pattern, ctx->ptr));
1454
goto jump_max_until_1;
1455
case JUMP_MIN_UNTIL_1:
1456
TRACE(("|%p|%p|JUMP_MIN_UNTIL_1\n", ctx->pattern, ctx->ptr));
1457
goto jump_min_until_1;
1459
TRACE(("|%p|%p|JUMP_REPEAT\n", ctx->pattern, ctx->ptr));
1461
case JUMP_REPEAT_ONE_1:
1462
TRACE(("|%p|%p|JUMP_REPEAT_ONE_1\n", ctx->pattern, ctx->ptr));
1463
goto jump_repeat_one_1;
1464
case JUMP_REPEAT_ONE_2:
1465
TRACE(("|%p|%p|JUMP_REPEAT_ONE_2\n", ctx->pattern, ctx->ptr));
1466
goto jump_repeat_one_2;
1467
case JUMP_MIN_REPEAT_ONE:
1468
TRACE(("|%p|%p|JUMP_MIN_REPEAT_ONE\n", ctx->pattern, ctx->ptr));
1469
goto jump_min_repeat_one;
1471
TRACE(("|%p|%p|JUMP_ASSERT\n", ctx->pattern, ctx->ptr));
1473
case JUMP_ASSERT_NOT:
1474
TRACE(("|%p|%p|JUMP_ASSERT_NOT\n", ctx->pattern, ctx->ptr));
1475
goto jump_assert_not;
1477
TRACE(("|%p|%p|RETURN %d\n", ctx->pattern, ctx->ptr, ret));
1481
return ret; /* should never get here */
1485
SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
1487
SRE_CHAR* ptr = (SRE_CHAR *)state->start;
1488
SRE_CHAR* end = (SRE_CHAR *)state->end;
1489
Py_ssize_t status = 0;
1490
Py_ssize_t prefix_len = 0;
1491
Py_ssize_t prefix_skip = 0;
1492
SRE_CODE* prefix = NULL;
1493
SRE_CODE* charset = NULL;
1494
SRE_CODE* overlap = NULL;
1497
if (pattern[0] == SRE_OP_INFO) {
1498
/* optimization info block */
1499
/* <INFO> <1=skip> <2=flags> <3=min> <4=max> <5=prefix info> */
1503
if (pattern[3] > 1) {
1504
/* adjust end point (but make sure we leave at least one
1505
character in there, so literal search will work) */
1506
end -= pattern[3]-1;
1511
if (flags & SRE_INFO_PREFIX) {
1512
/* pattern starts with a known prefix */
1513
/* <length> <skip> <prefix data> <overlap data> */
1514
prefix_len = pattern[5];
1515
prefix_skip = pattern[6];
1516
prefix = pattern + 7;
1517
overlap = prefix + prefix_len - 1;
1518
} else if (flags & SRE_INFO_CHARSET)
1519
/* pattern starts with a character from a known set */
1521
charset = pattern + 5;
1523
pattern += 1 + pattern[1];
1526
TRACE(("prefix = %p %d %d\n", prefix, prefix_len, prefix_skip));
1527
TRACE(("charset = %p\n", charset));
1529
#if defined(USE_FAST_SEARCH)
1530
if (prefix_len > 1) {
1531
/* pattern starts with a known prefix. use the overlap
1532
table to skip forward as fast as we possibly can */
1534
end = (SRE_CHAR *)state->end;
1537
if ((SRE_CODE) ptr[0] != prefix[i]) {
1543
if (++i == prefix_len) {
1544
/* found a potential match */
1545
TRACE(("|%p|%p|SEARCH SCAN\n", pattern, ptr));
1546
state->start = ptr + 1 - prefix_len;
1547
state->ptr = ptr + 1 - prefix_len + prefix_skip;
1548
if (flags & SRE_INFO_LITERAL)
1549
return 1; /* we got all of it */
1550
status = SRE_MATCH(state, pattern + 2*prefix_skip);
1553
/* close but no cigar -- try again */
1565
if (pattern[0] == SRE_OP_LITERAL) {
1566
/* pattern starts with a literal character. this is used
1567
for short prefixes, and if fast search is disabled */
1568
SRE_CODE chr = pattern[1];
1569
end = (SRE_CHAR *)state->end;
1571
while (ptr < end && (SRE_CODE) ptr[0] != chr)
1575
TRACE(("|%p|%p|SEARCH LITERAL\n", pattern, ptr));
1578
if (flags & SRE_INFO_LITERAL)
1579
return 1; /* we got all of it */
1580
status = SRE_MATCH(state, pattern + 2);
1584
} else if (charset) {
1585
/* pattern starts with a character from a known set */
1586
end = (SRE_CHAR *)state->end;
1588
while (ptr < end && !SRE_CHARSET(charset, ptr[0]))
1592
TRACE(("|%p|%p|SEARCH CHARSET\n", pattern, ptr));
1595
status = SRE_MATCH(state, pattern);
1602
while (ptr <= end) {
1603
TRACE(("|%p|%p|SEARCH\n", pattern, ptr));
1604
state->start = state->ptr = ptr++;
1605
status = SRE_MATCH(state, pattern);
1614
SRE_LITERAL_TEMPLATE(SRE_CHAR* ptr, Py_ssize_t len)
1616
/* check if given string is a literal template (i.e. no escapes) */
1623
#if !defined(SRE_RECURSIVE)
1625
/* -------------------------------------------------------------------- */
1626
/* factories and destructors */
1628
/* see sre.h for object declarations */
1629
static PyObject*pattern_new_match(PatternObject*, SRE_STATE*, int);
1630
static PyObject*pattern_scanner(PatternObject*, PyObject*);
1633
sre_codesize(PyObject* self, PyObject *unused)
1635
return Py_BuildValue("l", sizeof(SRE_CODE));
1639
sre_getlower(PyObject* self, PyObject* args)
1641
int character, flags;
1642
if (!PyArg_ParseTuple(args, "ii", &character, &flags))
1644
if (flags & SRE_FLAG_LOCALE)
1645
return Py_BuildValue("i", sre_lower_locale(character));
1646
if (flags & SRE_FLAG_UNICODE)
1647
#if defined(HAVE_UNICODE)
1648
return Py_BuildValue("i", sre_lower_unicode(character));
1650
return Py_BuildValue("i", sre_lower_locale(character));
1652
return Py_BuildValue("i", sre_lower(character));
1656
state_reset(SRE_STATE* state)
1658
/* FIXME: dynamic! */
1659
/*memset(state->mark, 0, sizeof(*state->mark) * SRE_MARK_SIZE);*/
1661
state->lastmark = -1;
1662
state->lastindex = -1;
1664
state->repeat = NULL;
1666
data_stack_dealloc(state);
1670
getstring(PyObject* string, Py_ssize_t* p_length, int* p_charsize)
1672
/* given a python object, return a data pointer, a length (in
1673
characters), and a character size. return NULL if the object
1674
is not a string (or not compatible) */
1676
PyBufferProcs *buffer;
1677
Py_ssize_t size, bytes;
1682
/* Unicode objects do not support the buffer API. So, get the data
1683
directly instead. */
1684
if (PyUnicode_Check(string)) {
1685
ptr = (void *)PyUnicode_AS_DATA(string);
1686
*p_length = PyUnicode_GET_SIZE(string);
1687
*p_charsize = sizeof(Py_UNICODE);
1691
/* get pointer to string buffer */
1693
buffer = Py_TYPE(string)->tp_as_buffer;
1694
if (!buffer || !buffer->bf_getbuffer ||
1695
(*buffer->bf_getbuffer)(string, &view, PyBUF_SIMPLE) < 0) {
1696
PyErr_SetString(PyExc_TypeError, "expected string or buffer");
1700
/* determine buffer size */
1704
/* Release the buffer immediately --- possibly dangerous
1705
but doing something else would require some re-factoring
1707
PyBuffer_Release(&view);
1710
PyErr_SetString(PyExc_TypeError, "buffer has negative size");
1714
/* determine character size */
1715
size = PyObject_Size(string);
1717
if (PyBytes_Check(string) || bytes == size)
1719
#if defined(HAVE_UNICODE)
1720
else if (bytes == (Py_ssize_t) (size * sizeof(Py_UNICODE)))
1721
charsize = sizeof(Py_UNICODE);
1724
PyErr_SetString(PyExc_TypeError, "buffer size mismatch");
1729
*p_charsize = charsize;
1732
PyErr_SetString(PyExc_ValueError,
1739
state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string,
1740
Py_ssize_t start, Py_ssize_t end)
1742
/* prepare state object */
1748
memset(state, 0, sizeof(SRE_STATE));
1750
state->lastmark = -1;
1751
state->lastindex = -1;
1753
ptr = getstring(string, &length, &charsize);
1757
if (charsize == 1 && pattern->charsize > 1) {
1758
PyErr_SetString(PyExc_TypeError,
1759
"can't use a string pattern on a bytes-like object");
1762
if (charsize > 1 && pattern->charsize == 1) {
1763
PyErr_SetString(PyExc_TypeError,
1764
"can't use a bytes pattern on a string-like object");
1768
/* adjust boundaries */
1771
else if (start > length)
1776
else if (end > length)
1779
state->charsize = charsize;
1781
state->beginning = ptr;
1783
state->start = (void*) ((char*) ptr + start * state->charsize);
1784
state->end = (void*) ((char*) ptr + end * state->charsize);
1787
state->string = string;
1789
state->endpos = end;
1791
if (pattern->flags & SRE_FLAG_LOCALE)
1792
state->lower = sre_lower_locale;
1793
else if (pattern->flags & SRE_FLAG_UNICODE)
1794
#if defined(HAVE_UNICODE)
1795
state->lower = sre_lower_unicode;
1797
state->lower = sre_lower_locale;
1800
state->lower = sre_lower;
1806
state_fini(SRE_STATE* state)
1808
Py_XDECREF(state->string);
1809
data_stack_dealloc(state);
1812
/* calculate offset from start of string */
1813
#define STATE_OFFSET(state, member)\
1814
(((char*)(member) - (char*)(state)->beginning) / (state)->charsize)
1817
state_getslice(SRE_STATE* state, Py_ssize_t index, PyObject* string, int empty)
1821
index = (index - 1) * 2;
1823
if (string == Py_None || index >= state->lastmark || !state->mark[index] || !state->mark[index+1]) {
1825
/* want empty string */
1832
i = STATE_OFFSET(state, state->mark[index]);
1833
j = STATE_OFFSET(state, state->mark[index+1]);
1836
return PySequence_GetSlice(string, i, j);
1840
pattern_error(int status)
1843
case SRE_ERROR_RECURSION_LIMIT:
1846
"maximum recursion limit exceeded"
1849
case SRE_ERROR_MEMORY:
1852
case SRE_ERROR_INTERRUPTED:
1853
/* An exception has already been raised, so let it fly */
1856
/* other error codes indicate compiler/engine bugs */
1859
"internal error in regular expression engine"
1865
pattern_dealloc(PatternObject* self)
1867
if (self->weakreflist != NULL)
1868
PyObject_ClearWeakRefs((PyObject *) self);
1869
Py_XDECREF(self->pattern);
1870
Py_XDECREF(self->groupindex);
1871
Py_XDECREF(self->indexgroup);
1876
pattern_match(PatternObject* self, PyObject* args, PyObject* kw)
1882
Py_ssize_t start = 0;
1883
Py_ssize_t end = PY_SSIZE_T_MAX;
1884
static char* kwlist[] = { "pattern", "pos", "endpos", NULL };
1885
if (!PyArg_ParseTupleAndKeywords(args, kw, "O|nn:match", kwlist,
1886
&string, &start, &end))
1889
string = state_init(&state, self, string, start, end);
1893
state.ptr = state.start;
1895
TRACE(("|%p|%p|MATCH\n", PatternObject_GetCode(self), state.ptr));
1897
if (state.charsize == 1) {
1898
status = sre_match(&state, PatternObject_GetCode(self));
1900
#if defined(HAVE_UNICODE)
1901
status = sre_umatch(&state, PatternObject_GetCode(self));
1905
TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
1906
if (PyErr_Occurred())
1911
return pattern_new_match(self, &state, status);
1915
pattern_search(PatternObject* self, PyObject* args, PyObject* kw)
1921
Py_ssize_t start = 0;
1922
Py_ssize_t end = PY_SSIZE_T_MAX;
1923
static char* kwlist[] = { "pattern", "pos", "endpos", NULL };
1924
if (!PyArg_ParseTupleAndKeywords(args, kw, "O|nn:search", kwlist,
1925
&string, &start, &end))
1928
string = state_init(&state, self, string, start, end);
1932
TRACE(("|%p|%p|SEARCH\n", PatternObject_GetCode(self), state.ptr));
1934
if (state.charsize == 1) {
1935
status = sre_search(&state, PatternObject_GetCode(self));
1937
#if defined(HAVE_UNICODE)
1938
status = sre_usearch(&state, PatternObject_GetCode(self));
1942
TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
1946
if (PyErr_Occurred())
1949
return pattern_new_match(self, &state, status);
1953
call(char* module, char* function, PyObject* args)
1962
name = PyUnicode_FromString(module);
1965
mod = PyImport_Import(name);
1969
func = PyObject_GetAttrString(mod, function);
1973
result = PyObject_CallObject(func, args);
1979
#ifdef USE_BUILTIN_COPY
1981
deepcopy(PyObject** object, PyObject* memo)
1987
PyTuple_Pack(2, *object, memo)
1995
return 1; /* success */
2000
join_list(PyObject* list, PyObject* string)
2002
/* join list elements */
2005
#if PY_VERSION_HEX >= 0x01060000
2011
joiner = PySequence_GetSlice(string, 0, 0);
2015
if (PyList_GET_SIZE(list) == 0) {
2020
#if PY_VERSION_HEX >= 0x01060000
2021
function = PyObject_GetAttrString(joiner, "join");
2026
args = PyTuple_New(1);
2028
Py_DECREF(function);
2032
PyTuple_SET_ITEM(args, 0, list);
2033
result = PyObject_CallObject(function, args);
2034
Py_DECREF(args); /* also removes list */
2035
Py_DECREF(function);
2039
PyTuple_Pack(2, list, joiner)
2048
pattern_findall(PatternObject* self, PyObject* args, PyObject* kw)
2056
Py_ssize_t start = 0;
2057
Py_ssize_t end = PY_SSIZE_T_MAX;
2058
static char* kwlist[] = { "source", "pos", "endpos", NULL };
2059
if (!PyArg_ParseTupleAndKeywords(args, kw, "O|nn:findall", kwlist,
2060
&string, &start, &end))
2063
string = state_init(&state, self, string, start, end);
2067
list = PyList_New(0);
2073
while (state.start <= state.end) {
2077
state_reset(&state);
2079
state.ptr = state.start;
2081
if (state.charsize == 1) {
2082
status = sre_search(&state, PatternObject_GetCode(self));
2084
#if defined(HAVE_UNICODE)
2085
status = sre_usearch(&state, PatternObject_GetCode(self));
2089
if (PyErr_Occurred())
2095
pattern_error(status);
2099
/* don't bother to build a match object */
2100
switch (self->groups) {
2102
b = STATE_OFFSET(&state, state.start);
2103
e = STATE_OFFSET(&state, state.ptr);
2104
item = PySequence_GetSlice(string, b, e);
2109
item = state_getslice(&state, 1, string, 1);
2114
item = PyTuple_New(self->groups);
2117
for (i = 0; i < self->groups; i++) {
2118
PyObject* o = state_getslice(&state, i+1, string, 1);
2123
PyTuple_SET_ITEM(item, i, o);
2128
status = PyList_Append(list, item);
2133
if (state.ptr == state.start)
2134
state.start = (void*) ((char*) state.ptr + state.charsize);
2136
state.start = state.ptr;
2150
#if PY_VERSION_HEX >= 0x02020000
2152
pattern_finditer(PatternObject* pattern, PyObject* args)
2158
scanner = pattern_scanner(pattern, args);
2162
search = PyObject_GetAttrString(scanner, "search");
2167
iterator = PyCallIter_New(search, Py_None);
2175
pattern_split(PatternObject* self, PyObject* args, PyObject* kw)
2186
Py_ssize_t maxsplit = 0;
2187
static char* kwlist[] = { "source", "maxsplit", NULL };
2188
if (!PyArg_ParseTupleAndKeywords(args, kw, "O|n:split", kwlist,
2189
&string, &maxsplit))
2192
string = state_init(&state, self, string, 0, PY_SSIZE_T_MAX);
2196
list = PyList_New(0);
2205
while (!maxsplit || n < maxsplit) {
2207
state_reset(&state);
2209
state.ptr = state.start;
2211
if (state.charsize == 1) {
2212
status = sre_search(&state, PatternObject_GetCode(self));
2214
#if defined(HAVE_UNICODE)
2215
status = sre_usearch(&state, PatternObject_GetCode(self));
2219
if (PyErr_Occurred())
2225
pattern_error(status);
2229
if (state.start == state.ptr) {
2230
if (last == state.end)
2232
/* skip one character */
2233
state.start = (void*) ((char*) state.ptr + state.charsize);
2237
/* get segment before this match */
2238
item = PySequence_GetSlice(
2239
string, STATE_OFFSET(&state, last),
2240
STATE_OFFSET(&state, state.start)
2244
status = PyList_Append(list, item);
2249
/* add groups (if any) */
2250
for (i = 0; i < self->groups; i++) {
2251
item = state_getslice(&state, i+1, string, 0);
2254
status = PyList_Append(list, item);
2262
last = state.start = state.ptr;
2266
/* get segment following last match (even if empty) */
2267
item = PySequence_GetSlice(
2268
string, STATE_OFFSET(&state, last), state.endpos
2272
status = PyList_Append(list, item);
2288
pattern_subx(PatternObject* self, PyObject* ptemplate, PyObject* string,
2289
Py_ssize_t count, Py_ssize_t subn)
2302
int filter_is_callable;
2304
if (PyCallable_Check(ptemplate)) {
2305
/* sub/subn takes either a function or a template */
2308
filter_is_callable = 1;
2310
/* if not callable, check if it's a literal string */
2312
ptr = getstring(ptemplate, &n, &bint);
2316
literal = sre_literal_template((unsigned char *)ptr, n);
2318
#if defined(HAVE_UNICODE)
2319
literal = sre_uliteral_template((Py_UNICODE *)ptr, n);
2329
filter_is_callable = 0;
2331
/* not a literal; hand it over to the template compiler */
2333
SRE_PY_MODULE, "_subx",
2334
PyTuple_Pack(2, self, ptemplate)
2338
filter_is_callable = PyCallable_Check(filter);
2342
string = state_init(&state, self, string, 0, PY_SSIZE_T_MAX);
2348
list = PyList_New(0);
2357
while (!count || n < count) {
2359
state_reset(&state);
2361
state.ptr = state.start;
2363
if (state.charsize == 1) {
2364
status = sre_search(&state, PatternObject_GetCode(self));
2366
#if defined(HAVE_UNICODE)
2367
status = sre_usearch(&state, PatternObject_GetCode(self));
2371
if (PyErr_Occurred())
2377
pattern_error(status);
2381
b = STATE_OFFSET(&state, state.start);
2382
e = STATE_OFFSET(&state, state.ptr);
2385
/* get segment before this match */
2386
item = PySequence_GetSlice(string, i, b);
2389
status = PyList_Append(list, item);
2394
} else if (i == b && i == e && n > 0)
2395
/* ignore empty match on latest position */
2398
if (filter_is_callable) {
2399
/* pass match object through filter */
2400
match = pattern_new_match(self, &state, 1);
2403
args = PyTuple_Pack(1, match);
2408
item = PyObject_CallObject(filter, args);
2414
/* filter is literal string */
2420
if (item != Py_None) {
2421
status = PyList_Append(list, item);
2432
if (state.ptr == state.start)
2433
state.start = (void*) ((char*) state.ptr + state.charsize);
2435
state.start = state.ptr;
2439
/* get segment following last match */
2440
if (i < state.endpos) {
2441
item = PySequence_GetSlice(string, i, state.endpos);
2444
status = PyList_Append(list, item);
2454
/* convert list to single string (also removes list) */
2455
item = join_list(list, string);
2461
return Py_BuildValue("Ni", item, n);
2474
pattern_sub(PatternObject* self, PyObject* args, PyObject* kw)
2476
PyObject* ptemplate;
2478
Py_ssize_t count = 0;
2479
static char* kwlist[] = { "repl", "string", "count", NULL };
2480
if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|n:sub", kwlist,
2481
&ptemplate, &string, &count))
2484
return pattern_subx(self, ptemplate, string, count, 0);
2488
pattern_subn(PatternObject* self, PyObject* args, PyObject* kw)
2490
PyObject* ptemplate;
2492
Py_ssize_t count = 0;
2493
static char* kwlist[] = { "repl", "string", "count", NULL };
2494
if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|n:subn", kwlist,
2495
&ptemplate, &string, &count))
2498
return pattern_subx(self, ptemplate, string, count, 1);
2502
pattern_copy(PatternObject* self, PyObject *unused)
2504
#ifdef USE_BUILTIN_COPY
2505
PatternObject* copy;
2508
copy = PyObject_NEW_VAR(PatternObject, &Pattern_Type, self->codesize);
2512
offset = offsetof(PatternObject, groups);
2514
Py_XINCREF(self->groupindex);
2515
Py_XINCREF(self->indexgroup);
2516
Py_XINCREF(self->pattern);
2518
memcpy((char*) copy + offset, (char*) self + offset,
2519
sizeof(PatternObject) + self->codesize * sizeof(SRE_CODE) - offset);
2520
copy->weakreflist = NULL;
2522
return (PyObject*) copy;
2524
PyErr_SetString(PyExc_TypeError, "cannot copy this pattern object");
2530
pattern_deepcopy(PatternObject* self, PyObject* memo)
2532
#ifdef USE_BUILTIN_COPY
2533
PatternObject* copy;
2535
copy = (PatternObject*) pattern_copy(self);
2539
if (!deepcopy(©->groupindex, memo) ||
2540
!deepcopy(©->indexgroup, memo) ||
2541
!deepcopy(©->pattern, memo)) {
2547
PyErr_SetString(PyExc_TypeError, "cannot deepcopy this pattern object");
2552
PyDoc_STRVAR(pattern_match_doc,
2553
"match(string[, pos[, endpos]]) --> match object or None.\n\
2554
Matches zero or more characters at the beginning of the string");
2556
PyDoc_STRVAR(pattern_search_doc,
2557
"search(string[, pos[, endpos]]) --> match object or None.\n\
2558
Scan through string looking for a match, and return a corresponding\n\
2559
MatchObject instance. Return None if no position in the string matches.");
2561
PyDoc_STRVAR(pattern_split_doc,
2562
"split(string[, maxsplit = 0]) --> list.\n\
2563
Split string by the occurrences of pattern.");
2565
PyDoc_STRVAR(pattern_findall_doc,
2566
"findall(string[, pos[, endpos]]) --> list.\n\
2567
Return a list of all non-overlapping matches of pattern in string.");
2569
PyDoc_STRVAR(pattern_finditer_doc,
2570
"finditer(string[, pos[, endpos]]) --> iterator.\n\
2571
Return an iterator over all non-overlapping matches for the \n\
2572
RE pattern in string. For each match, the iterator returns a\n\
2575
PyDoc_STRVAR(pattern_sub_doc,
2576
"sub(repl, string[, count = 0]) --> newstring\n\
2577
Return the string obtained by replacing the leftmost non-overlapping\n\
2578
occurrences of pattern in string by the replacement repl.");
2580
PyDoc_STRVAR(pattern_subn_doc,
2581
"subn(repl, string[, count = 0]) --> (newstring, number of subs)\n\
2582
Return the tuple (new_string, number_of_subs_made) found by replacing\n\
2583
the leftmost non-overlapping occurrences of pattern with the\n\
2584
replacement repl.");
2586
PyDoc_STRVAR(pattern_doc, "Compiled regular expression objects");
2588
static PyMethodDef pattern_methods[] = {
2589
{"match", (PyCFunction) pattern_match, METH_VARARGS|METH_KEYWORDS,
2591
{"search", (PyCFunction) pattern_search, METH_VARARGS|METH_KEYWORDS,
2592
pattern_search_doc},
2593
{"sub", (PyCFunction) pattern_sub, METH_VARARGS|METH_KEYWORDS,
2595
{"subn", (PyCFunction) pattern_subn, METH_VARARGS|METH_KEYWORDS,
2597
{"split", (PyCFunction) pattern_split, METH_VARARGS|METH_KEYWORDS,
2599
{"findall", (PyCFunction) pattern_findall, METH_VARARGS|METH_KEYWORDS,
2600
pattern_findall_doc},
2601
#if PY_VERSION_HEX >= 0x02020000
2602
{"finditer", (PyCFunction) pattern_finditer, METH_VARARGS,
2603
pattern_finditer_doc},
2605
{"scanner", (PyCFunction) pattern_scanner, METH_VARARGS},
2606
{"__copy__", (PyCFunction) pattern_copy, METH_NOARGS},
2607
{"__deepcopy__", (PyCFunction) pattern_deepcopy, METH_O},
2611
#define PAT_OFF(x) offsetof(PatternObject, x)
2612
static PyMemberDef pattern_members[] = {
2613
{"pattern", T_OBJECT, PAT_OFF(pattern), READONLY},
2614
{"flags", T_INT, PAT_OFF(flags), READONLY},
2615
{"groups", T_PYSSIZET, PAT_OFF(groups), READONLY},
2616
{"groupindex", T_OBJECT, PAT_OFF(groupindex), READONLY},
2617
{NULL} /* Sentinel */
2620
static PyTypeObject Pattern_Type = {
2621
PyVarObject_HEAD_INIT(NULL, 0)
2622
"_" SRE_MODULE ".SRE_Pattern",
2623
sizeof(PatternObject), sizeof(SRE_CODE),
2624
(destructor)pattern_dealloc, /* tp_dealloc */
2628
0, /* tp_reserved */
2630
0, /* tp_as_number */
2631
0, /* tp_as_sequence */
2632
0, /* tp_as_mapping */
2636
0, /* tp_getattro */
2637
0, /* tp_setattro */
2638
0, /* tp_as_buffer */
2639
Py_TPFLAGS_DEFAULT, /* tp_flags */
2640
pattern_doc, /* tp_doc */
2641
0, /* tp_traverse */
2643
0, /* tp_richcompare */
2644
offsetof(PatternObject, weakreflist), /* tp_weaklistoffset */
2646
0, /* tp_iternext */
2647
pattern_methods, /* tp_methods */
2648
pattern_members, /* tp_members */
2651
static int _validate(PatternObject *self); /* Forward */
2654
_compile(PyObject* self_, PyObject* args)
2656
/* "compile" pattern descriptor to pattern object */
2658
PatternObject* self;
2664
Py_ssize_t groups = 0;
2665
PyObject* groupindex = NULL;
2666
PyObject* indexgroup = NULL;
2667
if (!PyArg_ParseTuple(args, "OiO!|nOO", &pattern, &flags,
2668
&PyList_Type, &code, &groups,
2669
&groupindex, &indexgroup))
2672
n = PyList_GET_SIZE(code);
2673
/* coverity[ampersand_in_size] */
2674
self = PyObject_NEW_VAR(PatternObject, &Pattern_Type, n);
2680
for (i = 0; i < n; i++) {
2681
PyObject *o = PyList_GET_ITEM(code, i);
2682
unsigned long value = PyLong_AsUnsignedLong(o);
2683
self->code[i] = (SRE_CODE) value;
2684
if ((unsigned long) self->code[i] != value) {
2685
PyErr_SetString(PyExc_OverflowError,
2686
"regular expression code size limit exceeded");
2691
if (PyErr_Occurred()) {
2696
if (pattern == Py_None)
2697
self->charsize = -1;
2699
Py_ssize_t p_length;
2700
if (!getstring(pattern, &p_length, &self->charsize)) {
2707
self->pattern = pattern;
2709
self->flags = flags;
2711
self->groups = groups;
2713
Py_XINCREF(groupindex);
2714
self->groupindex = groupindex;
2716
Py_XINCREF(indexgroup);
2717
self->indexgroup = indexgroup;
2719
self->weakreflist = NULL;
2721
if (!_validate(self)) {
2726
return (PyObject*) self;
2729
/* -------------------------------------------------------------------- */
2730
/* Code validation */
2732
/* To learn more about this code, have a look at the _compile() function in
2733
Lib/sre_compile.py. The validation functions below checks the code array
2734
for conformance with the code patterns generated there.
2736
The nice thing about the generated code is that it is position-independent:
2737
all jumps are relative jumps forward. Also, jumps don't cross each other:
2738
the target of a later jump is always earlier than the target of an earlier
2739
jump. IOW, this is okay:
2741
J---------J-------T--------T
2743
\______________________/
2747
J---------J-------T--------T
2751
It also helps that SRE_CODE is always an unsigned type, either 2 bytes or 4
2752
bytes wide (the latter if Python is compiled for "wide" unicode support).
2755
/* Defining this one enables tracing of the validator */
2758
/* Trace macro for the validator */
2759
#if defined(VVERBOSE)
2760
#define VTRACE(v) printf v
2765
/* Report failure */
2766
#define FAIL do { VTRACE(("FAIL: %d\n", __LINE__)); return 0; } while (0)
2768
/* Extract opcode, argument, or skip count from code array */
2771
VTRACE(("%p: ", code)); \
2772
if (code >= end) FAIL; \
2774
VTRACE(("%lu (op)\n", (unsigned long)op)); \
2778
VTRACE(("%p= ", code)); \
2779
if (code >= end) FAIL; \
2781
VTRACE(("%lu (arg)\n", (unsigned long)arg)); \
2783
#define GET_SKIP_ADJ(adj) \
2785
VTRACE(("%p= ", code)); \
2786
if (code >= end) FAIL; \
2788
VTRACE(("%lu (skip to %p)\n", \
2789
(unsigned long)skip, code+skip)); \
2790
if (code+skip-adj < code || code+skip-adj > end)\
2794
#define GET_SKIP GET_SKIP_ADJ(0)
2797
_validate_charset(SRE_CODE *code, SRE_CODE *end)
2799
/* Some variables are manipulated by the macros above */
2805
while (code < end) {
2812
case SRE_OP_LITERAL:
2821
case SRE_OP_CHARSET:
2822
offset = 32/sizeof(SRE_CODE); /* 32-byte bitmap */
2823
if (code+offset < code || code+offset > end)
2828
case SRE_OP_BIGCHARSET:
2829
GET_ARG; /* Number of blocks */
2830
offset = 256/sizeof(SRE_CODE); /* 256-byte table */
2831
if (code+offset < code || code+offset > end)
2833
/* Make sure that each byte points to a valid block */
2834
for (i = 0; i < 256; i++) {
2835
if (((unsigned char *)code)[i] >= arg)
2839
offset = arg * 32/sizeof(SRE_CODE); /* 32-byte bitmap times arg */
2840
if (code+offset < code || code+offset > end)
2845
case SRE_OP_CATEGORY:
2848
case SRE_CATEGORY_DIGIT:
2849
case SRE_CATEGORY_NOT_DIGIT:
2850
case SRE_CATEGORY_SPACE:
2851
case SRE_CATEGORY_NOT_SPACE:
2852
case SRE_CATEGORY_WORD:
2853
case SRE_CATEGORY_NOT_WORD:
2854
case SRE_CATEGORY_LINEBREAK:
2855
case SRE_CATEGORY_NOT_LINEBREAK:
2856
case SRE_CATEGORY_LOC_WORD:
2857
case SRE_CATEGORY_LOC_NOT_WORD:
2858
case SRE_CATEGORY_UNI_DIGIT:
2859
case SRE_CATEGORY_UNI_NOT_DIGIT:
2860
case SRE_CATEGORY_UNI_SPACE:
2861
case SRE_CATEGORY_UNI_NOT_SPACE:
2862
case SRE_CATEGORY_UNI_WORD:
2863
case SRE_CATEGORY_UNI_NOT_WORD:
2864
case SRE_CATEGORY_UNI_LINEBREAK:
2865
case SRE_CATEGORY_UNI_NOT_LINEBREAK:
2882
_validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
2884
/* Some variables are manipulated by the macros above */
2889
VTRACE(("code=%p, end=%p\n", code, end));
2894
while (code < end) {
2899
/* We don't check whether marks are properly nested; the
2900
sre_match() code is robust even if they don't, and the worst
2901
you can get is nonsensical match results. */
2903
if (arg > 2*groups+1) {
2904
VTRACE(("arg=%d, groups=%d\n", (int)arg, (int)groups));
2909
case SRE_OP_LITERAL:
2910
case SRE_OP_NOT_LITERAL:
2911
case SRE_OP_LITERAL_IGNORE:
2912
case SRE_OP_NOT_LITERAL_IGNORE:
2914
/* The arg is just a character, nothing to check */
2917
case SRE_OP_SUCCESS:
2918
case SRE_OP_FAILURE:
2919
/* Nothing to check; these normally end the matching process */
2925
case SRE_AT_BEGINNING:
2926
case SRE_AT_BEGINNING_STRING:
2927
case SRE_AT_BEGINNING_LINE:
2929
case SRE_AT_END_LINE:
2930
case SRE_AT_END_STRING:
2931
case SRE_AT_BOUNDARY:
2932
case SRE_AT_NON_BOUNDARY:
2933
case SRE_AT_LOC_BOUNDARY:
2934
case SRE_AT_LOC_NON_BOUNDARY:
2935
case SRE_AT_UNI_BOUNDARY:
2936
case SRE_AT_UNI_NON_BOUNDARY:
2944
case SRE_OP_ANY_ALL:
2945
/* These have no operands */
2949
case SRE_OP_IN_IGNORE:
2951
/* Stop 1 before the end; we check the FAILURE below */
2952
if (!_validate_charset(code, code+skip-2))
2954
if (code[skip-2] != SRE_OP_FAILURE)
2961
/* A minimal info field is
2962
<INFO> <1=skip> <2=flags> <3=min> <4=max>;
2963
If SRE_INFO_PREFIX or SRE_INFO_CHARSET is in the flags,
2965
SRE_CODE flags, min, max, i;
2968
newcode = code+skip-1;
2969
GET_ARG; flags = arg;
2972
/* Check that only valid flags are present */
2973
if ((flags & ~(SRE_INFO_PREFIX |
2975
SRE_INFO_CHARSET)) != 0)
2977
/* PREFIX and CHARSET are mutually exclusive */
2978
if ((flags & SRE_INFO_PREFIX) &&
2979
(flags & SRE_INFO_CHARSET))
2981
/* LITERAL implies PREFIX */
2982
if ((flags & SRE_INFO_LITERAL) &&
2983
!(flags & SRE_INFO_PREFIX))
2985
/* Validate the prefix */
2986
if (flags & SRE_INFO_PREFIX) {
2987
SRE_CODE prefix_len, prefix_skip;
2988
GET_ARG; prefix_len = arg;
2989
GET_ARG; prefix_skip = arg;
2990
/* Here comes the prefix string */
2991
if (code+prefix_len < code || code+prefix_len > newcode)
2994
/* And here comes the overlap table */
2995
if (code+prefix_len < code || code+prefix_len > newcode)
2997
/* Each overlap value should be < prefix_len */
2998
for (i = 0; i < prefix_len; i++) {
2999
if (code[i] >= prefix_len)
3004
/* Validate the charset */
3005
if (flags & SRE_INFO_CHARSET) {
3006
if (!_validate_charset(code, newcode-1))
3008
if (newcode[-1] != SRE_OP_FAILURE)
3012
else if (code != newcode) {
3013
VTRACE(("code=%p, newcode=%p\n", code, newcode));
3021
SRE_CODE *target = NULL;
3026
/* Stop 2 before the end; we check the JUMP below */
3027
if (!_validate_inner(code, code+skip-3, groups))
3030
/* Check that it ends with a JUMP, and that each JUMP
3031
has the same target */
3033
if (op != SRE_OP_JUMP)
3037
target = code+skip-1;
3038
else if (code+skip-1 != target)
3044
case SRE_OP_REPEAT_ONE:
3045
case SRE_OP_MIN_REPEAT_ONE:
3053
#ifdef Py_UNICODE_WIDE
3057
if (!_validate_inner(code, code+skip-4, groups))
3061
if (op != SRE_OP_SUCCESS)
3074
#ifdef Py_UNICODE_WIDE
3078
if (!_validate_inner(code, code+skip-3, groups))
3082
if (op != SRE_OP_MAX_UNTIL && op != SRE_OP_MIN_UNTIL)
3087
case SRE_OP_GROUPREF:
3088
case SRE_OP_GROUPREF_IGNORE:
3094
case SRE_OP_GROUPREF_EXISTS:
3095
/* The regex syntax for this is: '(?(group)then|else)', where
3096
'group' is either an integer group number or a group name,
3097
'then' and 'else' are sub-regexes, and 'else' is optional. */
3102
code--; /* The skip is relative to the first arg! */
3103
/* There are two possibilities here: if there is both a 'then'
3104
part and an 'else' part, the generated code looks like:
3112
(<skipyes> jumps here)
3114
(<skipno> jumps here)
3116
If there is only a 'then' part, it looks like:
3124
There is no direct way to decide which it is, and we don't want
3125
to allow arbitrary jumps anywhere in the code; so we just look
3126
for a JUMP opcode preceding our skip target.
3128
if (skip >= 3 && code+skip-3 >= code &&
3129
code[skip-3] == SRE_OP_JUMP)
3131
VTRACE(("both then and else parts present\n"));
3132
if (!_validate_inner(code+1, code+skip-3, groups))
3134
code += skip-2; /* Position after JUMP, at <skipno> */
3136
if (!_validate_inner(code, code+skip-1, groups))
3141
VTRACE(("only a then part present\n"));
3142
if (!_validate_inner(code+1, code+skip-1, groups))
3149
case SRE_OP_ASSERT_NOT:
3151
GET_ARG; /* 0 for lookahead, width for lookbehind */
3152
code--; /* Back up over arg to simplify math below */
3153
if (arg & 0x80000000)
3154
FAIL; /* Width too large */
3155
/* Stop 1 before the end; we check the SUCCESS below */
3156
if (!_validate_inner(code+1, code+skip-2, groups))
3160
if (op != SRE_OP_SUCCESS)
3175
_validate_outer(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
3177
if (groups < 0 || groups > 100 || code >= end || end[-1] != SRE_OP_SUCCESS)
3179
if (groups == 0) /* fix for simplejson */
3180
groups = 100; /* 100 groups should always be safe */
3181
return _validate_inner(code, end-1, groups);
3185
_validate(PatternObject *self)
3187
if (!_validate_outer(self->code, self->code+self->codesize, self->groups))
3189
PyErr_SetString(PyExc_RuntimeError, "invalid SRE code");
3193
VTRACE(("Success!\n"));
3197
/* -------------------------------------------------------------------- */
3201
match_dealloc(MatchObject* self)
3203
Py_XDECREF(self->regs);
3204
Py_XDECREF(self->string);
3205
Py_DECREF(self->pattern);
3210
match_getslice_by_index(MatchObject* self, Py_ssize_t index, PyObject* def)
3212
if (index < 0 || index >= self->groups) {
3213
/* raise IndexError if we were given a bad group number */
3223
if (self->string == Py_None || self->mark[index] < 0) {
3224
/* return default value if the string or group is undefined */
3229
return PySequence_GetSlice(
3230
self->string, self->mark[index], self->mark[index+1]
3235
match_getindex(MatchObject* self, PyObject* index)
3243
if (PyLong_Check(index))
3244
return PyLong_AsSsize_t(index);
3248
if (self->pattern->groupindex) {
3249
index = PyObject_GetItem(self->pattern->groupindex, index);
3251
if (PyLong_Check(index))
3252
i = PyLong_AsSsize_t(index);
3262
match_getslice(MatchObject* self, PyObject* index, PyObject* def)
3264
return match_getslice_by_index(self, match_getindex(self, index), def);
3268
match_expand(MatchObject* self, PyObject* ptemplate)
3270
/* delegate to Python code */
3272
SRE_PY_MODULE, "_expand",
3273
PyTuple_Pack(3, self->pattern, self, ptemplate)
3278
match_group(MatchObject* self, PyObject* args)
3283
size = PyTuple_GET_SIZE(args);
3287
result = match_getslice(self, Py_False, Py_None);
3290
result = match_getslice(self, PyTuple_GET_ITEM(args, 0), Py_None);
3293
/* fetch multiple items */
3294
result = PyTuple_New(size);
3297
for (i = 0; i < size; i++) {
3298
PyObject* item = match_getslice(
3299
self, PyTuple_GET_ITEM(args, i), Py_None
3305
PyTuple_SET_ITEM(result, i, item);
3313
match_groups(MatchObject* self, PyObject* args, PyObject* kw)
3318
PyObject* def = Py_None;
3319
static char* kwlist[] = { "default", NULL };
3320
if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groups", kwlist, &def))
3323
result = PyTuple_New(self->groups-1);
3327
for (index = 1; index < self->groups; index++) {
3329
item = match_getslice_by_index(self, index, def);
3334
PyTuple_SET_ITEM(result, index-1, item);
3341
match_groupdict(MatchObject* self, PyObject* args, PyObject* kw)
3347
PyObject* def = Py_None;
3348
static char* kwlist[] = { "default", NULL };
3349
if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groupdict", kwlist, &def))
3352
result = PyDict_New();
3353
if (!result || !self->pattern->groupindex)
3356
keys = PyMapping_Keys(self->pattern->groupindex);
3360
for (index = 0; index < PyList_GET_SIZE(keys); index++) {
3364
key = PyList_GET_ITEM(keys, index);
3367
value = match_getslice(self, key, def);
3372
status = PyDict_SetItem(result, key, value);
3389
match_start(MatchObject* self, PyObject* args)
3393
PyObject* index_ = NULL;
3394
if (!PyArg_UnpackTuple(args, "start", 0, 1, &index_))
3397
index = match_getindex(self, index_);
3399
if (index < 0 || index >= self->groups) {
3407
/* mark is -1 if group is undefined */
3408
return Py_BuildValue("i", self->mark[index*2]);
3412
match_end(MatchObject* self, PyObject* args)
3416
PyObject* index_ = NULL;
3417
if (!PyArg_UnpackTuple(args, "end", 0, 1, &index_))
3420
index = match_getindex(self, index_);
3422
if (index < 0 || index >= self->groups) {
3430
/* mark is -1 if group is undefined */
3431
return Py_BuildValue("i", self->mark[index*2+1]);
3435
_pair(Py_ssize_t i1, Py_ssize_t i2)
3440
pair = PyTuple_New(2);
3444
item = PyLong_FromSsize_t(i1);
3447
PyTuple_SET_ITEM(pair, 0, item);
3449
item = PyLong_FromSsize_t(i2);
3452
PyTuple_SET_ITEM(pair, 1, item);
3462
match_span(MatchObject* self, PyObject* args)
3466
PyObject* index_ = NULL;
3467
if (!PyArg_UnpackTuple(args, "span", 0, 1, &index_))
3470
index = match_getindex(self, index_);
3472
if (index < 0 || index >= self->groups) {
3480
/* marks are -1 if group is undefined */
3481
return _pair(self->mark[index*2], self->mark[index*2+1]);
3485
match_regs(MatchObject* self)
3491
regs = PyTuple_New(self->groups);
3495
for (index = 0; index < self->groups; index++) {
3496
item = _pair(self->mark[index*2], self->mark[index*2+1]);
3501
PyTuple_SET_ITEM(regs, index, item);
3511
match_copy(MatchObject* self, PyObject *unused)
3513
#ifdef USE_BUILTIN_COPY
3515
Py_ssize_t slots, offset;
3517
slots = 2 * (self->pattern->groups+1);
3519
copy = PyObject_NEW_VAR(MatchObject, &Match_Type, slots);
3523
/* this value a constant, but any compiler should be able to
3524
figure that out all by itself */
3525
offset = offsetof(MatchObject, string);
3527
Py_XINCREF(self->pattern);
3528
Py_XINCREF(self->string);
3529
Py_XINCREF(self->regs);
3531
memcpy((char*) copy + offset, (char*) self + offset,
3532
sizeof(MatchObject) + slots * sizeof(Py_ssize_t) - offset);
3534
return (PyObject*) copy;
3536
PyErr_SetString(PyExc_TypeError, "cannot copy this match object");
3542
match_deepcopy(MatchObject* self, PyObject* memo)
3544
#ifdef USE_BUILTIN_COPY
3547
copy = (MatchObject*) match_copy(self);
3551
if (!deepcopy((PyObject**) ©->pattern, memo) ||
3552
!deepcopy(©->string, memo) ||
3553
!deepcopy(©->regs, memo)) {
3559
PyErr_SetString(PyExc_TypeError, "cannot deepcopy this match object");
3564
static PyMethodDef match_methods[] = {
3565
{"group", (PyCFunction) match_group, METH_VARARGS},
3566
{"start", (PyCFunction) match_start, METH_VARARGS},
3567
{"end", (PyCFunction) match_end, METH_VARARGS},
3568
{"span", (PyCFunction) match_span, METH_VARARGS},
3569
{"groups", (PyCFunction) match_groups, METH_VARARGS|METH_KEYWORDS},
3570
{"groupdict", (PyCFunction) match_groupdict, METH_VARARGS|METH_KEYWORDS},
3571
{"expand", (PyCFunction) match_expand, METH_O},
3572
{"__copy__", (PyCFunction) match_copy, METH_NOARGS},
3573
{"__deepcopy__", (PyCFunction) match_deepcopy, METH_O},
3578
match_lastindex_get(MatchObject *self)
3580
if (self->lastindex >= 0)
3581
return Py_BuildValue("i", self->lastindex);
3587
match_lastgroup_get(MatchObject *self)
3589
if (self->pattern->indexgroup && self->lastindex >= 0) {
3590
PyObject* result = PySequence_GetItem(
3591
self->pattern->indexgroup, self->lastindex
3602
match_regs_get(MatchObject *self)
3605
Py_INCREF(self->regs);
3608
return match_regs(self);
3611
static PyGetSetDef match_getset[] = {
3612
{"lastindex", (getter)match_lastindex_get, (setter)NULL},
3613
{"lastgroup", (getter)match_lastgroup_get, (setter)NULL},
3614
{"regs", (getter)match_regs_get, (setter)NULL},
3618
#define MATCH_OFF(x) offsetof(MatchObject, x)
3619
static PyMemberDef match_members[] = {
3620
{"string", T_OBJECT, MATCH_OFF(string), READONLY},
3621
{"re", T_OBJECT, MATCH_OFF(pattern), READONLY},
3622
{"pos", T_PYSSIZET, MATCH_OFF(pos), READONLY},
3623
{"endpos", T_PYSSIZET, MATCH_OFF(endpos), READONLY},
3627
/* FIXME: implement setattr("string", None) as a special case (to
3628
detach the associated string, if any */
3630
static PyTypeObject Match_Type = {
3631
PyVarObject_HEAD_INIT(NULL,0)
3632
"_" SRE_MODULE ".SRE_Match",
3633
sizeof(MatchObject), sizeof(Py_ssize_t),
3634
(destructor)match_dealloc, /* tp_dealloc */
3638
0, /* tp_reserved */
3640
0, /* tp_as_number */
3641
0, /* tp_as_sequence */
3642
0, /* tp_as_mapping */
3646
0, /* tp_getattro */
3647
0, /* tp_setattro */
3648
0, /* tp_as_buffer */
3649
Py_TPFLAGS_DEFAULT, /* tp_flags */
3651
0, /* tp_traverse */
3653
0, /* tp_richcompare */
3654
0, /* tp_weaklistoffset */
3656
0, /* tp_iternext */
3657
match_methods, /* tp_methods */
3658
match_members, /* tp_members */
3659
match_getset, /* tp_getset */
3663
pattern_new_match(PatternObject* pattern, SRE_STATE* state, int status)
3665
/* create match object (from state object) */
3674
/* create match object (with room for extra group marks) */
3675
/* coverity[ampersand_in_size] */
3676
match = PyObject_NEW_VAR(MatchObject, &Match_Type,
3677
2*(pattern->groups+1));
3682
match->pattern = pattern;
3684
Py_INCREF(state->string);
3685
match->string = state->string;
3688
match->groups = pattern->groups+1;
3690
/* fill in group slices */
3692
base = (char*) state->beginning;
3693
n = state->charsize;
3695
match->mark[0] = ((char*) state->start - base) / n;
3696
match->mark[1] = ((char*) state->ptr - base) / n;
3698
for (i = j = 0; i < pattern->groups; i++, j+=2)
3699
if (j+1 <= state->lastmark && state->mark[j] && state->mark[j+1]) {
3700
match->mark[j+2] = ((char*) state->mark[j] - base) / n;
3701
match->mark[j+3] = ((char*) state->mark[j+1] - base) / n;
3703
match->mark[j+2] = match->mark[j+3] = -1; /* undefined */
3705
match->pos = state->pos;
3706
match->endpos = state->endpos;
3708
match->lastindex = state->lastindex;
3710
return (PyObject*) match;
3712
} else if (status == 0) {
3720
/* internal error */
3721
pattern_error(status);
3726
/* -------------------------------------------------------------------- */
3727
/* scanner methods (experimental) */
3730
scanner_dealloc(ScannerObject* self)
3732
state_fini(&self->state);
3733
Py_DECREF(self->pattern);
3738
scanner_match(ScannerObject* self, PyObject *unused)
3740
SRE_STATE* state = &self->state;
3746
state->ptr = state->start;
3748
if (state->charsize == 1) {
3749
status = sre_match(state, PatternObject_GetCode(self->pattern));
3751
#if defined(HAVE_UNICODE)
3752
status = sre_umatch(state, PatternObject_GetCode(self->pattern));
3755
if (PyErr_Occurred())
3758
match = pattern_new_match((PatternObject*) self->pattern,
3761
if (status == 0 || state->ptr == state->start)
3762
state->start = (void*) ((char*) state->ptr + state->charsize);
3764
state->start = state->ptr;
3771
scanner_search(ScannerObject* self, PyObject *unused)
3773
SRE_STATE* state = &self->state;
3779
state->ptr = state->start;
3781
if (state->charsize == 1) {
3782
status = sre_search(state, PatternObject_GetCode(self->pattern));
3784
#if defined(HAVE_UNICODE)
3785
status = sre_usearch(state, PatternObject_GetCode(self->pattern));
3788
if (PyErr_Occurred())
3791
match = pattern_new_match((PatternObject*) self->pattern,
3794
if (status == 0 || state->ptr == state->start)
3795
state->start = (void*) ((char*) state->ptr + state->charsize);
3797
state->start = state->ptr;
3802
static PyMethodDef scanner_methods[] = {
3803
{"match", (PyCFunction) scanner_match, METH_NOARGS},
3804
{"search", (PyCFunction) scanner_search, METH_NOARGS},
3808
#define SCAN_OFF(x) offsetof(ScannerObject, x)
3809
static PyMemberDef scanner_members[] = {
3810
{"pattern", T_OBJECT, SCAN_OFF(pattern), READONLY},
3811
{NULL} /* Sentinel */
3814
static PyTypeObject Scanner_Type = {
3815
PyVarObject_HEAD_INIT(NULL, 0)
3816
"_" SRE_MODULE ".SRE_Scanner",
3817
sizeof(ScannerObject), 0,
3818
(destructor)scanner_dealloc,/* tp_dealloc */
3822
0, /* tp_reserved */
3824
0, /* tp_as_number */
3825
0, /* tp_as_sequence */
3826
0, /* tp_as_mapping */
3830
0, /* tp_getattro */
3831
0, /* tp_setattro */
3832
0, /* tp_as_buffer */
3833
Py_TPFLAGS_DEFAULT, /* tp_flags */
3835
0, /* tp_traverse */
3837
0, /* tp_richcompare */
3838
0, /* tp_weaklistoffset */
3840
0, /* tp_iternext */
3841
scanner_methods, /* tp_methods */
3842
scanner_members, /* tp_members */
3847
pattern_scanner(PatternObject* pattern, PyObject* args)
3849
/* create search state object */
3851
ScannerObject* self;
3854
Py_ssize_t start = 0;
3855
Py_ssize_t end = PY_SSIZE_T_MAX;
3856
if (!PyArg_ParseTuple(args, "O|nn:scanner", &string, &start, &end))
3859
/* create scanner object */
3860
self = PyObject_NEW(ScannerObject, &Scanner_Type);
3864
string = state_init(&self->state, pattern, string, start, end);
3871
self->pattern = (PyObject*) pattern;
3873
return (PyObject*) self;
3876
static PyMethodDef _functions[] = {
3877
{"compile", _compile, METH_VARARGS},
3878
{"getcodesize", sre_codesize, METH_NOARGS},
3879
{"getlower", sre_getlower, METH_VARARGS},
3883
static struct PyModuleDef sremodule = {
3884
PyModuleDef_HEAD_INIT,
3895
PyMODINIT_FUNC PyInit__sre(void)
3901
/* Initialize object types */
3902
if (PyType_Ready(&Pattern_Type) < 0)
3904
if (PyType_Ready(&Match_Type) < 0)
3906
if (PyType_Ready(&Scanner_Type) < 0)
3909
m = PyModule_Create(&sremodule);
3912
d = PyModule_GetDict(m);
3914
x = PyLong_FromLong(SRE_MAGIC);
3916
PyDict_SetItemString(d, "MAGIC", x);
3920
x = PyLong_FromLong(sizeof(SRE_CODE));
3922
PyDict_SetItemString(d, "CODESIZE", x);
3926
x = PyUnicode_FromString(copyright);
3928
PyDict_SetItemString(d, "copyright", x);
3934
#endif /* !defined(SRE_RECURSIVE) */