1
//** Unit to process RegEx commands
3
{ TRegExpr class library
4
Delphi Regular Expressions
6
Copyright (c) 1999-2004 Andrey V. Sorokin, St.Petersburg, Russia
8
You may use this software in any kind of development,
9
including comercial, redistribute, and modify it freely,
10
under the following restrictions :
11
1. This software is provided as it is, without any kind of
12
warranty given. Use it at Your own risk.The author is not
13
responsible for any consequences of use of this software.
14
2. The origin of this software may not be mispresented, You
15
must not claim that You wrote the original software. If
16
You use this software in any kind of product, it would be
17
appreciated that there in a information box, or in the
18
documentation would be an acknowledgement like
20
Partial Copyright (c) 2004 Andrey V. Sorokin
21
http://RegExpStudio.com
24
3. You may not have any income from distributing this source
25
(or altered version of it) to other developers. When You
26
use this product in a comercial package, the source may
27
not be charged seperatly.
28
4. Altered versions must be plainly marked as such, and must
29
not be misrepresented as being the original software.
30
5. RegExp Studio application and all the visual components as
31
well as documentation is not part of the TRegExpr library
32
and is not free for usage.
35
http://RegExpStudio.com
39
// ======== Determine compiler
40
{$IFDEF VER80} Sorry, TRegExpr is for 32-bits Delphi only. Delphi 1 is not supported (and whos really care today?!). {$ENDIF}
41
{$IFDEF VER90} {$DEFINE D2} {$ENDIF} // D2
42
{$IFDEF VER93} {$DEFINE D2} {$ENDIF} // CPPB 1
43
{$IFDEF VER100} {$DEFINE D3} {$DEFINE D2} {$ENDIF} // D3
44
{$IFDEF VER110} {$DEFINE D4} {$DEFINE D3} {$DEFINE D2} {$ENDIF} // CPPB 3
45
{$IFDEF VER120} {$DEFINE D4} {$DEFINE D3} {$DEFINE D2} {$ENDIF} // D4
46
{$IFDEF VER130} {$DEFINE D5} {$DEFINE D4} {$DEFINE D3} {$DEFINE D2} {$ENDIF} // D5
47
{$IFDEF VER140} {$DEFINE D6} {$DEFINE D5} {$DEFINE D4} {$DEFINE D3} {$DEFINE D2} {$ENDIF} // D6
48
{$IFDEF VER150} {$DEFINE D7} {$DEFINE D6} {$DEFINE D5} {$DEFINE D4} {$DEFINE D3} {$DEFINE D2} {$ENDIF} // D7
50
// ======== Define base compiler options
56
{$WARN SYMBOL_PLATFORM OFF} // Suppress .Net warnings
59
{$WARN UNSAFE_CAST OFF} // Suppress .Net warnings
60
{$WARN UNSAFE_TYPE OFF} // Suppress .Net warnings
61
{$WARN UNSAFE_CODE OFF} // Suppress .Net warnings
64
{$MODE DELPHI} // Delphi-compatible mode in FreePascal
67
// ======== Define options for TRegExpr engine
68
{.$DEFINE UniCode} // Unicode support
69
{$DEFINE RegExpPCodeDump} // p-code dumping (see Dump method)
70
{$IFNDEF FPC} // the option is not supported in FreePascal
71
{$DEFINE reRealExceptionAddr} // exceptions will point to appropriate source line, not to Error procedure
73
{$DEFINE ComplexBraces} // support braces in complex cases
74
{$IFNDEF UniCode} // the option applicable only for non-UniCode mode
75
{$DEFINE UseSetOfChar} // Significant optimization by using set of char
78
{$DEFINE UseFirstCharSet} // Fast skip between matches for r.e. that starts with determined set of chars
81
// ======== Define Pascal-language options
82
// Define 'UseAsserts' option (do not edit this definitions).
83
// Asserts used to catch 'strange bugs' in TRegExpr implementation (when something goes
84
// completely wrong). You can swith asserts on/off with help of {$C+}/{$C-} compiler options.
85
{$IFDEF D3} {$DEFINE UseAsserts} {$ENDIF}
86
{$IFDEF FPC} {$DEFINE UseAsserts} {$ENDIF}
88
// Define 'use subroutine parameters default values' option (do not edit this definition).
89
{$IFDEF D4} {$DEFINE DefParam} {$ENDIF}
91
// Define 'OverMeth' options, to use method overloading (do not edit this definitions).
92
{$IFDEF D5} {$DEFINE OverMeth} {$ENDIF}
93
{$IFDEF FPC} {$DEFINE OverMeth} {$ENDIF}
96
Classes, // TStrings in Split method
97
SysUtils; // Exception
101
PRegExprChar = PWideChar;
102
RegExprString = WideString;
105
PRegExprChar = PChar;
106
RegExprString = AnsiString; //###0.952 was string
109
TREOp = REChar; // internal p-code type //###0.933
111
TRENextOff = integer; // internal Next "pointer" (offset to current p-code) //###0.933
112
PRENextOff = ^TRENextOff; // used for extracting Next "pointers" from compiled r.e. //###0.933
113
TREBracesArg = integer; // type of {m,n} arguments
114
PREBracesArg = ^TREBracesArg;
117
REOpSz = SizeOf (TREOp) div SizeOf (REChar); // size of p-code in RegExprString units
118
RENextOffSz = SizeOf (TRENextOff) div SizeOf (REChar); // size of Next 'pointer' -"-
119
REBracesArgSz = SizeOf (TREBracesArg) div SizeOf (REChar); // size of BRACES arguments -"-
122
TRegExprInvertCaseFunction = function (const Ch : REChar) : REChar
126
EscChar = '\'; // 'Escape'-char ('\' in common r.e.) used for escaping metachars (\w, \d etc).
127
RegExprModifierI : boolean = False; // default value for ModifierI
128
RegExprModifierR : boolean = True; // default value for ModifierR
129
RegExprModifierS : boolean = True; // default value for ModifierS
130
RegExprModifierG : boolean = True; // default value for ModifierG
131
RegExprModifierM : boolean = False; // default value for ModifierM
132
RegExprModifierX : boolean = False; // default value for ModifierX
133
RegExprSpaceChars : RegExprString = // default value for SpaceChars
135
RegExprWordChars : RegExprString = // default value for WordChars
136
'0123456789' //###0.940
137
+ 'abcdefghijklmnopqrstuvwxyz'
138
+ 'ABCDEFGHIJKLMNOPQRSTUVWXYZ_';
139
RegExprLineSeparators : RegExprString =// default value for LineSeparators
140
#$d#$a{$IFDEF UniCode}+#$b#$c#$2028#$2029#$85{$ENDIF}; //###0.947
141
RegExprLinePairedSeparator : RegExprString =// default value for LinePairedSeparator
143
{ if You need Unix-styled line separators (only \n), then use:
144
RegExprLineSeparators = #$a;
145
RegExprLinePairedSeparator = '';
150
NSUBEXP = 15; // max number of subexpression //###0.929
151
// Cannot be more than NSUBEXPMAX
152
// Be carefull - don't use values which overflow CLOSE opcode
153
// (in this case you'll get compiler erorr).
154
// Big NSUBEXP will cause more slow work and more stack required
155
NSUBEXPMAX = 255; // Max possible value for NSUBEXP. //###0.945
156
// Don't change it! It's defined by internal TRegExpr design.
158
MaxBracesArg = $7FFFFFFF - 1; // max value for {n,m} arguments //###0.933
160
{$IFDEF ComplexBraces}
161
LoopStackMax = 10; // max depth of loops stack //###0.925
165
// if range includes more then TinySetLen chars, //###0.934
166
// then use full (32 bytes) ANYOFFULL instead of ANYOF[BUT]TINYSET
167
// !!! Attension ! If you change TinySetLen, you must
168
// change code marked as "//!!!TinySet"
173
{$IFDEF UseSetOfChar}
174
PSetOfREChar = ^TSetOfREChar;
175
TSetOfREChar = set of REChar;
180
TRegExprReplaceFunction = function (ARegExpr : TRegExpr): string
185
startp : array [0 .. NSUBEXP - 1] of PRegExprChar; // founded expr starting points
186
endp : array [0 .. NSUBEXP - 1] of PRegExprChar; // founded expr end points
188
{$IFDEF ComplexBraces}
189
LoopStack : array [1 .. LoopStackMax] of integer; // state before entering loop
190
LoopStackIdx : integer; // 0 - out of all loops
193
// The "internal use only" fields to pass info from compile
194
// to execute that permits the execute phase to run lots faster on
196
regstart : REChar; // char that must begin a match; '\0' if none obvious
197
reganch : REChar; // is the match anchored (at beginning-of-line only)?
198
regmust : PRegExprChar; // string (pointer into program) that match must include, or nil
199
regmlen : integer; // length of regmust string
200
// Regstart and reganch permit very fast decisions on suitable starting points
201
// for a match, cutting down the work a lot. Regmust permits fast rejection
202
// of lines that cannot possibly match. The regmust tests are costly enough
203
// that regcomp() supplies a regmust only if the r.e. contains something
204
// potentially expensive (at present, the only such thing detected is * or +
205
// at the start of the r.e., which can involve a lot of backup). Regmlen is
206
// supplied because the test in regexec() needs it and regcomp() is computing
208
{$IFDEF UseFirstCharSet} //###0.929
209
FirstCharSet : TSetOfREChar;
212
// work variables for Exec's routins - save stack in recursion}
213
reginput : PRegExprChar; // String-input pointer.
214
fInputStart : PRegExprChar; // Pointer to first char of input string.
215
fInputEnd : PRegExprChar; // Pointer to char AFTER last char of input string
217
// work variables for compiler's routines
218
regparse : PRegExprChar; // Input-scan pointer.
219
regnpar : integer; // count.
221
regcode : PRegExprChar; // Code-emit pointer; @regdummy = don't.
222
regsize : integer; // Code size.
224
regexpbeg : PRegExprChar; // only for error handling. Contains
225
// pointer to beginning of r.e. while compiling
226
fExprIsCompiled : boolean; // true if r.e. successfully compiled
228
// programm is essentially a linear encoding
229
// of a nondeterministic finite-state machine (aka syntax charts or
230
// "railroad normal form" in parsing technology). Each node is an opcode
231
// plus a "next" pointer, possibly plus an operand. "Next" pointers of
232
// all nodes except BRANCH implement concatenation; a "next" pointer with
233
// a BRANCH on both ends of it is connecting two alternatives. (Here we
234
// have one of the subtle syntax dependencies: an individual BRANCH (as
235
// opposed to a collection of them) is never concatenated with anything
236
// because of operator precedence.) The operand of some types of node is
237
// a literal string; for others, it is a node leading into a sub-FSM. In
238
// particular, the operand of a BRANCH node is the first node of the branch.
239
// (NB this is *not* a tree structure: the tail of the branch connects
240
// to the thing following the set of BRANCHes.) The opcodes are:
241
programm : PRegExprChar; // Unwarranted chumminess with compiler.
243
fExpression : PRegExprChar; // source of compiled r.e.
244
fInputString : PRegExprChar; // input string
246
fLastError : integer; // see Error, LastError
248
fModifiers : integer; // modifiers
249
fCompModifiers : integer; // compiler's copy of modifiers
250
fProgModifiers : integer; // modifiers values from last programm compilation
252
fSpaceChars : RegExprString; //###0.927
253
fWordChars : RegExprString; //###0.929
254
fInvertCase : TRegExprInvertCaseFunction; //###0.927
256
fLineSeparators : RegExprString; //###0.941
257
fLinePairedSeparatorAssigned : boolean;
258
fLinePairedSeparatorHead,
259
fLinePairedSeparatorTail : REChar;
261
fLineSeparatorsSet : set of REChar;
264
procedure InvalidateProgramm;
265
// Mark programm as have to be [re]compiled
267
function IsProgrammOk : boolean; //###0.941
268
// Check if we can use precompiled r.e. or
269
// [re]compile it if something changed
271
function GetExpression : RegExprString;
272
procedure SetExpression (const s : RegExprString);
274
function GetModifierStr : RegExprString;
275
class function ParseModifiersStr (const AModifiers : RegExprString;
276
var AModifiersInt : integer) : boolean; //###0.941 class function now
277
// Parse AModifiers string and return true and set AModifiersInt
278
// if it's in format 'ismxrg-ismxrg'.
279
procedure SetModifierStr (const AModifiers : RegExprString);
281
function GetModifier (AIndex : integer) : boolean;
282
procedure SetModifier (AIndex : integer; ASet : boolean);
284
procedure Error (AErrorID : integer); virtual; // error handler.
285
// Default handler raise exception ERegExpr with
286
// Message = ErrorMsg (AErrorID), ErrorCode = AErrorID
287
// and CompilerErrorPos = value of property CompilerErrorPos.
290
{==================== Compiler section ===================}
291
function CompileRegExpr (exp : PRegExprChar) : boolean;
292
// compile a regular expression into internal code
294
procedure Tail (p : PRegExprChar; val : PRegExprChar);
295
// set the next-pointer at the end of a node chain
297
procedure OpTail (p : PRegExprChar; val : PRegExprChar);
298
// regoptail - regtail on operand of first argument; nop if operandless
300
function EmitNode (op : TREOp) : PRegExprChar;
301
// regnode - emit a node, return location
303
procedure EmitC (b : REChar);
304
// emit (if appropriate) a byte of code
306
procedure InsertOperator (op : TREOp; opnd : PRegExprChar; sz : integer); //###0.90
307
// insert an operator in front of already-emitted operand
308
// Means relocating the operand.
310
function ParseReg (paren : integer; var flagp : integer) : PRegExprChar;
311
// regular expression, i.e. main body or parenthesized thing
313
function ParseBranch (var flagp : integer) : PRegExprChar;
314
// one alternative of an | operator
316
function ParsePiece (var flagp : integer) : PRegExprChar;
317
// something followed by possible [*+?]
319
function ParseAtom (var flagp : integer) : PRegExprChar;
322
function GetCompilerErrorPos : integer;
323
// current pos in r.e. - for error hanling
325
{$IFDEF UseFirstCharSet} //###0.929
326
procedure FillFirstCharSet (prog : PRegExprChar);
329
{===================== Mathing section ===================}
330
function regrepeat (p : PRegExprChar; AMax : integer) : integer;
331
// repeatedly match something simple, report how many
333
function regnext (p : PRegExprChar) : PRegExprChar;
334
// dig the "next" pointer out of a node
336
function MatchPrim (prog : PRegExprChar) : boolean;
337
// recursively matching routine
339
function ExecPrim (AOffset: integer) : boolean;
340
// Exec for stored InputString
342
{$IFDEF RegExpPCodeDump}
343
function DumpOp (op : REChar) : RegExprString;
346
function GetSubExprMatchCount : integer;
347
function GetMatchPos (Idx : integer) : integer;
348
function GetMatchLen (Idx : integer) : integer;
349
function GetMatch (Idx : integer) : RegExprString;
351
function GetInputString : RegExprString;
352
procedure SetInputString (const AInputString : RegExprString);
354
{$IFNDEF UseSetOfChar}
355
function StrScanCI (s : PRegExprChar; ch : REChar) : PRegExprChar; //###0.928
358
procedure SetLineSeparators (const AStr : RegExprString);
359
procedure SetLinePairedSeparator (const AStr : RegExprString);
360
function GetLinePairedSeparator : RegExprString;
364
destructor Destroy; override;
366
class function VersionMajor : integer; //###0.944
367
class function VersionMinor : integer; //###0.944
369
property Expression : RegExprString read GetExpression write SetExpression;
370
// Regular expression.
371
// For optimization, TRegExpr will automatically compiles it into 'P-code'
372
// (You can see it with help of Dump method) and stores in internal
373
// structures. Real [re]compilation occures only when it really needed -
374
// while calling Exec[Next], Substitute, Dump, etc
375
// and only if Expression or other P-code affected properties was changed
376
// after last [re]compilation.
377
// If any errors while [re]compilation occures, Error method is called
378
// (by default Error raises exception - see below)
380
property ModifierStr : RegExprString read GetModifierStr write SetModifierStr;
381
// Set/get default values of r.e.syntax modifiers. Modifiers in
382
// r.e. (?ismx-ismx) will replace this default values.
383
// If you try to set unsupported modifier, Error will be called
384
// (by defaul Error raises exception ERegExpr).
386
property ModifierI : boolean index 1 read GetModifier write SetModifier;
387
// Modifier /i - caseinsensitive, initialized from RegExprModifierI
389
property ModifierR : boolean index 2 read GetModifier write SetModifier;
390
// Modifier /r - use r.e.syntax extended for russian,
391
// (was property ExtSyntaxEnabled in previous versions)
392
// If true, then �-� additional include russian letter '�',
393
// �-� additional include '�', and �-� include all russian symbols.
394
// You have to turn it off if it may interfere with you national alphabet.
395
// , initialized from RegExprModifierR
397
property ModifierS : boolean index 3 read GetModifier write SetModifier;
398
// Modifier /s - '.' works as any char (else as [^\n]),
399
// , initialized from RegExprModifierS
401
property ModifierG : boolean index 4 read GetModifier write SetModifier;
402
// Switching off modifier /g switchs all operators in
403
// non-greedy style, so if ModifierG = False, then
404
// all '*' works as '*?', all '+' as '+?' and so on.
405
// , initialized from RegExprModifierG
407
property ModifierM : boolean index 5 read GetModifier write SetModifier;
408
// Treat string as multiple lines. That is, change `^' and `$' from
409
// matching at only the very start or end of the string to the start
410
// or end of any line anywhere within the string.
411
// , initialized from RegExprModifierM
413
property ModifierX : boolean index 6 read GetModifier write SetModifier;
414
// Modifier /x - eXtended syntax, allow r.e. text formatting,
415
// see description in the help. Initialized from RegExprModifierX
417
function Exec (const AInputString : RegExprString) : boolean; {$IFDEF OverMeth} overload;
418
{$IFNDEF FPC} // I do not know why FreePascal cannot overload methods with empty param list
419
function Exec : boolean; overload; //###0.949
421
function Exec (AOffset: integer) : boolean; overload; //###0.949
423
// match a programm against a string AInputString
424
// !!! Exec store AInputString into InputString property
425
// For Delphi 5 and higher available overloaded versions - first without
426
// parameter (uses already assigned to InputString property value)
427
// and second that has integer parameter and is same as ExecPos
429
function ExecNext : boolean;
433
// if MatchLen [0] = 0 then ExecPos (MatchPos [0] + 1)
434
// else ExecPos (MatchPos [0] + MatchLen [0]);
435
// but it's more simpler !
436
// Raises exception if used without preceeding SUCCESSFUL call to
437
// Exec* (Exec, ExecPos, ExecNext). So You always must use something like
438
// if Exec (InputString) then repeat { proceed results} until not ExecNext;
440
function ExecPos (AOffset: integer {$IFDEF DefParam}= 1{$ENDIF}) : boolean;
441
// find match for InputString starting from AOffset position
442
// (AOffset=1 - first char of InputString)
444
property InputString : RegExprString read GetInputString write SetInputString;
445
// returns current input string (from last Exec call or last assign
446
// to this property).
447
// Any assignment to this property clear Match* properties !
449
function Substitute (const ATemplate : RegExprString) : RegExprString;
450
// Returns ATemplate with '$&' or '$0' replaced by whole r.e.
451
// occurence and '$n' replaced by occurence of subexpression #n.
452
// Since v.0.929 '$' used instead of '\' (for future extensions
453
// and for more Perl-compatibility) and accept more then one digit.
454
// If you want place into template raw '$' or '\', use prefix '\'
455
// Example: '1\$ is $2\\rub\\' -> '1$ is <Match[2]>\rub\'
456
// If you want to place raw digit after '$n' you must delimit
457
// n with curly braces '{}'.
458
// Example: 'a$12bc' -> 'a<Match[12]>bc'
459
// 'a${1}2bc' -> 'a<Match[1]>2bc'.
461
procedure Split (AInputStr : RegExprString; APieces : TStrings);
462
// Split AInputStr into APieces by r.e. occurencies
463
// Internally calls Exec[Next]
465
function Replace (AInputStr : RegExprString;
466
const AReplaceStr : RegExprString;
467
AUseSubstitution : boolean{$IFDEF DefParam}= False{$ENDIF}) //###0.946
468
: RegExprString; {$IFDEF OverMeth} overload;
469
function Replace (AInputStr : RegExprString;
470
AReplaceFunc : TRegExprReplaceFunction)
471
: RegExprString; overload;
473
function ReplaceEx (AInputStr : RegExprString;
474
AReplaceFunc : TRegExprReplaceFunction)
476
// Returns AInputStr with r.e. occurencies replaced by AReplaceStr
477
// If AUseSubstitution is true, then AReplaceStr will be used
478
// as template for Substitution methods.
480
// Expression := '({-i}block|var)\s*\(\s*([^ ]*)\s*\)\s*';
481
// Replace ('BLOCK( test1)', 'def "$1" value "$2"', True);
482
// will return: def 'BLOCK' value 'test1'
483
// Replace ('BLOCK( test1)', 'def "$1" value "$2"')
484
// will return: def "$1" value "$2"
485
// Internally calls Exec[Next]
486
// Overloaded version and ReplaceEx operate with call-back function,
487
// so You can implement really complex functionality.
489
property SubExprMatchCount : integer read GetSubExprMatchCount;
490
// Number of subexpressions has been found in last Exec* call.
491
// If there are no subexpr. but whole expr was found (Exec* returned True),
492
// then SubExprMatchCount=0, if no subexpressions nor whole
493
// r.e. found (Exec* returned false) then SubExprMatchCount=-1.
494
// Note, that some subexpr. may be not found and for such
495
// subexpr. MathPos=MatchLen=-1 and Match=''.
496
// For example: Expression := '(1)?2(3)?';
497
// Exec ('123'): SubExprMatchCount=2, Match[0]='123', [1]='1', [2]='3'
498
// Exec ('12'): SubExprMatchCount=1, Match[0]='12', [1]='1'
499
// Exec ('23'): SubExprMatchCount=2, Match[0]='23', [1]='', [2]='3'
500
// Exec ('2'): SubExprMatchCount=0, Match[0]='2'
501
// Exec ('7') - return False: SubExprMatchCount=-1
503
property MatchPos [Idx : integer] : integer read GetMatchPos;
504
// pos of entrance subexpr. #Idx into tested in last Exec*
505
// string. First subexpr. have Idx=1, last - MatchCount,
506
// whole r.e. have Idx=0.
507
// Returns -1 if in r.e. no such subexpr. or this subexpr.
508
// not found in input string.
510
property MatchLen [Idx : integer] : integer read GetMatchLen;
511
// len of entrance subexpr. #Idx r.e. into tested in last Exec*
512
// string. First subexpr. have Idx=1, last - MatchCount,
513
// whole r.e. have Idx=0.
514
// Returns -1 if in r.e. no such subexpr. or this subexpr.
515
// not found in input string.
516
// Remember - MatchLen may be 0 (if r.e. match empty string) !
518
property Match [Idx : integer] : RegExprString read GetMatch;
519
// == copy (InputString, MatchPos [Idx], MatchLen [Idx])
520
// Returns '' if in r.e. no such subexpr. or this subexpr.
521
// not found in input string.
523
function LastError : integer;
524
// Returns ID of last error, 0 if no errors (unusable if
525
// Error method raises exception) and clear internal status
526
// into 0 (no errors).
528
function ErrorMsg (AErrorID : integer) : RegExprString; virtual;
529
// Returns Error message for error with ID = AErrorID.
531
property CompilerErrorPos : integer read GetCompilerErrorPos;
532
// Returns pos in r.e. there compiler stopped.
533
// Usefull for error diagnostics
535
property SpaceChars : RegExprString read fSpaceChars write fSpaceChars; //###0.927
536
// Contains chars, treated as /s (initially filled with RegExprSpaceChars
539
property WordChars : RegExprString read fWordChars write fWordChars; //###0.929
540
// Contains chars, treated as /w (initially filled with RegExprWordChars
543
property LineSeparators : RegExprString read fLineSeparators write SetLineSeparators; //###0.941
544
// line separators (like \n in Unix)
546
property LinePairedSeparator : RegExprString read GetLinePairedSeparator write SetLinePairedSeparator; //###0.941
547
// paired line separator (like \r\n in DOS and Windows).
548
// must contain exactly two chars or no chars at all
550
class function InvertCaseFunction (const Ch : REChar) : REChar;
551
// Converts Ch into upper case if it in lower case or in lower
552
// if it in upper (uses current system local setings)
554
property InvertCase : TRegExprInvertCaseFunction read fInvertCase write fInvertCase; //##0.935
555
// Set this property if you want to override case-insensitive functionality.
556
// Create set it to RegExprInvertCaseFunction (InvertCaseFunction by default)
558
procedure Compile; //###0.941
559
// [Re]compile r.e. Usefull for example for GUI r.e. editors (to check
560
// all properties validity).
562
{$IFDEF RegExpPCodeDump}
563
function Dump : RegExprString;
564
// dump a compiled regexp in vaguely comprehensible form
568
ERegExpr = class (Exception)
571
CompilerErrorPos : integer;
575
RegExprInvertCaseFunction : TRegExprInvertCaseFunction = {$IFDEF FPC} nil {$ELSE} TRegExpr.InvertCaseFunction{$ENDIF};
576
// defaul for InvertCase property
578
function ExecRegExpr (const ARegExpr, AInputStr : RegExprString) : boolean;
579
// true if string AInputString match regular expression ARegExpr
580
// ! will raise exeption if syntax errors in ARegExpr
582
procedure SplitRegExpr (const ARegExpr, AInputStr : RegExprString; APieces : TStrings);
583
// Split AInputStr into APieces by r.e. ARegExpr occurencies
585
function ReplaceRegExpr (const ARegExpr, AInputStr, AReplaceStr : RegExprString;
586
AUseSubstitution : boolean{$IFDEF DefParam}= False{$ENDIF}) : RegExprString; //###0.947
587
// Returns AInputStr with r.e. occurencies replaced by AReplaceStr
588
// If AUseSubstitution is true, then AReplaceStr will be used
589
// as template for Substitution methods.
591
// ReplaceRegExpr ('({-i}block|var)\s*\(\s*([^ ]*)\s*\)\s*',
592
// 'BLOCK( test1)', 'def "$1" value "$2"', True)
593
// will return: def 'BLOCK' value 'test1'
594
// ReplaceRegExpr ('({-i}block|var)\s*\(\s*([^ ]*)\s*\)\s*',
595
// 'BLOCK( test1)', 'def "$1" value "$2"')
596
// will return: def "$1" value "$2"
598
function QuoteRegExprMetaChars (const AStr : RegExprString) : RegExprString;
599
// Replace all metachars with its safe representation,
600
// for example 'abc$cd.(' converts into 'abc\$cd\.\('
601
// This function usefull for r.e. autogeneration from
604
function RegExprSubExpressions (const ARegExpr : string;
605
ASubExprs : TStrings; AExtendedSyntax : boolean{$IFDEF DefParam}= False{$ENDIF}) : integer;
606
// Makes list of subexpressions found in ARegExpr r.e.
607
// In ASubExps every item represent subexpression,
608
// from first to last, in format:
609
// String - subexpression text (without '()')
610
// low word of Object - starting position in ARegExpr, including '('
611
// if exists! (first position is 1)
612
// high word of Object - length, including starting '(' and ending ')'
614
// AExtendedSyntax - must be True if modifier /m will be On while
616
// Usefull for GUI editors of r.e. etc (You can find example of using
617
// in TestRExp.dpr project)
619
// 0 Success. No unbalanced brackets was found;
620
// -1 There are not enough closing brackets ')';
621
// -(n+1) At position n was found opening '[' without //###0.942
622
// corresponding closing ']';
623
// n At position n was found closing bracket ')' without
624
// corresponding opening '('.
625
// If Result <> 0, then ASubExpr can contain empty items or illegal ones
631
TRegExprVersionMajor : integer = 0;
632
TRegExprVersionMinor : integer = 952;
633
// TRegExpr.VersionMajor/Minor return values of this constants
635
MaskModI = 1; // modifier /i bit in fModifiers
636
MaskModR = 2; // -"- /r
637
MaskModS = 4; // -"- /s
638
MaskModG = 8; // -"- /g
639
MaskModM = 16; // -"- /m
640
MaskModX = 32; // -"- /x
643
XIgnoredChars = ' '#9#$d#$a;
645
XIgnoredChars = [' ', #9, #$d, #$a];
648
{=============================================================}
649
{=================== WideString functions ====================}
650
{=============================================================}
654
function StrPCopy (Dest: PRegExprChar; const Source: RegExprString): PRegExprChar;
658
Len := length (Source); //###0.932
660
Dest [i - 1] := Source [i];
663
end; { of function StrPCopy
664
--------------------------------------------------------------}
666
function StrLCopy (Dest, Source: PRegExprChar; MaxLen: Cardinal): PRegExprChar;
669
for i := 0 to MaxLen - 1 do
670
Dest [i] := Source [i];
672
end; { of function StrLCopy
673
--------------------------------------------------------------}
675
function StrLen (Str: PRegExprChar): Cardinal;
678
while Str [result] <> #0
680
end; { of function StrLen
681
--------------------------------------------------------------}
683
function StrPos (Str1, Str2: PRegExprChar): PRegExprChar;
687
n := Pos (RegExprString (Str2), RegExprString (Str1));
690
Result := Str1 + n - 1;
691
end; { of function StrPos
692
--------------------------------------------------------------}
694
function StrLComp (Str1, Str2: PRegExprChar; MaxLen: Cardinal): Integer;
695
var S1, S2: RegExprString;
699
if Copy (S1, 1, MaxLen) > Copy (S2, 1, MaxLen)
702
if Copy (S1, 1, MaxLen) < Copy (S2, 1, MaxLen)
705
end; { function StrLComp
706
--------------------------------------------------------------}
708
function StrScan (Str: PRegExprChar; Chr: WideChar): PRegExprChar;
711
while (Str^ <> #0) and (Str^ <> Chr)
715
end; { of function StrScan
716
--------------------------------------------------------------}
721
{=============================================================}
722
{===================== Global functions ======================}
723
{=============================================================}
725
function ExecRegExpr (const ARegExpr, AInputStr : RegExprString) : boolean;
728
r := TRegExpr.Create;
730
r.Expression := ARegExpr;
731
Result := r.Exec (AInputStr);
734
end; { of function ExecRegExpr
735
--------------------------------------------------------------}
737
procedure SplitRegExpr (const ARegExpr, AInputStr : RegExprString; APieces : TStrings);
741
r := TRegExpr.Create;
743
r.Expression := ARegExpr;
744
r.Split (AInputStr, APieces);
747
end; { of procedure SplitRegExpr
748
--------------------------------------------------------------}
750
function ReplaceRegExpr (const ARegExpr, AInputStr, AReplaceStr : RegExprString;
751
AUseSubstitution : boolean{$IFDEF DefParam}= False{$ENDIF}) : RegExprString;
753
with TRegExpr.Create do try
754
Expression := ARegExpr;
755
Result := Replace (AInputStr, AReplaceStr, AUseSubstitution);
758
end; { of function ReplaceRegExpr
759
--------------------------------------------------------------}
761
function QuoteRegExprMetaChars (const AStr : RegExprString) : RegExprString;
763
RegExprMetaSet : RegExprString = '^$.[()|?+*'+EscChar+'{'
764
+ ']}'; // - this last are additional to META.
765
// Very similar to META array, but slighly changed.
766
// !Any changes in META array must be synchronized with this set.
768
i, i0, Len : integer;
771
Len := length (AStr);
774
while i <= Len do begin
775
if Pos (AStr [i], RegExprMetaSet) > 0 then begin
776
Result := Result + System.Copy (AStr, i0, i - i0)
777
+ EscChar + AStr [i];
782
Result := Result + System.Copy (AStr, i0, MaxInt); // Tail
783
end; { of function QuoteRegExprMetaChars
784
--------------------------------------------------------------}
786
function RegExprSubExpressions (const ARegExpr : string;
787
ASubExprs : TStrings; AExtendedSyntax : boolean{$IFDEF DefParam}= False{$ENDIF}) : integer;
789
TStackItemRec = record //###0.945
790
SubExprIdx : integer;
793
TStackArray = packed array [0 .. NSUBEXPMAX - 1] of TStackItemRec;
795
Len, SubExprLen : integer;
798
Stack : ^TStackArray; //###0.945
799
StackIdx, StackSz : integer;
801
Result := 0; // no unbalanced brackets found at this very moment
803
ASubExprs.Clear; // I don't think that adding to non empty list
804
// can be usefull, so I simplified algorithm to work only with empty list
806
Len := length (ARegExpr); // some optimization tricks
808
// first we have to calculate number of subexpression to reserve
809
// space in Stack array (may be we'll reserve more then need, but
810
// it's faster then memory reallocation during parsing)
811
StackSz := 1; // add 1 for entire r.e.
813
if ARegExpr [i] = '('
815
// SetLength (Stack, StackSz); //###0.945
816
GetMem (Stack, SizeOf (TStackItemRec) * StackSz);
821
while (i <= Len) do begin
824
if (i < Len) and (ARegExpr [i + 1] = '?') then begin
825
// this is not subexpression, but comment or other
826
// Perl extension. We must check is it (?ismxrg-ismxrg)
827
// and change AExtendedSyntax if /x is changed.
828
inc (i, 2); // skip '(?'
830
while (i <= Len) and (ARegExpr [i] <> ')')
833
then Result := -1 // unbalansed '('
835
if TRegExpr.ParseModifiersStr (System.Copy (ARegExpr, i, i - i0), Modif)
836
then AExtendedSyntax := (Modif and MaskModX) <> 0;
838
else begin // subexpression starts
839
ASubExprs.Add (''); // just reserve space
840
with Stack [StackIdx] do begin
841
SubExprIdx := ASubExprs.Count - 1;
849
then Result := i // unbalanced ')'
852
with Stack [StackIdx] do begin
853
SubExprLen := i - StartPos + 1;
854
ASubExprs.Objects [SubExprIdx] :=
855
TObject (StartPos or (SubExprLen ShL 16));
856
ASubExprs [SubExprIdx] := System.Copy (
857
ARegExpr, StartPos + 1, SubExprLen - 2); // add without brackets
861
EscChar: inc (i); // skip quoted symbol
863
// we have to skip character ranges at once, because they can
864
// contain '#', and '#' in it must NOT be recognized as eXtended
865
// comment beginning!
868
if ARegExpr [i] = ']' // cannot be 'emty' ranges - this interpretes
869
then inc (i); // as ']' by itself
870
while (i <= Len) and (ARegExpr [i] <> ']') do
871
if ARegExpr [i] = EscChar //###0.942
872
then inc (i, 2) // skip 'escaped' char to prevent stopping at '\]'
874
if (i > Len) or (ARegExpr [i] <> ']') //###0.942
875
then Result := - (i0 + 1); // unbalansed '[' //###0.942
877
'#': if AExtendedSyntax then begin
878
// skip eXtended comments
879
while (i <= Len) and (ARegExpr [i] <> #$d) and (ARegExpr [i] <> #$a)
880
// do not use [#$d, #$a] due to UniCode compatibility
882
while (i + 1 <= Len) and ((ARegExpr [i + 1] = #$d) or (ARegExpr [i + 1] = #$a))
883
do inc (i); // attempt to work with different kinds of line separators
884
// now we are at the line separator that must be skipped.
886
// here is no 'else' clause - we simply skip ordinary chars
888
inc (i); // skip scanned char
889
// ! can move after Len due to skipping quoted symbol
892
// check brackets balance
894
then Result := -1; // unbalansed '('
896
// check if entire r.e. added
897
if (ASubExprs.Count = 0)
898
or ((integer (ASubExprs.Objects [0]) and $FFFF) <> 1)
899
or (((integer (ASubExprs.Objects [0]) ShR 16) and $FFFF) <> Len)
900
// whole r.e. wasn't added because it isn't bracketed
901
// well, we add it now:
902
then ASubExprs.InsertObject (0, ARegExpr, TObject ((Len ShL 16) or 1));
904
finally FreeMem (Stack);
906
end; { of function RegExprSubExpressions
907
--------------------------------------------------------------}
912
MAGIC = TREOp (216);// programm signature
914
// name opcode opnd? meaning
915
EEND = TREOp (0); // - End of program
916
BOL = TREOp (1); // - Match "" at beginning of line
917
EOL = TREOp (2); // - Match "" at end of line
918
ANY = TREOp (3); // - Match any one character
919
ANYOF = TREOp (4); // Str Match any character in string Str
920
ANYBUT = TREOp (5); // Str Match any char. not in string Str
921
BRANCH = TREOp (6); // Node Match this alternative, or the next
922
BACK = TREOp (7); // - Jump backward (Next < 0)
923
EXACTLY = TREOp (8); // Str Match string Str
924
NOTHING = TREOp (9); // - Match empty string
925
STAR = TREOp (10); // Node Match this (simple) thing 0 or more times
926
PLUS = TREOp (11); // Node Match this (simple) thing 1 or more times
927
ANYDIGIT = TREOp (12); // - Match any digit (equiv [0-9])
928
NOTDIGIT = TREOp (13); // - Match not digit (equiv [0-9])
929
ANYLETTER = TREOp (14); // - Match any letter from property WordChars
930
NOTLETTER = TREOp (15); // - Match not letter from property WordChars
931
ANYSPACE = TREOp (16); // - Match any space char (see property SpaceChars)
932
NOTSPACE = TREOp (17); // - Match not space char (see property SpaceChars)
933
BRACES = TREOp (18); // Node,Min,Max Match this (simple) thing from Min to Max times.
934
// Min and Max are TREBracesArg
935
COMMENT = TREOp (19); // - Comment ;)
936
EXACTLYCI = TREOp (20); // Str Match string Str case insensitive
937
ANYOFCI = TREOp (21); // Str Match any character in string Str, case insensitive
938
ANYBUTCI = TREOp (22); // Str Match any char. not in string Str, case insensitive
939
LOOPENTRY = TREOp (23); // Node Start of loop (Node - LOOP for this loop)
940
LOOP = TREOp (24); // Node,Min,Max,LoopEntryJmp - back jump for LOOPENTRY.
941
// Min and Max are TREBracesArg
942
// Node - next node in sequence,
943
// LoopEntryJmp - associated LOOPENTRY node addr
944
ANYOFTINYSET= TREOp (25); // Chrs Match any one char from Chrs (exactly TinySetLen chars)
945
ANYBUTTINYSET=TREOp (26); // Chrs Match any one char not in Chrs (exactly TinySetLen chars)
946
ANYOFFULLSET= TREOp (27); // Set Match any one char from set of char
947
// - very fast (one CPU instruction !) but takes 32 bytes of p-code
948
BSUBEXP = TREOp (28); // Idx Match previously matched subexpression #Idx (stored as REChar) //###0.936
949
BSUBEXPCI = TREOp (29); // Idx -"- in case-insensitive mode
951
// Non-Greedy Style Ops //###0.940
952
STARNG = TREOp (30); // Same as START but in non-greedy mode
953
PLUSNG = TREOp (31); // Same as PLUS but in non-greedy mode
954
BRACESNG = TREOp (32); // Same as BRACES but in non-greedy mode
955
LOOPNG = TREOp (33); // Same as LOOP but in non-greedy mode
958
BOLML = TREOp (34); // - Match "" at beginning of line
959
EOLML = TREOp (35); // - Match "" at end of line
960
ANYML = TREOp (36); // - Match any one character
963
BOUND = TREOp (37); // Match "" between words //###0.943
964
NOTBOUND = TREOp (38); // Match "" not between words //###0.943
966
// !!! Change OPEN value if you add new opcodes !!!
968
OPEN = TREOp (39); // - Mark this point in input as start of \n
969
// OPEN + 1 is \1, etc.
970
CLOSE = TREOp (ord (OPEN) + NSUBEXP);
971
// - Analogous to OPEN.
973
// !!! Don't add new OpCodes after CLOSE !!!
975
// We work with p-code thru pointers, compatible with PRegExprChar.
976
// Note: all code components (TRENextOff, TREOp, TREBracesArg, etc)
977
// must have lengths that can be divided by SizeOf (REChar) !
978
// A node is TREOp of opcode followed Next "pointer" of TRENextOff type.
979
// The Next is a offset from the opcode of the node containing it.
980
// An operand, if any, simply follows the node. (Note that much of
981
// the code generation knows about this implicit relationship!)
982
// Using TRENextOff=integer speed up p-code processing.
984
// Opcodes description:
986
// BRANCH The set of branches constituting a single choice are hooked
987
// together with their "next" pointers, since precedence prevents
988
// anything being concatenated to any individual branch. The
989
// "next" pointer of the last BRANCH in a choice points to the
990
// thing following the whole choice. This is also where the
991
// final "next" pointer of each individual branch points; each
992
// branch starts with the operand node of a BRANCH node.
993
// BACK Normal "next" pointers all implicitly point forward; BACK
994
// exists to make loop structures possible.
995
// STAR,PLUS,BRACES '?', and complex '*' and '+', are implemented as
996
// circular BRANCH structures using BACK. Complex '{min,max}'
997
// - as pair LOOPENTRY-LOOP (see below). Simple cases (one
998
// character per match) are implemented with STAR, PLUS and
999
// BRACES for speed and to minimize recursive plunges.
1000
// LOOPENTRY,LOOP {min,max} are implemented as special pair
1001
// LOOPENTRY-LOOP. Each LOOPENTRY initialize loopstack for
1003
// OPEN,CLOSE are numbered at compile time.
1006
{=============================================================}
1007
{================== Error handling section ===================}
1008
{=============================================================}
1012
reeCompNullArgument = 100;
1013
reeCompRegexpTooBig = 101;
1014
reeCompParseRegTooManyBrackets = 102;
1015
reeCompParseRegUnmatchedBrackets = 103;
1016
reeCompParseRegUnmatchedBrackets2 = 104;
1017
reeCompParseRegJunkOnEnd = 105;
1018
reePlusStarOperandCouldBeEmpty = 106;
1020
reeBadHexDigit = 108;
1021
reeInvalidRange = 109;
1022
reeParseAtomTrailingBackSlash = 110;
1023
reeNoHexCodeAfterBSlashX = 111;
1024
reeHexCodeAfterBSlashXTooBig = 112;
1025
reeUnmatchedSqBrackets = 113;
1026
reeInternalUrp = 114;
1027
reeQPSBFollowsNothing = 115;
1028
reeTrailingBackSlash = 116;
1029
reeRarseAtomInternalDisaster = 119;
1030
reeBRACESArgTooBig = 122;
1031
reeBracesMinParamGreaterMax = 124;
1032
reeUnclosedComment = 125;
1033
reeComplexBracesNotImplemented = 126;
1034
reeUrecognizedModifier = 127;
1035
reeBadLinePairedSeparator = 128;
1036
reeRegRepeatCalledInappropriately = 1000;
1037
reeMatchPrimMemoryCorruption = 1001;
1038
reeMatchPrimCorruptedPointers = 1002;
1039
reeNoExpression = 1003;
1040
reeCorruptedProgram = 1004;
1041
reeNoInpitStringSpecified = 1005;
1042
reeOffsetMustBeGreaterThen0 = 1006;
1043
reeExecNextWithoutExec = 1007;
1044
reeGetInputStringWithoutInputString = 1008;
1045
reeDumpCorruptedOpcode = 1011;
1046
reeModifierUnsupported = 1013;
1047
reeLoopStackExceeded = 1014;
1048
reeLoopWithoutEntry = 1015;
1049
reeBadPCodeImported = 2000;
1051
function TRegExpr.ErrorMsg (AErrorID : integer) : RegExprString;
1054
reeOk: Result := 'No errors';
1055
reeCompNullArgument: Result := 'TRegExpr(comp): Null Argument';
1056
reeCompRegexpTooBig: Result := 'TRegExpr(comp): Regexp Too Big';
1057
reeCompParseRegTooManyBrackets: Result := 'TRegExpr(comp): ParseReg Too Many ()';
1058
reeCompParseRegUnmatchedBrackets: Result := 'TRegExpr(comp): ParseReg Unmatched ()';
1059
reeCompParseRegUnmatchedBrackets2: Result := 'TRegExpr(comp): ParseReg Unmatched ()';
1060
reeCompParseRegJunkOnEnd: Result := 'TRegExpr(comp): ParseReg Junk On End';
1061
reePlusStarOperandCouldBeEmpty: Result := 'TRegExpr(comp): *+ Operand Could Be Empty';
1062
reeNestedSQP: Result := 'TRegExpr(comp): Nested *?+';
1063
reeBadHexDigit: Result := 'TRegExpr(comp): Bad Hex Digit';
1064
reeInvalidRange: Result := 'TRegExpr(comp): Invalid [] Range';
1065
reeParseAtomTrailingBackSlash: Result := 'TRegExpr(comp): Parse Atom Trailing \';
1066
reeNoHexCodeAfterBSlashX: Result := 'TRegExpr(comp): No Hex Code After \x';
1067
reeHexCodeAfterBSlashXTooBig: Result := 'TRegExpr(comp): Hex Code After \x Is Too Big';
1068
reeUnmatchedSqBrackets: Result := 'TRegExpr(comp): Unmatched []';
1069
reeInternalUrp: Result := 'TRegExpr(comp): Internal Urp';
1070
reeQPSBFollowsNothing: Result := 'TRegExpr(comp): ?+*{ Follows Nothing';
1071
reeTrailingBackSlash: Result := 'TRegExpr(comp): Trailing \';
1072
reeRarseAtomInternalDisaster: Result := 'TRegExpr(comp): RarseAtom Internal Disaster';
1073
reeBRACESArgTooBig: Result := 'TRegExpr(comp): BRACES Argument Too Big';
1074
reeBracesMinParamGreaterMax: Result := 'TRegExpr(comp): BRACE Min Param Greater then Max';
1075
reeUnclosedComment: Result := 'TRegExpr(comp): Unclosed (?#Comment)';
1076
reeComplexBracesNotImplemented: Result := 'TRegExpr(comp): If you want take part in beta-testing BRACES ''{min,max}'' and non-greedy ops ''*?'', ''+?'', ''??'' for complex cases - remove ''.'' from {.$DEFINE ComplexBraces}';
1077
reeUrecognizedModifier: Result := 'TRegExpr(comp): Urecognized Modifier';
1078
reeBadLinePairedSeparator: Result := 'TRegExpr(comp): LinePairedSeparator must countain two different chars or no chars at all';
1080
reeRegRepeatCalledInappropriately: Result := 'TRegExpr(exec): RegRepeat Called Inappropriately';
1081
reeMatchPrimMemoryCorruption: Result := 'TRegExpr(exec): MatchPrim Memory Corruption';
1082
reeMatchPrimCorruptedPointers: Result := 'TRegExpr(exec): MatchPrim Corrupted Pointers';
1083
reeNoExpression: Result := 'TRegExpr(exec): Not Assigned Expression Property';
1084
reeCorruptedProgram: Result := 'TRegExpr(exec): Corrupted Program';
1085
reeNoInpitStringSpecified: Result := 'TRegExpr(exec): No Input String Specified';
1086
reeOffsetMustBeGreaterThen0: Result := 'TRegExpr(exec): Offset Must Be Greater Then 0';
1087
reeExecNextWithoutExec: Result := 'TRegExpr(exec): ExecNext Without Exec[Pos]';
1088
reeGetInputStringWithoutInputString: Result := 'TRegExpr(exec): GetInputString Without InputString';
1089
reeDumpCorruptedOpcode: Result := 'TRegExpr(dump): Corrupted Opcode';
1090
reeLoopStackExceeded: Result := 'TRegExpr(exec): Loop Stack Exceeded';
1091
reeLoopWithoutEntry: Result := 'TRegExpr(exec): Loop Without LoopEntry !';
1093
reeBadPCodeImported: Result := 'TRegExpr(misc): Bad p-code imported';
1094
else Result := 'Unknown error';
1096
end; { of procedure TRegExpr.Error
1097
--------------------------------------------------------------}
1099
function TRegExpr.LastError : integer;
1101
Result := fLastError;
1102
fLastError := reeOk;
1103
end; { of function TRegExpr.LastError
1104
--------------------------------------------------------------}
1107
{=============================================================}
1108
{===================== Common section ========================}
1109
{=============================================================}
1111
class function TRegExpr.VersionMajor : integer; //###0.944
1113
Result := TRegExprVersionMajor;
1114
end; { of class function TRegExpr.VersionMajor
1115
--------------------------------------------------------------}
1117
class function TRegExpr.VersionMinor : integer; //###0.944
1119
Result := TRegExprVersionMinor;
1120
end; { of class function TRegExpr.VersionMinor
1121
--------------------------------------------------------------}
1123
constructor TRegExpr.Create;
1128
fInputString := nil;
1131
fExprIsCompiled := false;
1133
ModifierI := RegExprModifierI;
1134
ModifierR := RegExprModifierR;
1135
ModifierS := RegExprModifierS;
1136
ModifierG := RegExprModifierG;
1137
ModifierM := RegExprModifierM; //###0.940
1139
SpaceChars := RegExprSpaceChars; //###0.927
1140
WordChars := RegExprWordChars; //###0.929
1141
fInvertCase := RegExprInvertCaseFunction; //###0.927
1143
fLineSeparators := RegExprLineSeparators; //###0.941
1144
LinePairedSeparator := RegExprLinePairedSeparator; //###0.941
1145
end; { of constructor TRegExpr.Create
1146
--------------------------------------------------------------}
1148
destructor TRegExpr.Destroy;
1151
then FreeMem (programm);
1152
if fExpression <> nil
1153
then FreeMem (fExpression);
1154
if fInputString <> nil
1155
then FreeMem (fInputString);
1156
end; { of destructor TRegExpr.Destroy
1157
--------------------------------------------------------------}
1159
class function TRegExpr.InvertCaseFunction (const Ch : REChar) : REChar;
1167
Result := {$IFDEF FPC}AnsiUpperCase (Ch) [1]{$ELSE} REChar (CharUpper (PChar (Ch))){$ENDIF};
1169
then Result := {$IFDEF FPC}AnsiLowerCase (Ch) [1]{$ELSE} REChar (CharLower (PChar (Ch))){$ENDIF};
1171
end; { of function TRegExpr.InvertCaseFunction
1172
--------------------------------------------------------------}
1174
function TRegExpr.GetExpression : RegExprString;
1176
if fExpression <> nil
1177
then Result := fExpression
1179
end; { of function TRegExpr.GetExpression
1180
--------------------------------------------------------------}
1182
procedure TRegExpr.SetExpression (const s : RegExprString);
1184
Len : integer; //###0.950
1186
if (s <> fExpression) or not fExprIsCompiled then begin
1187
fExprIsCompiled := false;
1188
if fExpression <> nil then begin
1189
FreeMem (fExpression);
1192
if s <> '' then begin
1193
Len := length (s); //###0.950
1194
GetMem (fExpression, (Len + 1) * SizeOf (REChar));
1195
// StrPCopy (fExpression, s); //###0.950 replaced due to StrPCopy limitation of 255 chars
1197
StrPCopy (fExpression, Copy (s, 1, Len)); //###0.950
1199
StrLCopy (fExpression, PRegExprChar (s), Len); //###0.950
1202
InvalidateProgramm; //###0.941
1205
end; { of procedure TRegExpr.SetExpression
1206
--------------------------------------------------------------}
1208
function TRegExpr.GetSubExprMatchCount : integer;
1210
if Assigned (fInputString) then begin
1211
Result := NSUBEXP - 1;
1212
while (Result > 0) and ((startp [Result] = nil)
1213
or (endp [Result] = nil))
1217
end; { of function TRegExpr.GetSubExprMatchCount
1218
--------------------------------------------------------------}
1220
function TRegExpr.GetMatchPos (Idx : integer) : integer;
1222
if (Idx >= 0) and (Idx < NSUBEXP) and Assigned (fInputString)
1223
and Assigned (startp [Idx]) and Assigned (endp [Idx]) then begin
1224
Result := (startp [Idx] - fInputString) + 1;
1227
end; { of function TRegExpr.GetMatchPos
1228
--------------------------------------------------------------}
1230
function TRegExpr.GetMatchLen (Idx : integer) : integer;
1232
if (Idx >= 0) and (Idx < NSUBEXP) and Assigned (fInputString)
1233
and Assigned (startp [Idx]) and Assigned (endp [Idx]) then begin
1234
Result := endp [Idx] - startp [Idx];
1237
end; { of function TRegExpr.GetMatchLen
1238
--------------------------------------------------------------}
1240
function TRegExpr.GetMatch (Idx : integer) : RegExprString;
1242
if (Idx >= 0) and (Idx < NSUBEXP) and Assigned (fInputString)
1243
and Assigned (startp [Idx]) and Assigned (endp [Idx])
1244
//then Result := copy (fInputString, MatchPos [Idx], MatchLen [Idx]) //###0.929
1245
then SetString (Result, startp [idx], endp [idx] - startp [idx])
1247
end; { of function TRegExpr.GetMatch
1248
--------------------------------------------------------------}
1250
function TRegExpr.GetModifierStr : RegExprString;
1255
then Result := 'i' + Result
1256
else Result := Result + 'i';
1258
then Result := 'r' + Result
1259
else Result := Result + 'r';
1261
then Result := 's' + Result
1262
else Result := Result + 's';
1264
then Result := 'g' + Result
1265
else Result := Result + 'g';
1267
then Result := 'm' + Result
1268
else Result := Result + 'm';
1270
then Result := 'x' + Result
1271
else Result := Result + 'x';
1273
if Result [length (Result)] = '-' // remove '-' if all modifiers are 'On'
1274
then System.Delete (Result, length (Result), 1);
1275
end; { of function TRegExpr.GetModifierStr
1276
--------------------------------------------------------------}
1278
class function TRegExpr.ParseModifiersStr (const AModifiers : RegExprString;
1279
var AModifiersInt : integer) : boolean;
1280
// !!! Be carefull - this is class function and must not use object instance fields
1288
Mask := 0; // prevent compiler warning
1289
for i := 1 to length (AModifiers) do
1290
if AModifiers [i] = '-'
1293
if Pos (AModifiers [i], 'iI') > 0
1294
then Mask := MaskModI
1295
else if Pos (AModifiers [i], 'rR') > 0
1296
then Mask := MaskModR
1297
else if Pos (AModifiers [i], 'sS') > 0
1298
then Mask := MaskModS
1299
else if Pos (AModifiers [i], 'gG') > 0
1300
then Mask := MaskModG
1301
else if Pos (AModifiers [i], 'mM') > 0
1302
then Mask := MaskModM
1303
else if Pos (AModifiers [i], 'xX') > 0
1304
then Mask := MaskModX
1310
then AModifiersInt := AModifiersInt or Mask
1311
else AModifiersInt := AModifiersInt and not Mask;
1313
end; { of function TRegExpr.ParseModifiersStr
1314
--------------------------------------------------------------}
1316
procedure TRegExpr.SetModifierStr (const AModifiers : RegExprString);
1318
if not ParseModifiersStr (AModifiers, fModifiers)
1319
then Error (reeModifierUnsupported);
1320
end; { of procedure TRegExpr.SetModifierStr
1321
--------------------------------------------------------------}
1323
function TRegExpr.GetModifier (AIndex : integer) : boolean;
1329
1: Mask := MaskModI;
1330
2: Mask := MaskModR;
1331
3: Mask := MaskModS;
1332
4: Mask := MaskModG;
1333
5: Mask := MaskModM;
1334
6: Mask := MaskModX;
1336
Error (reeModifierUnsupported);
1340
Result := (fModifiers and Mask) <> 0;
1341
end; { of function TRegExpr.GetModifier
1342
--------------------------------------------------------------}
1344
procedure TRegExpr.SetModifier (AIndex : integer; ASet : boolean);
1349
1: Mask := MaskModI;
1350
2: Mask := MaskModR;
1351
3: Mask := MaskModS;
1352
4: Mask := MaskModG;
1353
5: Mask := MaskModM;
1354
6: Mask := MaskModX;
1356
Error (reeModifierUnsupported);
1361
then fModifiers := fModifiers or Mask
1362
else fModifiers := fModifiers and not Mask;
1363
end; { of procedure TRegExpr.SetModifier
1364
--------------------------------------------------------------}
1367
{=============================================================}
1368
{==================== Compiler section =======================}
1369
{=============================================================}
1371
procedure TRegExpr.InvalidateProgramm;
1373
if programm <> nil then begin
1377
end; { of procedure TRegExpr.InvalidateProgramm
1378
--------------------------------------------------------------}
1380
procedure TRegExpr.Compile; //###0.941
1382
if fExpression = nil then begin // No Expression assigned
1383
Error (reeNoExpression);
1386
CompileRegExpr (fExpression);
1387
end; { of procedure TRegExpr.Compile
1388
--------------------------------------------------------------}
1390
function TRegExpr.IsProgrammOk : boolean;
1399
if fModifiers <> fProgModifiers //###0.941
1400
then InvalidateProgramm;
1402
// can we optimize line separators by using sets?
1404
fLineSeparatorsSet := [];
1405
for i := 1 to length (fLineSeparators)
1406
do System.Include (fLineSeparatorsSet, fLineSeparators [i]);
1409
// [Re]compile if needed
1411
then Compile; //###0.941
1413
// check [re]compiled programm
1415
then EXIT // error was set/raised by Compile (was reeExecAfterCompErr)
1416
else if programm [0] <> MAGIC // Program corrupted.
1417
then Error (reeCorruptedProgram)
1418
else Result := true;
1419
end; { of function TRegExpr.IsProgrammOk
1420
--------------------------------------------------------------}
1422
procedure TRegExpr.Tail (p : PRegExprChar; val : PRegExprChar);
1423
// set the next-pointer at the end of a node chain
1425
scan : PRegExprChar;
1426
temp : PRegExprChar;
1434
temp := regnext (scan);
1439
// Set Next 'pointer'
1441
then PRENextOff (scan + REOpSz)^ := - (scan - val) //###0.948
1442
// work around PWideChar subtraction bug (Delphi uses
1443
// shr after subtraction to calculate widechar distance %-( )
1444
// so, if difference is negative we have .. the "feature" :(
1445
// I could wrap it in $IFDEF UniCode, but I didn't because
1446
// "P � Q computes the difference between the address given
1447
// by P (the higher address) and the address given by Q (the
1448
// lower address)" - Delphi help quotation.
1449
else PRENextOff (scan + REOpSz)^ := val - scan; //###0.933
1450
end; { of procedure TRegExpr.Tail
1451
--------------------------------------------------------------}
1453
procedure TRegExpr.OpTail (p : PRegExprChar; val : PRegExprChar);
1454
// regtail on operand of first argument; nop if operandless
1456
// "Operandless" and "op != BRANCH" are synonymous in practice.
1457
if (p = nil) or (p = @regdummy) or (PREOp (p)^ <> BRANCH)
1459
Tail (p + REOpSz + RENextOffSz, val); //###0.933
1460
end; { of procedure TRegExpr.OpTail
1461
--------------------------------------------------------------}
1463
function TRegExpr.EmitNode (op : TREOp) : PRegExprChar; //###0.933
1464
// emit a node, return location
1467
if Result <> @regdummy then begin
1468
PREOp (regcode)^ := op;
1469
inc (regcode, REOpSz);
1470
PRENextOff (regcode)^ := 0; // Next "pointer" := nil
1471
inc (regcode, RENextOffSz);
1473
else inc (regsize, REOpSz + RENextOffSz); // compute code size without code generation
1474
end; { of function TRegExpr.EmitNode
1475
--------------------------------------------------------------}
1477
procedure TRegExpr.EmitC (b : REChar);
1478
// emit a byte to code
1480
if regcode <> @regdummy then begin
1484
else inc (regsize); // Type of p-code pointer always is ^REChar
1485
end; { of procedure TRegExpr.EmitC
1486
--------------------------------------------------------------}
1488
procedure TRegExpr.InsertOperator (op : TREOp; opnd : PRegExprChar; sz : integer);
1489
// insert an operator in front of already-emitted operand
1490
// Means relocating the operand.
1492
src, dst, place : PRegExprChar;
1495
if regcode = @regdummy then begin
1502
while src > opnd do begin
1507
place := opnd; // Op node, where operand used to be.
1508
PREOp (place)^ := op;
1509
inc (place, REOpSz);
1510
for i := 1 + REOpSz to sz do begin
1514
end; { of procedure TRegExpr.InsertOperator
1515
--------------------------------------------------------------}
1517
function strcspn (s1 : PRegExprChar; s2 : PRegExprChar) : integer;
1518
// find length of initial segment of s1 consisting
1519
// entirely of characters not from s2
1520
var scan1, scan2 : PRegExprChar;
1524
while scan1^ <> #0 do begin
1526
while scan2^ <> #0 do
1533
end; { of function strcspn
1534
--------------------------------------------------------------}
1537
// Flags to be passed up and down.
1538
HASWIDTH = 01; // Known never to match nil string.
1539
SIMPLE = 02; // Simple enough to be STAR/PLUS/BRACES operand.
1540
SPSTART = 04; // Starts with * or +.
1541
WORST = 0; // Worst case.
1542
META : array [0 .. 12] of REChar = (
1543
'^', '$', '.', '[', '(', ')', '|', '?', '+', '*', EscChar, '{', #0);
1544
// Any modification must be synchronized with QuoteRegExprMetaChars !!!
1547
RusRangeLo : array [0 .. 33] of REChar =
1548
(#$430,#$431,#$432,#$433,#$434,#$435,#$451,#$436,#$437,
1549
#$438,#$439,#$43A,#$43B,#$43C,#$43D,#$43E,#$43F,
1550
#$440,#$441,#$442,#$443,#$444,#$445,#$446,#$447,
1551
#$448,#$449,#$44A,#$44B,#$44C,#$44D,#$44E,#$44F,#0);
1552
RusRangeHi : array [0 .. 33] of REChar =
1553
(#$410,#$411,#$412,#$413,#$414,#$415,#$401,#$416,#$417,
1554
#$418,#$419,#$41A,#$41B,#$41C,#$41D,#$41E,#$41F,
1555
#$420,#$421,#$422,#$423,#$424,#$425,#$426,#$427,
1556
#$428,#$429,#$42A,#$42B,#$42C,#$42D,#$42E,#$42F,#0);
1557
RusRangeLoLow = #$430{'�'};
1558
RusRangeLoHigh = #$44F{'�'};
1559
RusRangeHiLow = #$410{'�'};
1560
RusRangeHiHigh = #$42F{'�'};
1562
RusRangeLo = '���������������������������������';
1563
RusRangeHi = '�����Ũ��������������������������';
1564
RusRangeLoLow = '�';
1565
RusRangeLoHigh = '�';
1566
RusRangeHiLow = '�';
1567
RusRangeHiHigh = '�';
1570
function TRegExpr.CompileRegExpr (exp : PRegExprChar) : boolean;
1571
// compile a regular expression into internal code
1572
// We can't allocate space until we know how big the compiled form will be,
1573
// but we can't compile it (and thus know how big it is) until we've got a
1574
// place to put the code. So we cheat: we compile it twice, once with code
1575
// generation turned off and size counting turned on, and once "for real".
1576
// This also means that we don't allocate space until we are sure that the
1577
// thing really will compile successfully, and we never have to move the
1578
// code and thus invalidate pointers into it. (Note that it has to be in
1579
// one piece because free() must be able to free it all.)
1580
// Beware that the optimization-preparation code in here knows about some
1581
// of the structure of the compiled regexp.
1583
scan, longest : PRegExprChar;
1587
Result := false; // life too dark
1589
regparse := nil; // for correct error handling
1593
if programm <> nil then begin
1598
if exp = nil then begin
1599
Error (reeCompNullArgument);
1603
fProgModifiers := fModifiers;
1604
// well, may it's paranoia. I'll check it later... !!!!!!!!
1606
// First pass: determine size, legality.
1607
fCompModifiers := fModifiers;
1611
regcode := @regdummy;
1613
if ParseReg (0, flags) = nil
1616
// Small enough for 2-bytes programm pointers ?
1617
// ###0.933 no real p-code length limits now :)))
1618
// if regsize >= 64 * 1024 then begin
1619
// Error (reeCompRegexpTooBig);
1624
GetMem (programm, regsize * SizeOf (REChar));
1626
// Second pass: emit code.
1627
fCompModifiers := fModifiers;
1630
regcode := programm;
1632
if ParseReg (0, flags) = nil
1635
// Dig out information for optimizations.
1636
{$IFDEF UseFirstCharSet} //###0.929
1638
FillFirstCharSet (programm + REOpSz);
1640
regstart := #0; // Worst-case defaults.
1644
scan := programm + REOpSz; // First BRANCH.
1645
if PREOp (regnext (scan))^ = EEND then begin // Only one top-level choice.
1646
scan := scan + REOpSz + RENextOffSz;
1648
// Starting-point info.
1649
if PREOp (scan)^ = EXACTLY
1650
then regstart := (scan + REOpSz + RENextOffSz)^
1651
else if PREOp (scan)^ = BOL
1654
// If there's something expensive in the r.e., find the longest
1655
// literal string that must appear and make it the regmust. Resolve
1656
// ties in favor of later strings, since the regstart check works
1657
// with the beginning of the r.e. and avoiding duplication
1658
// strengthens checking. Not a strong reason, but sufficient in the
1659
// absence of others.
1660
if (flags and SPSTART) <> 0 then begin
1663
while scan <> nil do begin
1664
if (PREOp (scan)^ = EXACTLY)
1665
and (strlen (scan + REOpSz + RENextOffSz) >= len) then begin
1666
longest := scan + REOpSz + RENextOffSz;
1667
len := strlen (longest);
1669
scan := regnext (scan);
1680
then InvalidateProgramm;
1682
fExprIsCompiled := Result; //###0.944
1686
end; { of function TRegExpr.CompileRegExpr
1687
--------------------------------------------------------------}
1689
function TRegExpr.ParseReg (paren : integer; var flagp : integer) : PRegExprChar;
1690
// regular expression, i.e. main body or parenthesized thing
1691
// Caller must absorb opening parenthesis.
1692
// Combining parenthesis handling with the base level of regular expression
1693
// is a trifle forced, but the need to tie the tails of the branches to what
1694
// follows makes it hard to avoid.
1696
ret, br, ender : PRegExprChar;
1699
SavedModifiers : integer;
1702
flagp := HASWIDTH; // Tentatively.
1703
parno := 0; // eliminate compiler stupid warning
1704
SavedModifiers := fCompModifiers;
1706
// Make an OPEN node, if parenthesized.
1707
if paren <> 0 then begin
1708
if regnpar >= NSUBEXP then begin
1709
Error (reeCompParseRegTooManyBrackets);
1714
ret := EmitNode (TREOp (ord (OPEN) + parno));
1718
// Pick up the branches, linking them together.
1719
br := ParseBranch (flags);
1720
if br = nil then begin
1725
then Tail (ret, br) // OPEN -> first.
1727
if (flags and HASWIDTH) = 0
1728
then flagp := flagp and not HASWIDTH;
1729
flagp := flagp or flags and SPSTART;
1730
while (regparse^ = '|') do begin
1732
br := ParseBranch (flags);
1733
if br = nil then begin
1737
Tail (ret, br); // BRANCH -> BRANCH.
1738
if (flags and HASWIDTH) = 0
1739
then flagp := flagp and not HASWIDTH;
1740
flagp := flagp or flags and SPSTART;
1743
// Make a closing node, and hook it on the end.
1745
then ender := EmitNode (TREOp (ord (CLOSE) + parno))
1746
else ender := EmitNode (EEND);
1749
// Hook the tails of the branches to the closing node.
1751
while br <> nil do begin
1756
// Check for proper termination.
1758
if regparse^ <> ')' then begin
1759
Error (reeCompParseRegUnmatchedBrackets);
1762
else inc (regparse); // skip trailing ')'
1763
if (paren = 0) and (regparse^ <> #0) then begin
1765
then Error (reeCompParseRegUnmatchedBrackets2)
1766
else Error (reeCompParseRegJunkOnEnd);
1769
fCompModifiers := SavedModifiers; // restore modifiers of parent
1771
end; { of function TRegExpr.ParseReg
1772
--------------------------------------------------------------}
1774
function TRegExpr.ParseBranch (var flagp : integer) : PRegExprChar;
1775
// one alternative of an | operator
1776
// Implements the concatenation operator.
1778
ret, chain, latest : PRegExprChar;
1781
flagp := WORST; // Tentatively.
1783
ret := EmitNode (BRANCH);
1785
while (regparse^ <> #0) and (regparse^ <> '|')
1786
and (regparse^ <> ')') do begin
1787
latest := ParsePiece (flags);
1788
if latest = nil then begin
1792
flagp := flagp or flags and HASWIDTH;
1793
if chain = nil // First piece.
1794
then flagp := flagp or flags and SPSTART
1795
else Tail (chain, latest);
1798
if chain = nil // Loop ran zero times.
1799
then EmitNode (NOTHING);
1801
end; { of function TRegExpr.ParseBranch
1802
--------------------------------------------------------------}
1804
function TRegExpr.ParsePiece (var flagp : integer) : PRegExprChar;
1805
// something followed by possible [*+?{]
1806
// Note that the branching code sequences used for ? and the general cases
1807
// of * and + and { are somewhat optimized: they use the same NOTHING node as
1808
// both the endmarker for their branch list and the body of the last branch.
1809
// It might seem that this node could be dispensed with entirely, but the
1810
// endmarker role is not redundant.
1811
function parsenum (AStart, AEnd : PRegExprChar) : TREBracesArg;
1814
if AEnd - AStart + 1 > 8 then begin // prevent stupid scanning
1815
Error (reeBRACESArgTooBig);
1818
while AStart <= AEnd do begin
1819
Result := Result * 10 + (ord (AStart^) - ord ('0'));
1822
if (Result > MaxBracesArg) or (Result < 0) then begin
1823
Error (reeBRACESArgTooBig);
1830
NonGreedyOp, NonGreedyCh : boolean; //###0.940
1831
TheOp : TREOp; //###0.940
1832
NextNode : PRegExprChar;
1834
BracesMin, Bracesmax : TREBracesArg;
1835
p, savedparse : PRegExprChar;
1837
procedure EmitComplexBraces (ABracesMin, ABracesMax : TREBracesArg;
1838
ANonGreedyOp : boolean); //###0.940
1839
{$IFDEF ComplexBraces}
1844
{$IFNDEF ComplexBraces}
1845
Error (reeComplexBracesNotImplemented);
1848
then TheOp := LOOPNG
1850
InsertOperator (LOOPENTRY, Result, REOpSz + RENextOffSz);
1851
NextNode := EmitNode (TheOp);
1852
if regcode <> @regdummy then begin
1853
off := (Result + REOpSz + RENextOffSz)
1854
- (regcode - REOpSz - RENextOffSz); // back to Atom after LOOPENTRY
1855
PREBracesArg (regcode)^ := ABracesMin;
1856
inc (regcode, REBracesArgSz);
1857
PREBracesArg (regcode)^ := ABracesMax;
1858
inc (regcode, REBracesArgSz);
1859
PRENextOff (regcode)^ := off;
1860
inc (regcode, RENextOffSz);
1862
else inc (regsize, REBracesArgSz * 2 + RENextOffSz);
1863
Tail (Result, NextNode); // LOOPENTRY -> LOOP
1864
if regcode <> @regdummy then
1865
Tail (Result + REOpSz + RENextOffSz, NextNode); // Atom -> LOOP
1869
procedure EmitSimpleBraces (ABracesMin, ABracesMax : TREBracesArg;
1870
ANonGreedyOp : boolean); //###0.940
1872
if ANonGreedyOp //###0.940
1873
then TheOp := BRACESNG
1874
else TheOp := BRACES;
1875
InsertOperator (TheOp, Result, REOpSz + RENextOffSz + REBracesArgSz * 2);
1876
if regcode <> @regdummy then begin
1877
PREBracesArg (Result + REOpSz + RENextOffSz)^ := ABracesMin;
1878
PREBracesArg (Result + REOpSz + RENextOffSz + REBracesArgSz)^ := ABracesMax;
1883
Result := ParseAtom (flags);
1888
if not ((op = '*') or (op = '+') or (op = '?') or (op = '{')) then begin
1892
if ((flags and HASWIDTH) = 0) and (op <> '?') then begin
1893
Error (reePlusStarOperandCouldBeEmpty);
1899
flagp := WORST or SPSTART;
1900
NonGreedyCh := (regparse + 1)^ = '?'; //###0.940
1901
NonGreedyOp := NonGreedyCh or ((fCompModifiers and MaskModG) = 0); //###0.940
1902
if (flags and SIMPLE) = 0 then begin
1903
if NonGreedyOp //###0.940
1904
then EmitComplexBraces (0, MaxBracesArg, NonGreedyOp)
1905
else begin // Emit x* as (x&|), where & means "self".
1906
InsertOperator (BRANCH, Result, REOpSz + RENextOffSz); // Either x
1907
OpTail (Result, EmitNode (BACK)); // and loop
1908
OpTail (Result, Result); // back
1909
Tail (Result, EmitNode (BRANCH)); // or
1910
Tail (Result, EmitNode (NOTHING)); // nil.
1913
else begin // Simple
1914
if NonGreedyOp //###0.940
1915
then TheOp := STARNG
1917
InsertOperator (TheOp, Result, REOpSz + RENextOffSz);
1919
if NonGreedyCh //###0.940
1920
then inc (regparse); // Skip extra char ('?')
1923
flagp := WORST or SPSTART or HASWIDTH;
1924
NonGreedyCh := (regparse + 1)^ = '?'; //###0.940
1925
NonGreedyOp := NonGreedyCh or ((fCompModifiers and MaskModG) = 0); //###0.940
1926
if (flags and SIMPLE) = 0 then begin
1927
if NonGreedyOp //###0.940
1928
then EmitComplexBraces (1, MaxBracesArg, NonGreedyOp)
1929
else begin // Emit x+ as x(&|), where & means "self".
1930
NextNode := EmitNode (BRANCH); // Either
1931
Tail (Result, NextNode);
1932
Tail (EmitNode (BACK), Result); // loop back
1933
Tail (NextNode, EmitNode (BRANCH)); // or
1934
Tail (Result, EmitNode (NOTHING)); // nil.
1937
else begin // Simple
1938
if NonGreedyOp //###0.940
1939
then TheOp := PLUSNG
1941
InsertOperator (TheOp, Result, REOpSz + RENextOffSz);
1943
if NonGreedyCh //###0.940
1944
then inc (regparse); // Skip extra char ('?')
1948
NonGreedyCh := (regparse + 1)^ = '?'; //###0.940
1949
NonGreedyOp := NonGreedyCh or ((fCompModifiers and MaskModG) = 0); //###0.940
1950
if NonGreedyOp then begin //###0.940 // We emit x?? as x{0,1}?
1951
if (flags and SIMPLE) = 0
1952
then EmitComplexBraces (0, 1, NonGreedyOp)
1953
else EmitSimpleBraces (0, 1, NonGreedyOp);
1955
else begin // greedy '?'
1956
InsertOperator (BRANCH, Result, REOpSz + RENextOffSz); // Either x
1957
Tail (Result, EmitNode (BRANCH)); // or
1958
NextNode := EmitNode (NOTHING); // nil.
1959
Tail (Result, NextNode);
1960
OpTail (Result, NextNode);
1962
if NonGreedyCh //###0.940
1963
then inc (regparse); // Skip extra char ('?')
1966
savedparse := regparse;
1968
// Filip Jirsak's note - what will happen, when we are at the end of regparse?
1971
while Pos (regparse^, '0123456789') > 0 // <min> MUST appear
1973
if (regparse^ <> '}') and (regparse^ <> ',') or (p = regparse) then begin
1974
regparse := savedparse;
1978
BracesMin := parsenum (p, regparse - 1);
1979
if regparse^ = ',' then begin
1982
while Pos (regparse^, '0123456789') > 0
1984
if regparse^ <> '}' then begin
1985
regparse := savedparse;
1989
then BracesMax := MaxBracesArg
1990
else BracesMax := parsenum (p, regparse - 1);
1992
else BracesMax := BracesMin; // {n} == {n,n}
1993
if BracesMin > BracesMax then begin
1994
Error (reeBracesMinParamGreaterMax);
1998
then flagp := WORST;
2000
then flagp := flagp or HASWIDTH or SPSTART;
2002
NonGreedyCh := (regparse + 1)^ = '?'; //###0.940
2003
NonGreedyOp := NonGreedyCh or ((fCompModifiers and MaskModG) = 0); //###0.940
2004
if (flags and SIMPLE) <> 0
2005
then EmitSimpleBraces (BracesMin, BracesMax, NonGreedyOp)
2006
else EmitComplexBraces (BracesMin, BracesMax, NonGreedyOp);
2007
if NonGreedyCh //###0.940
2008
then inc (regparse); // Skip extra char '?'
2010
// else // here we can't be
2014
if (regparse^ = '*') or (regparse^ = '+') or (regparse^ = '?') or (regparse^ = '{') then begin
2015
Error (reeNestedSQP);
2018
end; { of function TRegExpr.ParsePiece
2019
--------------------------------------------------------------}
2021
function TRegExpr.ParseAtom (var flagp : integer) : PRegExprChar;
2023
// Optimization: gobbles an entire sequence of ordinary characters so that
2024
// it can turn them into a single node, which is smaller to store and
2025
// faster to run. Backslashed characters are exceptions, each becoming a
2026
// separate node; the code is simpler that way and it's not worth fixing.
2030
RangeBeg, RangeEnd : REChar;
2031
CanBeRange : boolean;
2034
begmodfs : PRegExprChar;
2036
{$IFDEF UseSetOfChar} //###0.930
2037
RangePCodeBeg : PRegExprChar;
2038
RangePCodeIdx : integer;
2039
RangeIsCI : boolean;
2040
RangeSet : TSetOfREChar;
2042
RangeChMin, RangeChMax : REChar;
2045
procedure EmitExactly (ch : REChar);
2047
if (fCompModifiers and MaskModI) <> 0
2048
then ret := EmitNode (EXACTLYCI)
2049
else ret := EmitNode (EXACTLY);
2052
flagp := flagp or HASWIDTH or SIMPLE;
2055
procedure EmitStr (const s : RegExprString);
2058
for i := 1 to length (s)
2062
function HexDig (ch : REChar) : integer;
2065
if (ch >= 'a') and (ch <= 'f')
2066
then ch := REChar (ord (ch) - (ord ('a') - ord ('A')));
2067
if (ch < '0') or (ch > 'F') or ((ch > '9') and (ch < 'A')) then begin
2068
Error (reeBadHexDigit);
2071
Result := ord (ch) - ord ('0');
2073
then Result := Result - (ord ('A') - ord ('9') - 1);
2076
function EmitRange (AOpCode : REChar) : PRegExprChar;
2078
{$IFDEF UseSetOfChar}
2081
Result := EmitNode (ANYBUTTINYSET);
2082
else // ANYOFCI, ANYOF
2083
Result := EmitNode (ANYOFTINYSET);
2088
else // ANYBUT, ANYOF
2091
RangePCodeBeg := regcode;
2092
RangePCodeIdx := regsize;
2098
Result := EmitNode (AOpCode);
2100
// !!!!!!!!!!!!! Implement ANYOF[BUT]TINYSET generation for UniCode !!!!!!!!!!
2104
{$IFDEF UseSetOfChar}
2105
procedure EmitRangeCPrim (b : REChar); //###0.930
2111
then RangeChMin := b;
2113
then RangeChMax := b;
2114
Include (RangeSet, b);
2118
procedure EmitRangeC (b : REChar);
2119
{$IFDEF UseSetOfChar}
2124
CanBeRange := false;
2125
{$IFDEF UseSetOfChar}
2126
if b <> #0 then begin
2127
EmitRangeCPrim (b); //###0.930
2129
then EmitRangeCPrim (InvertCase (b)); //###0.930
2133
Assert (RangeLen > 0, 'TRegExpr.ParseAtom(subroutine EmitRangeC): empty range'); // impossible, but who knows..
2134
Assert (RangeChMin <= RangeChMax, 'TRegExpr.ParseAtom(subroutine EmitRangeC): RangeChMin > RangeChMax'); // impossible, but who knows..
2136
if RangeLen <= TinySetLen then begin // emit "tiny set"
2137
if regcode = @regdummy then begin
2138
regsize := RangePCodeIdx + TinySetLen; // RangeChMin/Max !!!
2141
regcode := RangePCodeBeg;
2142
for Ch := RangeChMin to RangeChMax do //###0.930
2143
if Ch in RangeSet then begin
2148
while regcode < RangePCodeBeg + TinySetLen do begin
2149
regcode^ := RangeChMax;
2154
if regcode = @regdummy then begin
2155
regsize := RangePCodeIdx + SizeOf (TSetOfREChar);
2158
if (RangePCodeBeg - REOpSz - RENextOffSz)^ = ANYBUTTINYSET
2159
then RangeSet := [#0 .. #255] - RangeSet;
2160
PREOp (RangePCodeBeg - REOpSz - RENextOffSz)^ := ANYOFFULLSET;
2161
regcode := RangePCodeBeg;
2162
Move (RangeSet, regcode^, SizeOf (TSetOfREChar));
2163
inc (regcode, SizeOf (TSetOfREChar));
2171
procedure EmitSimpleRangeC (b : REChar);
2178
procedure EmitRangeStr (const s : RegExprString);
2181
for i := 1 to length (s)
2182
do EmitRangeC (s [i]);
2185
function UnQuoteChar (var APtr : PRegExprChar) : REChar; //###0.934
2188
't': Result := #$9; // tab (HT/TAB)
2189
'n': Result := #$a; // newline (NL)
2190
'r': Result := #$d; // car.return (CR)
2191
'f': Result := #$c; // form feed (FF)
2192
'a': Result := #$7; // alarm (bell) (BEL)
2193
'e': Result := #$1b; // escape (ESC)
2194
'x': begin // hex char
2197
if APtr^ = #0 then begin
2198
Error (reeNoHexCodeAfterBSlashX);
2201
if APtr^ = '{' then begin // \x{nnnn} //###0.936
2204
if APtr^ = #0 then begin
2205
Error (reeNoHexCodeAfterBSlashX);
2208
if APtr^ <> '}' then begin
2210
ShR (SizeOf (REChar) * 8 - 4)) and $F <> 0 then begin
2211
Error (reeHexCodeAfterBSlashXTooBig);
2214
Result := REChar ((Ord (Result) ShL 4) or HexDig (APtr^));
2215
// HexDig will cause Error if bad hex digit found
2221
Result := REChar (HexDig (APtr^));
2222
// HexDig will cause Error if bad hex digit found
2224
if APtr^ = #0 then begin
2225
Error (reeNoHexCodeAfterBSlashX);
2228
Result := REChar ((Ord (Result) ShL 4) or HexDig (APtr^));
2229
// HexDig will cause Error if bad hex digit found
2232
else Result := APtr^;
2238
flagp := WORST; // Tentatively.
2241
case (regparse - 1)^ of
2242
'^': if ((fCompModifiers and MaskModM) = 0)
2243
or ((fLineSeparators = '') and not fLinePairedSeparatorAssigned)
2244
then ret := EmitNode (BOL)
2245
else ret := EmitNode (BOLML);
2246
'$': if ((fCompModifiers and MaskModM) = 0)
2247
or ((fLineSeparators = '') and not fLinePairedSeparatorAssigned)
2248
then ret := EmitNode (EOL)
2249
else ret := EmitNode (EOLML);
2251
if (fCompModifiers and MaskModS) <> 0 then begin
2252
ret := EmitNode (ANY);
2253
flagp := flagp or HASWIDTH or SIMPLE;
2255
else begin // not /s, so emit [^:LineSeparators:]
2256
ret := EmitNode (ANYML);
2257
flagp := flagp or HASWIDTH; // not so simple ;)
2258
// ret := EmitRange (ANYBUT);
2259
// EmitRangeStr (LineSeparators); //###0.941
2260
// EmitRangeStr (LinePairedSeparator); // !!! isn't correct if have to accept only paired
2262
// flagp := flagp or HASWIDTH or SIMPLE;
2265
if regparse^ = '^' then begin // Complement of range.
2266
if (fCompModifiers and MaskModI) <> 0
2267
then ret := EmitRange (ANYBUTCI)
2268
else ret := EmitRange (ANYBUT);
2272
if (fCompModifiers and MaskModI) <> 0
2273
then ret := EmitRange (ANYOFCI)
2274
else ret := EmitRange (ANYOF);
2276
CanBeRange := false;
2278
if (regparse^ = ']') then begin
2279
EmitSimpleRangeC (regparse^); // []-a] -> ']' .. 'a'
2283
while (regparse^ <> #0) and (regparse^ <> ']') do begin
2284
if (regparse^ = '-')
2285
and ((regparse + 1)^ <> #0) and ((regparse + 1)^ <> ']')
2286
and CanBeRange then begin
2288
RangeEnd := regparse^;
2289
if RangeEnd = EscChar then begin
2290
{$IFDEF UniCode} //###0.935
2291
if (ord ((regparse + 1)^) < 256)
2292
and (char ((regparse + 1)^)
2293
in ['d', 'D', 's', 'S', 'w', 'W']) then begin
2295
if (regparse + 1)^ in ['d', 'D', 's', 'S', 'w', 'W'] then begin
2297
EmitRangeC ('-'); // or treat as error ?!!
2301
RangeEnd := UnQuoteChar (regparse);
2304
// r.e.ranges extension for russian
2305
if ((fCompModifiers and MaskModR) <> 0)
2306
and (RangeBeg = RusRangeLoLow) and (RangeEnd = RusRangeLoHigh) then begin
2307
EmitRangeStr (RusRangeLo);
2309
else if ((fCompModifiers and MaskModR) <> 0)
2310
and (RangeBeg = RusRangeHiLow) and (RangeEnd = RusRangeHiHigh) then begin
2311
EmitRangeStr (RusRangeHi);
2313
else if ((fCompModifiers and MaskModR) <> 0)
2314
and (RangeBeg = RusRangeLoLow) and (RangeEnd = RusRangeHiHigh) then begin
2315
EmitRangeStr (RusRangeLo);
2316
EmitRangeStr (RusRangeHi);
2318
else begin // standard r.e. handling
2319
if RangeBeg > RangeEnd then begin
2320
Error (reeInvalidRange);
2324
EmitRangeC (RangeEnd); // prevent infinite loop if RangeEnd=$ff
2325
while RangeBeg < RangeEnd do begin //###0.929
2326
EmitRangeC (RangeBeg);
2333
if regparse^ = EscChar then begin
2335
if regparse^ = #0 then begin
2336
Error (reeParseAtomTrailingBackSlash);
2339
case regparse^ of // r.e.extensions
2340
'd': EmitRangeStr ('0123456789');
2341
'w': EmitRangeStr (WordChars);
2342
's': EmitRangeStr (SpaceChars);
2343
else EmitSimpleRangeC (UnQuoteChar (regparse));
2346
else EmitSimpleRangeC (regparse^);
2351
if regparse^ <> ']' then begin
2352
Error (reeUnmatchedSqBrackets);
2356
flagp := flagp or HASWIDTH or SIMPLE;
2359
if regparse^ = '?' then begin
2360
// check for extended Perl syntax : (?..)
2361
if (regparse + 1)^ = '#' then begin // (?#comment)
2362
inc (regparse, 2); // find closing ')'
2363
while (regparse^ <> #0) and (regparse^ <> ')')
2365
if regparse^ <> ')' then begin
2366
Error (reeUnclosedComment);
2369
inc (regparse); // skip ')'
2370
ret := EmitNode (COMMENT); // comment
2372
else begin // modifiers ?
2373
inc (regparse); // skip '?'
2374
begmodfs := regparse;
2375
while (regparse^ <> #0) and (regparse^ <> ')')
2377
if (regparse^ <> ')')
2378
or not ParseModifiersStr (copy (begmodfs, 1, (regparse - begmodfs)), fCompModifiers) then begin
2379
Error (reeUrecognizedModifier);
2382
inc (regparse); // skip ')'
2383
ret := EmitNode (COMMENT); // comment
2384
// Error (reeQPSBFollowsNothing);
2389
ret := ParseReg (1, flags);
2390
if ret = nil then begin
2394
flagp := flagp or flags and (HASWIDTH or SPSTART);
2397
#0, '|', ')': begin // Supposed to be caught earlier.
2398
Error (reeInternalUrp);
2401
'?', '+', '*': begin
2402
Error (reeQPSBFollowsNothing);
2406
if regparse^ = #0 then begin
2407
Error (reeTrailingBackSlash);
2410
case regparse^ of // r.e.extensions
2411
'b': ret := EmitNode (BOUND); //###0.943
2412
'B': ret := EmitNode (NOTBOUND); //###0.943
2413
'A': ret := EmitNode (BOL); //###0.941
2414
'Z': ret := EmitNode (EOL); //###0.941
2415
'd': begin // r.e.extension - any digit ('0' .. '9')
2416
ret := EmitNode (ANYDIGIT);
2417
flagp := flagp or HASWIDTH or SIMPLE;
2419
'D': begin // r.e.extension - not digit ('0' .. '9')
2420
ret := EmitNode (NOTDIGIT);
2421
flagp := flagp or HASWIDTH or SIMPLE;
2423
's': begin // r.e.extension - any space char
2424
{$IFDEF UseSetOfChar}
2425
ret := EmitRange (ANYOF);
2426
EmitRangeStr (SpaceChars);
2429
ret := EmitNode (ANYSPACE);
2431
flagp := flagp or HASWIDTH or SIMPLE;
2433
'S': begin // r.e.extension - not space char
2434
{$IFDEF UseSetOfChar}
2435
ret := EmitRange (ANYBUT);
2436
EmitRangeStr (SpaceChars);
2439
ret := EmitNode (NOTSPACE);
2441
flagp := flagp or HASWIDTH or SIMPLE;
2443
'w': begin // r.e.extension - any english char / digit / '_'
2444
{$IFDEF UseSetOfChar}
2445
ret := EmitRange (ANYOF);
2446
EmitRangeStr (WordChars);
2449
ret := EmitNode (ANYLETTER);
2451
flagp := flagp or HASWIDTH or SIMPLE;
2453
'W': begin // r.e.extension - not english char / digit / '_'
2454
{$IFDEF UseSetOfChar}
2455
ret := EmitRange (ANYBUT);
2456
EmitRangeStr (WordChars);
2459
ret := EmitNode (NOTLETTER);
2461
flagp := flagp or HASWIDTH or SIMPLE;
2463
'1' .. '9': begin //###0.936
2464
if (fCompModifiers and MaskModI) <> 0
2465
then ret := EmitNode (BSUBEXPCI)
2466
else ret := EmitNode (BSUBEXP);
2467
EmitC (REChar (ord (regparse^) - ord ('0')));
2468
flagp := flagp or HASWIDTH or SIMPLE;
2470
else EmitExactly (UnQuoteChar (regparse));
2476
if ((fCompModifiers and MaskModX) <> 0) and // check for eXtended syntax
2478
or ({$IFDEF UniCode}StrScan (XIgnoredChars, regparse^) <> nil //###0.947
2479
{$ELSE}regparse^ in XIgnoredChars{$ENDIF})) then begin //###0.941 \x
2480
if regparse^ = '#' then begin // Skip eXtended comment
2481
// find comment terminator (group of \n and/or \r)
2482
while (regparse^ <> #0) and (regparse^ <> #$d) and (regparse^ <> #$a)
2484
while (regparse^ = #$d) or (regparse^ = #$a) // skip comment terminator
2485
do inc (regparse); // attempt to support different type of line separators
2487
else begin // Skip the blanks!
2488
while {$IFDEF UniCode}StrScan (XIgnoredChars, regparse^) <> nil //###0.947
2489
{$ELSE}regparse^ in XIgnoredChars{$ENDIF}
2492
ret := EmitNode (COMMENT); // comment
2495
len := strcspn (regparse, META);
2497
if regparse^ <> '{' then begin
2498
Error (reeRarseAtomInternalDisaster);
2501
else len := strcspn (regparse + 1, META) + 1; // bad {n,m} - compile as EXATLY
2502
ender := (regparse + len)^;
2504
and ((ender = '*') or (ender = '+') or (ender = '?') or (ender = '{'))
2505
then dec (len); // Back off clear of ?+*{ operand.
2506
flagp := flagp or HASWIDTH;
2508
then flagp := flagp or SIMPLE;
2509
if (fCompModifiers and MaskModI) <> 0
2510
then ret := EmitNode (EXACTLYCI)
2511
else ret := EmitNode (EXACTLY);
2513
and (((fCompModifiers and MaskModX) = 0) or (regparse^ <> '#')) do begin
2514
if ((fCompModifiers and MaskModX) = 0) or not ( //###0.941
2515
{$IFDEF UniCode}StrScan (XIgnoredChars, regparse^) <> nil //###0.947
2516
{$ELSE}regparse^ in XIgnoredChars{$ENDIF} )
2517
then EmitC (regparse^);
2522
end; { of if not comment}
2523
end; { of case else}
2527
end; { of function TRegExpr.ParseAtom
2528
--------------------------------------------------------------}
2530
function TRegExpr.GetCompilerErrorPos : integer;
2533
if (regexpbeg = nil) or (regparse = nil)
2534
then EXIT; // not in compiling mode ?
2535
Result := regparse - regexpbeg;
2536
end; { of function TRegExpr.GetCompilerErrorPos
2537
--------------------------------------------------------------}
2540
{=============================================================}
2541
{===================== Matching section ======================}
2542
{=============================================================}
2544
{$IFNDEF UseSetOfChar}
2545
function TRegExpr.StrScanCI (s : PRegExprChar; ch : REChar) : PRegExprChar; //###0.928 - now method of TRegExpr
2547
while (s^ <> #0) and (s^ <> ch) and (s^ <> InvertCase (ch))
2552
end; { of function TRegExpr.StrScanCI
2553
--------------------------------------------------------------}
2556
function TRegExpr.regrepeat (p : PRegExprChar; AMax : integer) : integer;
2557
// repeatedly match something simple, report how many
2559
scan : PRegExprChar;
2560
opnd : PRegExprChar;
2562
{Ch,} InvCh : REChar; //###0.931
2563
sestart, seend : PRegExprChar; //###0.936
2567
opnd := p + REOpSz + RENextOffSz; //OPERAND
2568
TheMax := fInputEnd - scan;
2570
then TheMax := AMax;
2573
// note - ANYML cannot be proceeded in regrepeat because can skip
2574
// more than one char at once
2578
EXACTLY: begin // in opnd can be only ONE char !!!
2579
// Ch := opnd^; // store in register //###0.931
2580
while (Result < TheMax) and (opnd^ = scan^) do begin
2585
EXACTLYCI: begin // in opnd can be only ONE char !!!
2586
// Ch := opnd^; // store in register //###0.931
2587
while (Result < TheMax) and (opnd^ = scan^) do begin // prevent unneeded InvertCase //###0.931
2591
if Result < TheMax then begin //###0.931
2592
InvCh := InvertCase (opnd^); // store in register
2593
while (Result < TheMax) and
2594
((opnd^ = scan^) or (InvCh = scan^)) do begin
2600
BSUBEXP: begin //###0.936
2601
sestart := startp [ord (opnd^)];
2604
seend := endp [ord (opnd^)];
2609
while opnd < seend do begin
2610
if (scan >= fInputEnd) or (scan^ <> opnd^)
2617
UNTIL Result >= AMax;
2619
BSUBEXPCI: begin //###0.936
2620
sestart := startp [ord (opnd^)];
2623
seend := endp [ord (opnd^)];
2628
while opnd < seend do begin
2629
if (scan >= fInputEnd) or
2630
((scan^ <> opnd^) and (scan^ <> InvertCase (opnd^)))
2637
UNTIL Result >= AMax;
2640
while (Result < TheMax) and
2641
(scan^ >= '0') and (scan^ <= '9') do begin
2646
while (Result < TheMax) and
2647
((scan^ < '0') or (scan^ > '9')) do begin
2651
{$IFNDEF UseSetOfChar} //###0.929
2653
while (Result < TheMax) and
2654
(Pos (scan^, fWordChars) > 0) //###0.940
2655
{ ((scan^ >= 'a') and (scan^ <= 'z') !! I've forgotten (>='0') and (<='9')
2656
or (scan^ >= 'A') and (scan^ <= 'Z') or (scan^ = '_'))} do begin
2661
while (Result < TheMax) and
2662
(Pos (scan^, fWordChars) <= 0) //###0.940
2663
{ not ((scan^ >= 'a') and (scan^ <= 'z') !! I've forgotten (>='0') and (<='9')
2664
or (scan^ >= 'A') and (scan^ <= 'Z')
2665
or (scan^ = '_'))} do begin
2670
while (Result < TheMax) and
2671
(Pos (scan^, fSpaceChars) > 0) do begin
2676
while (Result < TheMax) and
2677
(Pos (scan^, fSpaceChars) <= 0) do begin
2683
while (Result < TheMax) and //!!!TinySet
2684
((scan^ = opnd^) or (scan^ = (opnd + 1)^)
2685
or (scan^ = (opnd + 2)^)) do begin
2690
ANYBUTTINYSET: begin
2691
while (Result < TheMax) and //!!!TinySet
2692
(scan^ <> opnd^) and (scan^ <> (opnd + 1)^)
2693
and (scan^ <> (opnd + 2)^) do begin
2698
{$IFDEF UseSetOfChar} //###0.929
2700
while (Result < TheMax) and
2701
(scan^ in PSetOfREChar (opnd)^) do begin
2708
while (Result < TheMax) and
2709
(StrScan (opnd, scan^) <> nil) do begin
2714
while (Result < TheMax) and
2715
(StrScan (opnd, scan^) = nil) do begin
2720
while (Result < TheMax) and (StrScanCI (opnd, scan^) <> nil) do begin
2725
while (Result < TheMax) and (StrScanCI (opnd, scan^) = nil) do begin
2730
else begin // Oh dear. Called inappropriately.
2731
Result := 0; // Best compromise.
2732
Error (reeRegRepeatCalledInappropriately);
2737
end; { of function TRegExpr.regrepeat
2738
--------------------------------------------------------------}
2740
function TRegExpr.regnext (p : PRegExprChar) : PRegExprChar;
2741
// dig the "next" pointer out of a node
2742
var offset : TRENextOff;
2744
if p = @regdummy then begin
2748
offset := PRENextOff (p + REOpSz)^; //###0.933 inlined NEXT
2751
else Result := p + offset;
2752
end; { of function TRegExpr.regnext
2753
--------------------------------------------------------------}
2755
function TRegExpr.MatchPrim (prog : PRegExprChar) : boolean;
2756
// recursively matching routine
2757
// Conceptually the strategy is simple: check to see whether the current
2758
// node matches, call self recursively to see whether the rest matches,
2759
// and then act accordingly. In practice we make some effort to avoid
2760
// recursion, in particular by going through "ordinary" nodes (that don't
2761
// need to know whether the rest of the match failed) by a loop instead of
2764
scan : PRegExprChar; // Current node.
2765
next : PRegExprChar; // Next node.
2767
opnd : PRegExprChar;
2769
save : PRegExprChar;
2771
BracesMin, BracesMax : integer; // we use integer instead of TREBracesArg for better support */+
2772
{$IFDEF ComplexBraces}
2773
SavedLoopStack : array [1 .. LoopStackMax] of integer; // :(( very bad for recursion
2774
SavedLoopStackIdx : integer; //###0.925
2780
while scan <> nil do begin
2781
len := PRENextOff (scan + 1)^; //###0.932 inlined regnext
2784
else next := scan + len;
2787
NOTBOUND, //###0.943 //!!! think about UseSetOfChar !!!
2791
((reginput = fInputStart) or (Pos ((reginput - 1)^, fWordChars) <= 0))
2792
and (reginput^ <> #0) and (Pos (reginput^, fWordChars) > 0)
2794
(reginput <> fInputStart) and (Pos ((reginput - 1)^, fWordChars) > 0)
2795
and ((reginput^ = #0) or (Pos (reginput^, fWordChars) <= 0)))
2798
BOL: if reginput <> fInputStart
2800
EOL: if reginput^ <> #0
2802
BOLML: if reginput > fInputStart then begin
2803
nextch := (reginput - 1)^;
2804
if (nextch <> fLinePairedSeparatorTail)
2805
or ((reginput - 1) <= fInputStart)
2806
or ((reginput - 2)^ <> fLinePairedSeparatorHead)
2808
if (nextch = fLinePairedSeparatorHead)
2809
and (reginput^ = fLinePairedSeparatorTail)
2810
then EXIT; // don't stop between paired separator
2813
not (nextch in fLineSeparatorsSet)
2815
(pos (nextch, fLineSeparators) <= 0)
2820
EOLML: if reginput^ <> #0 then begin
2821
nextch := reginput^;
2822
if (nextch <> fLinePairedSeparatorHead)
2823
or ((reginput + 1)^ <> fLinePairedSeparatorTail)
2825
if (nextch = fLinePairedSeparatorTail)
2826
and (reginput > fInputStart)
2827
and ((reginput - 1)^ = fLinePairedSeparatorHead)
2828
then EXIT; // don't stop between paired separator
2831
not (nextch in fLineSeparatorsSet)
2833
(pos (nextch, fLineSeparators) <= 0)
2843
ANYML: begin //###0.941
2845
or ((reginput^ = fLinePairedSeparatorHead)
2846
and ((reginput + 1)^ = fLinePairedSeparatorTail))
2847
or {$IFNDEF UniCode} (reginput^ in fLineSeparatorsSet)
2848
{$ELSE} (pos (reginput^, fLineSeparators) > 0) {$ENDIF}
2853
if (reginput^ = #0) or (reginput^ < '0') or (reginput^ > '9')
2858
if (reginput^ = #0) or ((reginput^ >= '0') and (reginput^ <= '9'))
2862
{$IFNDEF UseSetOfChar} //###0.929
2864
if (reginput^ = #0) or (Pos (reginput^, fWordChars) <= 0) //###0.943
2869
if (reginput^ = #0) or (Pos (reginput^, fWordChars) > 0) //###0.943
2874
if (reginput^ = #0) or not (Pos (reginput^, fSpaceChars) > 0) //###0.943
2879
if (reginput^ = #0) or (Pos (reginput^, fSpaceChars) > 0) //###0.943
2885
opnd := scan + REOpSz + RENextOffSz; // OPERAND
2886
// Inline the first character, for speed.
2887
if (opnd^ <> reginput^)
2888
and (InvertCase (opnd^) <> reginput^)
2890
len := strlen (opnd);
2894
while no > 1 do begin
2898
and (InvertCase (opnd^) <> save^)
2903
inc (reginput, len);
2906
opnd := scan + REOpSz + RENextOffSz; // OPERAND
2907
// Inline the first character, for speed.
2908
if opnd^ <> reginput^
2910
len := strlen (opnd);
2914
while no > 1 do begin
2922
inc (reginput, len);
2924
BSUBEXP: begin //###0.936
2925
no := ord ((scan + REOpSz + RENextOffSz)^);
2926
if startp [no] = nil
2931
opnd := startp [no];
2932
while opnd < endp [no] do begin
2933
if (save >= fInputEnd) or (save^ <> opnd^)
2940
BSUBEXPCI: begin //###0.936
2941
no := ord ((scan + REOpSz + RENextOffSz)^);
2942
if startp [no] = nil
2947
opnd := startp [no];
2948
while opnd < endp [no] do begin
2949
if (save >= fInputEnd) or
2950
((save^ <> opnd^) and (save^ <> InvertCase (opnd^)))
2958
if (reginput^ = #0) or //!!!TinySet
2959
((reginput^ <> (scan + REOpSz + RENextOffSz)^)
2960
and (reginput^ <> (scan + REOpSz + RENextOffSz + 1)^)
2961
and (reginput^ <> (scan + REOpSz + RENextOffSz + 2)^))
2965
ANYBUTTINYSET: begin
2966
if (reginput^ = #0) or //!!!TinySet
2967
(reginput^ = (scan + REOpSz + RENextOffSz)^)
2968
or (reginput^ = (scan + REOpSz + RENextOffSz + 1)^)
2969
or (reginput^ = (scan + REOpSz + RENextOffSz + 2)^)
2973
{$IFDEF UseSetOfChar} //###0.929
2976
or not (reginput^ in PSetOfREChar (scan + REOpSz + RENextOffSz)^)
2982
if (reginput^ = #0) or (StrScan (scan + REOpSz + RENextOffSz, reginput^) = nil)
2987
if (reginput^ = #0) or (StrScan (scan + REOpSz + RENextOffSz, reginput^) <> nil)
2992
if (reginput^ = #0) or (StrScanCI (scan + REOpSz + RENextOffSz, reginput^) = nil)
2997
if (reginput^ = #0) or (StrScanCI (scan + REOpSz + RENextOffSz, reginput^) <> nil)
3005
Succ (OPEN) .. TREOp (Ord (OPEN) + NSUBEXP - 1) : begin //###0.929
3006
no := ord (scan^) - ord (OPEN);
3007
// save := reginput;
3008
save := startp [no]; //###0.936
3009
startp [no] := reginput; //###0.936
3010
Result := MatchPrim (next);
3011
if not Result //###0.936
3012
then startp [no] := save;
3013
// if Result and (startp [no] = nil)
3014
// then startp [no] := save;
3015
// Don't set startp if some later invocation of the same
3016
// parentheses already has.
3019
Succ (CLOSE) .. TREOp (Ord (CLOSE) + NSUBEXP - 1): begin //###0.929
3020
no := ord (scan^) - ord (CLOSE);
3021
// save := reginput;
3022
save := endp [no]; //###0.936
3023
endp [no] := reginput; //###0.936
3024
Result := MatchPrim (next);
3025
if not Result //###0.936
3026
then endp [no] := save;
3027
// if Result and (endp [no] = nil)
3028
// then endp [no] := save;
3029
// Don't set endp if some later invocation of the same
3030
// parentheses already has.
3034
if (next^ <> BRANCH) // No choice.
3035
then next := scan + REOpSz + RENextOffSz // Avoid recursion
3039
Result := MatchPrim (scan + REOpSz + RENextOffSz);
3043
scan := regnext (scan);
3044
UNTIL (scan = nil) or (scan^ <> BRANCH);
3048
{$IFDEF ComplexBraces}
3049
LOOPENTRY: begin //###0.925
3052
if LoopStackIdx > LoopStackMax then begin
3053
Error (reeLoopStackExceeded);
3057
LoopStack [LoopStackIdx] := 0; // init loop counter
3058
Result := MatchPrim (next); // execute LOOP
3059
LoopStackIdx := no; // cleanup
3065
LOOP, LOOPNG: begin //###0.940
3066
if LoopStackIdx <= 0 then begin
3067
Error (reeLoopWithoutEntry);
3070
opnd := scan + PRENextOff (scan + REOpSz + RENextOffSz + 2 * REBracesArgSz)^;
3071
BracesMin := PREBracesArg (scan + REOpSz + RENextOffSz)^;
3072
BracesMax := PREBracesArg (scan + REOpSz + RENextOffSz + REBracesArgSz)^;
3074
if LoopStack [LoopStackIdx] >= BracesMin then begin // Min alredy matched - we can work
3075
if scan^ = LOOP then begin
3076
// greedy way - first try to max deep of greed ;)
3077
if LoopStack [LoopStackIdx] < BracesMax then begin
3078
inc (LoopStack [LoopStackIdx]);
3080
Result := MatchPrim (opnd);
3086
dec (LoopStackIdx); // Fail. May be we are too greedy? ;)
3087
Result := MatchPrim (next);
3089
then reginput := save;
3093
// non-greedy - try just now
3094
Result := MatchPrim (next);
3097
else reginput := save; // failed - move next and try again
3098
if LoopStack [LoopStackIdx] < BracesMax then begin
3099
inc (LoopStack [LoopStackIdx]);
3101
Result := MatchPrim (opnd);
3107
dec (LoopStackIdx); // Failed - back up
3111
else begin // first match a min_cnt times
3112
inc (LoopStack [LoopStackIdx]);
3114
Result := MatchPrim (opnd);
3118
dec (LoopStack [LoopStackIdx]);
3124
STAR, PLUS, BRACES, STARNG, PLUSNG, BRACESNG: begin
3125
// Lookahead to avoid useless match attempts when we know
3126
// what character comes next.
3129
then nextch := (next + REOpSz + RENextOffSz)^;
3130
BracesMax := MaxInt; // infinite loop for * and + //###0.92
3131
if (scan^ = STAR) or (scan^ = STARNG)
3132
then BracesMin := 0 // STAR
3133
else if (scan^ = PLUS) or (scan^ = PLUSNG)
3134
then BracesMin := 1 // PLUS
3135
else begin // BRACES
3136
BracesMin := PREBracesArg (scan + REOpSz + RENextOffSz)^;
3137
BracesMax := PREBracesArg (scan + REOpSz + RENextOffSz + REBracesArgSz)^;
3140
opnd := scan + REOpSz + RENextOffSz;
3141
if (scan^ = BRACES) or (scan^ = BRACESNG)
3142
then inc (opnd, 2 * REBracesArgSz);
3144
if (scan^ = PLUSNG) or (scan^ = STARNG) or (scan^ = BRACESNG) then begin
3146
BracesMax := regrepeat (opnd, BracesMax); // don't repeat more than BracesMax
3147
// Now we know real Max limit to move forward (for recursion 'back up')
3148
// In some cases it can be faster to check only Min positions first,
3149
// but after that we have to check every position separtely instead
3150
// of fast scannig in loop.
3152
while no <= BracesMax do begin
3153
reginput := save + no;
3154
// If it could work, try it.
3155
if (nextch = #0) or (reginput^ = nextch) then begin
3156
{$IFDEF ComplexBraces}
3157
System.Move (LoopStack, SavedLoopStack, SizeOf (LoopStack)); //###0.925
3158
SavedLoopStackIdx := LoopStackIdx;
3160
if MatchPrim (next) then begin
3164
{$IFDEF ComplexBraces}
3165
System.Move (SavedLoopStack, LoopStack, SizeOf (LoopStack));
3166
LoopStackIdx := SavedLoopStackIdx;
3169
inc (no); // Couldn't or didn't - move forward.
3173
else begin // greedy mode
3174
no := regrepeat (opnd, BracesMax); // don't repeat more than max_cnt
3175
while no >= BracesMin do begin
3176
// If it could work, try it.
3177
if (nextch = #0) or (reginput^ = nextch) then begin
3178
{$IFDEF ComplexBraces}
3179
System.Move (LoopStack, SavedLoopStack, SizeOf (LoopStack)); //###0.925
3180
SavedLoopStackIdx := LoopStackIdx;
3182
if MatchPrim (next) then begin
3186
{$IFDEF ComplexBraces}
3187
System.Move (SavedLoopStack, LoopStack, SizeOf (LoopStack));
3188
LoopStackIdx := SavedLoopStackIdx;
3191
dec (no); // Couldn't or didn't - back up.
3192
reginput := save + no;
3198
Result := true; // Success!
3202
Error (reeMatchPrimMemoryCorruption);
3205
end; { of case scan^}
3207
end; { of while scan <> nil}
3209
// We get here only if there's trouble -- normally "case EEND" is the
3210
// terminating point.
3211
Error (reeMatchPrimCorruptedPointers);
3212
end; { of function TRegExpr.MatchPrim
3213
--------------------------------------------------------------}
3215
{$IFDEF UseFirstCharSet} //###0.929
3216
procedure TRegExpr.FillFirstCharSet (prog : PRegExprChar);
3218
scan : PRegExprChar; // Current node.
3219
next : PRegExprChar; // Next node.
3220
opnd : PRegExprChar;
3224
while scan <> nil do begin
3225
next := regnext (scan);
3226
case PREOp (scan)^ of
3227
BSUBEXP, BSUBEXPCI: begin //###0.938
3228
FirstCharSet := [#0 .. #255]; // :((( we cannot
3229
// optimize r.e. if it starts with back reference
3232
BOL, BOLML: ; // EXIT; //###0.937
3233
EOL, EOLML: begin //###0.948 was empty in 0.947, was EXIT in 0.937
3234
Include (FirstCharSet, #0);
3237
opnd := PRegExprChar (LineSeparators);
3238
while opnd^ <> #0 do begin
3239
Include (FirstCharSet, opnd^);
3245
BOUND, NOTBOUND: ; //###0.943 ?!!
3246
ANY, ANYML: begin // we can better define ANYML !!!
3247
FirstCharSet := [#0 .. #255]; //###0.930
3251
FirstCharSet := FirstCharSet + ['0' .. '9'];
3255
FirstCharSet := FirstCharSet + ([#0 .. #255] - ['0' .. '9']); //###0.948 FirstCharSet was forgotten
3259
Include (FirstCharSet, (scan + REOpSz + RENextOffSz)^);
3260
Include (FirstCharSet, InvertCase ((scan + REOpSz + RENextOffSz)^));
3264
Include (FirstCharSet, (scan + REOpSz + RENextOffSz)^);
3268
FirstCharSet := FirstCharSet + PSetOfREChar (scan + REOpSz + RENextOffSz)^;
3273
Include (FirstCharSet, (scan + REOpSz + RENextOffSz)^);
3274
Include (FirstCharSet, (scan + REOpSz + RENextOffSz + 1)^);
3275
Include (FirstCharSet, (scan + REOpSz + RENextOffSz + 2)^);
3276
// ... // up to TinySetLen
3279
ANYBUTTINYSET: begin
3281
FirstCharSet := FirstCharSet + ([#0 .. #255] - [ //###0.948 FirstCharSet was forgotten
3282
(scan + REOpSz + RENextOffSz)^,
3283
(scan + REOpSz + RENextOffSz + 1)^,
3284
(scan + REOpSz + RENextOffSz + 2)^]);
3285
// ... // up to TinySetLen
3291
Succ (OPEN) .. TREOp (Ord (OPEN) + NSUBEXP - 1) : begin //###0.929
3292
FillFirstCharSet (next);
3295
Succ (CLOSE) .. TREOp (Ord (CLOSE) + NSUBEXP - 1): begin //###0.929
3296
FillFirstCharSet (next);
3300
if (PREOp (next)^ <> BRANCH) // No choice.
3301
then next := scan + REOpSz + RENextOffSz // Avoid recursion.
3304
FillFirstCharSet (scan + REOpSz + RENextOffSz);
3305
scan := regnext (scan);
3306
UNTIL (scan = nil) or (PREOp (scan)^ <> BRANCH);
3310
{$IFDEF ComplexBraces}
3311
LOOPENTRY: begin //###0.925
3312
// LoopStack [LoopStackIdx] := 0; //###0.940 line removed
3313
FillFirstCharSet (next); // execute LOOP
3316
LOOP, LOOPNG: begin //###0.940
3317
opnd := scan + PRENextOff (scan + REOpSz + RENextOffSz + REBracesArgSz * 2)^;
3318
min_cnt := PREBracesArg (scan + REOpSz + RENextOffSz)^;
3319
FillFirstCharSet (opnd);
3321
then FillFirstCharSet (next);
3325
STAR, STARNG: //###0.940
3326
FillFirstCharSet (scan + REOpSz + RENextOffSz);
3327
PLUS, PLUSNG: begin //###0.940
3328
FillFirstCharSet (scan + REOpSz + RENextOffSz);
3331
BRACES, BRACESNG: begin //###0.940
3332
opnd := scan + REOpSz + RENextOffSz + REBracesArgSz * 2;
3333
min_cnt := PREBracesArg (scan + REOpSz + RENextOffSz)^; // BRACES
3334
FillFirstCharSet (opnd);
3339
FirstCharSet := [#0 .. #255]; //###0.948
3343
Error (reeMatchPrimMemoryCorruption);
3346
end; { of case scan^}
3348
end; { of while scan <> nil}
3349
end; { of procedure FillFirstCharSet
3350
--------------------------------------------------------------}
3353
function TRegExpr.Exec (const AInputString : RegExprString) : boolean;
3355
InputString := AInputString;
3356
Result := ExecPrim (1);
3357
end; { of function TRegExpr.Exec
3358
--------------------------------------------------------------}
3362
function TRegExpr.Exec : boolean;
3364
Result := ExecPrim (1);
3365
end; { of function TRegExpr.Exec
3366
--------------------------------------------------------------}
3368
function TRegExpr.Exec (AOffset: integer) : boolean;
3370
Result := ExecPrim (AOffset);
3371
end; { of function TRegExpr.Exec
3372
--------------------------------------------------------------}
3375
function TRegExpr.ExecPos (AOffset: integer {$IFDEF DefParam}= 1{$ENDIF}) : boolean;
3377
Result := ExecPrim (AOffset);
3378
end; { of function TRegExpr.ExecPos
3379
--------------------------------------------------------------}
3381
function TRegExpr.ExecPrim (AOffset: integer) : boolean;
3382
procedure ClearMatchs;
3383
// Clears matchs array
3386
for i := 0 to NSUBEXP - 1 do begin
3390
end; { of procedure ClearMatchs;
3391
..............................................................}
3392
function RegMatch (str : PRegExprChar) : boolean;
3393
// try match at specific point
3395
//###0.949 removed clearing of start\endp
3397
Result := MatchPrim (programm + REOpSz);
3398
if Result then begin
3400
endp [0] := reginput;
3402
end; { of function RegMatch
3403
..............................................................}
3406
StartPtr: PRegExprChar;
3409
Result := false; // Be paranoid...
3411
ClearMatchs; //###0.949
3412
// ensure that Match cleared either if optimization tricks or some error
3413
// will lead to leaving ExecPrim without actual search. That is
3414
// importent for ExecNext logic and so on.
3416
if not IsProgrammOk //###0.929
3419
// Check InputString presence
3420
if not Assigned (fInputString) then begin
3421
Error (reeNoInpitStringSpecified);
3425
InputLen := length (fInputString);
3427
//Check that the start position is not negative
3428
if AOffset < 1 then begin
3429
Error (reeOffsetMustBeGreaterThen0);
3432
// Check that the start position is not longer than the line
3433
// If so then exit with nothing found
3434
if AOffset > (InputLen + 1) // for matching empty string after last char.
3437
StartPtr := fInputString + AOffset - 1;
3439
// If there is a "must appear" string, look for it.
3440
if regmust <> nil then begin
3443
s := StrScan (s, regmust [0]);
3444
if s <> nil then begin
3445
if StrLComp (s, regmust, regmlen) = 0
3446
then BREAK; // Found it.
3450
if s = nil // Not present.
3454
// Mark beginning of line for ^ .
3455
fInputStart := fInputString;
3457
// Pointer to end of input stream - for
3458
// pascal-style string processing (may include #0)
3459
fInputEnd := fInputString + InputLen;
3461
{$IFDEF ComplexBraces}
3463
LoopStackIdx := 0; //###0.925
3466
// Simplest case: anchored match need be tried only once.
3467
if reganch <> #0 then begin
3468
Result := RegMatch (StartPtr);
3472
// Messy cases: unanchored match.
3474
if regstart <> #0 then // We know what char it must start with.
3476
s := StrScan (s, regstart);
3477
if s <> nil then begin
3478
Result := RegMatch (s);
3481
else ClearMatchs; //###0.949
3485
else begin // We don't - general case.
3487
{$IFDEF UseFirstCharSet}
3488
if s^ in FirstCharSet
3489
then Result := RegMatch (s);
3491
Result := RegMatch (s);
3493
if Result or (s^ = #0) // Exit on a match or after testing the end-of-string.
3495
else ClearMatchs; //###0.949
3498
(* optimized and fixed by Martin Fuller - empty strings
3499
were not allowed to pass thru in UseFirstCharSet mode
3500
{$IFDEF UseFirstCharSet} //###0.929
3501
while s^ <> #0 do begin
3502
if s^ in FirstCharSet
3503
then Result := RegMatch (s);
3510
Result := RegMatch (s);
3519
end; { of function TRegExpr.ExecPrim
3520
--------------------------------------------------------------}
3522
function TRegExpr.ExecNext : boolean;
3523
var offset : integer;
3526
if not Assigned (startp[0]) or not Assigned (endp[0]) then begin
3527
Error (reeExecNextWithoutExec);
3530
// Offset := MatchPos [0] + MatchLen [0];
3531
// if MatchLen [0] = 0
3532
Offset := endp [0] - fInputString + 1; //###0.929
3533
if endp [0] = startp [0] //###0.929
3534
then inc (Offset); // prevent infinite looping if empty string match r.e.
3535
Result := ExecPrim (Offset);
3536
end; { of function TRegExpr.ExecNext
3537
--------------------------------------------------------------}
3539
function TRegExpr.GetInputString : RegExprString;
3541
if not Assigned (fInputString) then begin
3542
Error (reeGetInputStringWithoutInputString);
3545
Result := fInputString;
3546
end; { of function TRegExpr.GetInputString
3547
--------------------------------------------------------------}
3549
procedure TRegExpr.SetInputString (const AInputString : RegExprString);
3554
// clear Match* - before next Exec* call it's undefined
3555
for i := 0 to NSUBEXP - 1 do begin
3560
// need reallocation of input string buffer ?
3561
Len := length (AInputString);
3562
if Assigned (fInputString) and (Length (fInputString) <> Len) then begin
3563
FreeMem (fInputString);
3564
fInputString := nil;
3566
// buffer [re]allocation
3567
if not Assigned (fInputString)
3568
then GetMem (fInputString, (Len + 1) * SizeOf (REChar));
3570
// copy input string into buffer
3572
StrPCopy (fInputString, Copy (AInputString, 1, Len)); //###0.927
3574
StrLCopy (fInputString, PRegExprChar (AInputString), Len);
3578
fInputString : string;
3579
fInputStart, fInputEnd : PRegExprChar;
3582
fInputString := AInputString;
3583
UniqueString (fInputString);
3584
fInputStart := PChar (fInputString);
3585
Len := length (fInputString);
3586
fInputEnd := PRegExprChar (integer (fInputStart) + Len); ??
3587
!! startp/endp ��� ����� ����� ������ ������������ ?
3589
end; { of procedure TRegExpr.SetInputString
3590
--------------------------------------------------------------}
3592
procedure TRegExpr.SetLineSeparators (const AStr : RegExprString);
3594
if AStr <> fLineSeparators then begin
3595
fLineSeparators := AStr;
3598
end; { of procedure TRegExpr.SetLineSeparators
3599
--------------------------------------------------------------}
3601
procedure TRegExpr.SetLinePairedSeparator (const AStr : RegExprString);
3603
if length (AStr) = 2 then begin
3604
if AStr [1] = AStr [2] then begin
3605
// it's impossible for our 'one-point' checking to support
3606
// two chars separator for identical chars
3607
Error (reeBadLinePairedSeparator);
3610
if not fLinePairedSeparatorAssigned
3611
or (AStr [1] <> fLinePairedSeparatorHead)
3612
or (AStr [2] <> fLinePairedSeparatorTail) then begin
3613
fLinePairedSeparatorAssigned := true;
3614
fLinePairedSeparatorHead := AStr [1];
3615
fLinePairedSeparatorTail := AStr [2];
3619
else if length (AStr) = 0 then begin
3620
if fLinePairedSeparatorAssigned then begin
3621
fLinePairedSeparatorAssigned := false;
3625
else Error (reeBadLinePairedSeparator);
3626
end; { of procedure TRegExpr.SetLinePairedSeparator
3627
--------------------------------------------------------------}
3629
function TRegExpr.GetLinePairedSeparator : RegExprString;
3631
if fLinePairedSeparatorAssigned then begin
3633
// Here is some UniCode 'magic'
3634
// If You do know better decision to concatenate
3635
// two WideChars, please, let me know!
3636
Result := fLinePairedSeparatorHead; //###0.947
3637
Result := Result + fLinePairedSeparatorTail;
3639
Result := fLinePairedSeparatorHead + fLinePairedSeparatorTail;
3643
end; { of function TRegExpr.GetLinePairedSeparator
3644
--------------------------------------------------------------}
3646
function TRegExpr.Substitute (const ATemplate : RegExprString) : RegExprString;
3647
// perform substitutions after a regexp match
3648
// completely rewritten in 0.929
3650
TemplateLen : integer;
3651
TemplateBeg, TemplateEnd : PRegExprChar;
3652
p, p0, ResultPtr : PRegExprChar;
3653
ResultLen : integer;
3656
function ParseVarName (var APtr : PRegExprChar) : integer;
3657
// extract name of variable (digits, may be enclosed with
3658
// curly braces) from APtr^, uses TemplateEnd !!!
3660
Digits = ['0' .. '9'];
3663
Delimited : boolean;
3667
Delimited := (p < TemplateEnd) and (p^ = '{');
3669
then inc (p); // skip left curly brace
3670
if (p < TemplateEnd) and (p^ = '&')
3671
then inc (p) // this is '$&' or '${&}'
3673
while (p < TemplateEnd) and
3674
{$IFDEF UniCode} //###0.935
3675
(ord (p^) < 256) and (char (p^) in Digits)
3680
Result := Result * 10 + (ord (p^) - ord ('0')); //###0.939
3684
if (p < TemplateEnd) and (p^ = '}')
3685
then inc (p) // skip right curly brace
3686
else p := APtr; // isn't properly terminated
3688
then Result := -1; // no valid digits found or no right curly brace
3692
// Check programm and input string
3695
if not Assigned (fInputString) then begin
3696
Error (reeNoInpitStringSpecified);
3699
// Prepare for working
3700
TemplateLen := length (ATemplate);
3701
if TemplateLen = 0 then begin // prevent nil pointers
3705
TemplateBeg := pointer (ATemplate);
3706
TemplateEnd := TemplateBeg + TemplateLen;
3707
// Count result length for speed optimization.
3710
while p < TemplateEnd do begin
3714
then n := ParseVarName (p)
3716
if n >= 0 then begin
3717
if (n < NSUBEXP) and Assigned (startp [n]) and Assigned (endp [n])
3718
then inc (ResultLen, endp [n] - startp [n]);
3721
if (Ch = EscChar) and (p < TemplateEnd)
3722
then inc (p); // quoted or special char followed
3726
// Get memory. We do it once and it significant speed up work !
3727
if ResultLen = 0 then begin
3731
SetString (Result, nil, ResultLen);
3733
ResultPtr := pointer (Result);
3735
while p < TemplateEnd do begin
3739
then n := ParseVarName (p)
3741
if n >= 0 then begin
3743
if (n < NSUBEXP) and Assigned (p0) and Assigned (endp [n]) then
3744
while p0 < endp [n] do begin
3751
if (Ch = EscChar) and (p < TemplateEnd) then begin // quoted or special char followed
3759
end; { of function TRegExpr.Substitute
3760
--------------------------------------------------------------}
3762
procedure TRegExpr.Split (AInputStr : RegExprString; APieces : TStrings);
3763
var PrevPos : integer;
3766
if Exec (AInputStr) then
3768
APieces.Add (System.Copy (AInputStr, PrevPos, MatchPos [0] - PrevPos));
3769
PrevPos := MatchPos [0] + MatchLen [0];
3771
APieces.Add (System.Copy (AInputStr, PrevPos, MaxInt)); // Tail
3772
end; { of procedure TRegExpr.Split
3773
--------------------------------------------------------------}
3775
function TRegExpr.Replace (AInputStr : RegExprString; const AReplaceStr : RegExprString;
3776
AUseSubstitution : boolean{$IFDEF DefParam}= False{$ENDIF}) : RegExprString;
3782
if Exec (AInputStr) then
3784
Result := Result + System.Copy (AInputStr, PrevPos,
3785
MatchPos [0] - PrevPos);
3786
if AUseSubstitution //###0.946
3787
then Result := Result + Substitute (AReplaceStr)
3788
else Result := Result + AReplaceStr;
3789
PrevPos := MatchPos [0] + MatchLen [0];
3791
Result := Result + System.Copy (AInputStr, PrevPos, MaxInt); // Tail
3792
end; { of function TRegExpr.Replace
3793
--------------------------------------------------------------}
3795
function TRegExpr.ReplaceEx (AInputStr : RegExprString;
3796
AReplaceFunc : TRegExprReplaceFunction)
3803
if Exec (AInputStr) then
3805
Result := Result + System.Copy (AInputStr, PrevPos,
3806
MatchPos [0] - PrevPos)
3807
+ AReplaceFunc (Self);
3808
PrevPos := MatchPos [0] + MatchLen [0];
3810
Result := Result + System.Copy (AInputStr, PrevPos, MaxInt); // Tail
3811
end; { of function TRegExpr.ReplaceEx
3812
--------------------------------------------------------------}
3816
function TRegExpr.Replace (AInputStr : RegExprString;
3817
AReplaceFunc : TRegExprReplaceFunction)
3820
ReplaceEx (AInputStr, AReplaceFunc);
3821
end; { of function TRegExpr.Replace
3822
--------------------------------------------------------------}
3825
{=============================================================}
3826
{====================== Debug section ========================}
3827
{=============================================================}
3829
{$IFDEF RegExpPCodeDump}
3830
function TRegExpr.DumpOp (op : TREOp) : RegExprString;
3831
// printable representation of opcode
3834
BOL: Result := 'BOL';
3835
EOL: Result := 'EOL';
3836
BOLML: Result := 'BOLML';
3837
EOLML: Result := 'EOLML';
3838
BOUND: Result := 'BOUND'; //###0.943
3839
NOTBOUND: Result := 'NOTBOUND'; //###0.943
3840
ANY: Result := 'ANY';
3841
ANYML: Result := 'ANYML'; //###0.941
3842
ANYLETTER: Result := 'ANYLETTER';
3843
NOTLETTER: Result := 'NOTLETTER';
3844
ANYDIGIT: Result := 'ANYDIGIT';
3845
NOTDIGIT: Result := 'NOTDIGIT';
3846
ANYSPACE: Result := 'ANYSPACE';
3847
NOTSPACE: Result := 'NOTSPACE';
3848
ANYOF: Result := 'ANYOF';
3849
ANYBUT: Result := 'ANYBUT';
3850
ANYOFCI: Result := 'ANYOF/CI';
3851
ANYBUTCI: Result := 'ANYBUT/CI';
3852
BRANCH: Result := 'BRANCH';
3853
EXACTLY: Result := 'EXACTLY';
3854
EXACTLYCI: Result := 'EXACTLY/CI';
3855
NOTHING: Result := 'NOTHING';
3856
COMMENT: Result := 'COMMENT';
3857
BACK: Result := 'BACK';
3858
EEND: Result := 'END';
3859
BSUBEXP: Result := 'BSUBEXP';
3860
BSUBEXPCI: Result := 'BSUBEXP/CI';
3861
Succ (OPEN) .. TREOp (Ord (OPEN) + NSUBEXP - 1): //###0.929
3862
Result := Format ('OPEN[%d]', [ord (op) - ord (OPEN)]);
3863
Succ (CLOSE) .. TREOp (Ord (CLOSE) + NSUBEXP - 1): //###0.929
3864
Result := Format ('CLOSE[%d]', [ord (op) - ord (CLOSE)]);
3865
STAR: Result := 'STAR';
3866
PLUS: Result := 'PLUS';
3867
BRACES: Result := 'BRACES';
3868
{$IFDEF ComplexBraces}
3869
LOOPENTRY: Result := 'LOOPENTRY'; //###0.925
3870
LOOP: Result := 'LOOP'; //###0.925
3871
LOOPNG: Result := 'LOOPNG'; //###0.940
3873
ANYOFTINYSET: Result:= 'ANYOFTINYSET';
3874
ANYBUTTINYSET:Result:= 'ANYBUTTINYSET';
3875
{$IFDEF UseSetOfChar} //###0.929
3876
ANYOFFULLSET: Result:= 'ANYOFFULLSET';
3878
STARNG: Result := 'STARNG'; //###0.940
3879
PLUSNG: Result := 'PLUSNG'; //###0.940
3880
BRACESNG: Result := 'BRACESNG'; //###0.940
3881
else Error (reeDumpCorruptedOpcode);
3883
Result := ':' + Result;
3884
end; { of function TRegExpr.DumpOp
3885
--------------------------------------------------------------}
3887
function TRegExpr.Dump : RegExprString;
3888
// dump a regexp in vaguely comprehensible form
3891
op : TREOp; // Arbitrary non-END op.
3892
next : PRegExprChar;
3895
{$IFDEF UseSetOfChar} //###0.929
3899
if not IsProgrammOk //###0.929
3904
s := programm + REOpSz;
3905
while op <> EEND do begin // While that wasn't END last time...
3907
Result := Result + Format ('%2d%s', [s - programm, DumpOp (s^)]); // Where, what.
3908
next := regnext (s);
3909
if next = nil // Next ptr.
3910
then Result := Result + ' (0)'
3912
if next > s //###0.948 PWideChar subtraction workaround (see comments in Tail method for details)
3913
then Diff := next - s
3914
else Diff := - (s - next);
3915
Result := Result + Format (' (%d) ', [(s - programm) + Diff]);
3917
inc (s, REOpSz + RENextOffSz);
3918
if (op = ANYOF) or (op = ANYOFCI) or (op = ANYBUT) or (op = ANYBUTCI)
3919
or (op = EXACTLY) or (op = EXACTLYCI) then begin
3920
// Literal string, where present.
3921
while s^ <> #0 do begin
3922
Result := Result + s^;
3927
if (op = ANYOFTINYSET) or (op = ANYBUTTINYSET) then begin
3928
for i := 1 to TinySetLen do begin
3929
Result := Result + s^;
3933
if (op = BSUBEXP) or (op = BSUBEXPCI) then begin
3934
Result := Result + ' \' + IntToStr (Ord (s^));
3937
{$IFDEF UseSetOfChar} //###0.929
3938
if op = ANYOFFULLSET then begin
3939
for Ch := #0 to #255 do
3940
if Ch in PSetOfREChar (s)^ then
3942
then Result := Result + '#' + IntToStr (Ord (Ch)) //###0.936
3943
else Result := Result + Ch;
3944
inc (s, SizeOf (TSetOfREChar));
3947
if (op = BRACES) or (op = BRACESNG) then begin //###0.941
3948
// show min/max argument of BRACES operator
3949
Result := Result + Format ('{%d,%d}', [PREBracesArg (s)^, PREBracesArg (s + REBracesArgSz)^]);
3950
inc (s, REBracesArgSz * 2);
3952
{$IFDEF ComplexBraces}
3953
if (op = LOOP) or (op = LOOPNG) then begin //###0.940
3954
Result := Result + Format (' -> (%d) {%d,%d}', [
3955
(s - programm - (REOpSz + RENextOffSz)) + PRENextOff (s + 2 * REBracesArgSz)^,
3956
PREBracesArg (s)^, PREBracesArg (s + REBracesArgSz)^]);
3957
inc (s, 2 * REBracesArgSz + RENextOffSz);
3960
Result := Result + #$d#$a;
3963
// Header fields of interest.
3966
then Result := Result + 'start ' + regstart;
3968
then Result := Result + 'anchored ';
3970
then Result := Result + 'must have ' + regmust;
3971
{$IFDEF UseFirstCharSet} //###0.929
3972
Result := Result + #$d#$a'FirstCharSet:';
3973
for Ch := #0 to #255 do
3974
if Ch in FirstCharSet
3977
then Result := Result + '#' + IntToStr(Ord(Ch)) //###0.948
3978
else Result := Result + Ch;
3981
Result := Result + #$d#$a;
3982
end; { of function TRegExpr.Dump
3983
--------------------------------------------------------------}
3986
{$IFDEF reRealExceptionAddr}
3988
// ReturnAddr works correctly only if compiler optimization is ON
3989
// I placed this method at very end of unit because there are no
3990
// way to restore compiler optimization flag ...
3992
procedure TRegExpr.Error (AErrorID : integer);
3993
{$IFDEF reRealExceptionAddr}
3994
function ReturnAddr : pointer; //###0.938
4002
fLastError := AErrorID; // dummy stub - useless because will raise exception
4003
if AErrorID < 1000 // compilation error ?
4004
then e := ERegExpr.Create (ErrorMsg (AErrorID) // yes - show error pos
4005
+ ' (pos ' + IntToStr (CompilerErrorPos) + ')')
4006
else e := ERegExpr.Create (ErrorMsg (AErrorID));
4007
e.ErrorCode := AErrorID;
4008
e.CompilerErrorPos := CompilerErrorPos;
4010
{$IFDEF reRealExceptionAddr}
4011
At ReturnAddr; //###0.938
4013
end; { of procedure TRegExpr.Error
4014
--------------------------------------------------------------}
4020
regstart // -> programm
4021
reganch // -> programm
4022
regmust, regmlen // -> programm
4026
// be carefull - placed here code will be always compiled with
4027
// compiler optimization flag
4031
RegExprInvertCaseFunction := TRegExpr.InvertCaseFunction;