1
// Copyright 2011 The Go Authors. All rights reserved.
2
// Use of this source code is governed by a BSD-style
3
// license that can be found in the LICENSE file.
15
// An Error describes a failure to parse a regular expression
16
// and gives the offending expression.
22
func (e *Error) String() string {
23
return "error parsing regexp: " + e.Code.String() + ": `" + e.Expr + "`"
26
// An ErrorCode describes a failure to parse a regular expression.
31
ErrInternalError ErrorCode = "regexp/syntax: internal error"
34
ErrInvalidCharClass ErrorCode = "invalid character class"
35
ErrInvalidCharRange ErrorCode = "invalid character class range"
36
ErrInvalidEscape ErrorCode = "invalid escape sequence"
37
ErrInvalidNamedCapture ErrorCode = "invalid named capture"
38
ErrInvalidPerlOp ErrorCode = "invalid or unsupported Perl syntax"
39
ErrInvalidRepeatOp ErrorCode = "invalid nested repetition operator"
40
ErrInvalidRepeatSize ErrorCode = "invalid repeat count"
41
ErrInvalidUTF8 ErrorCode = "invalid UTF-8"
42
ErrMissingBracket ErrorCode = "missing closing ]"
43
ErrMissingParen ErrorCode = "missing closing )"
44
ErrMissingRepeatArgument ErrorCode = "missing argument to repetition operator"
45
ErrTrailingBackslash ErrorCode = "trailing backslash at end of expression"
48
func (e ErrorCode) String() string {
52
// Flags control the behavior of the parser and record information about regexp context.
56
FoldCase Flags = 1 << iota // case-insensitive match
57
Literal // treat pattern as literal string
58
ClassNL // allow character classes like [^a-z] and [[:space:]] to match newline
59
DotNL // allow . to match newline
60
OneLine // treat ^ and $ as only matching at beginning and end of text
61
NonGreedy // make repetition operators default to non-greedy
62
PerlX // allow Perl extensions
63
UnicodeGroups // allow \p{Han}, \P{Han} for Unicode group and negation
64
WasDollar // regexp OpEndText was $, not \z
65
Simple // regexp contains no counted repetition
67
MatchNL = ClassNL | DotNL
69
Perl = ClassNL | OneLine | PerlX | UnicodeGroups // as close to Perl as possible
70
POSIX Flags = 0 // POSIX syntax
73
// Pseudo-ops for parsing stack.
75
opLeftParen = opPseudo + iota
80
flags Flags // parse mode flags
81
stack []*Regexp // stack of parsed expressions
83
numCap int // number of capturing groups seen
85
tmpClass []int // temporary char class work space
88
func (p *parser) newRegexp(op Op) *Regexp {
100
func (p *parser) reuse(re *Regexp) {
105
// Parse stack manipulation.
107
// push pushes the regexp re onto the parse stack and returns the regexp.
108
func (p *parser) push(re *Regexp) *Regexp {
109
if re.Op == OpCharClass && len(re.Rune) == 2 && re.Rune[0] == re.Rune[1] {
111
if p.maybeConcat(re.Rune[0], p.flags&^FoldCase) {
115
re.Rune = re.Rune[:1]
116
re.Flags = p.flags &^ FoldCase
117
} else if re.Op == OpCharClass && len(re.Rune) == 4 &&
118
re.Rune[0] == re.Rune[1] && re.Rune[2] == re.Rune[3] &&
119
unicode.SimpleFold(re.Rune[0]) == re.Rune[2] &&
120
unicode.SimpleFold(re.Rune[2]) == re.Rune[0] ||
121
re.Op == OpCharClass && len(re.Rune) == 2 &&
122
re.Rune[0]+1 == re.Rune[1] &&
123
unicode.SimpleFold(re.Rune[0]) == re.Rune[1] &&
124
unicode.SimpleFold(re.Rune[1]) == re.Rune[0] {
125
// Case-insensitive rune like [Aa] or [Δδ].
126
if p.maybeConcat(re.Rune[0], p.flags|FoldCase) {
130
// Rewrite as (case-insensitive) literal.
132
re.Rune = re.Rune[:1]
133
re.Flags = p.flags | FoldCase
135
// Incremental concatenation.
139
p.stack = append(p.stack, re)
143
// maybeConcat implements incremental concatenation
144
// of literal runes into string nodes. The parser calls this
145
// before each push, so only the top fragment of the stack
146
// might need processing. Since this is called before a push,
147
// the topmost literal is no longer subject to operators like *
148
// (Otherwise ab* would turn into (ab)*.)
149
// If r >= 0 and there's a node left over, maybeConcat uses it
150
// to push r with the given flags.
151
// maybeConcat reports whether r was pushed.
152
func (p *parser) maybeConcat(r int, flags Flags) bool {
160
if re1.Op != OpLiteral || re2.Op != OpLiteral || re1.Flags&FoldCase != re2.Flags&FoldCase {
164
// Push re1 into re2.
165
re2.Rune = append(re2.Rune, re1.Rune...)
167
// Reuse re1 if possible.
169
re1.Rune = re1.Rune0[:1]
175
p.stack = p.stack[:n-1]
177
return false // did not push r
180
// newLiteral returns a new OpLiteral Regexp with the given flags
181
func (p *parser) newLiteral(r int, flags Flags) *Regexp {
182
re := p.newRegexp(OpLiteral)
185
re.Rune = re.Rune0[:1]
189
// literal pushes a literal regexp for the rune r on the stack
190
// and returns that regexp.
191
func (p *parser) literal(r int) {
192
p.push(p.newLiteral(r, p.flags))
195
// op pushes a regexp with the given op onto the stack
196
// and returns that regexp.
197
func (p *parser) op(op Op) *Regexp {
198
re := p.newRegexp(op)
203
// repeat replaces the top stack element with itself repeated
205
func (p *parser) repeat(op Op, min, max int, opstr, t, lastRepeat string) (string, os.Error) {
207
if p.flags&PerlX != 0 {
208
if len(t) > 0 && t[0] == '?' {
212
if lastRepeat != "" {
213
// In Perl it is not allowed to stack repetition operators:
214
// a** is a syntax error, not a doubled star, and a++ means
215
// something else entirely, which we don't support!
216
return "", &Error{ErrInvalidRepeatOp, lastRepeat[:len(lastRepeat)-len(t)]}
221
return "", &Error{ErrMissingRepeatArgument, opstr}
224
re := p.newRegexp(op)
234
// concat replaces the top of the stack (above the topmost '|' or '(') with its concatenation.
235
func (p *parser) concat() *Regexp {
238
// Scan down to find pseudo-operator | or (.
240
for i > 0 && p.stack[i-1].Op < opPseudo {
244
p.stack = p.stack[:i]
246
// Empty concatenation is special case.
248
return p.push(p.newRegexp(OpEmptyMatch))
251
return p.push(p.collapse(subs, OpConcat))
254
// alternate replaces the top of the stack (above the topmost '(') with its alternation.
255
func (p *parser) alternate() *Regexp {
256
// Scan down to find pseudo-operator (.
257
// There are no | above (.
259
for i > 0 && p.stack[i-1].Op < opPseudo {
263
p.stack = p.stack[:i]
265
// Make sure top class is clean.
266
// All the others already are (see swapVerticalBar).
268
cleanAlt(subs[len(subs)-1])
271
// Empty alternate is special case
272
// (shouldn't happen but easy to handle).
274
return p.push(p.newRegexp(OpNoMatch))
277
return p.push(p.collapse(subs, OpAlternate))
280
// cleanAlt cleans re for eventual inclusion in an alternation.
281
func cleanAlt(re *Regexp) {
284
re.Rune = cleanClass(&re.Rune)
285
if len(re.Rune) == 2 && re.Rune[0] == 0 && re.Rune[1] == unicode.MaxRune {
290
if len(re.Rune) == 4 && re.Rune[0] == 0 && re.Rune[1] == '\n'-1 && re.Rune[2] == '\n'+1 && re.Rune[3] == unicode.MaxRune {
292
re.Op = OpAnyCharNotNL
295
if cap(re.Rune)-len(re.Rune) > 100 {
296
// re.Rune will not grow any more.
297
// Make a copy or inline to reclaim storage.
298
re.Rune = append(re.Rune0[:0], re.Rune...)
303
// collapse returns the result of applying op to sub.
304
// If sub contains op nodes, they all get hoisted up
305
// so that there is never a concat of a concat or an
306
// alternate of an alternate.
307
func (p *parser) collapse(subs []*Regexp, op Op) *Regexp {
311
re := p.newRegexp(op)
313
for _, sub := range subs {
315
re.Sub = append(re.Sub, sub.Sub...)
318
re.Sub = append(re.Sub, sub)
321
if op == OpAlternate {
322
re.Sub = p.factor(re.Sub, re.Flags)
323
if len(re.Sub) == 1 {
332
// factor factors common prefixes from the alternation list sub.
333
// It returns a replacement list that reuses the same storage and
334
// frees (passes to p.reuse) any removed *Regexps.
337
// ABC|ABD|AEF|BCX|BCY
338
// simplifies by literal prefix extraction to
339
// A(B(C|D)|EF)|BC(X|Y)
340
// which simplifies by character class introduction to
341
// A(B[CD]|EF)|BC[XY]
343
func (p *parser) factor(sub []*Regexp, flags Flags) []*Regexp {
348
// Round 1: Factor out common literal prefixes.
353
for i := 0; i <= len(sub); i++ {
354
// Invariant: the Regexps that were in sub[0:start] have been
355
// used or marked for reuse, and the slice space has been reused
356
// for out (len(out) <= start).
358
// Invariant: sub[start:i] consists of regexps that all begin
359
// with str as modified by strflags.
363
istr, iflags = p.leadingString(sub[i])
364
if iflags == strflags {
366
for same < len(str) && same < len(istr) && str[same] == istr[same] {
370
// Matches at least one rune in current range.
371
// Keep going around.
378
// Found end of a run with common leading literal string:
379
// sub[start:i] all begin with str[0:len(str)], but sub[i]
380
// does not even begin with str[0].
382
// Factor out common string and append factored expression to out.
384
// Nothing to do - run of length 0.
385
} else if i == start+1 {
386
// Just one: don't bother factoring.
387
out = append(out, sub[start])
389
// Construct factored form: prefix(suffix1|suffix2|...)
390
prefix := p.newRegexp(OpLiteral)
391
prefix.Flags = strflags
392
prefix.Rune = append(prefix.Rune[:0], str...)
394
for j := start; j < i; j++ {
395
sub[j] = p.removeLeadingString(sub[j], len(str))
397
suffix := p.collapse(sub[start:i], OpAlternate) // recurse
399
re := p.newRegexp(OpConcat)
400
re.Sub = append(re.Sub[:0], prefix, suffix)
401
out = append(out, re)
404
// Prepare for next iteration.
411
// Round 2: Factor out common complex prefixes,
412
// just the first piece of each concatenation,
413
// whatever it is. This is good enough a lot of the time.
417
for i := 0; i <= len(sub); i++ {
418
// Invariant: the Regexps that were in sub[0:start] have been
419
// used or marked for reuse, and the slice space has been reused
420
// for out (len(out) <= start).
422
// Invariant: sub[start:i] consists of regexps that all begin
423
// with str as modified by strflags.
426
ifirst = p.leadingRegexp(sub[i])
427
if first != nil && first.Equal(ifirst) {
432
// Found end of a run with common leading regexp:
433
// sub[start:i] all begin with first but sub[i] does not.
435
// Factor out common regexp and append factored expression to out.
437
// Nothing to do - run of length 0.
438
} else if i == start+1 {
439
// Just one: don't bother factoring.
440
out = append(out, sub[start])
442
// Construct factored form: prefix(suffix1|suffix2|...)
445
for j := start; j < i; j++ {
446
reuse := j != start // prefix came from sub[start]
447
sub[j] = p.removeLeadingRegexp(sub[j], reuse)
449
suffix := p.collapse(sub[start:i], OpAlternate) // recurse
451
re := p.newRegexp(OpConcat)
452
re.Sub = append(re.Sub[:0], prefix, suffix)
453
out = append(out, re)
456
// Prepare for next iteration.
462
// Round 3: Collapse runs of single literals into character classes.
465
for i := 0; i <= len(sub); i++ {
466
// Invariant: the Regexps that were in sub[0:start] have been
467
// used or marked for reuse, and the slice space has been reused
468
// for out (len(out) <= start).
470
// Invariant: sub[start:i] consists of regexps that are either
471
// literal runes or character classes.
472
if i < len(sub) && isCharClass(sub[i]) {
476
// sub[i] is not a char or char class;
477
// emit char class for sub[start:i]...
479
// Nothing to do - run of length 0.
480
} else if i == start+1 {
481
out = append(out, sub[start])
483
// Make new char class.
484
// Start with most complex regexp in sub[start].
486
for j := start + 1; j < i; j++ {
487
if sub[max].Op < sub[j].Op || sub[max].Op == sub[j].Op && len(sub[max].Rune) < len(sub[j].Rune) {
491
sub[start], sub[max] = sub[max], sub[start]
493
for j := start + 1; j < i; j++ {
494
mergeCharClass(sub[start], sub[j])
498
out = append(out, sub[start])
501
// ... and then emit sub[i].
503
out = append(out, sub[i])
509
// Round 4: Collapse runs of empty matches into a single empty match.
513
if i+1 < len(sub) && sub[i].Op == OpEmptyMatch && sub[i+1].Op == OpEmptyMatch {
516
out = append(out, sub[i])
523
// leadingString returns the leading literal string that re begins with.
524
// The string refers to storage in re or its children.
525
func (p *parser) leadingString(re *Regexp) ([]int, Flags) {
526
if re.Op == OpConcat && len(re.Sub) > 0 {
529
if re.Op != OpLiteral {
532
return re.Rune, re.Flags & FoldCase
535
// removeLeadingString removes the first n leading runes
536
// from the beginning of re. It returns the replacement for re.
537
func (p *parser) removeLeadingString(re *Regexp, n int) *Regexp {
538
if re.Op == OpConcat && len(re.Sub) > 0 {
539
// Removing a leading string in a concatenation
540
// might simplify the concatenation.
542
sub = p.removeLeadingString(sub, n)
544
if sub.Op == OpEmptyMatch {
548
// Impossible but handle.
556
copy(re.Sub, re.Sub[1:])
557
re.Sub = re.Sub[:len(re.Sub)-1]
563
if re.Op == OpLiteral {
564
re.Rune = re.Rune[:copy(re.Rune, re.Rune[n:])]
565
if len(re.Rune) == 0 {
572
// leadingRegexp returns the leading regexp that re begins with.
573
// The regexp refers to storage in re or its children.
574
func (p *parser) leadingRegexp(re *Regexp) *Regexp {
575
if re.Op == OpEmptyMatch {
578
if re.Op == OpConcat && len(re.Sub) > 0 {
580
if sub.Op == OpEmptyMatch {
588
// removeLeadingRegexp removes the leading regexp in re.
589
// It returns the replacement for re.
590
// If reuse is true, it passes the removed regexp (if no longer needed) to p.reuse.
591
func (p *parser) removeLeadingRegexp(re *Regexp, reuse bool) *Regexp {
592
if re.Op == OpConcat && len(re.Sub) > 0 {
596
re.Sub = re.Sub[:copy(re.Sub, re.Sub[1:])]
612
func literalRegexp(s string, flags Flags) *Regexp {
613
re := &Regexp{Op: OpLiteral}
615
re.Rune = re.Rune0[:0] // use local storage for small strings
616
for _, c := range s {
617
if len(re.Rune) >= cap(re.Rune) {
618
// string is too long to fit in Rune0. let Go handle it
622
re.Rune = append(re.Rune, c)
629
func Parse(s string, flags Flags) (*Regexp, os.Error) {
630
if flags&Literal != 0 {
631
// Trivial parser for literal string.
632
if err := checkUTF8(s); err != nil {
635
return literalRegexp(s, flags), nil
638
// Otherwise, must do real work.
655
if c, t, err = nextRune(t); err != nil {
661
if p.flags&PerlX != 0 && len(t) >= 2 && t[1] == '?' {
662
// Flag changes and non-capturing groups.
663
if t, err = p.parsePerlFlags(t); err != nil {
669
p.op(opLeftParen).Cap = p.numCap
672
if err = p.parseVerticalBar(); err != nil {
677
if err = p.parseRightParen(); err != nil {
682
if p.flags&OneLine != 0 {
689
if p.flags&OneLine != 0 {
690
p.op(OpEndText).Flags |= WasDollar
696
if p.flags&DotNL != 0 {
703
if t, err = p.parseClass(t); err != nil {
715
if t, err = p.repeat(op, min, max, t[:1], t[1:], lastRepeat); err != nil {
720
min, max, tt, ok := p.parseRepeat(t)
722
// If the repeat cannot be parsed, { is a literal.
727
if t, err = p.repeat(op, min, max, t[:len(t)-len(tt)], tt, lastRepeat); err != nil {
731
if p.flags&PerlX != 0 && len(t) >= 2 {
742
p.op(OpNoWordBoundary)
746
// any byte; not supported
747
return nil, &Error{ErrInvalidEscape, t[:2]}
749
// \Q ... \E: the ... is always literals
751
if i := strings.Index(t, `\E`); i < 0 {
758
p.push(literalRegexp(lit, p.flags))
767
re := p.newRegexp(OpCharClass)
770
// Look for Unicode character group like \p{Han}
771
if len(t) >= 2 && (t[1] == 'p' || t[1] == 'P') {
772
r, rest, err := p.parseUnicodeClass(t, re.Rune0[:0])
784
// Perl character class escape.
785
if r, rest := p.parsePerlClassEscape(t, re.Rune0[:0]); r != nil {
793
// Ordinary single-character escape.
794
if c, t, err = p.parseEscape(t); err != nil {
803
if p.swapVerticalBar() {
805
p.stack = p.stack[:len(p.stack)-1]
811
return nil, &Error{ErrMissingParen, s}
813
return p.stack[0], nil
816
// parseRepeat parses {min} (max=min) or {min,} (max=-1) or {min,max}.
817
// If s is not of that form, it returns ok == false.
818
func (p *parser) parseRepeat(s string) (min, max int, rest string, ok bool) {
819
if s == "" || s[0] != '{' {
823
if min, s, ok = p.parseInt(s); !ok {
838
} else if max, s, ok = p.parseInt(s); !ok {
842
if s == "" || s[0] != '}' {
850
// parsePerlFlags parses a Perl flag setting or non-capturing group or both,
851
// like (?i) or (?: or (?i:. It removes the prefix from s and updates the parse state.
852
// The caller must have ensured that s begins with "(?".
853
func (p *parser) parsePerlFlags(s string) (rest string, err os.Error) {
856
// Check for named captures, first introduced in Python's regexp library.
857
// As usual, there are three slightly different syntaxes:
859
// (?P<name>expr) the original, introduced by Python
860
// (?<name>expr) the .NET alteration, adopted by Perl 5.10
861
// (?'name'expr) another .NET alteration, adopted by Perl 5.10
863
// Perl 5.10 gave in and implemented the Python version too,
864
// but they claim that the last two are the preferred forms.
865
// PCRE and languages based on it (specifically, PHP and Ruby)
866
// support all three as well. EcmaScript 4 uses only the Python form.
868
// In both the open source world (via Code Search) and the
869
// Google source tree, (?P<expr>name) is the dominant form,
870
// so that's the one we implement. One is enough.
871
if len(t) > 4 && t[2] == 'P' && t[3] == '<' {
873
end := strings.IndexRune(t, '>')
875
if err = checkUTF8(t); err != nil {
878
return "", &Error{ErrInvalidNamedCapture, s}
881
capture := t[:end+1] // "(?P<name>"
882
name := t[4:end] // "name"
883
if err = checkUTF8(name); err != nil {
886
if !isValidCaptureName(name) {
887
return "", &Error{ErrInvalidNamedCapture, capture}
890
// Like ordinary capture, but named.
892
re := p.op(opLeftParen)
895
return t[end+1:], nil
898
// Non-capturing group. Might also twiddle Perl flags.
906
if c, t, err = nextRune(t); err != nil {
927
// Switch to negation.
933
// Invert flags so that | above turn into &^ and vice versa.
934
// We'll invert flags again before using it below.
938
// End of flags, starting group or not.
955
return "", &Error{ErrInvalidPerlOp, s[:len(s)-len(t)]}
958
// isValidCaptureName reports whether name
959
// is a valid capture name: [A-Za-z0-9_]+.
960
// PCRE limits names to 32 bytes.
961
// Python rejects names starting with digits.
962
// We don't enforce either of those.
963
func isValidCaptureName(name string) bool {
967
for _, c := range name {
968
if c != '_' && !isalnum(c) {
975
// parseInt parses a decimal integer.
976
func (p *parser) parseInt(s string) (n int, rest string, ok bool) {
977
if s == "" || s[0] < '0' || '9' < s[0] {
980
// Disallow leading zeros.
981
if len(s) >= 2 && s[0] == '0' && '0' <= s[1] && s[1] <= '9' {
984
for s != "" && '0' <= s[0] && s[0] <= '9' {
989
n = n*10 + int(s[0]) - '0'
997
// can this be represented as a character class?
998
// single-rune literal string, char class, ., and .|\n.
999
func isCharClass(re *Regexp) bool {
1000
return re.Op == OpLiteral && len(re.Rune) == 1 ||
1001
re.Op == OpCharClass ||
1002
re.Op == OpAnyCharNotNL ||
1007
func matchRune(re *Regexp, r int) bool {
1010
return len(re.Rune) == 1 && re.Rune[0] == r
1012
for i := 0; i < len(re.Rune); i += 2 {
1013
if re.Rune[i] <= r && r <= re.Rune[i+1] {
1018
case OpAnyCharNotNL:
1026
// parseVerticalBar handles a | in the input.
1027
func (p *parser) parseVerticalBar() os.Error {
1030
// The concatenation we just parsed is on top of the stack.
1031
// If it sits above an opVerticalBar, swap it below
1032
// (things below an opVerticalBar become an alternation).
1033
// Otherwise, push a new vertical bar.
1034
if !p.swapVerticalBar() {
1041
// mergeCharClass makes dst = dst|src.
1042
// The caller must ensure that dst.Op >= src.Op,
1043
// to reduce the amount of copying.
1044
func mergeCharClass(dst, src *Regexp) {
1047
// src doesn't add anything.
1048
case OpAnyCharNotNL:
1050
if matchRune(src, '\n') {
1054
// src is simpler, so either literal or char class
1055
if src.Op == OpLiteral {
1056
dst.Rune = appendRange(dst.Rune, src.Rune[0], src.Rune[0])
1058
dst.Rune = appendClass(dst.Rune, src.Rune)
1062
if src.Rune[0] == dst.Rune[0] {
1065
dst.Op = OpCharClass
1066
dst.Rune = append(dst.Rune, dst.Rune[0])
1067
dst.Rune = appendRange(dst.Rune, src.Rune[0], src.Rune[0])
1071
// If the top of the stack is an element followed by an opVerticalBar
1072
// swapVerticalBar swaps the two and returns true.
1073
// Otherwise it returns false.
1074
func (p *parser) swapVerticalBar() bool {
1075
// If above and below vertical bar are literal or char class,
1076
// can merge into a single char class.
1078
if n >= 3 && p.stack[n-2].Op == opVerticalBar && isCharClass(p.stack[n-1]) && isCharClass(p.stack[n-3]) {
1081
// Make re3 the more complex of the two.
1082
if re1.Op > re3.Op {
1086
mergeCharClass(re3, re1)
1088
p.stack = p.stack[:n-1]
1095
if re2.Op == opVerticalBar {
1097
// Now out of reach.
1098
// Clean opportunistically.
1099
cleanAlt(p.stack[n-3])
1109
// parseRightParen handles a ) in the input.
1110
func (p *parser) parseRightParen() os.Error {
1112
if p.swapVerticalBar() {
1114
p.stack = p.stack[:len(p.stack)-1]
1120
return &Error{ErrInternalError, ""}
1124
p.stack = p.stack[:n-2]
1125
if re2.Op != opLeftParen {
1126
return &Error{ErrMissingParen, p.wholeRegexp}
1129
// Just for grouping.
1133
re2.Sub = re2.Sub0[:1]
1140
// parseEscape parses an escape sequence at the beginning of s
1141
// and returns the rune.
1142
func (p *parser) parseEscape(s string) (r int, rest string, err os.Error) {
1145
return 0, "", &Error{ErrTrailingBackslash, ""}
1147
c, t, err := nextRune(t)
1155
if c < utf8.RuneSelf && !isalnum(c) {
1156
// Escaped non-word characters are always themselves.
1157
// PCRE is not quite so rigorous: it accepts things like
1158
// \q, but we don't. We once rejected \_, but too many
1159
// programs and people insist on using it, so allow \_.
1164
case '1', '2', '3', '4', '5', '6', '7':
1165
// Single non-zero digit is a backreference; not supported
1166
if t == "" || t[0] < '0' || t[0] > '7' {
1171
// Consume up to three octal digits; already have one.
1173
for i := 1; i < 3; i++ {
1174
if t == "" || t[0] < '0' || t[0] > '7' {
1177
r = r*8 + int(t[0]) - '0'
1182
// Hexadecimal escapes.
1187
if c, t, err = nextRune(t); err != nil {
1191
// Any number of digits in braces.
1192
// Perl accepts any text at all; it ignores all text
1193
// after the first non-hex digit. We require only hex digits,
1194
// and at least one.
1201
if c, t, err = nextRune(t); err != nil {
1212
if r > unicode.MaxRune {
1223
// Easy case: two hex digits.
1225
if c, t, err = nextRune(t); err != nil {
1232
return x*16 + y, t, nil
1234
// C escapes. There is no case 'b', to avoid misparsing
1235
// the Perl word-boundary \b as the C backspace \b
1236
// when in POSIX mode. In Perl, /\b/ means word-boundary
1237
// but /[\b]/ means backspace. We don't support that.
1238
// If you want a backspace, embed a literal backspace
1239
// character or use \x08.
1253
return 0, "", &Error{ErrInvalidEscape, s[:len(s)-len(t)]}
1256
// parseClassChar parses a character class character at the beginning of s
1258
func (p *parser) parseClassChar(s, wholeClass string) (r int, rest string, err os.Error) {
1260
return 0, "", &Error{Code: ErrMissingBracket, Expr: wholeClass}
1263
// Allow regular escape sequences even though
1264
// many need not be escaped in this context.
1266
return p.parseEscape(s)
1272
type charGroup struct {
1277
// parsePerlClassEscape parses a leading Perl character class escape like \d
1278
// from the beginning of s. If one is present, it appends the characters to r
1279
// and returns the new slice r and the remainder of the string.
1280
func (p *parser) parsePerlClassEscape(s string, r []int) (out []int, rest string) {
1281
if p.flags&PerlX == 0 || len(s) < 2 || s[0] != '\\' {
1284
g := perlGroup[s[0:2]]
1288
return p.appendGroup(r, g), s[2:]
1291
// parseNamedClass parses a leading POSIX named character class like [:alnum:]
1292
// from the beginning of s. If one is present, it appends the characters to r
1293
// and returns the new slice r and the remainder of the string.
1294
func (p *parser) parseNamedClass(s string, r []int) (out []int, rest string, err os.Error) {
1295
if len(s) < 2 || s[0] != '[' || s[1] != ':' {
1299
i := strings.Index(s[2:], ":]")
1304
name, s := s[0:i+2], s[i+2:]
1305
g := posixGroup[name]
1307
return nil, "", &Error{ErrInvalidCharRange, name}
1309
return p.appendGroup(r, g), s, nil
1312
func (p *parser) appendGroup(r []int, g charGroup) []int {
1313
if p.flags&FoldCase == 0 {
1315
r = appendNegatedClass(r, g.class)
1317
r = appendClass(r, g.class)
1320
tmp := p.tmpClass[:0]
1321
tmp = appendFoldedClass(tmp, g.class)
1323
tmp = cleanClass(&p.tmpClass)
1325
r = appendNegatedClass(r, tmp)
1327
r = appendClass(r, tmp)
1333
// unicodeTable returns the unicode.RangeTable identified by name
1334
// and the table of additional fold-equivalent code points.
1335
func unicodeTable(name string) (*unicode.RangeTable, *unicode.RangeTable) {
1336
if t := unicode.Categories[name]; t != nil {
1337
return t, unicode.FoldCategory[name]
1339
if t := unicode.Scripts[name]; t != nil {
1340
return t, unicode.FoldScript[name]
1345
// parseUnicodeClass parses a leading Unicode character class like \p{Han}
1346
// from the beginning of s. If one is present, it appends the characters to r
1347
// and returns the new slice r and the remainder of the string.
1348
func (p *parser) parseUnicodeClass(s string, r []int) (out []int, rest string, err os.Error) {
1349
if p.flags&UnicodeGroups == 0 || len(s) < 2 || s[0] != '\\' || s[1] != 'p' && s[1] != 'P' {
1353
// Committed to parse or return error.
1359
c, t, err := nextRune(t)
1363
var seq, name string
1365
// Single-letter name.
1366
seq = s[:len(s)-len(t)]
1369
// Name is in braces.
1370
end := strings.IndexRune(s, '}')
1372
if err = checkUTF8(s); err != nil {
1375
return nil, "", &Error{ErrInvalidCharRange, s}
1377
seq, t = s[:end+1], s[end+1:]
1379
if err = checkUTF8(name); err != nil {
1384
// Group can have leading negation too. \p{^Han} == \P{Han}, \P{^Han} == \p{Han}.
1385
if name != "" && name[0] == '^' {
1390
tab, fold := unicodeTable(name)
1392
return nil, "", &Error{ErrInvalidCharRange, seq}
1395
if p.flags&FoldCase == 0 || fold == nil {
1397
r = appendTable(r, tab)
1399
r = appendNegatedTable(r, tab)
1402
// Merge and clean tab and fold in a temporary buffer.
1403
// This is necessary for the negative case and just tidy
1404
// for the positive case.
1405
tmp := p.tmpClass[:0]
1406
tmp = appendTable(tmp, tab)
1407
tmp = appendTable(tmp, fold)
1409
tmp = cleanClass(&p.tmpClass)
1411
r = appendClass(r, tmp)
1413
r = appendNegatedClass(r, tmp)
1419
// parseClass parses a character class at the beginning of s
1420
// and pushes it onto the parse stack.
1421
func (p *parser) parseClass(s string) (rest string, err os.Error) {
1422
t := s[1:] // chop [
1423
re := p.newRegexp(OpCharClass)
1425
re.Rune = re.Rune0[:0]
1428
if t != "" && t[0] == '^' {
1432
// If character class does not match \n, add it here,
1433
// so that negation later will do the right thing.
1434
if p.flags&ClassNL == 0 {
1435
re.Rune = append(re.Rune, '\n', '\n')
1440
first := true // ] and - are okay as first char in class
1441
for t == "" || t[0] != ']' || first {
1442
// POSIX: - is only okay unescaped as first or last in class.
1443
// Perl: - is okay anywhere.
1444
if t != "" && t[0] == '-' && p.flags&PerlX == 0 && !first && (len(t) == 1 || t[1] != ']') {
1445
_, size := utf8.DecodeRuneInString(t[1:])
1446
return "", &Error{Code: ErrInvalidCharRange, Expr: t[:1+size]}
1450
// Look for POSIX [:alnum:] etc.
1451
if len(t) > 2 && t[0] == '[' && t[1] == ':' {
1452
nclass, nt, err := p.parseNamedClass(t, class)
1457
class, t = nclass, nt
1462
// Look for Unicode character group like \p{Han}.
1463
nclass, nt, err := p.parseUnicodeClass(t, class)
1468
class, t = nclass, nt
1472
// Look for Perl character class symbols (extension).
1473
if nclass, nt := p.parsePerlClassEscape(t, class); nclass != nil {
1474
class, t = nclass, nt
1478
// Single character or simple range.
1481
if lo, t, err = p.parseClassChar(t, s); err != nil {
1485
// [a-] means (a|-) so check for final ].
1486
if len(t) >= 2 && t[0] == '-' && t[1] != ']' {
1488
if hi, t, err = p.parseClassChar(t, s); err != nil {
1492
rng = rng[:len(rng)-len(t)]
1493
return "", &Error{Code: ErrInvalidCharRange, Expr: rng}
1496
if p.flags&FoldCase == 0 {
1497
class = appendRange(class, lo, hi)
1499
class = appendFoldedRange(class, lo, hi)
1504
// Use &re.Rune instead of &class to avoid allocation.
1506
class = cleanClass(&re.Rune)
1508
class = negateClass(class)
1515
// cleanClass sorts the ranges (pairs of elements of r),
1516
// merges them, and eliminates duplicates.
1517
func cleanClass(rp *[]int) []int {
1519
// Sort by lo increasing, hi decreasing to break ties.
1520
sort.Sort(ranges{rp})
1527
// Merge abutting, overlapping.
1528
w := 2 // write index
1529
for i := 2; i < len(r); i += 2 {
1530
lo, hi := r[i], r[i+1]
1532
// merge with previous range
1538
// new disjoint range
1547
// appendRange returns the result of appending the range lo-hi to the class r.
1548
func appendRange(r []int, lo, hi int) []int {
1549
// Expand last range or next to last range if it overlaps or abuts.
1550
// Checking two ranges helps when appending case-folded
1551
// alphabets, so that one range can be expanding A-Z and the
1552
// other expanding a-z.
1554
for i := 2; i <= 4; i += 2 { // twice, using i=2, i=4
1556
rlo, rhi := r[n-i], r[n-i+1]
1557
if lo <= rhi+1 && rlo <= hi+1 {
1569
return append(r, lo, hi)
1573
// minimum and maximum runes involved in folding.
1574
// checked during test.
1579
// appendFoldedRange returns the result of appending the range lo-hi
1580
// and its case folding-equivalent runes to the class r.
1581
func appendFoldedRange(r []int, lo, hi int) []int {
1583
if lo <= minFold && hi >= maxFold {
1584
// Range is full: folding can't add more.
1585
return appendRange(r, lo, hi)
1587
if hi < minFold || lo > maxFold {
1588
// Range is outside folding possibilities.
1589
return appendRange(r, lo, hi)
1592
// [lo, minFold-1] needs no folding.
1593
r = appendRange(r, lo, minFold-1)
1597
// [maxFold+1, hi] needs no folding.
1598
r = appendRange(r, maxFold+1, hi)
1602
// Brute force. Depend on appendRange to coalesce ranges on the fly.
1603
for c := lo; c <= hi; c++ {
1604
r = appendRange(r, c, c)
1605
f := unicode.SimpleFold(c)
1607
r = appendRange(r, f, f)
1608
f = unicode.SimpleFold(f)
1614
// appendClass returns the result of appending the class x to the class r.
1615
// It assume x is clean.
1616
func appendClass(r []int, x []int) []int {
1617
for i := 0; i < len(x); i += 2 {
1618
r = appendRange(r, x[i], x[i+1])
1623
// appendFolded returns the result of appending the case folding of the class x to the class r.
1624
func appendFoldedClass(r []int, x []int) []int {
1625
for i := 0; i < len(x); i += 2 {
1626
r = appendFoldedRange(r, x[i], x[i+1])
1631
// appendNegatedClass returns the result of appending the negation of the class x to the class r.
1632
// It assumes x is clean.
1633
func appendNegatedClass(r []int, x []int) []int {
1635
for i := 0; i < len(x); i += 2 {
1636
lo, hi := x[i], x[i+1]
1638
r = appendRange(r, nextLo, lo-1)
1642
if nextLo <= unicode.MaxRune {
1643
r = appendRange(r, nextLo, unicode.MaxRune)
1648
// appendTable returns the result of appending x to the class r.
1649
func appendTable(r []int, x *unicode.RangeTable) []int {
1650
for _, xr := range x.R16 {
1651
lo, hi, stride := int(xr.Lo), int(xr.Hi), int(xr.Stride)
1653
r = appendRange(r, lo, hi)
1656
for c := lo; c <= hi; c += stride {
1657
r = appendRange(r, c, c)
1660
for _, xr := range x.R32 {
1661
lo, hi, stride := int(xr.Lo), int(xr.Hi), int(xr.Stride)
1663
r = appendRange(r, lo, hi)
1666
for c := lo; c <= hi; c += stride {
1667
r = appendRange(r, c, c)
1673
// appendNegatedTable returns the result of appending the negation of x to the class r.
1674
func appendNegatedTable(r []int, x *unicode.RangeTable) []int {
1675
nextLo := 0 // lo end of next class to add
1676
for _, xr := range x.R16 {
1677
lo, hi, stride := int(xr.Lo), int(xr.Hi), int(xr.Stride)
1680
r = appendRange(r, nextLo, lo-1)
1685
for c := lo; c <= hi; c += stride {
1687
r = appendRange(r, nextLo, c-1)
1692
for _, xr := range x.R32 {
1693
lo, hi, stride := int(xr.Lo), int(xr.Hi), int(xr.Stride)
1696
r = appendRange(r, nextLo, lo-1)
1701
for c := lo; c <= hi; c += stride {
1703
r = appendRange(r, nextLo, c-1)
1708
if nextLo <= unicode.MaxRune {
1709
r = appendRange(r, nextLo, unicode.MaxRune)
1714
// negateClass overwrites r and returns r's negation.
1715
// It assumes the class r is already clean.
1716
func negateClass(r []int) []int {
1717
nextLo := 0 // lo end of next class to add
1718
w := 0 // write index
1719
for i := 0; i < len(r); i += 2 {
1720
lo, hi := r[i], r[i+1]
1729
if nextLo <= unicode.MaxRune {
1730
// It's possible for the negation to have one more
1731
// range - this one - than the original class, so use append.
1732
r = append(r, nextLo, unicode.MaxRune)
1737
// ranges implements sort.Interface on a []rune.
1738
// The choice of receiver type definition is strange
1739
// but avoids an allocation since we already have
1741
type ranges struct {
1745
func (ra ranges) Less(i, j int) bool {
1749
return p[i] < p[j] || p[i] == p[j] && p[i+1] > p[j+1]
1752
func (ra ranges) Len() int {
1753
return len(*ra.p) / 2
1756
func (ra ranges) Swap(i, j int) {
1760
p[i], p[i+1], p[j], p[j+1] = p[j], p[j+1], p[i], p[i+1]
1764
func checkUTF8(s string) os.Error {
1766
rune, size := utf8.DecodeRuneInString(s)
1767
if rune == utf8.RuneError && size == 1 {
1768
return &Error{Code: ErrInvalidUTF8, Expr: s}
1775
func nextRune(s string) (c int, t string, err os.Error) {
1776
c, size := utf8.DecodeRuneInString(s)
1777
if c == utf8.RuneError && size == 1 {
1778
return 0, "", &Error{Code: ErrInvalidUTF8, Expr: s}
1780
return c, s[size:], nil
1783
func isalnum(c int) bool {
1784
return '0' <= c && c <= '9' || 'A' <= c && c <= 'Z' || 'a' <= c && c <= 'z'
1787
func unhex(c int) int {
1788
if '0' <= c && c <= '9' {
1791
if 'a' <= c && c <= 'f' {
1794
if 'A' <= c && c <= 'F' {