1
// Copyright 2011 The Go Authors. All rights reserved.
2
// Use of this source code is governed by a BSD-style
3
// license that can be found in the LICENSE file.
5
// Package syntax parses regular expressions into syntax trees.
9
// Note to implementers:
10
// In this package, re is always a *Regexp and r is always a rune.
19
// A Regexp is a node in a regular expression syntax tree.
23
Sub []*Regexp // subexpressions, if any
24
Sub0 [1]*Regexp // storage for short Sub
25
Rune []int // matched runes, for OpLiteral, OpCharClass
26
Rune0 [2]int // storage for short Rune
27
Min, Max int // min, max for OpRepeat
28
Cap int // capturing index, for OpCapture
29
Name string // capturing name, for OpCapture
32
// An Op is a single regular expression operator.
35
// Operators are listed in precedence order, tightest binding to weakest.
36
// Character class operators are listed simplest to most complex
37
// (OpLiteral, OpCharClass, OpAnyCharNotNL, OpAnyChar).
40
OpNoMatch Op = 1 + iota // matches no strings
41
OpEmptyMatch // matches empty string
42
OpLiteral // matches Runes sequence
43
OpCharClass // matches Runes interpreted as range pair list
44
OpAnyCharNotNL // matches any character
45
OpAnyChar // matches any character
46
OpBeginLine // matches empty string at beginning of line
47
OpEndLine // matches empty string at end of line
48
OpBeginText // matches empty string at beginning of text
49
OpEndText // matches empty string at end of text
50
OpWordBoundary // matches word boundary `\b`
51
OpNoWordBoundary // matches word non-boundary `\B`
52
OpCapture // capturing subexpression with index Cap, optional name Name
53
OpStar // matches Sub[0] zero or more times
54
OpPlus // matches Sub[0] one or more times
55
OpQuest // matches Sub[0] zero or one times
56
OpRepeat // matches Sub[0] at least Min times, at most Max (Max == -1 is no limit)
57
OpConcat // matches concatenation of Subs
58
OpAlternate // matches alternation of Subs
61
const opPseudo Op = 128 // where pseudo-ops start
63
// Equal returns true if x and y have identical structure.
64
func (x *Regexp) Equal(y *Regexp) bool {
65
if x == nil || y == nil {
73
// The parse flags remember whether this is \z or \Z.
74
if x.Flags&WasDollar != y.Flags&WasDollar {
78
case OpLiteral, OpCharClass:
79
if len(x.Rune) != len(y.Rune) {
82
for i, r := range x.Rune {
88
case OpAlternate, OpConcat:
89
if len(x.Sub) != len(y.Sub) {
92
for i, sub := range x.Sub {
93
if !sub.Equal(y.Sub[i]) {
98
case OpStar, OpPlus, OpQuest:
99
if x.Flags&NonGreedy != y.Flags&NonGreedy || !x.Sub[0].Equal(y.Sub[0]) {
104
if x.Flags&NonGreedy != y.Flags&NonGreedy || x.Min != y.Min || x.Max != y.Max || !x.Sub[0].Equal(y.Sub[0]) {
109
if x.Cap != y.Cap || x.Name != y.Name || !x.Sub[0].Equal(y.Sub[0]) {
116
// writeRegexp writes the Perl syntax for the regular expression re to b.
117
func writeRegexp(b *bytes.Buffer, re *Regexp) {
120
b.WriteString("<invalid op" + strconv.Itoa(int(re.Op)) + ">")
122
b.WriteString(`[^\x00-\x{10FFFF}]`)
124
b.WriteString(`(?:)`)
126
if re.Flags&FoldCase != 0 {
127
b.WriteString(`(?i:`)
129
for _, r := range re.Rune {
132
if re.Flags&FoldCase != 0 {
136
if len(re.Rune)%2 != 0 {
137
b.WriteString(`[invalid char class]`)
141
if len(re.Rune) == 0 {
142
b.WriteString(`^\x00-\x{10FFFF}`)
143
} else if re.Rune[0] == 0 && re.Rune[len(re.Rune)-1] == unicode.MaxRune {
144
// Contains 0 and MaxRune. Probably a negated class.
147
for i := 1; i < len(re.Rune)-1; i += 2 {
148
lo, hi := re.Rune[i]+1, re.Rune[i+1]-1
149
escape(b, lo, lo == '-')
152
escape(b, hi, hi == '-')
156
for i := 0; i < len(re.Rune); i += 2 {
157
lo, hi := re.Rune[i], re.Rune[i+1]
158
escape(b, lo, lo == '-')
161
escape(b, hi, hi == '-')
167
b.WriteString(`[^\n]`)
180
case OpNoWordBoundary:
184
b.WriteString(`(?P<`)
185
b.WriteString(re.Name)
190
if re.Sub[0].Op != OpEmptyMatch {
191
writeRegexp(b, re.Sub[0])
194
case OpStar, OpPlus, OpQuest, OpRepeat:
195
if sub := re.Sub[0]; sub.Op > OpCapture {
211
b.WriteString(strconv.Itoa(re.Min))
212
if re.Max != re.Min {
215
b.WriteString(strconv.Itoa(re.Max))
221
for _, sub := range re.Sub {
222
if sub.Op == OpAlternate {
231
for i, sub := range re.Sub {
240
func (re *Regexp) String() string {
246
const meta = `\.+*?()|[]{}^$`
248
func escape(b *bytes.Buffer, r int, force bool) {
249
if unicode.IsPrint(r) {
250
if strings.IndexRune(meta, r) >= 0 || force {
273
s := strconv.Itob(r, 16)
281
b.WriteString(strconv.Itob(r, 16))