1
2
3
4
5 package syntax
6
7
8
9
10 import (
11 "strconv"
12 "strings"
13 "unicode"
14 )
15
16
17 type Regexp struct {
18 Op Op
19 Flags Flags
20 Sub []*Regexp
21 Sub0 [1]*Regexp
22 Rune []rune
23 Rune0 [2]rune
24 Min, Max int
25 Cap int
26 Name string
27 }
28
29
30
31
32 type Op uint8
33
34
35
36
37
38 const (
39 OpNoMatch Op = 1 + iota
40 OpEmptyMatch
41 OpLiteral
42 OpCharClass
43 OpAnyCharNotNL
44 OpAnyChar
45 OpBeginLine
46 OpEndLine
47 OpBeginText
48 OpEndText
49 OpWordBoundary
50 OpNoWordBoundary
51 OpCapture
52 OpStar
53 OpPlus
54 OpQuest
55 OpRepeat
56 OpConcat
57 OpAlternate
58 )
59
60 const opPseudo Op = 128
61
62
63 func (x *Regexp) Equal(y *Regexp) bool {
64 if x == nil || y == nil {
65 return x == y
66 }
67 if x.Op != y.Op {
68 return false
69 }
70 switch x.Op {
71 case OpEndText:
72
73 if x.Flags&WasDollar != y.Flags&WasDollar {
74 return false
75 }
76
77 case OpLiteral, OpCharClass:
78 if len(x.Rune) != len(y.Rune) {
79 return false
80 }
81 for i, r := range x.Rune {
82 if r != y.Rune[i] {
83 return false
84 }
85 }
86
87 case OpAlternate, OpConcat:
88 if len(x.Sub) != len(y.Sub) {
89 return false
90 }
91 for i, sub := range x.Sub {
92 if !sub.Equal(y.Sub[i]) {
93 return false
94 }
95 }
96
97 case OpStar, OpPlus, OpQuest:
98 if x.Flags&NonGreedy != y.Flags&NonGreedy || !x.Sub[0].Equal(y.Sub[0]) {
99 return false
100 }
101
102 case OpRepeat:
103 if x.Flags&NonGreedy != y.Flags&NonGreedy || x.Min != y.Min || x.Max != y.Max || !x.Sub[0].Equal(y.Sub[0]) {
104 return false
105 }
106
107 case OpCapture:
108 if x.Cap != y.Cap || x.Name != y.Name || !x.Sub[0].Equal(y.Sub[0]) {
109 return false
110 }
111 }
112 return true
113 }
114
115
116 func writeRegexp(b *strings.Builder, re *Regexp) {
117 switch re.Op {
118 default:
119 b.WriteString("<invalid op" + strconv.Itoa(int(re.Op)) + ">")
120 case OpNoMatch:
121 b.WriteString(`[^\x00-\x{10FFFF}]`)
122 case OpEmptyMatch:
123 b.WriteString(`(?:)`)
124 case OpLiteral:
125 if re.Flags&FoldCase != 0 {
126 b.WriteString(`(?i:`)
127 }
128 for _, r := range re.Rune {
129 escape(b, r, false)
130 }
131 if re.Flags&FoldCase != 0 {
132 b.WriteString(`)`)
133 }
134 case OpCharClass:
135 if len(re.Rune)%2 != 0 {
136 b.WriteString(`[invalid char class]`)
137 break
138 }
139 b.WriteRune('[')
140 if len(re.Rune) == 0 {
141 b.WriteString(`^\x00-\x{10FFFF}`)
142 } else if re.Rune[0] == 0 && re.Rune[len(re.Rune)-1] == unicode.MaxRune && len(re.Rune) > 2 {
143
144
145 b.WriteRune('^')
146 for i := 1; i < len(re.Rune)-1; i += 2 {
147 lo, hi := re.Rune[i]+1, re.Rune[i+1]-1
148 escape(b, lo, lo == '-')
149 if lo != hi {
150 b.WriteRune('-')
151 escape(b, hi, hi == '-')
152 }
153 }
154 } else {
155 for i := 0; i < len(re.Rune); i += 2 {
156 lo, hi := re.Rune[i], re.Rune[i+1]
157 escape(b, lo, lo == '-')
158 if lo != hi {
159 b.WriteRune('-')
160 escape(b, hi, hi == '-')
161 }
162 }
163 }
164 b.WriteRune(']')
165 case OpAnyCharNotNL:
166 b.WriteString(`(?-s:.)`)
167 case OpAnyChar:
168 b.WriteString(`(?s:.)`)
169 case OpBeginLine:
170 b.WriteString(`(?m:^)`)
171 case OpEndLine:
172 b.WriteString(`(?m:$)`)
173 case OpBeginText:
174 b.WriteString(`\A`)
175 case OpEndText:
176 if re.Flags&WasDollar != 0 {
177 b.WriteString(`(?-m:$)`)
178 } else {
179 b.WriteString(`\z`)
180 }
181 case OpWordBoundary:
182 b.WriteString(`\b`)
183 case OpNoWordBoundary:
184 b.WriteString(`\B`)
185 case OpCapture:
186 if re.Name != "" {
187 b.WriteString(`(?P<`)
188 b.WriteString(re.Name)
189 b.WriteRune('>')
190 } else {
191 b.WriteRune('(')
192 }
193 if re.Sub[0].Op != OpEmptyMatch {
194 writeRegexp(b, re.Sub[0])
195 }
196 b.WriteRune(')')
197 case OpStar, OpPlus, OpQuest, OpRepeat:
198 if sub := re.Sub[0]; sub.Op > OpCapture || sub.Op == OpLiteral && len(sub.Rune) > 1 {
199 b.WriteString(`(?:`)
200 writeRegexp(b, sub)
201 b.WriteString(`)`)
202 } else {
203 writeRegexp(b, sub)
204 }
205 switch re.Op {
206 case OpStar:
207 b.WriteRune('*')
208 case OpPlus:
209 b.WriteRune('+')
210 case OpQuest:
211 b.WriteRune('?')
212 case OpRepeat:
213 b.WriteRune('{')
214 b.WriteString(strconv.Itoa(re.Min))
215 if re.Max != re.Min {
216 b.WriteRune(',')
217 if re.Max >= 0 {
218 b.WriteString(strconv.Itoa(re.Max))
219 }
220 }
221 b.WriteRune('}')
222 }
223 if re.Flags&NonGreedy != 0 {
224 b.WriteRune('?')
225 }
226 case OpConcat:
227 for _, sub := range re.Sub {
228 if sub.Op == OpAlternate {
229 b.WriteString(`(?:`)
230 writeRegexp(b, sub)
231 b.WriteString(`)`)
232 } else {
233 writeRegexp(b, sub)
234 }
235 }
236 case OpAlternate:
237 for i, sub := range re.Sub {
238 if i > 0 {
239 b.WriteRune('|')
240 }
241 writeRegexp(b, sub)
242 }
243 }
244 }
245
246 func (re *Regexp) String() string {
247 var b strings.Builder
248 writeRegexp(&b, re)
249 return b.String()
250 }
251
252 const meta = `\.+*?()|[]{}^$`
253
254 func escape(b *strings.Builder, r rune, force bool) {
255 if unicode.IsPrint(r) {
256 if strings.ContainsRune(meta, r) || force {
257 b.WriteRune('\\')
258 }
259 b.WriteRune(r)
260 return
261 }
262
263 switch r {
264 case '\a':
265 b.WriteString(`\a`)
266 case '\f':
267 b.WriteString(`\f`)
268 case '\n':
269 b.WriteString(`\n`)
270 case '\r':
271 b.WriteString(`\r`)
272 case '\t':
273 b.WriteString(`\t`)
274 case '\v':
275 b.WriteString(`\v`)
276 default:
277 if r < 0x100 {
278 b.WriteString(`\x`)
279 s := strconv.FormatInt(int64(r), 16)
280 if len(s) == 1 {
281 b.WriteRune('0')
282 }
283 b.WriteString(s)
284 break
285 }
286 b.WriteString(`\x{`)
287 b.WriteString(strconv.FormatInt(int64(r), 16))
288 b.WriteString(`}`)
289 }
290 }
291
292
293 func (re *Regexp) MaxCap() int {
294 m := 0
295 if re.Op == OpCapture {
296 m = re.Cap
297 }
298 for _, sub := range re.Sub {
299 if n := sub.MaxCap(); m < n {
300 m = n
301 }
302 }
303 return m
304 }
305
306
307 func (re *Regexp) CapNames() []string {
308 names := make([]string, re.MaxCap()+1)
309 re.capNames(names)
310 return names
311 }
312
313 func (re *Regexp) capNames(names []string) {
314 if re.Op == OpCapture {
315 names[re.Cap] = re.Name
316 }
317 for _, sub := range re.Sub {
318 sub.capNames(names)
319 }
320 }
321
View as plain text