1
2
3
4
5
6
7
8 package utf8
9
10
11
12
13
14
15 const (
16 RuneError = '\uFFFD'
17 RuneSelf = 0x80
18 MaxRune = '\U0010FFFF'
19 UTFMax = 4
20 )
21
22
23 const (
24 surrogateMin = 0xD800
25 surrogateMax = 0xDFFF
26 )
27
28 const (
29 t1 = 0b00000000
30 tx = 0b10000000
31 t2 = 0b11000000
32 t3 = 0b11100000
33 t4 = 0b11110000
34 t5 = 0b11111000
35
36 maskx = 0b00111111
37 mask2 = 0b00011111
38 mask3 = 0b00001111
39 mask4 = 0b00000111
40
41 rune1Max = 1<<7 - 1
42 rune2Max = 1<<11 - 1
43 rune3Max = 1<<16 - 1
44
45
46 locb = 0b10000000
47 hicb = 0b10111111
48
49
50
51
52
53 xx = 0xF1
54 as = 0xF0
55 s1 = 0x02
56 s2 = 0x13
57 s3 = 0x03
58 s4 = 0x23
59 s5 = 0x34
60 s6 = 0x04
61 s7 = 0x44
62 )
63
64
65 var first = [256]uint8{
66
67 as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as,
68 as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as,
69 as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as,
70 as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as,
71 as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as,
72 as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as,
73 as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as,
74 as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as,
75
76 xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx,
77 xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx,
78 xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx,
79 xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx,
80 xx, xx, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1,
81 s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1,
82 s2, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s4, s3, s3,
83 s5, s6, s6, s6, s7, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx,
84 }
85
86
87
88 type acceptRange struct {
89 lo uint8
90 hi uint8
91 }
92
93
94 var acceptRanges = [16]acceptRange{
95 0: {locb, hicb},
96 1: {0xA0, hicb},
97 2: {locb, 0x9F},
98 3: {0x90, hicb},
99 4: {locb, 0x8F},
100 }
101
102
103
104 func FullRune(p []byte) bool {
105 n := len(p)
106 if n == 0 {
107 return false
108 }
109 x := first[p[0]]
110 if n >= int(x&7) {
111 return true
112 }
113
114 accept := acceptRanges[x>>4]
115 if n > 1 && (p[1] < accept.lo || accept.hi < p[1]) {
116 return true
117 } else if n > 2 && (p[2] < locb || hicb < p[2]) {
118 return true
119 }
120 return false
121 }
122
123
124 func FullRuneInString(s string) bool {
125 n := len(s)
126 if n == 0 {
127 return false
128 }
129 x := first[s[0]]
130 if n >= int(x&7) {
131 return true
132 }
133
134 accept := acceptRanges[x>>4]
135 if n > 1 && (s[1] < accept.lo || accept.hi < s[1]) {
136 return true
137 } else if n > 2 && (s[2] < locb || hicb < s[2]) {
138 return true
139 }
140 return false
141 }
142
143
144
145
146
147
148
149
150
151 func DecodeRune(p []byte) (r rune, size int) {
152 n := len(p)
153 if n < 1 {
154 return RuneError, 0
155 }
156 p0 := p[0]
157 x := first[p0]
158 if x >= as {
159
160
161
162 mask := rune(x) << 31 >> 31
163 return rune(p[0])&^mask | RuneError&mask, 1
164 }
165 sz := int(x & 7)
166 accept := acceptRanges[x>>4]
167 if n < sz {
168 return RuneError, 1
169 }
170 b1 := p[1]
171 if b1 < accept.lo || accept.hi < b1 {
172 return RuneError, 1
173 }
174 if sz <= 2 {
175 return rune(p0&mask2)<<6 | rune(b1&maskx), 2
176 }
177 b2 := p[2]
178 if b2 < locb || hicb < b2 {
179 return RuneError, 1
180 }
181 if sz <= 3 {
182 return rune(p0&mask3)<<12 | rune(b1&maskx)<<6 | rune(b2&maskx), 3
183 }
184 b3 := p[3]
185 if b3 < locb || hicb < b3 {
186 return RuneError, 1
187 }
188 return rune(p0&mask4)<<18 | rune(b1&maskx)<<12 | rune(b2&maskx)<<6 | rune(b3&maskx), 4
189 }
190
191
192
193
194
195
196
197
198
199 func DecodeRuneInString(s string) (r rune, size int) {
200 n := len(s)
201 if n < 1 {
202 return RuneError, 0
203 }
204 s0 := s[0]
205 x := first[s0]
206 if x >= as {
207
208
209
210 mask := rune(x) << 31 >> 31
211 return rune(s[0])&^mask | RuneError&mask, 1
212 }
213 sz := int(x & 7)
214 accept := acceptRanges[x>>4]
215 if n < sz {
216 return RuneError, 1
217 }
218 s1 := s[1]
219 if s1 < accept.lo || accept.hi < s1 {
220 return RuneError, 1
221 }
222 if sz <= 2 {
223 return rune(s0&mask2)<<6 | rune(s1&maskx), 2
224 }
225 s2 := s[2]
226 if s2 < locb || hicb < s2 {
227 return RuneError, 1
228 }
229 if sz <= 3 {
230 return rune(s0&mask3)<<12 | rune(s1&maskx)<<6 | rune(s2&maskx), 3
231 }
232 s3 := s[3]
233 if s3 < locb || hicb < s3 {
234 return RuneError, 1
235 }
236 return rune(s0&mask4)<<18 | rune(s1&maskx)<<12 | rune(s2&maskx)<<6 | rune(s3&maskx), 4
237 }
238
239
240
241
242
243
244
245
246
247 func DecodeLastRune(p []byte) (r rune, size int) {
248 end := len(p)
249 if end == 0 {
250 return RuneError, 0
251 }
252 start := end - 1
253 r = rune(p[start])
254 if r < RuneSelf {
255 return r, 1
256 }
257
258
259
260 lim := end - UTFMax
261 if lim < 0 {
262 lim = 0
263 }
264 for start--; start >= lim; start-- {
265 if RuneStart(p[start]) {
266 break
267 }
268 }
269 if start < 0 {
270 start = 0
271 }
272 r, size = DecodeRune(p[start:end])
273 if start+size != end {
274 return RuneError, 1
275 }
276 return r, size
277 }
278
279
280
281
282
283
284
285
286
287 func DecodeLastRuneInString(s string) (r rune, size int) {
288 end := len(s)
289 if end == 0 {
290 return RuneError, 0
291 }
292 start := end - 1
293 r = rune(s[start])
294 if r < RuneSelf {
295 return r, 1
296 }
297
298
299
300 lim := end - UTFMax
301 if lim < 0 {
302 lim = 0
303 }
304 for start--; start >= lim; start-- {
305 if RuneStart(s[start]) {
306 break
307 }
308 }
309 if start < 0 {
310 start = 0
311 }
312 r, size = DecodeRuneInString(s[start:end])
313 if start+size != end {
314 return RuneError, 1
315 }
316 return r, size
317 }
318
319
320
321 func RuneLen(r rune) int {
322 switch {
323 case r < 0:
324 return -1
325 case r <= rune1Max:
326 return 1
327 case r <= rune2Max:
328 return 2
329 case surrogateMin <= r && r <= surrogateMax:
330 return -1
331 case r <= rune3Max:
332 return 3
333 case r <= MaxRune:
334 return 4
335 }
336 return -1
337 }
338
339
340
341
342 func EncodeRune(p []byte, r rune) int {
343
344 switch i := uint32(r); {
345 case i <= rune1Max:
346 p[0] = byte(r)
347 return 1
348 case i <= rune2Max:
349 _ = p[1]
350 p[0] = t2 | byte(r>>6)
351 p[1] = tx | byte(r)&maskx
352 return 2
353 case i > MaxRune, surrogateMin <= i && i <= surrogateMax:
354 r = RuneError
355 fallthrough
356 case i <= rune3Max:
357 _ = p[2]
358 p[0] = t3 | byte(r>>12)
359 p[1] = tx | byte(r>>6)&maskx
360 p[2] = tx | byte(r)&maskx
361 return 3
362 default:
363 _ = p[3]
364 p[0] = t4 | byte(r>>18)
365 p[1] = tx | byte(r>>12)&maskx
366 p[2] = tx | byte(r>>6)&maskx
367 p[3] = tx | byte(r)&maskx
368 return 4
369 }
370 }
371
372
373
374
375 func AppendRune(p []byte, r rune) []byte {
376
377 if uint32(r) <= rune1Max {
378 return append(p, byte(r))
379 }
380 return appendRuneNonASCII(p, r)
381 }
382
383 func appendRuneNonASCII(p []byte, r rune) []byte {
384
385 switch i := uint32(r); {
386 case i <= rune2Max:
387 return append(p, t2|byte(r>>6), tx|byte(r)&maskx)
388 case i > MaxRune, surrogateMin <= i && i <= surrogateMax:
389 r = RuneError
390 fallthrough
391 case i <= rune3Max:
392 return append(p, t3|byte(r>>12), tx|byte(r>>6)&maskx, tx|byte(r)&maskx)
393 default:
394 return append(p, t4|byte(r>>18), tx|byte(r>>12)&maskx, tx|byte(r>>6)&maskx, tx|byte(r)&maskx)
395 }
396 }
397
398
399
400 func RuneCount(p []byte) int {
401 np := len(p)
402 var n int
403 for i := 0; i < np; {
404 n++
405 c := p[i]
406 if c < RuneSelf {
407
408 i++
409 continue
410 }
411 x := first[c]
412 if x == xx {
413 i++
414 continue
415 }
416 size := int(x & 7)
417 if i+size > np {
418 i++
419 continue
420 }
421 accept := acceptRanges[x>>4]
422 if c := p[i+1]; c < accept.lo || accept.hi < c {
423 size = 1
424 } else if size == 2 {
425 } else if c := p[i+2]; c < locb || hicb < c {
426 size = 1
427 } else if size == 3 {
428 } else if c := p[i+3]; c < locb || hicb < c {
429 size = 1
430 }
431 i += size
432 }
433 return n
434 }
435
436
437 func RuneCountInString(s string) (n int) {
438 ns := len(s)
439 for i := 0; i < ns; n++ {
440 c := s[i]
441 if c < RuneSelf {
442
443 i++
444 continue
445 }
446 x := first[c]
447 if x == xx {
448 i++
449 continue
450 }
451 size := int(x & 7)
452 if i+size > ns {
453 i++
454 continue
455 }
456 accept := acceptRanges[x>>4]
457 if c := s[i+1]; c < accept.lo || accept.hi < c {
458 size = 1
459 } else if size == 2 {
460 } else if c := s[i+2]; c < locb || hicb < c {
461 size = 1
462 } else if size == 3 {
463 } else if c := s[i+3]; c < locb || hicb < c {
464 size = 1
465 }
466 i += size
467 }
468 return n
469 }
470
471
472
473
474 func RuneStart(b byte) bool { return b&0xC0 != 0x80 }
475
476
477 func Valid(p []byte) bool {
478
479 for len(p) >= 8 {
480
481
482
483
484 first32 := uint32(p[0]) | uint32(p[1])<<8 | uint32(p[2])<<16 | uint32(p[3])<<24
485 second32 := uint32(p[4]) | uint32(p[5])<<8 | uint32(p[6])<<16 | uint32(p[7])<<24
486 if (first32|second32)&0x80808080 != 0 {
487
488 break
489 }
490 p = p[8:]
491 }
492 n := len(p)
493 for i := 0; i < n; {
494 pi := p[i]
495 if pi < RuneSelf {
496 i++
497 continue
498 }
499 x := first[pi]
500 if x == xx {
501 return false
502 }
503 size := int(x & 7)
504 if i+size > n {
505 return false
506 }
507 accept := acceptRanges[x>>4]
508 if c := p[i+1]; c < accept.lo || accept.hi < c {
509 return false
510 } else if size == 2 {
511 } else if c := p[i+2]; c < locb || hicb < c {
512 return false
513 } else if size == 3 {
514 } else if c := p[i+3]; c < locb || hicb < c {
515 return false
516 }
517 i += size
518 }
519 return true
520 }
521
522
523 func ValidString(s string) bool {
524
525 for len(s) >= 8 {
526
527
528
529
530 first32 := uint32(s[0]) | uint32(s[1])<<8 | uint32(s[2])<<16 | uint32(s[3])<<24
531 second32 := uint32(s[4]) | uint32(s[5])<<8 | uint32(s[6])<<16 | uint32(s[7])<<24
532 if (first32|second32)&0x80808080 != 0 {
533
534 break
535 }
536 s = s[8:]
537 }
538 n := len(s)
539 for i := 0; i < n; {
540 si := s[i]
541 if si < RuneSelf {
542 i++
543 continue
544 }
545 x := first[si]
546 if x == xx {
547 return false
548 }
549 size := int(x & 7)
550 if i+size > n {
551 return false
552 }
553 accept := acceptRanges[x>>4]
554 if c := s[i+1]; c < accept.lo || accept.hi < c {
555 return false
556 } else if size == 2 {
557 } else if c := s[i+2]; c < locb || hicb < c {
558 return false
559 } else if size == 3 {
560 } else if c := s[i+3]; c < locb || hicb < c {
561 return false
562 }
563 i += size
564 }
565 return true
566 }
567
568
569
570 func ValidRune(r rune) bool {
571 switch {
572 case 0 <= r && r < surrogateMin:
573 return true
574 case surrogateMax < r && r <= MaxRune:
575 return true
576 }
577 return false
578 }
579
View as plain text