1
2
3
4
5
6
7
8
9
10
11
12
13
14
15 package scanner
16
17 import (
18 "bytes"
19 "fmt"
20 "io"
21 "os"
22 "unicode"
23 "unicode/utf8"
24 )
25
26
27
28 type Position struct {
29 Filename string
30 Offset int
31 Line int
32 Column int
33 }
34
35
36 func (pos *Position) IsValid() bool { return pos.Line > 0 }
37
38 func (pos Position) String() string {
39 s := pos.Filename
40 if s == "" {
41 s = "<input>"
42 }
43 if pos.IsValid() {
44 s += fmt.Sprintf(":%d:%d", pos.Line, pos.Column)
45 }
46 return s
47 }
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64 const (
65 ScanIdents = 1 << -Ident
66 ScanInts = 1 << -Int
67 ScanFloats = 1 << -Float
68 ScanChars = 1 << -Char
69 ScanStrings = 1 << -String
70 ScanRawStrings = 1 << -RawString
71 ScanComments = 1 << -Comment
72 SkipComments = 1 << -skipComment
73 GoTokens = ScanIdents | ScanFloats | ScanChars | ScanStrings | ScanRawStrings | ScanComments | SkipComments
74 )
75
76
77 const (
78 EOF = -(iota + 1)
79 Ident
80 Int
81 Float
82 Char
83 String
84 RawString
85 Comment
86
87
88 skipComment
89 )
90
91 var tokenString = map[rune]string{
92 EOF: "EOF",
93 Ident: "Ident",
94 Int: "Int",
95 Float: "Float",
96 Char: "Char",
97 String: "String",
98 RawString: "RawString",
99 Comment: "Comment",
100 }
101
102
103 func TokenString(tok rune) string {
104 if s, found := tokenString[tok]; found {
105 return s
106 }
107 return fmt.Sprintf("%q", string(tok))
108 }
109
110
111
112 const GoWhitespace = 1<<'\t' | 1<<'\n' | 1<<'\r' | 1<<' '
113
114 const bufLen = 1024
115
116
117 type Scanner struct {
118
119 src io.Reader
120
121
122 srcBuf [bufLen + 1]byte
123 srcPos int
124 srcEnd int
125
126
127 srcBufOffset int
128 line int
129 column int
130 lastLineLen int
131 lastCharLen int
132
133
134
135
136
137 tokBuf bytes.Buffer
138 tokPos int
139 tokEnd int
140
141
142 ch rune
143
144
145
146 Error func(s *Scanner, msg string)
147
148
149 ErrorCount int
150
151
152
153
154 Mode uint
155
156
157
158
159
160 Whitespace uint64
161
162
163
164
165
166
167 IsIdentRune func(ch rune, i int) bool
168
169
170
171
172
173
174
175
176 Position
177 }
178
179
180
181
182 func (s *Scanner) Init(src io.Reader) *Scanner {
183 s.src = src
184
185
186
187 s.srcBuf[0] = utf8.RuneSelf
188 s.srcPos = 0
189 s.srcEnd = 0
190
191
192 s.srcBufOffset = 0
193 s.line = 1
194 s.column = 0
195 s.lastLineLen = 0
196 s.lastCharLen = 0
197
198
199
200 s.tokPos = -1
201
202
203 s.ch = -2
204
205
206 s.Error = nil
207 s.ErrorCount = 0
208 s.Mode = GoTokens
209 s.Whitespace = GoWhitespace
210 s.Line = 0
211
212 return s
213 }
214
215
216
217
218
219 func (s *Scanner) next() rune {
220 ch, width := rune(s.srcBuf[s.srcPos]), 1
221
222 if ch >= utf8.RuneSelf {
223
224 for s.srcPos+utf8.UTFMax > s.srcEnd && !utf8.FullRune(s.srcBuf[s.srcPos:s.srcEnd]) {
225
226
227 if s.tokPos >= 0 {
228 s.tokBuf.Write(s.srcBuf[s.tokPos:s.srcPos])
229 s.tokPos = 0
230
231 }
232
233 copy(s.srcBuf[0:], s.srcBuf[s.srcPos:s.srcEnd])
234 s.srcBufOffset += s.srcPos
235
236
237
238
239
240 i := s.srcEnd - s.srcPos
241 n, err := s.src.Read(s.srcBuf[i:bufLen])
242 s.srcPos = 0
243 s.srcEnd = i + n
244 s.srcBuf[s.srcEnd] = utf8.RuneSelf
245 if err != nil {
246 if err != io.EOF {
247 s.error(err.Error())
248 }
249 if s.srcEnd == 0 {
250 if s.lastCharLen > 0 {
251
252 s.column++
253 }
254 s.lastCharLen = 0
255 return EOF
256 }
257
258
259
260
261 break
262 }
263 }
264
265 ch = rune(s.srcBuf[s.srcPos])
266 if ch >= utf8.RuneSelf {
267
268 ch, width = utf8.DecodeRune(s.srcBuf[s.srcPos:s.srcEnd])
269 if ch == utf8.RuneError && width == 1 {
270
271 s.srcPos += width
272 s.lastCharLen = width
273 s.column++
274 s.error("invalid UTF-8 encoding")
275 return ch
276 }
277 }
278 }
279
280
281 s.srcPos += width
282 s.lastCharLen = width
283 s.column++
284
285
286 switch ch {
287 case 0:
288
289 s.error("invalid character NUL")
290 case '\n':
291 s.line++
292 s.lastLineLen = s.column
293 s.column = 0
294 }
295
296 return ch
297 }
298
299
300
301
302
303
304
305 func (s *Scanner) Next() rune {
306 s.tokPos = -1
307 s.Line = 0
308 ch := s.Peek()
309 if ch != EOF {
310 s.ch = s.next()
311 }
312 return ch
313 }
314
315
316
317
318 func (s *Scanner) Peek() rune {
319 if s.ch == -2 {
320
321 s.ch = s.next()
322 if s.ch == '\uFEFF' {
323 s.ch = s.next()
324 }
325 }
326 return s.ch
327 }
328
329 func (s *Scanner) error(msg string) {
330 s.tokEnd = s.srcPos - s.lastCharLen
331 s.ErrorCount++
332 if s.Error != nil {
333 s.Error(s, msg)
334 return
335 }
336 pos := s.Position
337 if !pos.IsValid() {
338 pos = s.Pos()
339 }
340 fmt.Fprintf(os.Stderr, "%s: %s\n", pos, msg)
341 }
342
343 func (s *Scanner) errorf(format string, args ...any) {
344 s.error(fmt.Sprintf(format, args...))
345 }
346
347 func (s *Scanner) isIdentRune(ch rune, i int) bool {
348 if s.IsIdentRune != nil {
349 return s.IsIdentRune(ch, i)
350 }
351 return ch == '_' || unicode.IsLetter(ch) || unicode.IsDigit(ch) && i > 0
352 }
353
354 func (s *Scanner) scanIdentifier() rune {
355
356 ch := s.next()
357 for i := 1; s.isIdentRune(ch, i); i++ {
358 ch = s.next()
359 }
360 return ch
361 }
362
363 func lower(ch rune) rune { return ('a' - 'A') | ch }
364 func isDecimal(ch rune) bool { return '0' <= ch && ch <= '9' }
365 func isHex(ch rune) bool { return '0' <= ch && ch <= '9' || 'a' <= lower(ch) && lower(ch) <= 'f' }
366
367
368
369
370
371
372
373 func (s *Scanner) digits(ch0 rune, base int, invalid *rune) (ch rune, digsep int) {
374 ch = ch0
375 if base <= 10 {
376 max := rune('0' + base)
377 for isDecimal(ch) || ch == '_' {
378 ds := 1
379 if ch == '_' {
380 ds = 2
381 } else if ch >= max && *invalid == 0 {
382 *invalid = ch
383 }
384 digsep |= ds
385 ch = s.next()
386 }
387 } else {
388 for isHex(ch) || ch == '_' {
389 ds := 1
390 if ch == '_' {
391 ds = 2
392 }
393 digsep |= ds
394 ch = s.next()
395 }
396 }
397 return
398 }
399
400 func (s *Scanner) scanNumber(ch rune, seenDot bool) (rune, rune) {
401 base := 10
402 prefix := rune(0)
403 digsep := 0
404 invalid := rune(0)
405
406
407 var tok rune
408 var ds int
409 if !seenDot {
410 tok = Int
411 if ch == '0' {
412 ch = s.next()
413 switch lower(ch) {
414 case 'x':
415 ch = s.next()
416 base, prefix = 16, 'x'
417 case 'o':
418 ch = s.next()
419 base, prefix = 8, 'o'
420 case 'b':
421 ch = s.next()
422 base, prefix = 2, 'b'
423 default:
424 base, prefix = 8, '0'
425 digsep = 1
426 }
427 }
428 ch, ds = s.digits(ch, base, &invalid)
429 digsep |= ds
430 if ch == '.' && s.Mode&ScanFloats != 0 {
431 ch = s.next()
432 seenDot = true
433 }
434 }
435
436
437 if seenDot {
438 tok = Float
439 if prefix == 'o' || prefix == 'b' {
440 s.error("invalid radix point in " + litname(prefix))
441 }
442 ch, ds = s.digits(ch, base, &invalid)
443 digsep |= ds
444 }
445
446 if digsep&1 == 0 {
447 s.error(litname(prefix) + " has no digits")
448 }
449
450
451 if e := lower(ch); (e == 'e' || e == 'p') && s.Mode&ScanFloats != 0 {
452 switch {
453 case e == 'e' && prefix != 0 && prefix != '0':
454 s.errorf("%q exponent requires decimal mantissa", ch)
455 case e == 'p' && prefix != 'x':
456 s.errorf("%q exponent requires hexadecimal mantissa", ch)
457 }
458 ch = s.next()
459 tok = Float
460 if ch == '+' || ch == '-' {
461 ch = s.next()
462 }
463 ch, ds = s.digits(ch, 10, nil)
464 digsep |= ds
465 if ds&1 == 0 {
466 s.error("exponent has no digits")
467 }
468 } else if prefix == 'x' && tok == Float {
469 s.error("hexadecimal mantissa requires a 'p' exponent")
470 }
471
472 if tok == Int && invalid != 0 {
473 s.errorf("invalid digit %q in %s", invalid, litname(prefix))
474 }
475
476 if digsep&2 != 0 {
477 s.tokEnd = s.srcPos - s.lastCharLen
478 if i := invalidSep(s.TokenText()); i >= 0 {
479 s.error("'_' must separate successive digits")
480 }
481 }
482
483 return tok, ch
484 }
485
486 func litname(prefix rune) string {
487 switch prefix {
488 default:
489 return "decimal literal"
490 case 'x':
491 return "hexadecimal literal"
492 case 'o', '0':
493 return "octal literal"
494 case 'b':
495 return "binary literal"
496 }
497 }
498
499
500 func invalidSep(x string) int {
501 x1 := ' '
502 d := '.'
503 i := 0
504
505
506 if len(x) >= 2 && x[0] == '0' {
507 x1 = lower(rune(x[1]))
508 if x1 == 'x' || x1 == 'o' || x1 == 'b' {
509 d = '0'
510 i = 2
511 }
512 }
513
514
515 for ; i < len(x); i++ {
516 p := d
517 d = rune(x[i])
518 switch {
519 case d == '_':
520 if p != '0' {
521 return i
522 }
523 case isDecimal(d) || x1 == 'x' && isHex(d):
524 d = '0'
525 default:
526 if p == '_' {
527 return i - 1
528 }
529 d = '.'
530 }
531 }
532 if d == '_' {
533 return len(x) - 1
534 }
535
536 return -1
537 }
538
539 func digitVal(ch rune) int {
540 switch {
541 case '0' <= ch && ch <= '9':
542 return int(ch - '0')
543 case 'a' <= lower(ch) && lower(ch) <= 'f':
544 return int(lower(ch) - 'a' + 10)
545 }
546 return 16
547 }
548
549 func (s *Scanner) scanDigits(ch rune, base, n int) rune {
550 for n > 0 && digitVal(ch) < base {
551 ch = s.next()
552 n--
553 }
554 if n > 0 {
555 s.error("invalid char escape")
556 }
557 return ch
558 }
559
560 func (s *Scanner) scanEscape(quote rune) rune {
561 ch := s.next()
562 switch ch {
563 case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', quote:
564
565 ch = s.next()
566 case '0', '1', '2', '3', '4', '5', '6', '7':
567 ch = s.scanDigits(ch, 8, 3)
568 case 'x':
569 ch = s.scanDigits(s.next(), 16, 2)
570 case 'u':
571 ch = s.scanDigits(s.next(), 16, 4)
572 case 'U':
573 ch = s.scanDigits(s.next(), 16, 8)
574 default:
575 s.error("invalid char escape")
576 }
577 return ch
578 }
579
580 func (s *Scanner) scanString(quote rune) (n int) {
581 ch := s.next()
582 for ch != quote {
583 if ch == '\n' || ch < 0 {
584 s.error("literal not terminated")
585 return
586 }
587 if ch == '\\' {
588 ch = s.scanEscape(quote)
589 } else {
590 ch = s.next()
591 }
592 n++
593 }
594 return
595 }
596
597 func (s *Scanner) scanRawString() {
598 ch := s.next()
599 for ch != '`' {
600 if ch < 0 {
601 s.error("literal not terminated")
602 return
603 }
604 ch = s.next()
605 }
606 }
607
608 func (s *Scanner) scanChar() {
609 if s.scanString('\'') != 1 {
610 s.error("invalid char literal")
611 }
612 }
613
614 func (s *Scanner) scanComment(ch rune) rune {
615
616 if ch == '/' {
617
618 ch = s.next()
619 for ch != '\n' && ch >= 0 {
620 ch = s.next()
621 }
622 return ch
623 }
624
625
626 ch = s.next()
627 for {
628 if ch < 0 {
629 s.error("comment not terminated")
630 break
631 }
632 ch0 := ch
633 ch = s.next()
634 if ch0 == '*' && ch == '/' {
635 ch = s.next()
636 break
637 }
638 }
639 return ch
640 }
641
642
643
644
645
646
647 func (s *Scanner) Scan() rune {
648 ch := s.Peek()
649
650
651 s.tokPos = -1
652 s.Line = 0
653
654 redo:
655
656 for s.Whitespace&(1<<uint(ch)) != 0 {
657 ch = s.next()
658 }
659
660
661 s.tokBuf.Reset()
662 s.tokPos = s.srcPos - s.lastCharLen
663
664
665
666 s.Offset = s.srcBufOffset + s.tokPos
667 if s.column > 0 {
668
669 s.Line = s.line
670 s.Column = s.column
671 } else {
672
673
674
675 s.Line = s.line - 1
676 s.Column = s.lastLineLen
677 }
678
679
680 tok := ch
681 switch {
682 case s.isIdentRune(ch, 0):
683 if s.Mode&ScanIdents != 0 {
684 tok = Ident
685 ch = s.scanIdentifier()
686 } else {
687 ch = s.next()
688 }
689 case isDecimal(ch):
690 if s.Mode&(ScanInts|ScanFloats) != 0 {
691 tok, ch = s.scanNumber(ch, false)
692 } else {
693 ch = s.next()
694 }
695 default:
696 switch ch {
697 case EOF:
698 break
699 case '"':
700 if s.Mode&ScanStrings != 0 {
701 s.scanString('"')
702 tok = String
703 }
704 ch = s.next()
705 case '\'':
706 if s.Mode&ScanChars != 0 {
707 s.scanChar()
708 tok = Char
709 }
710 ch = s.next()
711 case '.':
712 ch = s.next()
713 if isDecimal(ch) && s.Mode&ScanFloats != 0 {
714 tok, ch = s.scanNumber(ch, true)
715 }
716 case '/':
717 ch = s.next()
718 if (ch == '/' || ch == '*') && s.Mode&ScanComments != 0 {
719 if s.Mode&SkipComments != 0 {
720 s.tokPos = -1
721 ch = s.scanComment(ch)
722 goto redo
723 }
724 ch = s.scanComment(ch)
725 tok = Comment
726 }
727 case '`':
728 if s.Mode&ScanRawStrings != 0 {
729 s.scanRawString()
730 tok = RawString
731 }
732 ch = s.next()
733 default:
734 ch = s.next()
735 }
736 }
737
738
739 s.tokEnd = s.srcPos - s.lastCharLen
740
741 s.ch = ch
742 return tok
743 }
744
745
746
747
748
749 func (s *Scanner) Pos() (pos Position) {
750 pos.Filename = s.Filename
751 pos.Offset = s.srcBufOffset + s.srcPos - s.lastCharLen
752 switch {
753 case s.column > 0:
754
755 pos.Line = s.line
756 pos.Column = s.column
757 case s.lastLineLen > 0:
758
759 pos.Line = s.line - 1
760 pos.Column = s.lastLineLen
761 default:
762
763 pos.Line = 1
764 pos.Column = 1
765 }
766 return
767 }
768
769
770
771 func (s *Scanner) TokenText() string {
772 if s.tokPos < 0 {
773
774 return ""
775 }
776
777 if s.tokEnd < s.tokPos {
778
779 s.tokEnd = s.tokPos
780 }
781
782
783 if s.tokBuf.Len() == 0 {
784
785 return string(s.srcBuf[s.tokPos:s.tokEnd])
786 }
787
788
789
790 s.tokBuf.Write(s.srcBuf[s.tokPos:s.tokEnd])
791 s.tokPos = s.tokEnd
792 return s.tokBuf.String()
793 }
794
View as plain text