Source file
src/go/scanner/scanner.go
1
2
3
4
5
6
7
8
9 package scanner
10
11 import (
12 "bytes"
13 "fmt"
14 "go/token"
15 "path/filepath"
16 "strconv"
17 "unicode"
18 "unicode/utf8"
19 )
20
21
22
23
24
25
26 type ErrorHandler func(pos token.Position, msg string)
27
28
29
30
31
32 type Scanner struct {
33
34 file *token.File
35 dir string
36 src []byte
37 err ErrorHandler
38 mode Mode
39
40
41 ch rune
42 offset int
43 rdOffset int
44 lineOffset int
45 insertSemi bool
46
47
48 ErrorCount int
49 }
50
51 const (
52 bom = 0xFEFF
53 eof = -1
54 )
55
56
57
58
59
60
61 func (s *Scanner) next() {
62 if s.rdOffset < len(s.src) {
63 s.offset = s.rdOffset
64 if s.ch == '\n' {
65 s.lineOffset = s.offset
66 s.file.AddLine(s.offset)
67 }
68 r, w := rune(s.src[s.rdOffset]), 1
69 switch {
70 case r == 0:
71 s.error(s.offset, "illegal character NUL")
72 case r >= utf8.RuneSelf:
73
74 r, w = utf8.DecodeRune(s.src[s.rdOffset:])
75 if r == utf8.RuneError && w == 1 {
76 s.error(s.offset, "illegal UTF-8 encoding")
77 } else if r == bom && s.offset > 0 {
78 s.error(s.offset, "illegal byte order mark")
79 }
80 }
81 s.rdOffset += w
82 s.ch = r
83 } else {
84 s.offset = len(s.src)
85 if s.ch == '\n' {
86 s.lineOffset = s.offset
87 s.file.AddLine(s.offset)
88 }
89 s.ch = eof
90 }
91 }
92
93
94
95 func (s *Scanner) peek() byte {
96 if s.rdOffset < len(s.src) {
97 return s.src[s.rdOffset]
98 }
99 return 0
100 }
101
102
103
104
105 type Mode uint
106
107 const (
108 ScanComments Mode = 1 << iota
109 dontInsertSemis
110 )
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127 func (s *Scanner) Init(file *token.File, src []byte, err ErrorHandler, mode Mode) {
128
129 if file.Size() != len(src) {
130 panic(fmt.Sprintf("file size (%d) does not match src len (%d)", file.Size(), len(src)))
131 }
132 s.file = file
133 s.dir, _ = filepath.Split(file.Name())
134 s.src = src
135 s.err = err
136 s.mode = mode
137
138 s.ch = ' '
139 s.offset = 0
140 s.rdOffset = 0
141 s.lineOffset = 0
142 s.insertSemi = false
143 s.ErrorCount = 0
144
145 s.next()
146 if s.ch == bom {
147 s.next()
148 }
149 }
150
151 func (s *Scanner) error(offs int, msg string) {
152 if s.err != nil {
153 s.err(s.file.Position(s.file.Pos(offs)), msg)
154 }
155 s.ErrorCount++
156 }
157
158 func (s *Scanner) errorf(offs int, format string, args ...any) {
159 s.error(offs, fmt.Sprintf(format, args...))
160 }
161
162 func (s *Scanner) scanComment() string {
163
164 offs := s.offset - 1
165 next := -1
166 numCR := 0
167
168 if s.ch == '/' {
169
170
171 s.next()
172 for s.ch != '\n' && s.ch >= 0 {
173 if s.ch == '\r' {
174 numCR++
175 }
176 s.next()
177 }
178
179 next = s.offset
180 if s.ch == '\n' {
181 next++
182 }
183 goto exit
184 }
185
186
187 s.next()
188 for s.ch >= 0 {
189 ch := s.ch
190 if ch == '\r' {
191 numCR++
192 }
193 s.next()
194 if ch == '*' && s.ch == '/' {
195 s.next()
196 next = s.offset
197 goto exit
198 }
199 }
200
201 s.error(offs, "comment not terminated")
202
203 exit:
204 lit := s.src[offs:s.offset]
205
206
207
208
209
210
211 if numCR > 0 && len(lit) >= 2 && lit[1] == '/' && lit[len(lit)-1] == '\r' {
212 lit = lit[:len(lit)-1]
213 numCR--
214 }
215
216
217
218 if next >= 0 && (lit[1] == '*' || offs == s.lineOffset) && bytes.HasPrefix(lit[2:], prefix) {
219 s.updateLineInfo(next, offs, lit)
220 }
221
222 if numCR > 0 {
223 lit = stripCR(lit, lit[1] == '*')
224 }
225
226 return string(lit)
227 }
228
229 var prefix = []byte("line ")
230
231
232
233
234 func (s *Scanner) updateLineInfo(next, offs int, text []byte) {
235
236 if text[1] == '*' {
237 text = text[:len(text)-2]
238 }
239 text = text[7:]
240 offs += 7
241
242 i, n, ok := trailingDigits(text)
243 if i == 0 {
244 return
245 }
246
247
248 if !ok {
249
250 s.error(offs+i, "invalid line number: "+string(text[i:]))
251 return
252 }
253
254 var line, col int
255 i2, n2, ok2 := trailingDigits(text[:i-1])
256 if ok2 {
257
258 i, i2 = i2, i
259 line, col = n2, n
260 if col == 0 {
261 s.error(offs+i2, "invalid column number: "+string(text[i2:]))
262 return
263 }
264 text = text[:i2-1]
265 } else {
266
267 line = n
268 }
269
270 if line == 0 {
271 s.error(offs+i, "invalid line number: "+string(text[i:]))
272 return
273 }
274
275
276
277 filename := string(text[:i-1])
278 if filename == "" && ok2 {
279 filename = s.file.Position(s.file.Pos(offs)).Filename
280 } else if filename != "" {
281
282
283
284 filename = filepath.Clean(filename)
285 if !filepath.IsAbs(filename) {
286 filename = filepath.Join(s.dir, filename)
287 }
288 }
289
290 s.file.AddLineColumnInfo(next, filename, line, col)
291 }
292
293 func trailingDigits(text []byte) (int, int, bool) {
294 i := bytes.LastIndexByte(text, ':')
295 if i < 0 {
296 return 0, 0, false
297 }
298
299 n, err := strconv.ParseUint(string(text[i+1:]), 10, 0)
300 return i + 1, int(n), err == nil
301 }
302
303 func (s *Scanner) findLineEnd() bool {
304
305
306 defer func(offs int) {
307
308 s.ch = '/'
309 s.offset = offs
310 s.rdOffset = offs + 1
311 s.next()
312 }(s.offset - 1)
313
314
315 for s.ch == '/' || s.ch == '*' {
316 if s.ch == '/' {
317
318 return true
319 }
320
321 s.next()
322 for s.ch >= 0 {
323 ch := s.ch
324 if ch == '\n' {
325 return true
326 }
327 s.next()
328 if ch == '*' && s.ch == '/' {
329 s.next()
330 break
331 }
332 }
333 s.skipWhitespace()
334 if s.ch < 0 || s.ch == '\n' {
335 return true
336 }
337 if s.ch != '/' {
338
339 return false
340 }
341 s.next()
342 }
343
344 return false
345 }
346
347 func isLetter(ch rune) bool {
348 return 'a' <= lower(ch) && lower(ch) <= 'z' || ch == '_' || ch >= utf8.RuneSelf && unicode.IsLetter(ch)
349 }
350
351 func isDigit(ch rune) bool {
352 return isDecimal(ch) || ch >= utf8.RuneSelf && unicode.IsDigit(ch)
353 }
354
355
356
357
358
359
360 func (s *Scanner) scanIdentifier() string {
361 offs := s.offset
362
363
364
365
366
367
368
369
370 for rdOffset, b := range s.src[s.rdOffset:] {
371 if 'a' <= b && b <= 'z' || 'A' <= b && b <= 'Z' || b == '_' || '0' <= b && b <= '9' {
372
373 continue
374 }
375 s.rdOffset += rdOffset
376 if 0 < b && b < utf8.RuneSelf {
377
378
379
380
381
382
383 s.ch = rune(b)
384 s.offset = s.rdOffset
385 s.rdOffset++
386 goto exit
387 }
388
389
390
391 s.next()
392 for isLetter(s.ch) || isDigit(s.ch) {
393 s.next()
394 }
395 goto exit
396 }
397 s.offset = len(s.src)
398 s.rdOffset = len(s.src)
399 s.ch = eof
400
401 exit:
402 return string(s.src[offs:s.offset])
403 }
404
405 func digitVal(ch rune) int {
406 switch {
407 case '0' <= ch && ch <= '9':
408 return int(ch - '0')
409 case 'a' <= lower(ch) && lower(ch) <= 'f':
410 return int(lower(ch) - 'a' + 10)
411 }
412 return 16
413 }
414
415 func lower(ch rune) rune { return ('a' - 'A') | ch }
416 func isDecimal(ch rune) bool { return '0' <= ch && ch <= '9' }
417 func isHex(ch rune) bool { return '0' <= ch && ch <= '9' || 'a' <= lower(ch) && lower(ch) <= 'f' }
418
419
420
421
422
423
424
425 func (s *Scanner) digits(base int, invalid *int) (digsep int) {
426 if base <= 10 {
427 max := rune('0' + base)
428 for isDecimal(s.ch) || s.ch == '_' {
429 ds := 1
430 if s.ch == '_' {
431 ds = 2
432 } else if s.ch >= max && *invalid < 0 {
433 *invalid = s.offset
434 }
435 digsep |= ds
436 s.next()
437 }
438 } else {
439 for isHex(s.ch) || s.ch == '_' {
440 ds := 1
441 if s.ch == '_' {
442 ds = 2
443 }
444 digsep |= ds
445 s.next()
446 }
447 }
448 return
449 }
450
451 func (s *Scanner) scanNumber() (token.Token, string) {
452 offs := s.offset
453 tok := token.ILLEGAL
454
455 base := 10
456 prefix := rune(0)
457 digsep := 0
458 invalid := -1
459
460
461 if s.ch != '.' {
462 tok = token.INT
463 if s.ch == '0' {
464 s.next()
465 switch lower(s.ch) {
466 case 'x':
467 s.next()
468 base, prefix = 16, 'x'
469 case 'o':
470 s.next()
471 base, prefix = 8, 'o'
472 case 'b':
473 s.next()
474 base, prefix = 2, 'b'
475 default:
476 base, prefix = 8, '0'
477 digsep = 1
478 }
479 }
480 digsep |= s.digits(base, &invalid)
481 }
482
483
484 if s.ch == '.' {
485 tok = token.FLOAT
486 if prefix == 'o' || prefix == 'b' {
487 s.error(s.offset, "invalid radix point in "+litname(prefix))
488 }
489 s.next()
490 digsep |= s.digits(base, &invalid)
491 }
492
493 if digsep&1 == 0 {
494 s.error(s.offset, litname(prefix)+" has no digits")
495 }
496
497
498 if e := lower(s.ch); e == 'e' || e == 'p' {
499 switch {
500 case e == 'e' && prefix != 0 && prefix != '0':
501 s.errorf(s.offset, "%q exponent requires decimal mantissa", s.ch)
502 case e == 'p' && prefix != 'x':
503 s.errorf(s.offset, "%q exponent requires hexadecimal mantissa", s.ch)
504 }
505 s.next()
506 tok = token.FLOAT
507 if s.ch == '+' || s.ch == '-' {
508 s.next()
509 }
510 ds := s.digits(10, nil)
511 digsep |= ds
512 if ds&1 == 0 {
513 s.error(s.offset, "exponent has no digits")
514 }
515 } else if prefix == 'x' && tok == token.FLOAT {
516 s.error(s.offset, "hexadecimal mantissa requires a 'p' exponent")
517 }
518
519
520 if s.ch == 'i' {
521 tok = token.IMAG
522 s.next()
523 }
524
525 lit := string(s.src[offs:s.offset])
526 if tok == token.INT && invalid >= 0 {
527 s.errorf(invalid, "invalid digit %q in %s", lit[invalid-offs], litname(prefix))
528 }
529 if digsep&2 != 0 {
530 if i := invalidSep(lit); i >= 0 {
531 s.error(offs+i, "'_' must separate successive digits")
532 }
533 }
534
535 return tok, lit
536 }
537
538 func litname(prefix rune) string {
539 switch prefix {
540 case 'x':
541 return "hexadecimal literal"
542 case 'o', '0':
543 return "octal literal"
544 case 'b':
545 return "binary literal"
546 }
547 return "decimal literal"
548 }
549
550
551 func invalidSep(x string) int {
552 x1 := ' '
553 d := '.'
554 i := 0
555
556
557 if len(x) >= 2 && x[0] == '0' {
558 x1 = lower(rune(x[1]))
559 if x1 == 'x' || x1 == 'o' || x1 == 'b' {
560 d = '0'
561 i = 2
562 }
563 }
564
565
566 for ; i < len(x); i++ {
567 p := d
568 d = rune(x[i])
569 switch {
570 case d == '_':
571 if p != '0' {
572 return i
573 }
574 case isDecimal(d) || x1 == 'x' && isHex(d):
575 d = '0'
576 default:
577 if p == '_' {
578 return i - 1
579 }
580 d = '.'
581 }
582 }
583 if d == '_' {
584 return len(x) - 1
585 }
586
587 return -1
588 }
589
590
591
592
593
594 func (s *Scanner) scanEscape(quote rune) bool {
595 offs := s.offset
596
597 var n int
598 var base, max uint32
599 switch s.ch {
600 case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', quote:
601 s.next()
602 return true
603 case '0', '1', '2', '3', '4', '5', '6', '7':
604 n, base, max = 3, 8, 255
605 case 'x':
606 s.next()
607 n, base, max = 2, 16, 255
608 case 'u':
609 s.next()
610 n, base, max = 4, 16, unicode.MaxRune
611 case 'U':
612 s.next()
613 n, base, max = 8, 16, unicode.MaxRune
614 default:
615 msg := "unknown escape sequence"
616 if s.ch < 0 {
617 msg = "escape sequence not terminated"
618 }
619 s.error(offs, msg)
620 return false
621 }
622
623 var x uint32
624 for n > 0 {
625 d := uint32(digitVal(s.ch))
626 if d >= base {
627 msg := fmt.Sprintf("illegal character %#U in escape sequence", s.ch)
628 if s.ch < 0 {
629 msg = "escape sequence not terminated"
630 }
631 s.error(s.offset, msg)
632 return false
633 }
634 x = x*base + d
635 s.next()
636 n--
637 }
638
639 if x > max || 0xD800 <= x && x < 0xE000 {
640 s.error(offs, "escape sequence is invalid Unicode code point")
641 return false
642 }
643
644 return true
645 }
646
647 func (s *Scanner) scanRune() string {
648
649 offs := s.offset - 1
650
651 valid := true
652 n := 0
653 for {
654 ch := s.ch
655 if ch == '\n' || ch < 0 {
656
657 if valid {
658 s.error(offs, "rune literal not terminated")
659 valid = false
660 }
661 break
662 }
663 s.next()
664 if ch == '\'' {
665 break
666 }
667 n++
668 if ch == '\\' {
669 if !s.scanEscape('\'') {
670 valid = false
671 }
672
673 }
674 }
675
676 if valid && n != 1 {
677 s.error(offs, "illegal rune literal")
678 }
679
680 return string(s.src[offs:s.offset])
681 }
682
683 func (s *Scanner) scanString() string {
684
685 offs := s.offset - 1
686
687 for {
688 ch := s.ch
689 if ch == '\n' || ch < 0 {
690 s.error(offs, "string literal not terminated")
691 break
692 }
693 s.next()
694 if ch == '"' {
695 break
696 }
697 if ch == '\\' {
698 s.scanEscape('"')
699 }
700 }
701
702 return string(s.src[offs:s.offset])
703 }
704
705 func stripCR(b []byte, comment bool) []byte {
706 c := make([]byte, len(b))
707 i := 0
708 for j, ch := range b {
709
710
711
712
713
714 if ch != '\r' || comment && i > len("/*") && c[i-1] == '*' && j+1 < len(b) && b[j+1] == '/' {
715 c[i] = ch
716 i++
717 }
718 }
719 return c[:i]
720 }
721
722 func (s *Scanner) scanRawString() string {
723
724 offs := s.offset - 1
725
726 hasCR := false
727 for {
728 ch := s.ch
729 if ch < 0 {
730 s.error(offs, "raw string literal not terminated")
731 break
732 }
733 s.next()
734 if ch == '`' {
735 break
736 }
737 if ch == '\r' {
738 hasCR = true
739 }
740 }
741
742 lit := s.src[offs:s.offset]
743 if hasCR {
744 lit = stripCR(lit, false)
745 }
746
747 return string(lit)
748 }
749
750 func (s *Scanner) skipWhitespace() {
751 for s.ch == ' ' || s.ch == '\t' || s.ch == '\n' && !s.insertSemi || s.ch == '\r' {
752 s.next()
753 }
754 }
755
756
757
758
759
760
761
762 func (s *Scanner) switch2(tok0, tok1 token.Token) token.Token {
763 if s.ch == '=' {
764 s.next()
765 return tok1
766 }
767 return tok0
768 }
769
770 func (s *Scanner) switch3(tok0, tok1 token.Token, ch2 rune, tok2 token.Token) token.Token {
771 if s.ch == '=' {
772 s.next()
773 return tok1
774 }
775 if s.ch == ch2 {
776 s.next()
777 return tok2
778 }
779 return tok0
780 }
781
782 func (s *Scanner) switch4(tok0, tok1 token.Token, ch2 rune, tok2, tok3 token.Token) token.Token {
783 if s.ch == '=' {
784 s.next()
785 return tok1
786 }
787 if s.ch == ch2 {
788 s.next()
789 if s.ch == '=' {
790 s.next()
791 return tok3
792 }
793 return tok2
794 }
795 return tok0
796 }
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829 func (s *Scanner) Scan() (pos token.Pos, tok token.Token, lit string) {
830 scanAgain:
831 s.skipWhitespace()
832
833
834 pos = s.file.Pos(s.offset)
835
836
837 insertSemi := false
838 switch ch := s.ch; {
839 case isLetter(ch):
840 lit = s.scanIdentifier()
841 if len(lit) > 1 {
842
843 tok = token.Lookup(lit)
844 switch tok {
845 case token.IDENT, token.BREAK, token.CONTINUE, token.FALLTHROUGH, token.RETURN:
846 insertSemi = true
847 }
848 } else {
849 insertSemi = true
850 tok = token.IDENT
851 }
852 case isDecimal(ch) || ch == '.' && isDecimal(rune(s.peek())):
853 insertSemi = true
854 tok, lit = s.scanNumber()
855 default:
856 s.next()
857 switch ch {
858 case -1:
859 if s.insertSemi {
860 s.insertSemi = false
861 return pos, token.SEMICOLON, "\n"
862 }
863 tok = token.EOF
864 case '\n':
865
866
867
868 s.insertSemi = false
869 return pos, token.SEMICOLON, "\n"
870 case '"':
871 insertSemi = true
872 tok = token.STRING
873 lit = s.scanString()
874 case '\'':
875 insertSemi = true
876 tok = token.CHAR
877 lit = s.scanRune()
878 case '`':
879 insertSemi = true
880 tok = token.STRING
881 lit = s.scanRawString()
882 case ':':
883 tok = s.switch2(token.COLON, token.DEFINE)
884 case '.':
885
886 tok = token.PERIOD
887 if s.ch == '.' && s.peek() == '.' {
888 s.next()
889 s.next()
890 tok = token.ELLIPSIS
891 }
892 case ',':
893 tok = token.COMMA
894 case ';':
895 tok = token.SEMICOLON
896 lit = ";"
897 case '(':
898 tok = token.LPAREN
899 case ')':
900 insertSemi = true
901 tok = token.RPAREN
902 case '[':
903 tok = token.LBRACK
904 case ']':
905 insertSemi = true
906 tok = token.RBRACK
907 case '{':
908 tok = token.LBRACE
909 case '}':
910 insertSemi = true
911 tok = token.RBRACE
912 case '+':
913 tok = s.switch3(token.ADD, token.ADD_ASSIGN, '+', token.INC)
914 if tok == token.INC {
915 insertSemi = true
916 }
917 case '-':
918 tok = s.switch3(token.SUB, token.SUB_ASSIGN, '-', token.DEC)
919 if tok == token.DEC {
920 insertSemi = true
921 }
922 case '*':
923 tok = s.switch2(token.MUL, token.MUL_ASSIGN)
924 case '/':
925 if s.ch == '/' || s.ch == '*' {
926
927 if s.insertSemi && s.findLineEnd() {
928
929 s.ch = '/'
930 s.offset = s.file.Offset(pos)
931 s.rdOffset = s.offset + 1
932 s.insertSemi = false
933 return pos, token.SEMICOLON, "\n"
934 }
935 comment := s.scanComment()
936 if s.mode&ScanComments == 0 {
937
938 s.insertSemi = false
939 goto scanAgain
940 }
941 tok = token.COMMENT
942 lit = comment
943 } else {
944 tok = s.switch2(token.QUO, token.QUO_ASSIGN)
945 }
946 case '%':
947 tok = s.switch2(token.REM, token.REM_ASSIGN)
948 case '^':
949 tok = s.switch2(token.XOR, token.XOR_ASSIGN)
950 case '<':
951 if s.ch == '-' {
952 s.next()
953 tok = token.ARROW
954 } else {
955 tok = s.switch4(token.LSS, token.LEQ, '<', token.SHL, token.SHL_ASSIGN)
956 }
957 case '>':
958 tok = s.switch4(token.GTR, token.GEQ, '>', token.SHR, token.SHR_ASSIGN)
959 case '=':
960 tok = s.switch2(token.ASSIGN, token.EQL)
961 case '!':
962 tok = s.switch2(token.NOT, token.NEQ)
963 case '&':
964 if s.ch == '^' {
965 s.next()
966 tok = s.switch2(token.AND_NOT, token.AND_NOT_ASSIGN)
967 } else {
968 tok = s.switch3(token.AND, token.AND_ASSIGN, '&', token.LAND)
969 }
970 case '|':
971 tok = s.switch3(token.OR, token.OR_ASSIGN, '|', token.LOR)
972 case '~':
973 tok = token.TILDE
974 default:
975
976 if ch != bom {
977 s.errorf(s.file.Offset(pos), "illegal character %#U", ch)
978 }
979 insertSemi = s.insertSemi
980 tok = token.ILLEGAL
981 lit = string(ch)
982 }
983 }
984 if s.mode&dontInsertSemis == 0 {
985 s.insertSemi = insertSemi
986 }
987
988 return
989 }
990
View as plain text