scanner.go

     1  // Copyright 2009 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // Package scanner implements a scanner for Go source text.
     6  // It takes a []byte as source which can then be tokenized
     7  // through repeated calls to the Scan method.
     8  //
     9  package scanner
    10  
    11  import (
    12  	"bytes"
    13  	"fmt"
    14  	"go/token"
    15  	"path/filepath"
    16  	"strconv"
    17  	"unicode"
    18  	"unicode/utf8"
    19  )
    20  
    21  // An ErrorHandler may be provided to Scanner.Init. If a syntax error is
    22  // encountered and a handler was installed, the handler is called with a
    23  // position and an error message. The position points to the beginning of
    24  // the offending token.
    25  //
    26  type ErrorHandler func(pos token.Position, msg string)
    27  
    28  // A Scanner holds the scanner's internal state while processing
    29  // a given text. It can be allocated as part of another data
    30  // structure but must be initialized via Init before use.
    31  //
    32  type Scanner struct {
    33  	// immutable state
    34  	file *token.File  // source file handle
    35  	dir  string       // directory portion of file.Name()
    36  	src  []byte       // source
    37  	err  ErrorHandler // error reporting; or nil
    38  	mode Mode         // scanning mode
    39  
    40  	// scanning state
    41  	ch         rune // current character
    42  	offset     int  // character offset
    43  	rdOffset   int  // reading offset (position after current character)
    44  	lineOffset int  // current line offset
    45  	insertSemi bool // insert a semicolon before next newline
    46  
    47  	// public state - ok to modify
    48  	ErrorCount int // number of errors encountered
    49  }
    50  
    51  const (
    52  	bom = 0xFEFF // byte order mark, only permitted as very first character
    53  	eof = -1     // end of file
    54  )
    55  
    56  // Read the next Unicode char into s.ch.
    57  // s.ch < 0 means end-of-file.
    58  //
    59  // For optimization, there is some overlap between this method and
    60  // s.scanIdentifier.
    61  func (s *Scanner) next() {
    62  	if s.rdOffset < len(s.src) {
    63  		s.offset = s.rdOffset
    64  		if s.ch == '\n' {
    65  			s.lineOffset = s.offset
    66  			s.file.AddLine(s.offset)
    67  		}
    68  		r, w := rune(s.src[s.rdOffset]), 1
    69  		switch {
    70  		case r == 0:
    71  			s.error(s.offset, "illegal character NUL")
    72  		case r >= utf8.RuneSelf:
    73  			// not ASCII
    74  			r, w = utf8.DecodeRune(s.src[s.rdOffset:])
    75  			if r == utf8.RuneError && w == 1 {
    76  				s.error(s.offset, "illegal UTF-8 encoding")
    77  			} else if r == bom && s.offset > 0 {
    78  				s.error(s.offset, "illegal byte order mark")
    79  			}
    80  		}
    81  		s.rdOffset += w
    82  		s.ch = r
    83  	} else {
    84  		s.offset = len(s.src)
    85  		if s.ch == '\n' {
    86  			s.lineOffset = s.offset
    87  			s.file.AddLine(s.offset)
    88  		}
    89  		s.ch = eof
    90  	}
    91  }
    92  
    93  // peek returns the byte following the most recently read character without
    94  // advancing the scanner. If the scanner is at EOF, peek returns 0.
    95  func (s *Scanner) peek() byte {
    96  	if s.rdOffset < len(s.src) {
    97  		return s.src[s.rdOffset]
    98  	}
    99  	return 0
   100  }
   101  
   102  // A mode value is a set of flags (or 0).
   103  // They control scanner behavior.
   104  //
   105  type Mode uint
   106  
   107  const (
   108  	ScanComments    Mode = 1 << iota // return comments as COMMENT tokens
   109  	dontInsertSemis                  // do not automatically insert semicolons - for testing only
   110  )
   111  
   112  // Init prepares the scanner s to tokenize the text src by setting the
   113  // scanner at the beginning of src. The scanner uses the file set file
   114  // for position information and it adds line information for each line.
   115  // It is ok to re-use the same file when re-scanning the same file as
   116  // line information which is already present is ignored. Init causes a
   117  // panic if the file size does not match the src size.
   118  //
   119  // Calls to Scan will invoke the error handler err if they encounter a
   120  // syntax error and err is not nil. Also, for each error encountered,
   121  // the Scanner field ErrorCount is incremented by one. The mode parameter
   122  // determines how comments are handled.
   123  //
   124  // Note that Init may call err if there is an error in the first character
   125  // of the file.
   126  //
   127  func (s *Scanner) Init(file *token.File, src []byte, err ErrorHandler, mode Mode) {
   128  	// Explicitly initialize all fields since a scanner may be reused.
   129  	if file.Size() != len(src) {
   130  		panic(fmt.Sprintf("file size (%d) does not match src len (%d)", file.Size(), len(src)))
   131  	}
   132  	s.file = file
   133  	s.dir, _ = filepath.Split(file.Name())
   134  	s.src = src
   135  	s.err = err
   136  	s.mode = mode
   137  
   138  	s.ch = ' '
   139  	s.offset = 0
   140  	s.rdOffset = 0
   141  	s.lineOffset = 0
   142  	s.insertSemi = false
   143  	s.ErrorCount = 0
   144  
   145  	s.next()
   146  	if s.ch == bom {
   147  		s.next() // ignore BOM at file beginning
   148  	}
   149  }
   150  
   151  func (s *Scanner) error(offs int, msg string) {
   152  	if s.err != nil {
   153  		s.err(s.file.Position(s.file.Pos(offs)), msg)
   154  	}
   155  	s.ErrorCount++
   156  }
   157  
   158  func (s *Scanner) errorf(offs int, format string, args ...any) {
   159  	s.error(offs, fmt.Sprintf(format, args...))
   160  }
   161  
   162  func (s *Scanner) scanComment() string {
   163  	// initial '/' already consumed; s.ch == '/' || s.ch == '*'
   164  	offs := s.offset - 1 // position of initial '/'
   165  	next := -1           // position immediately following the comment; < 0 means invalid comment
   166  	numCR := 0
   167  
   168  	if s.ch == '/' {
   169  		//-style comment
   170  		// (the final '\n' is not considered part of the comment)
   171  		s.next()
   172  		for s.ch != '\n' && s.ch >= 0 {
   173  			if s.ch == '\r' {
   174  				numCR++
   175  			}
   176  			s.next()
   177  		}
   178  		// if we are at '\n', the position following the comment is afterwards
   179  		next = s.offset
   180  		if s.ch == '\n' {
   181  			next++
   182  		}
   183  		goto exit
   184  	}
   185  
   186  	/*-style comment */
   187  	s.next()
   188  	for s.ch >= 0 {
   189  		ch := s.ch
   190  		if ch == '\r' {
   191  			numCR++
   192  		}
   193  		s.next()
   194  		if ch == '*' && s.ch == '/' {
   195  			s.next()
   196  			next = s.offset
   197  			goto exit
   198  		}
   199  	}
   200  
   201  	s.error(offs, "comment not terminated")
   202  
   203  exit:
   204  	lit := s.src[offs:s.offset]
   205  
   206  	// On Windows, a (//-comment) line may end in "\r\n".
   207  	// Remove the final '\r' before analyzing the text for
   208  	// line directives (matching the compiler). Remove any
   209  	// other '\r' afterwards (matching the pre-existing be-
   210  	// havior of the scanner).
   211  	if numCR > 0 && len(lit) >= 2 && lit[1] == '/' && lit[len(lit)-1] == '\r' {
   212  		lit = lit[:len(lit)-1]
   213  		numCR--
   214  	}
   215  
   216  	// interpret line directives
   217  	// (//line directives must start at the beginning of the current line)
   218  	if next >= 0 /* implies valid comment */ && (lit[1] == '*' || offs == s.lineOffset) && bytes.HasPrefix(lit[2:], prefix) {
   219  		s.updateLineInfo(next, offs, lit)
   220  	}
   221  
   222  	if numCR > 0 {
   223  		lit = stripCR(lit, lit[1] == '*')
   224  	}
   225  
   226  	return string(lit)
   227  }
   228  
   229  var prefix = []byte("line ")
   230  
   231  // updateLineInfo parses the incoming comment text at offset offs
   232  // as a line directive. If successful, it updates the line info table
   233  // for the position next per the line directive.
   234  func (s *Scanner) updateLineInfo(next, offs int, text []byte) {
   235  	// extract comment text
   236  	if text[1] == '*' {
   237  		text = text[:len(text)-2] // lop off trailing "*/"
   238  	}
   239  	text = text[7:] // lop off leading "//line " or "/*line "
   240  	offs += 7
   241  
   242  	i, n, ok := trailingDigits(text)
   243  	if i == 0 {
   244  		return // ignore (not a line directive)
   245  	}
   246  	// i > 0
   247  
   248  	if !ok {
   249  		// text has a suffix :xxx but xxx is not a number
   250  		s.error(offs+i, "invalid line number: "+string(text[i:]))
   251  		return
   252  	}
   253  
   254  	var line, col int
   255  	i2, n2, ok2 := trailingDigits(text[:i-1])
   256  	if ok2 {
   257  		//line filename:line:col
   258  		i, i2 = i2, i
   259  		line, col = n2, n
   260  		if col == 0 {
   261  			s.error(offs+i2, "invalid column number: "+string(text[i2:]))
   262  			return
   263  		}
   264  		text = text[:i2-1] // lop off ":col"
   265  	} else {
   266  		//line filename:line
   267  		line = n
   268  	}
   269  
   270  	if line == 0 {
   271  		s.error(offs+i, "invalid line number: "+string(text[i:]))
   272  		return
   273  	}
   274  
   275  	// If we have a column (//line filename:line:col form),
   276  	// an empty filename means to use the previous filename.
   277  	filename := string(text[:i-1]) // lop off ":line", and trim white space
   278  	if filename == "" && ok2 {
   279  		filename = s.file.Position(s.file.Pos(offs)).Filename
   280  	} else if filename != "" {
   281  		// Put a relative filename in the current directory.
   282  		// This is for compatibility with earlier releases.
   283  		// See issue 26671.
   284  		filename = filepath.Clean(filename)
   285  		if !filepath.IsAbs(filename) {
   286  			filename = filepath.Join(s.dir, filename)
   287  		}
   288  	}
   289  
   290  	s.file.AddLineColumnInfo(next, filename, line, col)
   291  }
   292  
   293  func trailingDigits(text []byte) (int, int, bool) {
   294  	i := bytes.LastIndexByte(text, ':') // look from right (Windows filenames may contain ':')
   295  	if i < 0 {
   296  		return 0, 0, false // no ":"
   297  	}
   298  	// i >= 0
   299  	n, err := strconv.ParseUint(string(text[i+1:]), 10, 0)
   300  	return i + 1, int(n), err == nil
   301  }
   302  
   303  func (s *Scanner) findLineEnd() bool {
   304  	// initial '/' already consumed
   305  
   306  	defer func(offs int) {
   307  		// reset scanner state to where it was upon calling findLineEnd
   308  		s.ch = '/'
   309  		s.offset = offs
   310  		s.rdOffset = offs + 1
   311  		s.next() // consume initial '/' again
   312  	}(s.offset - 1)
   313  
   314  	// read ahead until a newline, EOF, or non-comment token is found
   315  	for s.ch == '/' || s.ch == '*' {
   316  		if s.ch == '/' {
   317  			//-style comment always contains a newline
   318  			return true
   319  		}
   320  		/*-style comment: look for newline */
   321  		s.next()
   322  		for s.ch >= 0 {
   323  			ch := s.ch
   324  			if ch == '\n' {
   325  				return true
   326  			}
   327  			s.next()
   328  			if ch == '*' && s.ch == '/' {
   329  				s.next()
   330  				break
   331  			}
   332  		}
   333  		s.skipWhitespace() // s.insertSemi is set
   334  		if s.ch < 0 || s.ch == '\n' {
   335  			return true
   336  		}
   337  		if s.ch != '/' {
   338  			// non-comment token
   339  			return false
   340  		}
   341  		s.next() // consume '/'
   342  	}
   343  
   344  	return false
   345  }
   346  
   347  func isLetter(ch rune) bool {
   348  	return 'a' <= lower(ch) && lower(ch) <= 'z' || ch == '_' || ch >= utf8.RuneSelf && unicode.IsLetter(ch)
   349  }
   350  
   351  func isDigit(ch rune) bool {
   352  	return isDecimal(ch) || ch >= utf8.RuneSelf && unicode.IsDigit(ch)
   353  }
   354  
   355  // scanIdentifier reads the string of valid identifier characters at s.offset.
   356  // It must only be called when s.ch is known to be a valid letter.
   357  //
   358  // Be careful when making changes to this function: it is optimized and affects
   359  // scanning performance significantly.
   360  func (s *Scanner) scanIdentifier() string {
   361  	offs := s.offset
   362  
   363  	// Optimize for the common case of an ASCII identifier.
   364  	//
   365  	// Ranging over s.src[s.rdOffset:] lets us avoid some bounds checks, and
   366  	// avoids conversions to runes.
   367  	//
   368  	// In case we encounter a non-ASCII character, fall back on the slower path
   369  	// of calling into s.next().
   370  	for rdOffset, b := range s.src[s.rdOffset:] {
   371  		if 'a' <= b && b <= 'z' || 'A' <= b && b <= 'Z' || b == '_' || '0' <= b && b <= '9' {
   372  			// Avoid assigning a rune for the common case of an ascii character.
   373  			continue
   374  		}
   375  		s.rdOffset += rdOffset
   376  		if 0 < b && b < utf8.RuneSelf {
   377  			// Optimization: we've encountered an ASCII character that's not a letter
   378  			// or number. Avoid the call into s.next() and corresponding set up.
   379  			//
   380  			// Note that s.next() does some line accounting if s.ch is '\n', so this
   381  			// shortcut is only possible because we know that the preceding character
   382  			// is not '\n'.
   383  			s.ch = rune(b)
   384  			s.offset = s.rdOffset
   385  			s.rdOffset++
   386  			goto exit
   387  		}
   388  		// We know that the preceding character is valid for an identifier because
   389  		// scanIdentifier is only called when s.ch is a letter, so calling s.next()
   390  		// at s.rdOffset resets the scanner state.
   391  		s.next()
   392  		for isLetter(s.ch) || isDigit(s.ch) {
   393  			s.next()
   394  		}
   395  		goto exit
   396  	}
   397  	s.offset = len(s.src)
   398  	s.rdOffset = len(s.src)
   399  	s.ch = eof
   400  
   401  exit:
   402  	return string(s.src[offs:s.offset])
   403  }
   404  
   405  func digitVal(ch rune) int {
   406  	switch {
   407  	case '0' <= ch && ch <= '9':
   408  		return int(ch - '0')
   409  	case 'a' <= lower(ch) && lower(ch) <= 'f':
   410  		return int(lower(ch) - 'a' + 10)
   411  	}
   412  	return 16 // larger than any legal digit val
   413  }
   414  
   415  func lower(ch rune) rune     { return ('a' - 'A') | ch } // returns lower-case ch iff ch is ASCII letter
   416  func isDecimal(ch rune) bool { return '0' <= ch && ch <= '9' }
   417  func isHex(ch rune) bool     { return '0' <= ch && ch <= '9' || 'a' <= lower(ch) && lower(ch) <= 'f' }
   418  
   419  // digits accepts the sequence { digit | '_' }.
   420  // If base <= 10, digits accepts any decimal digit but records
   421  // the offset (relative to the source start) of a digit >= base
   422  // in *invalid, if *invalid < 0.
   423  // digits returns a bitset describing whether the sequence contained
   424  // digits (bit 0 is set), or separators '_' (bit 1 is set).
   425  func (s *Scanner) digits(base int, invalid *int) (digsep int) {
   426  	if base <= 10 {
   427  		max := rune('0' + base)
   428  		for isDecimal(s.ch) || s.ch == '_' {
   429  			ds := 1
   430  			if s.ch == '_' {
   431  				ds = 2
   432  			} else if s.ch >= max && *invalid < 0 {
   433  				*invalid = s.offset // record invalid rune offset
   434  			}
   435  			digsep |= ds
   436  			s.next()
   437  		}
   438  	} else {
   439  		for isHex(s.ch) || s.ch == '_' {
   440  			ds := 1
   441  			if s.ch == '_' {
   442  				ds = 2
   443  			}
   444  			digsep |= ds
   445  			s.next()
   446  		}
   447  	}
   448  	return
   449  }
   450  
   451  func (s *Scanner) scanNumber() (token.Token, string) {
   452  	offs := s.offset
   453  	tok := token.ILLEGAL
   454  
   455  	base := 10        // number base
   456  	prefix := rune(0) // one of 0 (decimal), '0' (0-octal), 'x', 'o', or 'b'
   457  	digsep := 0       // bit 0: digit present, bit 1: '_' present
   458  	invalid := -1     // index of invalid digit in literal, or < 0
   459  
   460  	// integer part
   461  	if s.ch != '.' {
   462  		tok = token.INT
   463  		if s.ch == '0' {
   464  			s.next()
   465  			switch lower(s.ch) {
   466  			case 'x':
   467  				s.next()
   468  				base, prefix = 16, 'x'
   469  			case 'o':
   470  				s.next()
   471  				base, prefix = 8, 'o'
   472  			case 'b':
   473  				s.next()
   474  				base, prefix = 2, 'b'
   475  			default:
   476  				base, prefix = 8, '0'
   477  				digsep = 1 // leading 0
   478  			}
   479  		}
   480  		digsep |= s.digits(base, &invalid)
   481  	}
   482  
   483  	// fractional part
   484  	if s.ch == '.' {
   485  		tok = token.FLOAT
   486  		if prefix == 'o' || prefix == 'b' {
   487  			s.error(s.offset, "invalid radix point in "+litname(prefix))
   488  		}
   489  		s.next()
   490  		digsep |= s.digits(base, &invalid)
   491  	}
   492  
   493  	if digsep&1 == 0 {
   494  		s.error(s.offset, litname(prefix)+" has no digits")
   495  	}
   496  
   497  	// exponent
   498  	if e := lower(s.ch); e == 'e' || e == 'p' {
   499  		switch {
   500  		case e == 'e' && prefix != 0 && prefix != '0':
   501  			s.errorf(s.offset, "%q exponent requires decimal mantissa", s.ch)
   502  		case e == 'p' && prefix != 'x':
   503  			s.errorf(s.offset, "%q exponent requires hexadecimal mantissa", s.ch)
   504  		}
   505  		s.next()
   506  		tok = token.FLOAT
   507  		if s.ch == '+' || s.ch == '-' {
   508  			s.next()
   509  		}
   510  		ds := s.digits(10, nil)
   511  		digsep |= ds
   512  		if ds&1 == 0 {
   513  			s.error(s.offset, "exponent has no digits")
   514  		}
   515  	} else if prefix == 'x' && tok == token.FLOAT {
   516  		s.error(s.offset, "hexadecimal mantissa requires a 'p' exponent")
   517  	}
   518  
   519  	// suffix 'i'
   520  	if s.ch == 'i' {
   521  		tok = token.IMAG
   522  		s.next()
   523  	}
   524  
   525  	lit := string(s.src[offs:s.offset])
   526  	if tok == token.INT && invalid >= 0 {
   527  		s.errorf(invalid, "invalid digit %q in %s", lit[invalid-offs], litname(prefix))
   528  	}
   529  	if digsep&2 != 0 {
   530  		if i := invalidSep(lit); i >= 0 {
   531  			s.error(offs+i, "'_' must separate successive digits")
   532  		}
   533  	}
   534  
   535  	return tok, lit
   536  }
   537  
   538  func litname(prefix rune) string {
   539  	switch prefix {
   540  	case 'x':
   541  		return "hexadecimal literal"
   542  	case 'o', '0':
   543  		return "octal literal"
   544  	case 'b':
   545  		return "binary literal"
   546  	}
   547  	return "decimal literal"
   548  }
   549  
   550  // invalidSep returns the index of the first invalid separator in x, or -1.
   551  func invalidSep(x string) int {
   552  	x1 := ' ' // prefix char, we only care if it's 'x'
   553  	d := '.'  // digit, one of '_', '0' (a digit), or '.' (anything else)
   554  	i := 0
   555  
   556  	// a prefix counts as a digit
   557  	if len(x) >= 2 && x[0] == '0' {
   558  		x1 = lower(rune(x[1]))
   559  		if x1 == 'x' || x1 == 'o' || x1 == 'b' {
   560  			d = '0'
   561  			i = 2
   562  		}
   563  	}
   564  
   565  	// mantissa and exponent
   566  	for ; i < len(x); i++ {
   567  		p := d // previous digit
   568  		d = rune(x[i])
   569  		switch {
   570  		case d == '_':
   571  			if p != '0' {
   572  				return i
   573  			}
   574  		case isDecimal(d) || x1 == 'x' && isHex(d):
   575  			d = '0'
   576  		default:
   577  			if p == '_' {
   578  				return i - 1
   579  			}
   580  			d = '.'
   581  		}
   582  	}
   583  	if d == '_' {
   584  		return len(x) - 1
   585  	}
   586  
   587  	return -1
   588  }
   589  
   590  // scanEscape parses an escape sequence where rune is the accepted
   591  // escaped quote. In case of a syntax error, it stops at the offending
   592  // character (without consuming it) and returns false. Otherwise
   593  // it returns true.
   594  func (s *Scanner) scanEscape(quote rune) bool {
   595  	offs := s.offset
   596  
   597  	var n int
   598  	var base, max uint32
   599  	switch s.ch {
   600  	case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', quote:
   601  		s.next()
   602  		return true
   603  	case '0', '1', '2', '3', '4', '5', '6', '7':
   604  		n, base, max = 3, 8, 255
   605  	case 'x':
   606  		s.next()
   607  		n, base, max = 2, 16, 255
   608  	case 'u':
   609  		s.next()
   610  		n, base, max = 4, 16, unicode.MaxRune
   611  	case 'U':
   612  		s.next()
   613  		n, base, max = 8, 16, unicode.MaxRune
   614  	default:
   615  		msg := "unknown escape sequence"
   616  		if s.ch < 0 {
   617  			msg = "escape sequence not terminated"
   618  		}
   619  		s.error(offs, msg)
   620  		return false
   621  	}
   622  
   623  	var x uint32
   624  	for n > 0 {
   625  		d := uint32(digitVal(s.ch))
   626  		if d >= base {
   627  			msg := fmt.Sprintf("illegal character %#U in escape sequence", s.ch)
   628  			if s.ch < 0 {
   629  				msg = "escape sequence not terminated"
   630  			}
   631  			s.error(s.offset, msg)
   632  			return false
   633  		}
   634  		x = x*base + d
   635  		s.next()
   636  		n--
   637  	}
   638  
   639  	if x > max || 0xD800 <= x && x < 0xE000 {
   640  		s.error(offs, "escape sequence is invalid Unicode code point")
   641  		return false
   642  	}
   643  
   644  	return true
   645  }
   646  
   647  func (s *Scanner) scanRune() string {
   648  	// '\'' opening already consumed
   649  	offs := s.offset - 1
   650  
   651  	valid := true
   652  	n := 0
   653  	for {
   654  		ch := s.ch
   655  		if ch == '\n' || ch < 0 {
   656  			// only report error if we don't have one already
   657  			if valid {
   658  				s.error(offs, "rune literal not terminated")
   659  				valid = false
   660  			}
   661  			break
   662  		}
   663  		s.next()
   664  		if ch == '\'' {
   665  			break
   666  		}
   667  		n++
   668  		if ch == '\\' {
   669  			if !s.scanEscape('\'') {
   670  				valid = false
   671  			}
   672  			// continue to read to closing quote
   673  		}
   674  	}
   675  
   676  	if valid && n != 1 {
   677  		s.error(offs, "illegal rune literal")
   678  	}
   679  
   680  	return string(s.src[offs:s.offset])
   681  }
   682  
   683  func (s *Scanner) scanString() string {
   684  	// '"' opening already consumed
   685  	offs := s.offset - 1
   686  
   687  	for {
   688  		ch := s.ch
   689  		if ch == '\n' || ch < 0 {
   690  			s.error(offs, "string literal not terminated")
   691  			break
   692  		}
   693  		s.next()
   694  		if ch == '"' {
   695  			break
   696  		}
   697  		if ch == '\\' {
   698  			s.scanEscape('"')
   699  		}
   700  	}
   701  
   702  	return string(s.src[offs:s.offset])
   703  }
   704  
   705  func stripCR(b []byte, comment bool) []byte {
   706  	c := make([]byte, len(b))
   707  	i := 0
   708  	for j, ch := range b {
   709  		// In a /*-style comment, don't strip \r from *\r/ (incl.
   710  		// sequences of \r from *\r\r...\r/) since the resulting
   711  		// */ would terminate the comment too early unless the \r
   712  		// is immediately following the opening /* in which case
   713  		// it's ok because /*/ is not closed yet (issue #11151).
   714  		if ch != '\r' || comment && i > len("/*") && c[i-1] == '*' && j+1 < len(b) && b[j+1] == '/' {
   715  			c[i] = ch
   716  			i++
   717  		}
   718  	}
   719  	return c[:i]
   720  }
   721  
   722  func (s *Scanner) scanRawString() string {
   723  	// '`' opening already consumed
   724  	offs := s.offset - 1
   725  
   726  	hasCR := false
   727  	for {
   728  		ch := s.ch
   729  		if ch < 0 {
   730  			s.error(offs, "raw string literal not terminated")
   731  			break
   732  		}
   733  		s.next()
   734  		if ch == '`' {
   735  			break
   736  		}
   737  		if ch == '\r' {
   738  			hasCR = true
   739  		}
   740  	}
   741  
   742  	lit := s.src[offs:s.offset]
   743  	if hasCR {
   744  		lit = stripCR(lit, false)
   745  	}
   746  
   747  	return string(lit)
   748  }
   749  
   750  func (s *Scanner) skipWhitespace() {
   751  	for s.ch == ' ' || s.ch == '\t' || s.ch == '\n' && !s.insertSemi || s.ch == '\r' {
   752  		s.next()
   753  	}
   754  }
   755  
   756  // Helper functions for scanning multi-byte tokens such as >> += >>= .
   757  // Different routines recognize different length tok_i based on matches
   758  // of ch_i. If a token ends in '=', the result is tok1 or tok3
   759  // respectively. Otherwise, the result is tok0 if there was no other
   760  // matching character, or tok2 if the matching character was ch2.
   761  
   762  func (s *Scanner) switch2(tok0, tok1 token.Token) token.Token {
   763  	if s.ch == '=' {
   764  		s.next()
   765  		return tok1
   766  	}
   767  	return tok0
   768  }
   769  
   770  func (s *Scanner) switch3(tok0, tok1 token.Token, ch2 rune, tok2 token.Token) token.Token {
   771  	if s.ch == '=' {
   772  		s.next()
   773  		return tok1
   774  	}
   775  	if s.ch == ch2 {
   776  		s.next()
   777  		return tok2
   778  	}
   779  	return tok0
   780  }
   781  
   782  func (s *Scanner) switch4(tok0, tok1 token.Token, ch2 rune, tok2, tok3 token.Token) token.Token {
   783  	if s.ch == '=' {
   784  		s.next()
   785  		return tok1
   786  	}
   787  	if s.ch == ch2 {
   788  		s.next()
   789  		if s.ch == '=' {
   790  			s.next()
   791  			return tok3
   792  		}
   793  		return tok2
   794  	}
   795  	return tok0
   796  }
   797  
   798  // Scan scans the next token and returns the token position, the token,
   799  // and its literal string if applicable. The source end is indicated by
   800  // token.EOF.
   801  //
   802  // If the returned token is a literal (token.IDENT, token.INT, token.FLOAT,
   803  // token.IMAG, token.CHAR, token.STRING) or token.COMMENT, the literal string
   804  // has the corresponding value.
   805  //
   806  // If the returned token is a keyword, the literal string is the keyword.
   807  //
   808  // If the returned token is token.SEMICOLON, the corresponding
   809  // literal string is ";" if the semicolon was present in the source,
   810  // and "\n" if the semicolon was inserted because of a newline or
   811  // at EOF.
   812  //
   813  // If the returned token is token.ILLEGAL, the literal string is the
   814  // offending character.
   815  //
   816  // In all other cases, Scan returns an empty literal string.
   817  //
   818  // For more tolerant parsing, Scan will return a valid token if
   819  // possible even if a syntax error was encountered. Thus, even
   820  // if the resulting token sequence contains no illegal tokens,
   821  // a client may not assume that no error occurred. Instead it
   822  // must check the scanner's ErrorCount or the number of calls
   823  // of the error handler, if there was one installed.
   824  //
   825  // Scan adds line information to the file added to the file
   826  // set with Init. Token positions are relative to that file
   827  // and thus relative to the file set.
   828  //
   829  func (s *Scanner) Scan() (pos token.Pos, tok token.Token, lit string) {
   830  scanAgain:
   831  	s.skipWhitespace()
   832  
   833  	// current token start
   834  	pos = s.file.Pos(s.offset)
   835  
   836  	// determine token value
   837  	insertSemi := false
   838  	switch ch := s.ch; {
   839  	case isLetter(ch):
   840  		lit = s.scanIdentifier()
   841  		if len(lit) > 1 {
   842  			// keywords are longer than one letter - avoid lookup otherwise
   843  			tok = token.Lookup(lit)
   844  			switch tok {
   845  			case token.IDENT, token.BREAK, token.CONTINUE, token.FALLTHROUGH, token.RETURN:
   846  				insertSemi = true
   847  			}
   848  		} else {
   849  			insertSemi = true
   850  			tok = token.IDENT
   851  		}
   852  	case isDecimal(ch) || ch == '.' && isDecimal(rune(s.peek())):
   853  		insertSemi = true
   854  		tok, lit = s.scanNumber()
   855  	default:
   856  		s.next() // always make progress
   857  		switch ch {
   858  		case -1:
   859  			if s.insertSemi {
   860  				s.insertSemi = false // EOF consumed
   861  				return pos, token.SEMICOLON, "\n"
   862  			}
   863  			tok = token.EOF
   864  		case '\n':
   865  			// we only reach here if s.insertSemi was
   866  			// set in the first place and exited early
   867  			// from s.skipWhitespace()
   868  			s.insertSemi = false // newline consumed
   869  			return pos, token.SEMICOLON, "\n"
   870  		case '"':
   871  			insertSemi = true
   872  			tok = token.STRING
   873  			lit = s.scanString()
   874  		case '\'':
   875  			insertSemi = true
   876  			tok = token.CHAR
   877  			lit = s.scanRune()
   878  		case '`':
   879  			insertSemi = true
   880  			tok = token.STRING
   881  			lit = s.scanRawString()
   882  		case ':':
   883  			tok = s.switch2(token.COLON, token.DEFINE)
   884  		case '.':
   885  			// fractions starting with a '.' are handled by outer switch
   886  			tok = token.PERIOD
   887  			if s.ch == '.' && s.peek() == '.' {
   888  				s.next()
   889  				s.next() // consume last '.'
   890  				tok = token.ELLIPSIS
   891  			}
   892  		case ',':
   893  			tok = token.COMMA
   894  		case ';':
   895  			tok = token.SEMICOLON
   896  			lit = ";"
   897  		case '(':
   898  			tok = token.LPAREN
   899  		case ')':
   900  			insertSemi = true
   901  			tok = token.RPAREN
   902  		case '[':
   903  			tok = token.LBRACK
   904  		case ']':
   905  			insertSemi = true
   906  			tok = token.RBRACK
   907  		case '{':
   908  			tok = token.LBRACE
   909  		case '}':
   910  			insertSemi = true
   911  			tok = token.RBRACE
   912  		case '+':
   913  			tok = s.switch3(token.ADD, token.ADD_ASSIGN, '+', token.INC)
   914  			if tok == token.INC {
   915  				insertSemi = true
   916  			}
   917  		case '-':
   918  			tok = s.switch3(token.SUB, token.SUB_ASSIGN, '-', token.DEC)
   919  			if tok == token.DEC {
   920  				insertSemi = true
   921  			}
   922  		case '*':
   923  			tok = s.switch2(token.MUL, token.MUL_ASSIGN)
   924  		case '/':
   925  			if s.ch == '/' || s.ch == '*' {
   926  				// comment
   927  				if s.insertSemi && s.findLineEnd() {
   928  					// reset position to the beginning of the comment
   929  					s.ch = '/'
   930  					s.offset = s.file.Offset(pos)
   931  					s.rdOffset = s.offset + 1
   932  					s.insertSemi = false // newline consumed
   933  					return pos, token.SEMICOLON, "\n"
   934  				}
   935  				comment := s.scanComment()
   936  				if s.mode&ScanComments == 0 {
   937  					// skip comment
   938  					s.insertSemi = false // newline consumed
   939  					goto scanAgain
   940  				}
   941  				tok = token.COMMENT
   942  				lit = comment
   943  			} else {
   944  				tok = s.switch2(token.QUO, token.QUO_ASSIGN)
   945  			}
   946  		case '%':
   947  			tok = s.switch2(token.REM, token.REM_ASSIGN)
   948  		case '^':
   949  			tok = s.switch2(token.XOR, token.XOR_ASSIGN)
   950  		case '<':
   951  			if s.ch == '-' {
   952  				s.next()
   953  				tok = token.ARROW
   954  			} else {
   955  				tok = s.switch4(token.LSS, token.LEQ, '<', token.SHL, token.SHL_ASSIGN)
   956  			}
   957  		case '>':
   958  			tok = s.switch4(token.GTR, token.GEQ, '>', token.SHR, token.SHR_ASSIGN)
   959  		case '=':
   960  			tok = s.switch2(token.ASSIGN, token.EQL)
   961  		case '!':
   962  			tok = s.switch2(token.NOT, token.NEQ)
   963  		case '&':
   964  			if s.ch == '^' {
   965  				s.next()
   966  				tok = s.switch2(token.AND_NOT, token.AND_NOT_ASSIGN)
   967  			} else {
   968  				tok = s.switch3(token.AND, token.AND_ASSIGN, '&', token.LAND)
   969  			}
   970  		case '|':
   971  			tok = s.switch3(token.OR, token.OR_ASSIGN, '|', token.LOR)
   972  		case '~':
   973  			tok = token.TILDE
   974  		default:
   975  			// next reports unexpected BOMs - don't repeat
   976  			if ch != bom {
   977  				s.errorf(s.file.Offset(pos), "illegal character %#U", ch)
   978  			}
   979  			insertSemi = s.insertSemi // preserve insertSemi info
   980  			tok = token.ILLEGAL
   981  			lit = string(ch)
   982  		}
   983  	}
   984  	if s.mode&dontInsertSemis == 0 {
   985  		s.insertSemi = insertSemi
   986  	}
   987  
   988  	return
   989  }
   990
View as plain text