Source file src/strconv/quote.go

     1  // Copyright 2009 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  //go:generate go run makeisprint.go -output isprint.go
     6  
     7  package strconv
     8  
     9  import (
    10  	"unicode/utf8"
    11  )
    12  
    13  const (
    14  	lowerhex = "0123456789abcdef"
    15  	upperhex = "0123456789ABCDEF"
    16  )
    17  
    18  // contains reports whether the string contains the byte c.
    19  func contains(s string, c byte) bool {
    20  	return index(s, c) != -1
    21  }
    22  
    23  func quoteWith(s string, quote byte, ASCIIonly, graphicOnly bool) string {
    24  	return string(appendQuotedWith(make([]byte, 0, 3*len(s)/2), s, quote, ASCIIonly, graphicOnly))
    25  }
    26  
    27  func quoteRuneWith(r rune, quote byte, ASCIIonly, graphicOnly bool) string {
    28  	return string(appendQuotedRuneWith(nil, r, quote, ASCIIonly, graphicOnly))
    29  }
    30  
    31  func appendQuotedWith(buf []byte, s string, quote byte, ASCIIonly, graphicOnly bool) []byte {
    32  	// Often called with big strings, so preallocate. If there's quoting,
    33  	// this is conservative but still helps a lot.
    34  	if cap(buf)-len(buf) < len(s) {
    35  		nBuf := make([]byte, len(buf), len(buf)+1+len(s)+1)
    36  		copy(nBuf, buf)
    37  		buf = nBuf
    38  	}
    39  	buf = append(buf, quote)
    40  	for width := 0; len(s) > 0; s = s[width:] {
    41  		r := rune(s[0])
    42  		width = 1
    43  		if r >= utf8.RuneSelf {
    44  			r, width = utf8.DecodeRuneInString(s)
    45  		}
    46  		if width == 1 && r == utf8.RuneError {
    47  			buf = append(buf, `\x`...)
    48  			buf = append(buf, lowerhex[s[0]>>4])
    49  			buf = append(buf, lowerhex[s[0]&0xF])
    50  			continue
    51  		}
    52  		buf = appendEscapedRune(buf, r, quote, ASCIIonly, graphicOnly)
    53  	}
    54  	buf = append(buf, quote)
    55  	return buf
    56  }
    57  
    58  func appendQuotedRuneWith(buf []byte, r rune, quote byte, ASCIIonly, graphicOnly bool) []byte {
    59  	buf = append(buf, quote)
    60  	if !utf8.ValidRune(r) {
    61  		r = utf8.RuneError
    62  	}
    63  	buf = appendEscapedRune(buf, r, quote, ASCIIonly, graphicOnly)
    64  	buf = append(buf, quote)
    65  	return buf
    66  }
    67  
    68  func appendEscapedRune(buf []byte, r rune, quote byte, ASCIIonly, graphicOnly bool) []byte {
    69  	var runeTmp [utf8.UTFMax]byte
    70  	if r == rune(quote) || r == '\\' { // always backslashed
    71  		buf = append(buf, '\\')
    72  		buf = append(buf, byte(r))
    73  		return buf
    74  	}
    75  	if ASCIIonly {
    76  		if r < utf8.RuneSelf && IsPrint(r) {
    77  			buf = append(buf, byte(r))
    78  			return buf
    79  		}
    80  	} else if IsPrint(r) || graphicOnly && isInGraphicList(r) {
    81  		n := utf8.EncodeRune(runeTmp[:], r)
    82  		buf = append(buf, runeTmp[:n]...)
    83  		return buf
    84  	}
    85  	switch r {
    86  	case '\a':
    87  		buf = append(buf, `\a`...)
    88  	case '\b':
    89  		buf = append(buf, `\b`...)
    90  	case '\f':
    91  		buf = append(buf, `\f`...)
    92  	case '\n':
    93  		buf = append(buf, `\n`...)
    94  	case '\r':
    95  		buf = append(buf, `\r`...)
    96  	case '\t':
    97  		buf = append(buf, `\t`...)
    98  	case '\v':
    99  		buf = append(buf, `\v`...)
   100  	default:
   101  		switch {
   102  		case r < ' ':
   103  			buf = append(buf, `\x`...)
   104  			buf = append(buf, lowerhex[byte(r)>>4])
   105  			buf = append(buf, lowerhex[byte(r)&0xF])
   106  		case !utf8.ValidRune(r):
   107  			r = 0xFFFD
   108  			fallthrough
   109  		case r < 0x10000:
   110  			buf = append(buf, `\u`...)
   111  			for s := 12; s >= 0; s -= 4 {
   112  				buf = append(buf, lowerhex[r>>uint(s)&0xF])
   113  			}
   114  		default:
   115  			buf = append(buf, `\U`...)
   116  			for s := 28; s >= 0; s -= 4 {
   117  				buf = append(buf, lowerhex[r>>uint(s)&0xF])
   118  			}
   119  		}
   120  	}
   121  	return buf
   122  }
   123  
   124  // Quote returns a double-quoted Go string literal representing s. The
   125  // returned string uses Go escape sequences (\t, \n, \xFF, \u0100) for
   126  // control characters and non-printable characters as defined by
   127  // IsPrint.
   128  func Quote(s string) string {
   129  	return quoteWith(s, '"', false, false)
   130  }
   131  
   132  // AppendQuote appends a double-quoted Go string literal representing s,
   133  // as generated by Quote, to dst and returns the extended buffer.
   134  func AppendQuote(dst []byte, s string) []byte {
   135  	return appendQuotedWith(dst, s, '"', false, false)
   136  }
   137  
   138  // QuoteToASCII returns a double-quoted Go string literal representing s.
   139  // The returned string uses Go escape sequences (\t, \n, \xFF, \u0100) for
   140  // non-ASCII characters and non-printable characters as defined by IsPrint.
   141  func QuoteToASCII(s string) string {
   142  	return quoteWith(s, '"', true, false)
   143  }
   144  
   145  // AppendQuoteToASCII appends a double-quoted Go string literal representing s,
   146  // as generated by QuoteToASCII, to dst and returns the extended buffer.
   147  func AppendQuoteToASCII(dst []byte, s string) []byte {
   148  	return appendQuotedWith(dst, s, '"', true, false)
   149  }
   150  
   151  // QuoteToGraphic returns a double-quoted Go string literal representing s.
   152  // The returned string leaves Unicode graphic characters, as defined by
   153  // IsGraphic, unchanged and uses Go escape sequences (\t, \n, \xFF, \u0100)
   154  // for non-graphic characters.
   155  func QuoteToGraphic(s string) string {
   156  	return quoteWith(s, '"', false, true)
   157  }
   158  
   159  // AppendQuoteToGraphic appends a double-quoted Go string literal representing s,
   160  // as generated by QuoteToGraphic, to dst and returns the extended buffer.
   161  func AppendQuoteToGraphic(dst []byte, s string) []byte {
   162  	return appendQuotedWith(dst, s, '"', false, true)
   163  }
   164  
   165  // QuoteRune returns a single-quoted Go character literal representing the
   166  // rune. The returned string uses Go escape sequences (\t, \n, \xFF, \u0100)
   167  // for control characters and non-printable characters as defined by IsPrint.
   168  func QuoteRune(r rune) string {
   169  	return quoteRuneWith(r, '\'', false, false)
   170  }
   171  
   172  // AppendQuoteRune appends a single-quoted Go character literal representing the rune,
   173  // as generated by QuoteRune, to dst and returns the extended buffer.
   174  func AppendQuoteRune(dst []byte, r rune) []byte {
   175  	return appendQuotedRuneWith(dst, r, '\'', false, false)
   176  }
   177  
   178  // QuoteRuneToASCII returns a single-quoted Go character literal representing
   179  // the rune. The returned string uses Go escape sequences (\t, \n, \xFF,
   180  // \u0100) for non-ASCII characters and non-printable characters as defined
   181  // by IsPrint.
   182  func QuoteRuneToASCII(r rune) string {
   183  	return quoteRuneWith(r, '\'', true, false)
   184  }
   185  
   186  // AppendQuoteRuneToASCII appends a single-quoted Go character literal representing the rune,
   187  // as generated by QuoteRuneToASCII, to dst and returns the extended buffer.
   188  func AppendQuoteRuneToASCII(dst []byte, r rune) []byte {
   189  	return appendQuotedRuneWith(dst, r, '\'', true, false)
   190  }
   191  
   192  // QuoteRuneToGraphic returns a single-quoted Go character literal representing
   193  // the rune. If the rune is not a Unicode graphic character,
   194  // as defined by IsGraphic, the returned string will use a Go escape sequence
   195  // (\t, \n, \xFF, \u0100).
   196  func QuoteRuneToGraphic(r rune) string {
   197  	return quoteRuneWith(r, '\'', false, true)
   198  }
   199  
   200  // AppendQuoteRuneToGraphic appends a single-quoted Go character literal representing the rune,
   201  // as generated by QuoteRuneToGraphic, to dst and returns the extended buffer.
   202  func AppendQuoteRuneToGraphic(dst []byte, r rune) []byte {
   203  	return appendQuotedRuneWith(dst, r, '\'', false, true)
   204  }
   205  
   206  // CanBackquote reports whether the string s can be represented
   207  // unchanged as a single-line backquoted string without control
   208  // characters other than tab.
   209  func CanBackquote(s string) bool {
   210  	for len(s) > 0 {
   211  		r, wid := utf8.DecodeRuneInString(s)
   212  		s = s[wid:]
   213  		if wid > 1 {
   214  			if r == '\ufeff' {
   215  				return false // BOMs are invisible and should not be quoted.
   216  			}
   217  			continue // All other multibyte runes are correctly encoded and assumed printable.
   218  		}
   219  		if r == utf8.RuneError {
   220  			return false
   221  		}
   222  		if (r < ' ' && r != '\t') || r == '`' || r == '\u007F' {
   223  			return false
   224  		}
   225  	}
   226  	return true
   227  }
   228  
   229  func unhex(b byte) (v rune, ok bool) {
   230  	c := rune(b)
   231  	switch {
   232  	case '0' <= c && c <= '9':
   233  		return c - '0', true
   234  	case 'a' <= c && c <= 'f':
   235  		return c - 'a' + 10, true
   236  	case 'A' <= c && c <= 'F':
   237  		return c - 'A' + 10, true
   238  	}
   239  	return
   240  }
   241  
   242  // UnquoteChar decodes the first character or byte in the escaped string
   243  // or character literal represented by the string s.
   244  // It returns four values:
   245  //
   246  //	1) value, the decoded Unicode code point or byte value;
   247  //	2) multibyte, a boolean indicating whether the decoded character requires a multibyte UTF-8 representation;
   248  //	3) tail, the remainder of the string after the character; and
   249  //	4) an error that will be nil if the character is syntactically valid.
   250  //
   251  // The second argument, quote, specifies the type of literal being parsed
   252  // and therefore which escaped quote character is permitted.
   253  // If set to a single quote, it permits the sequence \' and disallows unescaped '.
   254  // If set to a double quote, it permits \" and disallows unescaped ".
   255  // If set to zero, it does not permit either escape and allows both quote characters to appear unescaped.
   256  func UnquoteChar(s string, quote byte) (value rune, multibyte bool, tail string, err error) {
   257  	// easy cases
   258  	if len(s) == 0 {
   259  		err = ErrSyntax
   260  		return
   261  	}
   262  	switch c := s[0]; {
   263  	case c == quote && (quote == '\'' || quote == '"'):
   264  		err = ErrSyntax
   265  		return
   266  	case c >= utf8.RuneSelf:
   267  		r, size := utf8.DecodeRuneInString(s)
   268  		return r, true, s[size:], nil
   269  	case c != '\\':
   270  		return rune(s[0]), false, s[1:], nil
   271  	}
   272  
   273  	// hard case: c is backslash
   274  	if len(s) <= 1 {
   275  		err = ErrSyntax
   276  		return
   277  	}
   278  	c := s[1]
   279  	s = s[2:]
   280  
   281  	switch c {
   282  	case 'a':
   283  		value = '\a'
   284  	case 'b':
   285  		value = '\b'
   286  	case 'f':
   287  		value = '\f'
   288  	case 'n':
   289  		value = '\n'
   290  	case 'r':
   291  		value = '\r'
   292  	case 't':
   293  		value = '\t'
   294  	case 'v':
   295  		value = '\v'
   296  	case 'x', 'u', 'U':
   297  		n := 0
   298  		switch c {
   299  		case 'x':
   300  			n = 2
   301  		case 'u':
   302  			n = 4
   303  		case 'U':
   304  			n = 8
   305  		}
   306  		var v rune
   307  		if len(s) < n {
   308  			err = ErrSyntax
   309  			return
   310  		}
   311  		for j := 0; j < n; j++ {
   312  			x, ok := unhex(s[j])
   313  			if !ok {
   314  				err = ErrSyntax
   315  				return
   316  			}
   317  			v = v<<4 | x
   318  		}
   319  		s = s[n:]
   320  		if c == 'x' {
   321  			// single-byte string, possibly not UTF-8
   322  			value = v
   323  			break
   324  		}
   325  		if !utf8.ValidRune(v) {
   326  			err = ErrSyntax
   327  			return
   328  		}
   329  		value = v
   330  		multibyte = true
   331  	case '0', '1', '2', '3', '4', '5', '6', '7':
   332  		v := rune(c) - '0'
   333  		if len(s) < 2 {
   334  			err = ErrSyntax
   335  			return
   336  		}
   337  		for j := 0; j < 2; j++ { // one digit already; two more
   338  			x := rune(s[j]) - '0'
   339  			if x < 0 || x > 7 {
   340  				err = ErrSyntax
   341  				return
   342  			}
   343  			v = (v << 3) | x
   344  		}
   345  		s = s[2:]
   346  		if v > 255 {
   347  			err = ErrSyntax
   348  			return
   349  		}
   350  		value = v
   351  	case '\\':
   352  		value = '\\'
   353  	case '\'', '"':
   354  		if c != quote {
   355  			err = ErrSyntax
   356  			return
   357  		}
   358  		value = rune(c)
   359  	default:
   360  		err = ErrSyntax
   361  		return
   362  	}
   363  	tail = s
   364  	return
   365  }
   366  
   367  // QuotedPrefix returns the quoted string (as understood by Unquote) at the prefix of s.
   368  // If s does not start with a valid quoted string, QuotedPrefix returns an error.
   369  func QuotedPrefix(s string) (string, error) {
   370  	out, _, err := unquote(s, false)
   371  	return out, err
   372  }
   373  
   374  // Unquote interprets s as a single-quoted, double-quoted,
   375  // or backquoted Go string literal, returning the string value
   376  // that s quotes.  (If s is single-quoted, it would be a Go
   377  // character literal; Unquote returns the corresponding
   378  // one-character string.)
   379  func Unquote(s string) (string, error) {
   380  	out, rem, err := unquote(s, true)
   381  	if len(rem) > 0 {
   382  		return "", ErrSyntax
   383  	}
   384  	return out, err
   385  }
   386  
   387  // unquote parses a quoted string at the start of the input,
   388  // returning the parsed prefix, the remaining suffix, and any parse errors.
   389  // If unescape is true, the parsed prefix is unescaped,
   390  // otherwise the input prefix is provided verbatim.
   391  func unquote(in string, unescape bool) (out, rem string, err error) {
   392  	// Determine the quote form and optimistically find the terminating quote.
   393  	if len(in) < 2 {
   394  		return "", in, ErrSyntax
   395  	}
   396  	quote := in[0]
   397  	end := index(in[1:], quote)
   398  	if end < 0 {
   399  		return "", in, ErrSyntax
   400  	}
   401  	end += 2 // position after terminating quote; may be wrong if escape sequences are present
   402  
   403  	switch quote {
   404  	case '`':
   405  		switch {
   406  		case !unescape:
   407  			out = in[:end] // include quotes
   408  		case !contains(in[:end], '\r'):
   409  			out = in[len("`") : end-len("`")] // exclude quotes
   410  		default:
   411  			// Carriage return characters ('\r') inside raw string literals
   412  			// are discarded from the raw string value.
   413  			buf := make([]byte, 0, end-len("`")-len("\r")-len("`"))
   414  			for i := len("`"); i < end-len("`"); i++ {
   415  				if in[i] != '\r' {
   416  					buf = append(buf, in[i])
   417  				}
   418  			}
   419  			out = string(buf)
   420  		}
   421  		// NOTE: Prior implementations did not verify that raw strings consist
   422  		// of valid UTF-8 characters and we continue to not verify it as such.
   423  		// The Go specification does not explicitly require valid UTF-8,
   424  		// but only mention that it is implicitly valid for Go source code
   425  		// (which must be valid UTF-8).
   426  		return out, in[end:], nil
   427  	case '"', '\'':
   428  		// Handle quoted strings without any escape sequences.
   429  		if !contains(in[:end], '\\') && !contains(in[:end], '\n') {
   430  			var valid bool
   431  			switch quote {
   432  			case '"':
   433  				valid = utf8.ValidString(in[len(`"`) : end-len(`"`)])
   434  			case '\'':
   435  				r, n := utf8.DecodeRuneInString(in[len("'") : end-len("'")])
   436  				valid = len("'")+n+len("'") == end && (r != utf8.RuneError || n != 1)
   437  			}
   438  			if valid {
   439  				out = in[:end]
   440  				if unescape {
   441  					out = out[1 : end-1] // exclude quotes
   442  				}
   443  				return out, in[end:], nil
   444  			}
   445  		}
   446  
   447  		// Handle quoted strings with escape sequences.
   448  		var buf []byte
   449  		in0 := in
   450  		in = in[1:] // skip starting quote
   451  		if unescape {
   452  			buf = make([]byte, 0, 3*end/2) // try to avoid more allocations
   453  		}
   454  		for len(in) > 0 && in[0] != quote {
   455  			// Process the next character,
   456  			// rejecting any unescaped newline characters which are invalid.
   457  			r, multibyte, rem, err := UnquoteChar(in, quote)
   458  			if in[0] == '\n' || err != nil {
   459  				return "", in0, ErrSyntax
   460  			}
   461  			in = rem
   462  
   463  			// Append the character if unescaping the input.
   464  			if unescape {
   465  				if r < utf8.RuneSelf || !multibyte {
   466  					buf = append(buf, byte(r))
   467  				} else {
   468  					var arr [utf8.UTFMax]byte
   469  					n := utf8.EncodeRune(arr[:], r)
   470  					buf = append(buf, arr[:n]...)
   471  				}
   472  			}
   473  
   474  			// Single quoted strings must be a single character.
   475  			if quote == '\'' {
   476  				break
   477  			}
   478  		}
   479  
   480  		// Verify that the string ends with a terminating quote.
   481  		if !(len(in) > 0 && in[0] == quote) {
   482  			return "", in0, ErrSyntax
   483  		}
   484  		in = in[1:] // skip terminating quote
   485  
   486  		if unescape {
   487  			return string(buf), in, nil
   488  		}
   489  		return in0[:len(in0)-len(in)], in, nil
   490  	default:
   491  		return "", in, ErrSyntax
   492  	}
   493  }
   494  
   495  // bsearch16 returns the smallest i such that a[i] >= x.
   496  // If there is no such i, bsearch16 returns len(a).
   497  func bsearch16(a []uint16, x uint16) int {
   498  	i, j := 0, len(a)
   499  	for i < j {
   500  		h := i + (j-i)>>1
   501  		if a[h] < x {
   502  			i = h + 1
   503  		} else {
   504  			j = h
   505  		}
   506  	}
   507  	return i
   508  }
   509  
   510  // bsearch32 returns the smallest i such that a[i] >= x.
   511  // If there is no such i, bsearch32 returns len(a).
   512  func bsearch32(a []uint32, x uint32) int {
   513  	i, j := 0, len(a)
   514  	for i < j {
   515  		h := i + (j-i)>>1
   516  		if a[h] < x {
   517  			i = h + 1
   518  		} else {
   519  			j = h
   520  		}
   521  	}
   522  	return i
   523  }
   524  
   525  // TODO: IsPrint is a local implementation of unicode.IsPrint, verified by the tests
   526  // to give the same answer. It allows this package not to depend on unicode,
   527  // and therefore not pull in all the Unicode tables. If the linker were better
   528  // at tossing unused tables, we could get rid of this implementation.
   529  // That would be nice.
   530  
   531  // IsPrint reports whether the rune is defined as printable by Go, with
   532  // the same definition as unicode.IsPrint: letters, numbers, punctuation,
   533  // symbols and ASCII space.
   534  func IsPrint(r rune) bool {
   535  	// Fast check for Latin-1
   536  	if r <= 0xFF {
   537  		if 0x20 <= r && r <= 0x7E {
   538  			// All the ASCII is printable from space through DEL-1.
   539  			return true
   540  		}
   541  		if 0xA1 <= r && r <= 0xFF {
   542  			// Similarly for ¡ through ÿ...
   543  			return r != 0xAD // ...except for the bizarre soft hyphen.
   544  		}
   545  		return false
   546  	}
   547  
   548  	// Same algorithm, either on uint16 or uint32 value.
   549  	// First, find first i such that isPrint[i] >= x.
   550  	// This is the index of either the start or end of a pair that might span x.
   551  	// The start is even (isPrint[i&^1]) and the end is odd (isPrint[i|1]).
   552  	// If we find x in a range, make sure x is not in isNotPrint list.
   553  
   554  	if 0 <= r && r < 1<<16 {
   555  		rr, isPrint, isNotPrint := uint16(r), isPrint16, isNotPrint16
   556  		i := bsearch16(isPrint, rr)
   557  		if i >= len(isPrint) || rr < isPrint[i&^1] || isPrint[i|1] < rr {
   558  			return false
   559  		}
   560  		j := bsearch16(isNotPrint, rr)
   561  		return j >= len(isNotPrint) || isNotPrint[j] != rr
   562  	}
   563  
   564  	rr, isPrint, isNotPrint := uint32(r), isPrint32, isNotPrint32
   565  	i := bsearch32(isPrint, rr)
   566  	if i >= len(isPrint) || rr < isPrint[i&^1] || isPrint[i|1] < rr {
   567  		return false
   568  	}
   569  	if r >= 0x20000 {
   570  		return true
   571  	}
   572  	r -= 0x10000
   573  	j := bsearch16(isNotPrint, uint16(r))
   574  	return j >= len(isNotPrint) || isNotPrint[j] != uint16(r)
   575  }
   576  
   577  // IsGraphic reports whether the rune is defined as a Graphic by Unicode. Such
   578  // characters include letters, marks, numbers, punctuation, symbols, and
   579  // spaces, from categories L, M, N, P, S, and Zs.
   580  func IsGraphic(r rune) bool {
   581  	if IsPrint(r) {
   582  		return true
   583  	}
   584  	return isInGraphicList(r)
   585  }
   586  
   587  // isInGraphicList reports whether the rune is in the isGraphic list. This separation
   588  // from IsGraphic allows quoteWith to avoid two calls to IsPrint.
   589  // Should be called only if IsPrint fails.
   590  func isInGraphicList(r rune) bool {
   591  	// We know r must fit in 16 bits - see makeisprint.go.
   592  	if r > 0xFFFF {
   593  		return false
   594  	}
   595  	rr := uint16(r)
   596  	i := bsearch16(isGraphic, rr)
   597  	return i < len(isGraphic) && rr == isGraphic[i]
   598  }
   599  

View as plain text