Source file src/vendor/golang.org/x/text/unicode/norm/normalize.go

     1  // Copyright 2011 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // Note: the file data_test.go that is generated should not be checked in.
     6  //go:generate go run maketables.go triegen.go
     7  //go:generate go test -tags test
     8  
     9  // Package norm contains types and functions for normalizing Unicode strings.
    10  package norm // import "golang.org/x/text/unicode/norm"
    11  
    12  import (
    13  	"unicode/utf8"
    14  
    15  	"golang.org/x/text/transform"
    16  )
    17  
    18  // A Form denotes a canonical representation of Unicode code points.
    19  // The Unicode-defined normalization and equivalence forms are:
    20  //
    21  //   NFC   Unicode Normalization Form C
    22  //   NFD   Unicode Normalization Form D
    23  //   NFKC  Unicode Normalization Form KC
    24  //   NFKD  Unicode Normalization Form KD
    25  //
    26  // For a Form f, this documentation uses the notation f(x) to mean
    27  // the bytes or string x converted to the given form.
    28  // A position n in x is called a boundary if conversion to the form can
    29  // proceed independently on both sides:
    30  //   f(x) == append(f(x[0:n]), f(x[n:])...)
    31  //
    32  // References: https://unicode.org/reports/tr15/ and
    33  // https://unicode.org/notes/tn5/.
    34  type Form int
    35  
    36  const (
    37  	NFC Form = iota
    38  	NFD
    39  	NFKC
    40  	NFKD
    41  )
    42  
    43  // Bytes returns f(b). May return b if f(b) = b.
    44  func (f Form) Bytes(b []byte) []byte {
    45  	src := inputBytes(b)
    46  	ft := formTable[f]
    47  	n, ok := ft.quickSpan(src, 0, len(b), true)
    48  	if ok {
    49  		return b
    50  	}
    51  	out := make([]byte, n, len(b))
    52  	copy(out, b[0:n])
    53  	rb := reorderBuffer{f: *ft, src: src, nsrc: len(b), out: out, flushF: appendFlush}
    54  	return doAppendInner(&rb, n)
    55  }
    56  
    57  // String returns f(s).
    58  func (f Form) String(s string) string {
    59  	src := inputString(s)
    60  	ft := formTable[f]
    61  	n, ok := ft.quickSpan(src, 0, len(s), true)
    62  	if ok {
    63  		return s
    64  	}
    65  	out := make([]byte, n, len(s))
    66  	copy(out, s[0:n])
    67  	rb := reorderBuffer{f: *ft, src: src, nsrc: len(s), out: out, flushF: appendFlush}
    68  	return string(doAppendInner(&rb, n))
    69  }
    70  
    71  // IsNormal returns true if b == f(b).
    72  func (f Form) IsNormal(b []byte) bool {
    73  	src := inputBytes(b)
    74  	ft := formTable[f]
    75  	bp, ok := ft.quickSpan(src, 0, len(b), true)
    76  	if ok {
    77  		return true
    78  	}
    79  	rb := reorderBuffer{f: *ft, src: src, nsrc: len(b)}
    80  	rb.setFlusher(nil, cmpNormalBytes)
    81  	for bp < len(b) {
    82  		rb.out = b[bp:]
    83  		if bp = decomposeSegment(&rb, bp, true); bp < 0 {
    84  			return false
    85  		}
    86  		bp, _ = rb.f.quickSpan(rb.src, bp, len(b), true)
    87  	}
    88  	return true
    89  }
    90  
    91  func cmpNormalBytes(rb *reorderBuffer) bool {
    92  	b := rb.out
    93  	for i := 0; i < rb.nrune; i++ {
    94  		info := rb.rune[i]
    95  		if int(info.size) > len(b) {
    96  			return false
    97  		}
    98  		p := info.pos
    99  		pe := p + info.size
   100  		for ; p < pe; p++ {
   101  			if b[0] != rb.byte[p] {
   102  				return false
   103  			}
   104  			b = b[1:]
   105  		}
   106  	}
   107  	return true
   108  }
   109  
   110  // IsNormalString returns true if s == f(s).
   111  func (f Form) IsNormalString(s string) bool {
   112  	src := inputString(s)
   113  	ft := formTable[f]
   114  	bp, ok := ft.quickSpan(src, 0, len(s), true)
   115  	if ok {
   116  		return true
   117  	}
   118  	rb := reorderBuffer{f: *ft, src: src, nsrc: len(s)}
   119  	rb.setFlusher(nil, func(rb *reorderBuffer) bool {
   120  		for i := 0; i < rb.nrune; i++ {
   121  			info := rb.rune[i]
   122  			if bp+int(info.size) > len(s) {
   123  				return false
   124  			}
   125  			p := info.pos
   126  			pe := p + info.size
   127  			for ; p < pe; p++ {
   128  				if s[bp] != rb.byte[p] {
   129  					return false
   130  				}
   131  				bp++
   132  			}
   133  		}
   134  		return true
   135  	})
   136  	for bp < len(s) {
   137  		if bp = decomposeSegment(&rb, bp, true); bp < 0 {
   138  			return false
   139  		}
   140  		bp, _ = rb.f.quickSpan(rb.src, bp, len(s), true)
   141  	}
   142  	return true
   143  }
   144  
   145  // patchTail fixes a case where a rune may be incorrectly normalized
   146  // if it is followed by illegal continuation bytes. It returns the
   147  // patched buffer and whether the decomposition is still in progress.
   148  func patchTail(rb *reorderBuffer) bool {
   149  	info, p := lastRuneStart(&rb.f, rb.out)
   150  	if p == -1 || info.size == 0 {
   151  		return true
   152  	}
   153  	end := p + int(info.size)
   154  	extra := len(rb.out) - end
   155  	if extra > 0 {
   156  		// Potentially allocating memory. However, this only
   157  		// happens with ill-formed UTF-8.
   158  		x := make([]byte, 0)
   159  		x = append(x, rb.out[len(rb.out)-extra:]...)
   160  		rb.out = rb.out[:end]
   161  		decomposeToLastBoundary(rb)
   162  		rb.doFlush()
   163  		rb.out = append(rb.out, x...)
   164  		return false
   165  	}
   166  	buf := rb.out[p:]
   167  	rb.out = rb.out[:p]
   168  	decomposeToLastBoundary(rb)
   169  	if s := rb.ss.next(info); s == ssStarter {
   170  		rb.doFlush()
   171  		rb.ss.first(info)
   172  	} else if s == ssOverflow {
   173  		rb.doFlush()
   174  		rb.insertCGJ()
   175  		rb.ss = 0
   176  	}
   177  	rb.insertUnsafe(inputBytes(buf), 0, info)
   178  	return true
   179  }
   180  
   181  func appendQuick(rb *reorderBuffer, i int) int {
   182  	if rb.nsrc == i {
   183  		return i
   184  	}
   185  	end, _ := rb.f.quickSpan(rb.src, i, rb.nsrc, true)
   186  	rb.out = rb.src.appendSlice(rb.out, i, end)
   187  	return end
   188  }
   189  
   190  // Append returns f(append(out, b...)).
   191  // The buffer out must be nil, empty, or equal to f(out).
   192  func (f Form) Append(out []byte, src ...byte) []byte {
   193  	return f.doAppend(out, inputBytes(src), len(src))
   194  }
   195  
   196  func (f Form) doAppend(out []byte, src input, n int) []byte {
   197  	if n == 0 {
   198  		return out
   199  	}
   200  	ft := formTable[f]
   201  	// Attempt to do a quickSpan first so we can avoid initializing the reorderBuffer.
   202  	if len(out) == 0 {
   203  		p, _ := ft.quickSpan(src, 0, n, true)
   204  		out = src.appendSlice(out, 0, p)
   205  		if p == n {
   206  			return out
   207  		}
   208  		rb := reorderBuffer{f: *ft, src: src, nsrc: n, out: out, flushF: appendFlush}
   209  		return doAppendInner(&rb, p)
   210  	}
   211  	rb := reorderBuffer{f: *ft, src: src, nsrc: n}
   212  	return doAppend(&rb, out, 0)
   213  }
   214  
   215  func doAppend(rb *reorderBuffer, out []byte, p int) []byte {
   216  	rb.setFlusher(out, appendFlush)
   217  	src, n := rb.src, rb.nsrc
   218  	doMerge := len(out) > 0
   219  	if q := src.skipContinuationBytes(p); q > p {
   220  		// Move leading non-starters to destination.
   221  		rb.out = src.appendSlice(rb.out, p, q)
   222  		p = q
   223  		doMerge = patchTail(rb)
   224  	}
   225  	fd := &rb.f
   226  	if doMerge {
   227  		var info Properties
   228  		if p < n {
   229  			info = fd.info(src, p)
   230  			if !info.BoundaryBefore() || info.nLeadingNonStarters() > 0 {
   231  				if p == 0 {
   232  					decomposeToLastBoundary(rb)
   233  				}
   234  				p = decomposeSegment(rb, p, true)
   235  			}
   236  		}
   237  		if info.size == 0 {
   238  			rb.doFlush()
   239  			// Append incomplete UTF-8 encoding.
   240  			return src.appendSlice(rb.out, p, n)
   241  		}
   242  		if rb.nrune > 0 {
   243  			return doAppendInner(rb, p)
   244  		}
   245  	}
   246  	p = appendQuick(rb, p)
   247  	return doAppendInner(rb, p)
   248  }
   249  
   250  func doAppendInner(rb *reorderBuffer, p int) []byte {
   251  	for n := rb.nsrc; p < n; {
   252  		p = decomposeSegment(rb, p, true)
   253  		p = appendQuick(rb, p)
   254  	}
   255  	return rb.out
   256  }
   257  
   258  // AppendString returns f(append(out, []byte(s))).
   259  // The buffer out must be nil, empty, or equal to f(out).
   260  func (f Form) AppendString(out []byte, src string) []byte {
   261  	return f.doAppend(out, inputString(src), len(src))
   262  }
   263  
   264  // QuickSpan returns a boundary n such that b[0:n] == f(b[0:n]).
   265  // It is not guaranteed to return the largest such n.
   266  func (f Form) QuickSpan(b []byte) int {
   267  	n, _ := formTable[f].quickSpan(inputBytes(b), 0, len(b), true)
   268  	return n
   269  }
   270  
   271  // Span implements transform.SpanningTransformer. It returns a boundary n such
   272  // that b[0:n] == f(b[0:n]). It is not guaranteed to return the largest such n.
   273  func (f Form) Span(b []byte, atEOF bool) (n int, err error) {
   274  	n, ok := formTable[f].quickSpan(inputBytes(b), 0, len(b), atEOF)
   275  	if n < len(b) {
   276  		if !ok {
   277  			err = transform.ErrEndOfSpan
   278  		} else {
   279  			err = transform.ErrShortSrc
   280  		}
   281  	}
   282  	return n, err
   283  }
   284  
   285  // SpanString returns a boundary n such that s[0:n] == f(s[0:n]).
   286  // It is not guaranteed to return the largest such n.
   287  func (f Form) SpanString(s string, atEOF bool) (n int, err error) {
   288  	n, ok := formTable[f].quickSpan(inputString(s), 0, len(s), atEOF)
   289  	if n < len(s) {
   290  		if !ok {
   291  			err = transform.ErrEndOfSpan
   292  		} else {
   293  			err = transform.ErrShortSrc
   294  		}
   295  	}
   296  	return n, err
   297  }
   298  
   299  // quickSpan returns a boundary n such that src[0:n] == f(src[0:n]) and
   300  // whether any non-normalized parts were found. If atEOF is false, n will
   301  // not point past the last segment if this segment might be become
   302  // non-normalized by appending other runes.
   303  func (f *formInfo) quickSpan(src input, i, end int, atEOF bool) (n int, ok bool) {
   304  	var lastCC uint8
   305  	ss := streamSafe(0)
   306  	lastSegStart := i
   307  	for n = end; i < n; {
   308  		if j := src.skipASCII(i, n); i != j {
   309  			i = j
   310  			lastSegStart = i - 1
   311  			lastCC = 0
   312  			ss = 0
   313  			continue
   314  		}
   315  		info := f.info(src, i)
   316  		if info.size == 0 {
   317  			if atEOF {
   318  				// include incomplete runes
   319  				return n, true
   320  			}
   321  			return lastSegStart, true
   322  		}
   323  		// This block needs to be before the next, because it is possible to
   324  		// have an overflow for runes that are starters (e.g. with U+FF9E).
   325  		switch ss.next(info) {
   326  		case ssStarter:
   327  			lastSegStart = i
   328  		case ssOverflow:
   329  			return lastSegStart, false
   330  		case ssSuccess:
   331  			if lastCC > info.ccc {
   332  				return lastSegStart, false
   333  			}
   334  		}
   335  		if f.composing {
   336  			if !info.isYesC() {
   337  				break
   338  			}
   339  		} else {
   340  			if !info.isYesD() {
   341  				break
   342  			}
   343  		}
   344  		lastCC = info.ccc
   345  		i += int(info.size)
   346  	}
   347  	if i == n {
   348  		if !atEOF {
   349  			n = lastSegStart
   350  		}
   351  		return n, true
   352  	}
   353  	return lastSegStart, false
   354  }
   355  
   356  // QuickSpanString returns a boundary n such that s[0:n] == f(s[0:n]).
   357  // It is not guaranteed to return the largest such n.
   358  func (f Form) QuickSpanString(s string) int {
   359  	n, _ := formTable[f].quickSpan(inputString(s), 0, len(s), true)
   360  	return n
   361  }
   362  
   363  // FirstBoundary returns the position i of the first boundary in b
   364  // or -1 if b contains no boundary.
   365  func (f Form) FirstBoundary(b []byte) int {
   366  	return f.firstBoundary(inputBytes(b), len(b))
   367  }
   368  
   369  func (f Form) firstBoundary(src input, nsrc int) int {
   370  	i := src.skipContinuationBytes(0)
   371  	if i >= nsrc {
   372  		return -1
   373  	}
   374  	fd := formTable[f]
   375  	ss := streamSafe(0)
   376  	// We should call ss.first here, but we can't as the first rune is
   377  	// skipped already. This means FirstBoundary can't really determine
   378  	// CGJ insertion points correctly. Luckily it doesn't have to.
   379  	for {
   380  		info := fd.info(src, i)
   381  		if info.size == 0 {
   382  			return -1
   383  		}
   384  		if s := ss.next(info); s != ssSuccess {
   385  			return i
   386  		}
   387  		i += int(info.size)
   388  		if i >= nsrc {
   389  			if !info.BoundaryAfter() && !ss.isMax() {
   390  				return -1
   391  			}
   392  			return nsrc
   393  		}
   394  	}
   395  }
   396  
   397  // FirstBoundaryInString returns the position i of the first boundary in s
   398  // or -1 if s contains no boundary.
   399  func (f Form) FirstBoundaryInString(s string) int {
   400  	return f.firstBoundary(inputString(s), len(s))
   401  }
   402  
   403  // NextBoundary reports the index of the boundary between the first and next
   404  // segment in b or -1 if atEOF is false and there are not enough bytes to
   405  // determine this boundary.
   406  func (f Form) NextBoundary(b []byte, atEOF bool) int {
   407  	return f.nextBoundary(inputBytes(b), len(b), atEOF)
   408  }
   409  
   410  // NextBoundaryInString reports the index of the boundary between the first and
   411  // next segment in b or -1 if atEOF is false and there are not enough bytes to
   412  // determine this boundary.
   413  func (f Form) NextBoundaryInString(s string, atEOF bool) int {
   414  	return f.nextBoundary(inputString(s), len(s), atEOF)
   415  }
   416  
   417  func (f Form) nextBoundary(src input, nsrc int, atEOF bool) int {
   418  	if nsrc == 0 {
   419  		if atEOF {
   420  			return 0
   421  		}
   422  		return -1
   423  	}
   424  	fd := formTable[f]
   425  	info := fd.info(src, 0)
   426  	if info.size == 0 {
   427  		if atEOF {
   428  			return 1
   429  		}
   430  		return -1
   431  	}
   432  	ss := streamSafe(0)
   433  	ss.first(info)
   434  
   435  	for i := int(info.size); i < nsrc; i += int(info.size) {
   436  		info = fd.info(src, i)
   437  		if info.size == 0 {
   438  			if atEOF {
   439  				return i
   440  			}
   441  			return -1
   442  		}
   443  		// TODO: Using streamSafe to determine the boundary isn't the same as
   444  		// using BoundaryBefore. Determine which should be used.
   445  		if s := ss.next(info); s != ssSuccess {
   446  			return i
   447  		}
   448  	}
   449  	if !atEOF && !info.BoundaryAfter() && !ss.isMax() {
   450  		return -1
   451  	}
   452  	return nsrc
   453  }
   454  
   455  // LastBoundary returns the position i of the last boundary in b
   456  // or -1 if b contains no boundary.
   457  func (f Form) LastBoundary(b []byte) int {
   458  	return lastBoundary(formTable[f], b)
   459  }
   460  
   461  func lastBoundary(fd *formInfo, b []byte) int {
   462  	i := len(b)
   463  	info, p := lastRuneStart(fd, b)
   464  	if p == -1 {
   465  		return -1
   466  	}
   467  	if info.size == 0 { // ends with incomplete rune
   468  		if p == 0 { // starts with incomplete rune
   469  			return -1
   470  		}
   471  		i = p
   472  		info, p = lastRuneStart(fd, b[:i])
   473  		if p == -1 { // incomplete UTF-8 encoding or non-starter bytes without a starter
   474  			return i
   475  		}
   476  	}
   477  	if p+int(info.size) != i { // trailing non-starter bytes: illegal UTF-8
   478  		return i
   479  	}
   480  	if info.BoundaryAfter() {
   481  		return i
   482  	}
   483  	ss := streamSafe(0)
   484  	v := ss.backwards(info)
   485  	for i = p; i >= 0 && v != ssStarter; i = p {
   486  		info, p = lastRuneStart(fd, b[:i])
   487  		if v = ss.backwards(info); v == ssOverflow {
   488  			break
   489  		}
   490  		if p+int(info.size) != i {
   491  			if p == -1 { // no boundary found
   492  				return -1
   493  			}
   494  			return i // boundary after an illegal UTF-8 encoding
   495  		}
   496  	}
   497  	return i
   498  }
   499  
   500  // decomposeSegment scans the first segment in src into rb. It inserts 0x034f
   501  // (Grapheme Joiner) when it encounters a sequence of more than 30 non-starters
   502  // and returns the number of bytes consumed from src or iShortDst or iShortSrc.
   503  func decomposeSegment(rb *reorderBuffer, sp int, atEOF bool) int {
   504  	// Force one character to be consumed.
   505  	info := rb.f.info(rb.src, sp)
   506  	if info.size == 0 {
   507  		return 0
   508  	}
   509  	if s := rb.ss.next(info); s == ssStarter {
   510  		// TODO: this could be removed if we don't support merging.
   511  		if rb.nrune > 0 {
   512  			goto end
   513  		}
   514  	} else if s == ssOverflow {
   515  		rb.insertCGJ()
   516  		goto end
   517  	}
   518  	if err := rb.insertFlush(rb.src, sp, info); err != iSuccess {
   519  		return int(err)
   520  	}
   521  	for {
   522  		sp += int(info.size)
   523  		if sp >= rb.nsrc {
   524  			if !atEOF && !info.BoundaryAfter() {
   525  				return int(iShortSrc)
   526  			}
   527  			break
   528  		}
   529  		info = rb.f.info(rb.src, sp)
   530  		if info.size == 0 {
   531  			if !atEOF {
   532  				return int(iShortSrc)
   533  			}
   534  			break
   535  		}
   536  		if s := rb.ss.next(info); s == ssStarter {
   537  			break
   538  		} else if s == ssOverflow {
   539  			rb.insertCGJ()
   540  			break
   541  		}
   542  		if err := rb.insertFlush(rb.src, sp, info); err != iSuccess {
   543  			return int(err)
   544  		}
   545  	}
   546  end:
   547  	if !rb.doFlush() {
   548  		return int(iShortDst)
   549  	}
   550  	return sp
   551  }
   552  
   553  // lastRuneStart returns the runeInfo and position of the last
   554  // rune in buf or the zero runeInfo and -1 if no rune was found.
   555  func lastRuneStart(fd *formInfo, buf []byte) (Properties, int) {
   556  	p := len(buf) - 1
   557  	for ; p >= 0 && !utf8.RuneStart(buf[p]); p-- {
   558  	}
   559  	if p < 0 {
   560  		return Properties{}, -1
   561  	}
   562  	return fd.info(inputBytes(buf), p), p
   563  }
   564  
   565  // decomposeToLastBoundary finds an open segment at the end of the buffer
   566  // and scans it into rb. Returns the buffer minus the last segment.
   567  func decomposeToLastBoundary(rb *reorderBuffer) {
   568  	fd := &rb.f
   569  	info, i := lastRuneStart(fd, rb.out)
   570  	if int(info.size) != len(rb.out)-i {
   571  		// illegal trailing continuation bytes
   572  		return
   573  	}
   574  	if info.BoundaryAfter() {
   575  		return
   576  	}
   577  	var add [maxNonStarters + 1]Properties // stores runeInfo in reverse order
   578  	padd := 0
   579  	ss := streamSafe(0)
   580  	p := len(rb.out)
   581  	for {
   582  		add[padd] = info
   583  		v := ss.backwards(info)
   584  		if v == ssOverflow {
   585  			// Note that if we have an overflow, it the string we are appending to
   586  			// is not correctly normalized. In this case the behavior is undefined.
   587  			break
   588  		}
   589  		padd++
   590  		p -= int(info.size)
   591  		if v == ssStarter || p < 0 {
   592  			break
   593  		}
   594  		info, i = lastRuneStart(fd, rb.out[:p])
   595  		if int(info.size) != p-i {
   596  			break
   597  		}
   598  	}
   599  	rb.ss = ss
   600  	// Copy bytes for insertion as we may need to overwrite rb.out.
   601  	var buf [maxBufferSize * utf8.UTFMax]byte
   602  	cp := buf[:copy(buf[:], rb.out[p:])]
   603  	rb.out = rb.out[:p]
   604  	for padd--; padd >= 0; padd-- {
   605  		info = add[padd]
   606  		rb.insertUnsafe(inputBytes(cp), 0, info)
   607  		cp = cp[info.size:]
   608  	}
   609  }
   610  

View as plain text