Source file src/vendor/golang.org/x/text/unicode/norm/forminfo.go

     1  // Copyright 2011 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package norm
     6  
     7  import "encoding/binary"
     8  
     9  // This file contains Form-specific logic and wrappers for data in tables.go.
    10  
    11  // Rune info is stored in a separate trie per composing form. A composing form
    12  // and its corresponding decomposing form share the same trie.  Each trie maps
    13  // a rune to a uint16. The values take two forms.  For v >= 0x8000:
    14  //   bits
    15  //   15:    1 (inverse of NFD_QC bit of qcInfo)
    16  //   13..7: qcInfo (see below). isYesD is always true (no decompostion).
    17  //    6..0: ccc (compressed CCC value).
    18  // For v < 0x8000, the respective rune has a decomposition and v is an index
    19  // into a byte array of UTF-8 decomposition sequences and additional info and
    20  // has the form:
    21  //    <header> <decomp_byte>* [<tccc> [<lccc>]]
    22  // The header contains the number of bytes in the decomposition (excluding this
    23  // length byte). The two most significant bits of this length byte correspond
    24  // to bit 5 and 4 of qcInfo (see below).  The byte sequence itself starts at v+1.
    25  // The byte sequence is followed by a trailing and leading CCC if the values
    26  // for these are not zero.  The value of v determines which ccc are appended
    27  // to the sequences.  For v < firstCCC, there are none, for v >= firstCCC,
    28  // the sequence is followed by a trailing ccc, and for v >= firstLeadingCC
    29  // there is an additional leading ccc. The value of tccc itself is the
    30  // trailing CCC shifted left 2 bits. The two least-significant bits of tccc
    31  // are the number of trailing non-starters.
    32  
    33  const (
    34  	qcInfoMask      = 0x3F // to clear all but the relevant bits in a qcInfo
    35  	headerLenMask   = 0x3F // extract the length value from the header byte
    36  	headerFlagsMask = 0xC0 // extract the qcInfo bits from the header byte
    37  )
    38  
    39  // Properties provides access to normalization properties of a rune.
    40  type Properties struct {
    41  	pos   uint8  // start position in reorderBuffer; used in composition.go
    42  	size  uint8  // length of UTF-8 encoding of this rune
    43  	ccc   uint8  // leading canonical combining class (ccc if not decomposition)
    44  	tccc  uint8  // trailing canonical combining class (ccc if not decomposition)
    45  	nLead uint8  // number of leading non-starters.
    46  	flags qcInfo // quick check flags
    47  	index uint16
    48  }
    49  
    50  // functions dispatchable per form
    51  type lookupFunc func(b input, i int) Properties
    52  
    53  // formInfo holds Form-specific functions and tables.
    54  type formInfo struct {
    55  	form                     Form
    56  	composing, compatibility bool // form type
    57  	info                     lookupFunc
    58  	nextMain                 iterFunc
    59  }
    60  
    61  var formTable = []*formInfo{{
    62  	form:          NFC,
    63  	composing:     true,
    64  	compatibility: false,
    65  	info:          lookupInfoNFC,
    66  	nextMain:      nextComposed,
    67  }, {
    68  	form:          NFD,
    69  	composing:     false,
    70  	compatibility: false,
    71  	info:          lookupInfoNFC,
    72  	nextMain:      nextDecomposed,
    73  }, {
    74  	form:          NFKC,
    75  	composing:     true,
    76  	compatibility: true,
    77  	info:          lookupInfoNFKC,
    78  	nextMain:      nextComposed,
    79  }, {
    80  	form:          NFKD,
    81  	composing:     false,
    82  	compatibility: true,
    83  	info:          lookupInfoNFKC,
    84  	nextMain:      nextDecomposed,
    85  }}
    86  
    87  // We do not distinguish between boundaries for NFC, NFD, etc. to avoid
    88  // unexpected behavior for the user.  For example, in NFD, there is a boundary
    89  // after 'a'.  However, 'a' might combine with modifiers, so from the application's
    90  // perspective it is not a good boundary. We will therefore always use the
    91  // boundaries for the combining variants.
    92  
    93  // BoundaryBefore returns true if this rune starts a new segment and
    94  // cannot combine with any rune on the left.
    95  func (p Properties) BoundaryBefore() bool {
    96  	if p.ccc == 0 && !p.combinesBackward() {
    97  		return true
    98  	}
    99  	// We assume that the CCC of the first character in a decomposition
   100  	// is always non-zero if different from info.ccc and that we can return
   101  	// false at this point. This is verified by maketables.
   102  	return false
   103  }
   104  
   105  // BoundaryAfter returns true if runes cannot combine with or otherwise
   106  // interact with this or previous runes.
   107  func (p Properties) BoundaryAfter() bool {
   108  	// TODO: loosen these conditions.
   109  	return p.isInert()
   110  }
   111  
   112  // We pack quick check data in 4 bits:
   113  //   5:    Combines forward  (0 == false, 1 == true)
   114  //   4..3: NFC_QC Yes(00), No (10), or Maybe (11)
   115  //   2:    NFD_QC Yes (0) or No (1). No also means there is a decomposition.
   116  //   1..0: Number of trailing non-starters.
   117  //
   118  // When all 4 bits are zero, the character is inert, meaning it is never
   119  // influenced by normalization.
   120  type qcInfo uint8
   121  
   122  func (p Properties) isYesC() bool { return p.flags&0x10 == 0 }
   123  func (p Properties) isYesD() bool { return p.flags&0x4 == 0 }
   124  
   125  func (p Properties) combinesForward() bool  { return p.flags&0x20 != 0 }
   126  func (p Properties) combinesBackward() bool { return p.flags&0x8 != 0 } // == isMaybe
   127  func (p Properties) hasDecomposition() bool { return p.flags&0x4 != 0 } // == isNoD
   128  
   129  func (p Properties) isInert() bool {
   130  	return p.flags&qcInfoMask == 0 && p.ccc == 0
   131  }
   132  
   133  func (p Properties) multiSegment() bool {
   134  	return p.index >= firstMulti && p.index < endMulti
   135  }
   136  
   137  func (p Properties) nLeadingNonStarters() uint8 {
   138  	return p.nLead
   139  }
   140  
   141  func (p Properties) nTrailingNonStarters() uint8 {
   142  	return uint8(p.flags & 0x03)
   143  }
   144  
   145  // Decomposition returns the decomposition for the underlying rune
   146  // or nil if there is none.
   147  func (p Properties) Decomposition() []byte {
   148  	// TODO: create the decomposition for Hangul?
   149  	if p.index == 0 {
   150  		return nil
   151  	}
   152  	i := p.index
   153  	n := decomps[i] & headerLenMask
   154  	i++
   155  	return decomps[i : i+uint16(n)]
   156  }
   157  
   158  // Size returns the length of UTF-8 encoding of the rune.
   159  func (p Properties) Size() int {
   160  	return int(p.size)
   161  }
   162  
   163  // CCC returns the canonical combining class of the underlying rune.
   164  func (p Properties) CCC() uint8 {
   165  	if p.index >= firstCCCZeroExcept {
   166  		return 0
   167  	}
   168  	return ccc[p.ccc]
   169  }
   170  
   171  // LeadCCC returns the CCC of the first rune in the decomposition.
   172  // If there is no decomposition, LeadCCC equals CCC.
   173  func (p Properties) LeadCCC() uint8 {
   174  	return ccc[p.ccc]
   175  }
   176  
   177  // TrailCCC returns the CCC of the last rune in the decomposition.
   178  // If there is no decomposition, TrailCCC equals CCC.
   179  func (p Properties) TrailCCC() uint8 {
   180  	return ccc[p.tccc]
   181  }
   182  
   183  func buildRecompMap() {
   184  	recompMap = make(map[uint32]rune, len(recompMapPacked)/8)
   185  	var buf [8]byte
   186  	for i := 0; i < len(recompMapPacked); i += 8 {
   187  		copy(buf[:], recompMapPacked[i:i+8])
   188  		key := binary.BigEndian.Uint32(buf[:4])
   189  		val := binary.BigEndian.Uint32(buf[4:])
   190  		recompMap[key] = rune(val)
   191  	}
   192  }
   193  
   194  // Recomposition
   195  // We use 32-bit keys instead of 64-bit for the two codepoint keys.
   196  // This clips off the bits of three entries, but we know this will not
   197  // result in a collision. In the unlikely event that changes to
   198  // UnicodeData.txt introduce collisions, the compiler will catch it.
   199  // Note that the recomposition map for NFC and NFKC are identical.
   200  
   201  // combine returns the combined rune or 0 if it doesn't exist.
   202  //
   203  // The caller is responsible for calling
   204  // recompMapOnce.Do(buildRecompMap) sometime before this is called.
   205  func combine(a, b rune) rune {
   206  	key := uint32(uint16(a))<<16 + uint32(uint16(b))
   207  	if recompMap == nil {
   208  		panic("caller error") // see func comment
   209  	}
   210  	return recompMap[key]
   211  }
   212  
   213  func lookupInfoNFC(b input, i int) Properties {
   214  	v, sz := b.charinfoNFC(i)
   215  	return compInfo(v, sz)
   216  }
   217  
   218  func lookupInfoNFKC(b input, i int) Properties {
   219  	v, sz := b.charinfoNFKC(i)
   220  	return compInfo(v, sz)
   221  }
   222  
   223  // Properties returns properties for the first rune in s.
   224  func (f Form) Properties(s []byte) Properties {
   225  	if f == NFC || f == NFD {
   226  		return compInfo(nfcData.lookup(s))
   227  	}
   228  	return compInfo(nfkcData.lookup(s))
   229  }
   230  
   231  // PropertiesString returns properties for the first rune in s.
   232  func (f Form) PropertiesString(s string) Properties {
   233  	if f == NFC || f == NFD {
   234  		return compInfo(nfcData.lookupString(s))
   235  	}
   236  	return compInfo(nfkcData.lookupString(s))
   237  }
   238  
   239  // compInfo converts the information contained in v and sz
   240  // to a Properties.  See the comment at the top of the file
   241  // for more information on the format.
   242  func compInfo(v uint16, sz int) Properties {
   243  	if v == 0 {
   244  		return Properties{size: uint8(sz)}
   245  	} else if v >= 0x8000 {
   246  		p := Properties{
   247  			size:  uint8(sz),
   248  			ccc:   uint8(v),
   249  			tccc:  uint8(v),
   250  			flags: qcInfo(v >> 8),
   251  		}
   252  		if p.ccc > 0 || p.combinesBackward() {
   253  			p.nLead = uint8(p.flags & 0x3)
   254  		}
   255  		return p
   256  	}
   257  	// has decomposition
   258  	h := decomps[v]
   259  	f := (qcInfo(h&headerFlagsMask) >> 2) | 0x4
   260  	p := Properties{size: uint8(sz), flags: f, index: v}
   261  	if v >= firstCCC {
   262  		v += uint16(h&headerLenMask) + 1
   263  		c := decomps[v]
   264  		p.tccc = c >> 2
   265  		p.flags |= qcInfo(c & 0x3)
   266  		if v >= firstLeadingCCC {
   267  			p.nLead = c & 0x3
   268  			if v >= firstStarterWithNLead {
   269  				// We were tricked. Remove the decomposition.
   270  				p.flags &= 0x03
   271  				p.index = 0
   272  				return p
   273  			}
   274  			p.ccc = decomps[v+1]
   275  		}
   276  	}
   277  	return p
   278  }
   279  

View as plain text