input.go

     1  // Copyright 2019 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package fuzzy
     6  
     7  import (
     8  	"unicode"
     9  )
    10  
    11  // RuneRole specifies the role of a rune in the context of an input.
    12  type RuneRole byte
    13  
    14  const (
    15  	// RNone specifies a rune without any role in the input (i.e., whitespace/non-ASCII).
    16  	RNone RuneRole = iota
    17  	// RSep specifies a rune with the role of segment separator.
    18  	RSep
    19  	// RTail specifies a rune which is a lower-case tail in a word in the input.
    20  	RTail
    21  	// RUCTail specifies a rune which is an upper-case tail in a word in the input.
    22  	RUCTail
    23  	// RHead specifies a rune which is the first character in a word in the input.
    24  	RHead
    25  )
    26  
    27  // RuneRoles detects the roles of each byte rune in an input string and stores it in the output
    28  // slice. The rune role depends on the input type. Stops when it parsed all the runes in the string
    29  // or when it filled the output. If output is nil, then it gets created.
    30  func RuneRoles(candidate []byte, reuse []RuneRole) []RuneRole {
    31  	var output []RuneRole
    32  	if cap(reuse) < len(candidate) {
    33  		output = make([]RuneRole, 0, len(candidate))
    34  	} else {
    35  		output = reuse[:0]
    36  	}
    37  
    38  	prev, prev2 := rtNone, rtNone
    39  	for i := 0; i < len(candidate); i++ {
    40  		r := rune(candidate[i])
    41  
    42  		role := RNone
    43  
    44  		curr := rtLower
    45  		if candidate[i] <= unicode.MaxASCII {
    46  			curr = runeType(rt[candidate[i]] - '0')
    47  		}
    48  
    49  		if curr == rtLower {
    50  			if prev == rtNone || prev == rtPunct {
    51  				role = RHead
    52  			} else {
    53  				role = RTail
    54  			}
    55  		} else if curr == rtUpper {
    56  			role = RHead
    57  
    58  			if prev == rtUpper {
    59  				// This and previous characters are both upper case.
    60  
    61  				if i+1 == len(candidate) {
    62  					// This is last character, previous was also uppercase -> this is UCTail
    63  					// i.e., (current char is C): aBC / BC / ABC
    64  					role = RUCTail
    65  				}
    66  			}
    67  		} else if curr == rtPunct {
    68  			switch r {
    69  			case '.', ':':
    70  				role = RSep
    71  			}
    72  		}
    73  		if curr != rtLower {
    74  			if i > 1 && output[i-1] == RHead && prev2 == rtUpper && (output[i-2] == RHead || output[i-2] == RUCTail) {
    75  				// The previous two characters were uppercase. The current one is not a lower case, so the
    76  				// previous one can't be a HEAD. Make it a UCTail.
    77  				// i.e., (last char is current char - B must be a UCTail): ABC / ZABC / AB.
    78  				output[i-1] = RUCTail
    79  			}
    80  		}
    81  
    82  		output = append(output, role)
    83  		prev2 = prev
    84  		prev = curr
    85  	}
    86  	return output
    87  }
    88  
    89  type runeType byte
    90  
    91  const (
    92  	rtNone runeType = iota
    93  	rtPunct
    94  	rtLower
    95  	rtUpper
    96  )
    97  
    98  const rt = "00000000000000000000000000000000000000000000001122222222221000000333333333333333333333333330000002222222222222222222222222200000"
    99  
   100  // LastSegment returns the substring representing the last segment from the input, where each
   101  // byte has an associated RuneRole in the roles slice. This makes sense only for inputs of Symbol
   102  // or Filename type.
   103  func LastSegment(input string, roles []RuneRole) string {
   104  	// Exclude ending separators.
   105  	end := len(input) - 1
   106  	for end >= 0 && roles[end] == RSep {
   107  		end--
   108  	}
   109  	if end < 0 {
   110  		return ""
   111  	}
   112  
   113  	start := end - 1
   114  	for start >= 0 && roles[start] != RSep {
   115  		start--
   116  	}
   117  
   118  	return input[start+1 : end+1]
   119  }
   120  
   121  // fromChunks copies string chunks into the given buffer.
   122  func fromChunks(chunks []string, buffer []byte) []byte {
   123  	ii := 0
   124  	for _, chunk := range chunks {
   125  		for i := 0; i < len(chunk); i++ {
   126  			if ii >= cap(buffer) {
   127  				break
   128  			}
   129  			buffer[ii] = chunk[i]
   130  			ii++
   131  		}
   132  	}
   133  	return buffer[:ii]
   134  }
   135  
   136  // toLower transforms the input string to lower case, which is stored in the output byte slice.
   137  // The lower casing considers only ASCII values - non ASCII values are left unmodified.
   138  // Stops when parsed all input or when it filled the output slice. If output is nil, then it gets
   139  // created.
   140  func toLower(input []byte, reuse []byte) []byte {
   141  	output := reuse
   142  	if cap(reuse) < len(input) {
   143  		output = make([]byte, len(input))
   144  	}
   145  
   146  	for i := 0; i < len(input); i++ {
   147  		r := rune(input[i])
   148  		if input[i] <= unicode.MaxASCII {
   149  			if 'A' <= r && r <= 'Z' {
   150  				r += 'a' - 'A'
   151  			}
   152  		}
   153  		output[i] = byte(r)
   154  	}
   155  	return output[:len(input)]
   156  }
   157  
   158  // WordConsumer defines a consumer for a word delimited by the [start,end) byte offsets in an input
   159  // (start is inclusive, end is exclusive).
   160  type WordConsumer func(start, end int)
   161  
   162  // Words find word delimiters in an input based on its bytes' mappings to rune roles. The offset
   163  // delimiters for each word are fed to the provided consumer function.
   164  func Words(roles []RuneRole, consume WordConsumer) {
   165  	var wordStart int
   166  	for i, r := range roles {
   167  		switch r {
   168  		case RUCTail, RTail:
   169  		case RHead, RNone, RSep:
   170  			if i != wordStart {
   171  				consume(wordStart, i)
   172  			}
   173  			wordStart = i
   174  			if r != RHead {
   175  				// Skip this character.
   176  				wordStart = i + 1
   177  			}
   178  		}
   179  	}
   180  	if wordStart != len(roles) {
   181  		consume(wordStart, len(roles))
   182  	}
   183  }
   184
View as plain text