Source file src/cmd/vendor/golang.org/x/tools/internal/lsp/fuzzy/symbol.go
1 // Copyright 2021 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package fuzzy 6 7 import ( 8 "unicode" 9 ) 10 11 // SymbolMatcher implements a fuzzy matching algorithm optimized for Go symbols 12 // of the form: 13 // example.com/path/to/package.object.field 14 // 15 // Knowing that we are matching symbols like this allows us to make the 16 // following optimizations: 17 // - We can incorporate right-to-left relevance directly into the score 18 // calculation. 19 // - We can match from right to left, discarding leading bytes if the input is 20 // too long. 21 // - We just take the right-most match without losing too much precision. This 22 // allows us to use an O(n) algorithm. 23 // - We can operate directly on chunked strings; in many cases we will 24 // be storing the package path and/or package name separately from the 25 // symbol or identifiers, so doing this avoids allocating strings. 26 // - We can return the index of the right-most match, allowing us to trim 27 // irrelevant qualification. 28 // 29 // This implementation is experimental, serving as a reference fast algorithm 30 // to compare to the fuzzy algorithm implemented by Matcher. 31 type SymbolMatcher struct { 32 // Using buffers of length 256 is both a reasonable size for most qualified 33 // symbols, and makes it easy to avoid bounds checks by using uint8 indexes. 34 pattern [256]rune 35 patternLen uint8 36 inputBuffer [256]rune // avoid allocating when considering chunks 37 roles [256]uint32 // which roles does a rune play (word start, etc.) 38 segments [256]uint8 // how many segments from the right is each rune 39 } 40 41 const ( 42 segmentStart uint32 = 1 << iota 43 wordStart 44 separator 45 ) 46 47 // NewSymbolMatcher creates a SymbolMatcher that may be used to match the given 48 // search pattern. 49 // 50 // Currently this matcher only accepts case-insensitive fuzzy patterns. 51 // 52 // An empty pattern matches no input. 53 func NewSymbolMatcher(pattern string) *SymbolMatcher { 54 m := &SymbolMatcher{} 55 for _, p := range pattern { 56 m.pattern[m.patternLen] = unicode.ToLower(p) 57 m.patternLen++ 58 if m.patternLen == 255 || int(m.patternLen) == len(pattern) { 59 // break at 255 so that we can represent patternLen with a uint8. 60 break 61 } 62 } 63 return m 64 } 65 66 // Match looks for the right-most match of the search pattern within the symbol 67 // represented by concatenating the given chunks, returning its offset and 68 // score. 69 // 70 // If a match is found, the first return value will hold the absolute byte 71 // offset within all chunks for the start of the symbol. In other words, the 72 // index of the match within strings.Join(chunks, ""). If no match is found, 73 // the first return value will be -1. 74 // 75 // The second return value will be the score of the match, which is always 76 // between 0 and 1, inclusive. A score of 0 indicates no match. 77 func (m *SymbolMatcher) Match(chunks []string) (int, float64) { 78 // Explicit behavior for an empty pattern. 79 // 80 // As a minor optimization, this also avoids nilness checks later on, since 81 // the compiler can prove that m != nil. 82 if m.patternLen == 0 { 83 return -1, 0 84 } 85 86 // First phase: populate the input buffer with lower-cased runes. 87 // 88 // We could also check for a forward match here, but since we'd have to write 89 // the entire input anyway this has negligible impact on performance. 90 91 var ( 92 inputLen = uint8(0) 93 modifiers = wordStart | segmentStart 94 ) 95 96 input: 97 for _, chunk := range chunks { 98 for _, r := range chunk { 99 if r == '.' || r == '/' { 100 modifiers |= separator 101 } 102 // optimization: avoid calls to unicode.ToLower, which can't be inlined. 103 l := r 104 if r <= unicode.MaxASCII { 105 if 'A' <= r && r <= 'Z' { 106 l = r + 'a' - 'A' 107 } 108 } else { 109 l = unicode.ToLower(r) 110 } 111 if l != r { 112 modifiers |= wordStart 113 } 114 m.inputBuffer[inputLen] = l 115 m.roles[inputLen] = modifiers 116 inputLen++ 117 if m.roles[inputLen-1]&separator != 0 { 118 modifiers = wordStart | segmentStart 119 } else { 120 modifiers = 0 121 } 122 // TODO: we should prefer the right-most input if it overflows, rather 123 // than the left-most as we're doing here. 124 if inputLen == 255 { 125 break input 126 } 127 } 128 } 129 130 // Second phase: find the right-most match, and count segments from the 131 // right. 132 133 var ( 134 pi = uint8(m.patternLen - 1) // pattern index 135 p = m.pattern[pi] // pattern rune 136 start = -1 // start offset of match 137 rseg = uint8(0) 138 ) 139 const maxSeg = 3 // maximum number of segments from the right to count, for scoring purposes. 140 141 for ii := inputLen - 1; ; ii-- { 142 r := m.inputBuffer[ii] 143 if rseg < maxSeg && m.roles[ii]&separator != 0 { 144 rseg++ 145 } 146 m.segments[ii] = rseg 147 if p == r { 148 if pi == 0 { 149 start = int(ii) 150 break 151 } 152 pi-- 153 p = m.pattern[pi] 154 } 155 // Don't check ii >= 0 in the loop condition: ii is a uint8. 156 if ii == 0 { 157 break 158 } 159 } 160 161 if start < 0 { 162 // no match: skip scoring 163 return -1, 0 164 } 165 166 // Third phase: find the shortest match, and compute the score. 167 168 // Score is the average score for each character. 169 // 170 // A character score is the multiple of: 171 // 1. 1.0 if the character starts a segment, .8 if the character start a 172 // mid-segment word, otherwise 0.6. This carries over to immediately 173 // following characters. 174 // 2. For the final character match, the multiplier from (1) is reduced to 175 // .8 if the next character in the input is a mid-segment word, or 0.6 if 176 // the next character in the input is not a word or segment start. This 177 // ensures that we favor whole-word or whole-segment matches over prefix 178 // matches. 179 // 3. 1.0 if the character is part of the last segment, otherwise 180 // 1.0-.2*<segments from the right>, with a max segment count of 3. 181 // 182 // This is a very naive algorithm, but it is fast. There's lots of prior art 183 // here, and we should leverage it. For example, we could explicitly consider 184 // character distance, and exact matches of words or segments. 185 // 186 // Also note that this might not actually find the highest scoring match, as 187 // doing so could require a non-linear algorithm, depending on how the score 188 // is calculated. 189 190 pi = 0 191 p = m.pattern[pi] 192 193 const ( 194 segStreak = 1.0 195 wordStreak = 0.8 196 noStreak = 0.6 197 perSegment = 0.2 // we count at most 3 segments above 198 ) 199 200 streakBonus := noStreak 201 totScore := 0.0 202 for ii := uint8(start); ii < inputLen; ii++ { 203 r := m.inputBuffer[ii] 204 if r == p { 205 pi++ 206 p = m.pattern[pi] 207 // Note: this could be optimized with some bit operations. 208 switch { 209 case m.roles[ii]&segmentStart != 0 && segStreak > streakBonus: 210 streakBonus = segStreak 211 case m.roles[ii]&wordStart != 0 && wordStreak > streakBonus: 212 streakBonus = wordStreak 213 } 214 finalChar := pi >= m.patternLen 215 // finalCost := 1.0 216 if finalChar && streakBonus > noStreak { 217 switch { 218 case ii == inputLen-1 || m.roles[ii+1]&segmentStart != 0: 219 // Full segment: no reduction 220 case m.roles[ii+1]&wordStart != 0: 221 streakBonus = wordStreak 222 default: 223 streakBonus = noStreak 224 } 225 } 226 totScore += streakBonus * (1.0 - float64(m.segments[ii])*perSegment) 227 if finalChar { 228 break 229 } 230 } else { 231 streakBonus = noStreak 232 } 233 } 234 235 return start, totScore / float64(m.patternLen) 236 } 237