symbol.go

     1  // Copyright 2021 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package fuzzy
     6  
     7  import (
     8  	"unicode"
     9  )
    10  
    11  // SymbolMatcher implements a fuzzy matching algorithm optimized for Go symbols
    12  // of the form:
    13  //  example.com/path/to/package.object.field
    14  //
    15  // Knowing that we are matching symbols like this allows us to make the
    16  // following optimizations:
    17  //  - We can incorporate right-to-left relevance directly into the score
    18  //    calculation.
    19  //  - We can match from right to left, discarding leading bytes if the input is
    20  //    too long.
    21  //  - We just take the right-most match without losing too much precision. This
    22  //    allows us to use an O(n) algorithm.
    23  //  - We can operate directly on chunked strings; in many cases we will
    24  //    be storing the package path and/or package name separately from the
    25  //    symbol or identifiers, so doing this avoids allocating strings.
    26  //  - We can return the index of the right-most match, allowing us to trim
    27  //    irrelevant qualification.
    28  //
    29  // This implementation is experimental, serving as a reference fast algorithm
    30  // to compare to the fuzzy algorithm implemented by Matcher.
    31  type SymbolMatcher struct {
    32  	// Using buffers of length 256 is both a reasonable size for most qualified
    33  	// symbols, and makes it easy to avoid bounds checks by using uint8 indexes.
    34  	pattern     [256]rune
    35  	patternLen  uint8
    36  	inputBuffer [256]rune   // avoid allocating when considering chunks
    37  	roles       [256]uint32 // which roles does a rune play (word start, etc.)
    38  	segments    [256]uint8  // how many segments from the right is each rune
    39  }
    40  
    41  const (
    42  	segmentStart uint32 = 1 << iota
    43  	wordStart
    44  	separator
    45  )
    46  
    47  // NewSymbolMatcher creates a SymbolMatcher that may be used to match the given
    48  // search pattern.
    49  //
    50  // Currently this matcher only accepts case-insensitive fuzzy patterns.
    51  //
    52  // An empty pattern matches no input.
    53  func NewSymbolMatcher(pattern string) *SymbolMatcher {
    54  	m := &SymbolMatcher{}
    55  	for _, p := range pattern {
    56  		m.pattern[m.patternLen] = unicode.ToLower(p)
    57  		m.patternLen++
    58  		if m.patternLen == 255 || int(m.patternLen) == len(pattern) {
    59  			// break at 255 so that we can represent patternLen with a uint8.
    60  			break
    61  		}
    62  	}
    63  	return m
    64  }
    65  
    66  // Match looks for the right-most match of the search pattern within the symbol
    67  // represented by concatenating the given chunks, returning its offset and
    68  // score.
    69  //
    70  // If a match is found, the first return value will hold the absolute byte
    71  // offset within all chunks for the start of the symbol. In other words, the
    72  // index of the match within strings.Join(chunks, ""). If no match is found,
    73  // the first return value will be -1.
    74  //
    75  // The second return value will be the score of the match, which is always
    76  // between 0 and 1, inclusive. A score of 0 indicates no match.
    77  func (m *SymbolMatcher) Match(chunks []string) (int, float64) {
    78  	// Explicit behavior for an empty pattern.
    79  	//
    80  	// As a minor optimization, this also avoids nilness checks later on, since
    81  	// the compiler can prove that m != nil.
    82  	if m.patternLen == 0 {
    83  		return -1, 0
    84  	}
    85  
    86  	// First phase: populate the input buffer with lower-cased runes.
    87  	//
    88  	// We could also check for a forward match here, but since we'd have to write
    89  	// the entire input anyway this has negligible impact on performance.
    90  
    91  	var (
    92  		inputLen  = uint8(0)
    93  		modifiers = wordStart | segmentStart
    94  	)
    95  
    96  input:
    97  	for _, chunk := range chunks {
    98  		for _, r := range chunk {
    99  			if r == '.' || r == '/' {
   100  				modifiers |= separator
   101  			}
   102  			// optimization: avoid calls to unicode.ToLower, which can't be inlined.
   103  			l := r
   104  			if r <= unicode.MaxASCII {
   105  				if 'A' <= r && r <= 'Z' {
   106  					l = r + 'a' - 'A'
   107  				}
   108  			} else {
   109  				l = unicode.ToLower(r)
   110  			}
   111  			if l != r {
   112  				modifiers |= wordStart
   113  			}
   114  			m.inputBuffer[inputLen] = l
   115  			m.roles[inputLen] = modifiers
   116  			inputLen++
   117  			if m.roles[inputLen-1]&separator != 0 {
   118  				modifiers = wordStart | segmentStart
   119  			} else {
   120  				modifiers = 0
   121  			}
   122  			// TODO: we should prefer the right-most input if it overflows, rather
   123  			//       than the left-most as we're doing here.
   124  			if inputLen == 255 {
   125  				break input
   126  			}
   127  		}
   128  	}
   129  
   130  	// Second phase: find the right-most match, and count segments from the
   131  	// right.
   132  
   133  	var (
   134  		pi    = uint8(m.patternLen - 1) // pattern index
   135  		p     = m.pattern[pi]           // pattern rune
   136  		start = -1                      // start offset of match
   137  		rseg  = uint8(0)
   138  	)
   139  	const maxSeg = 3 // maximum number of segments from the right to count, for scoring purposes.
   140  
   141  	for ii := inputLen - 1; ; ii-- {
   142  		r := m.inputBuffer[ii]
   143  		if rseg < maxSeg && m.roles[ii]&separator != 0 {
   144  			rseg++
   145  		}
   146  		m.segments[ii] = rseg
   147  		if p == r {
   148  			if pi == 0 {
   149  				start = int(ii)
   150  				break
   151  			}
   152  			pi--
   153  			p = m.pattern[pi]
   154  		}
   155  		// Don't check ii >= 0 in the loop condition: ii is a uint8.
   156  		if ii == 0 {
   157  			break
   158  		}
   159  	}
   160  
   161  	if start < 0 {
   162  		// no match: skip scoring
   163  		return -1, 0
   164  	}
   165  
   166  	// Third phase: find the shortest match, and compute the score.
   167  
   168  	// Score is the average score for each character.
   169  	//
   170  	// A character score is the multiple of:
   171  	//   1. 1.0 if the character starts a segment, .8 if the character start a
   172  	//      mid-segment word, otherwise 0.6. This carries over to immediately
   173  	//      following characters.
   174  	//   2. For the final character match, the multiplier from (1) is reduced to
   175  	//     .8 if the next character in the input is a mid-segment word, or 0.6 if
   176  	//      the next character in the input is not a word or segment start. This
   177  	//      ensures that we favor whole-word or whole-segment matches over prefix
   178  	//      matches.
   179  	//   3. 1.0 if the character is part of the last segment, otherwise
   180  	//      1.0-.2*<segments from the right>, with a max segment count of 3.
   181  	//
   182  	// This is a very naive algorithm, but it is fast. There's lots of prior art
   183  	// here, and we should leverage it. For example, we could explicitly consider
   184  	// character distance, and exact matches of words or segments.
   185  	//
   186  	// Also note that this might not actually find the highest scoring match, as
   187  	// doing so could require a non-linear algorithm, depending on how the score
   188  	// is calculated.
   189  
   190  	pi = 0
   191  	p = m.pattern[pi]
   192  
   193  	const (
   194  		segStreak  = 1.0
   195  		wordStreak = 0.8
   196  		noStreak   = 0.6
   197  		perSegment = 0.2 // we count at most 3 segments above
   198  	)
   199  
   200  	streakBonus := noStreak
   201  	totScore := 0.0
   202  	for ii := uint8(start); ii < inputLen; ii++ {
   203  		r := m.inputBuffer[ii]
   204  		if r == p {
   205  			pi++
   206  			p = m.pattern[pi]
   207  			// Note: this could be optimized with some bit operations.
   208  			switch {
   209  			case m.roles[ii]&segmentStart != 0 && segStreak > streakBonus:
   210  				streakBonus = segStreak
   211  			case m.roles[ii]&wordStart != 0 && wordStreak > streakBonus:
   212  				streakBonus = wordStreak
   213  			}
   214  			finalChar := pi >= m.patternLen
   215  			// finalCost := 1.0
   216  			if finalChar && streakBonus > noStreak {
   217  				switch {
   218  				case ii == inputLen-1 || m.roles[ii+1]&segmentStart != 0:
   219  					// Full segment: no reduction
   220  				case m.roles[ii+1]&wordStart != 0:
   221  					streakBonus = wordStreak
   222  				default:
   223  					streakBonus = noStreak
   224  				}
   225  			}
   226  			totScore += streakBonus * (1.0 - float64(m.segments[ii])*perSegment)
   227  			if finalChar {
   228  				break
   229  			}
   230  		} else {
   231  			streakBonus = noStreak
   232  		}
   233  	}
   234  
   235  	return start, totScore / float64(m.patternLen)
   236  }
   237
View as plain text