1
2
3
4
5
6 package fuzzy
7
8 import (
9 "bytes"
10 "fmt"
11 )
12
13 const (
14
15
16 MaxInputSize = 127
17
18
19 MaxPatternSize = 63
20 )
21
22 type scoreVal int
23
24 func (s scoreVal) val() int {
25 return int(s) >> 1
26 }
27
28 func (s scoreVal) prevK() int {
29 return int(s) & 1
30 }
31
32 func score(val int, prevK int ) scoreVal {
33 return scoreVal(val<<1 + prevK)
34 }
35
36
37
38 type Matcher struct {
39 pattern string
40 patternLower []byte
41 patternShort []byte
42 caseSensitive bool
43
44 patternRoles []RuneRole
45 roles []RuneRole
46
47 scores [MaxInputSize + 1][MaxPatternSize + 1][2]scoreVal
48
49 scoreScale float32
50
51 lastCandidateLen int
52 lastCandidateMatched bool
53
54
55
56
57
58
59 inputBuf [MaxInputSize]byte
60 lowerBuf [MaxInputSize]byte
61 rolesBuf [MaxInputSize]RuneRole
62 }
63
64 func (m *Matcher) bestK(i, j int) int {
65 if m.scores[i][j][0].val() < m.scores[i][j][1].val() {
66 return 1
67 }
68 return 0
69 }
70
71
72 func NewMatcher(pattern string) *Matcher {
73 if len(pattern) > MaxPatternSize {
74 pattern = pattern[:MaxPatternSize]
75 }
76
77 m := &Matcher{
78 pattern: pattern,
79 patternLower: toLower([]byte(pattern), nil),
80 }
81
82 for i, c := range m.patternLower {
83 if pattern[i] != c {
84 m.caseSensitive = true
85 break
86 }
87 }
88
89 if len(pattern) > 3 {
90 m.patternShort = m.patternLower[:3]
91 } else {
92 m.patternShort = m.patternLower
93 }
94
95 m.patternRoles = RuneRoles([]byte(pattern), nil)
96
97 if len(pattern) > 0 {
98 maxCharScore := 4
99 m.scoreScale = 1 / float32(maxCharScore*len(pattern))
100 }
101
102 return m
103 }
104
105
106
107
108 func (m *Matcher) Score(candidate string) float32 {
109 return m.ScoreChunks([]string{candidate})
110 }
111
112 func (m *Matcher) ScoreChunks(chunks []string) float32 {
113 candidate := fromChunks(chunks, m.inputBuf[:])
114 if len(candidate) > MaxInputSize {
115 candidate = candidate[:MaxInputSize]
116 }
117 lower := toLower(candidate, m.lowerBuf[:])
118 m.lastCandidateLen = len(candidate)
119
120 if len(m.pattern) == 0 {
121
122 return 1
123 }
124
125 if m.match(candidate, lower) {
126 sc := m.computeScore(candidate, lower)
127 if sc > minScore/2 && !m.poorMatch() {
128 m.lastCandidateMatched = true
129 if len(m.pattern) == len(candidate) {
130
131 return 1
132 }
133
134 if sc < 0 {
135 sc = 0
136 }
137 normalizedScore := float32(sc) * m.scoreScale
138 if normalizedScore > 1 {
139 normalizedScore = 1
140 }
141
142 return normalizedScore
143 }
144 }
145
146 m.lastCandidateMatched = false
147 return 0
148 }
149
150 const minScore = -10000
151
152
153
154 func (m *Matcher) MatchedRanges() []int {
155 if len(m.pattern) == 0 || !m.lastCandidateMatched {
156 return nil
157 }
158 i, j := m.lastCandidateLen, len(m.pattern)
159 if m.scores[i][j][0].val() < minScore/2 && m.scores[i][j][1].val() < minScore/2 {
160 return nil
161 }
162
163 var ret []int
164 k := m.bestK(i, j)
165 for i > 0 {
166 take := (k == 1)
167 k = m.scores[i][j][k].prevK()
168 if take {
169 if len(ret) == 0 || ret[len(ret)-1] != i {
170 ret = append(ret, i)
171 ret = append(ret, i-1)
172 } else {
173 ret[len(ret)-1] = i - 1
174 }
175 j--
176 }
177 i--
178 }
179
180 for i := 0; i < len(ret)/2; i++ {
181 ret[i], ret[len(ret)-1-i] = ret[len(ret)-1-i], ret[i]
182 }
183 return ret
184 }
185
186 func (m *Matcher) match(candidate []byte, candidateLower []byte) bool {
187 i, j := 0, 0
188 for ; i < len(candidateLower) && j < len(m.patternLower); i++ {
189 if candidateLower[i] == m.patternLower[j] {
190 j++
191 }
192 }
193 if j != len(m.patternLower) {
194 return false
195 }
196
197
198
199 m.roles = RuneRoles(candidate, m.rolesBuf[:])
200
201 return true
202 }
203
204 func (m *Matcher) computeScore(candidate []byte, candidateLower []byte) int {
205 pattLen, candLen := len(m.pattern), len(candidate)
206
207 for j := 0; j <= len(m.pattern); j++ {
208 m.scores[0][j][0] = minScore << 1
209 m.scores[0][j][1] = minScore << 1
210 }
211 m.scores[0][0][0] = score(0, 0)
212
213 segmentsLeft, lastSegStart := 1, 0
214 for i := 0; i < candLen; i++ {
215 if m.roles[i] == RSep {
216 segmentsLeft++
217 lastSegStart = i + 1
218 }
219 }
220
221
222 consecutiveBonus := 2
223 wordIdx := 0
224 for i := 1; i <= candLen; i++ {
225
226 role := m.roles[i-1]
227 isHead := role == RHead
228
229 if isHead {
230 wordIdx++
231 } else if role == RSep && segmentsLeft > 1 {
232 wordIdx = 0
233 segmentsLeft--
234 }
235
236 var skipPenalty int
237 if i == 1 || (i-1) == lastSegStart {
238
239 skipPenalty++
240 }
241
242 for j := 0; j <= pattLen; j++ {
243
244 m.scores[i][j][1] = minScore << 1
245
246
247 k := 0
248 if m.scores[i-1][j][0].val() < m.scores[i-1][j][1].val() {
249 k = 1
250 }
251
252 skipScore := m.scores[i-1][j][k].val()
253
254 if j != pattLen {
255 skipScore -= skipPenalty
256 }
257 m.scores[i][j][0] = score(skipScore, k)
258
259 if j == 0 || candidateLower[i-1] != m.patternLower[j-1] {
260
261 continue
262 }
263 pRole := m.patternRoles[j-1]
264
265 if role == RTail && pRole == RHead {
266 if j > 1 {
267
268 continue
269 }
270
271
272
273 if !bytes.HasPrefix(candidateLower[i-1:], m.patternShort) {
274 continue
275 }
276 }
277
278
279 var charScore int
280
281 if segmentsLeft <= 1 {
282 charScore++
283 }
284
285
286
287 if candidate[i-1] == m.pattern[j-1] || role == RHead && (!m.caseSensitive || pRole == RHead) {
288 charScore++
289 }
290
291
292 if role == RTail && pRole == RHead {
293 charScore--
294 }
295
296 if j == 1 && role == RTail {
297 charScore -= 4
298 }
299
300
301
302 for k := 0; k < 2; k++ {
303 sc := m.scores[i-1][j-1][k].val() + charScore
304
305 isConsecutive := k == 1 || i-1 == 0 || i-1 == lastSegStart
306 if isConsecutive {
307
308
309
310
311 sc += consecutiveBonus
312 }
313 if k == 0 {
314
315
316 if role == RTail || role == RUCTail {
317 sc -= 3
318 }
319 }
320
321 if sc > m.scores[i][j][1].val() {
322 m.scores[i][j][1] = score(sc, k)
323 }
324 }
325 }
326 }
327
328 result := m.scores[len(candidate)][len(m.pattern)][m.bestK(len(candidate), len(m.pattern))].val()
329
330 return result
331 }
332
333
334 func (m *Matcher) ScoreTable(candidate string) string {
335 var buf bytes.Buffer
336
337 var line1, line2, separator bytes.Buffer
338 line1.WriteString("\t")
339 line2.WriteString("\t")
340 for j := 0; j < len(m.pattern); j++ {
341 line1.WriteString(fmt.Sprintf("%c\t\t", m.pattern[j]))
342 separator.WriteString("----------------")
343 }
344
345 buf.WriteString(line1.String())
346 buf.WriteString("\n")
347 buf.WriteString(separator.String())
348 buf.WriteString("\n")
349
350 for i := 1; i <= len(candidate); i++ {
351 line1.Reset()
352 line2.Reset()
353
354 line1.WriteString(fmt.Sprintf("%c\t", candidate[i-1]))
355 line2.WriteString("\t")
356
357 for j := 1; j <= len(m.pattern); j++ {
358 line1.WriteString(fmt.Sprintf("M%6d(%c)\t", m.scores[i][j][0].val(), dir(m.scores[i][j][0].prevK())))
359 line2.WriteString(fmt.Sprintf("H%6d(%c)\t", m.scores[i][j][1].val(), dir(m.scores[i][j][1].prevK())))
360 }
361 buf.WriteString(line1.String())
362 buf.WriteString("\n")
363 buf.WriteString(line2.String())
364 buf.WriteString("\n")
365 buf.WriteString(separator.String())
366 buf.WriteString("\n")
367 }
368
369 return buf.String()
370 }
371
372 func dir(prevK int) rune {
373 if prevK == 0 {
374 return 'M'
375 }
376 return 'H'
377 }
378
379 func (m *Matcher) poorMatch() bool {
380 if len(m.pattern) < 2 {
381 return false
382 }
383
384 i, j := m.lastCandidateLen, len(m.pattern)
385 k := m.bestK(i, j)
386
387 var counter, len int
388 for i > 0 {
389 take := (k == 1)
390 k = m.scores[i][j][k].prevK()
391 if take {
392 len++
393 if k == 0 && len < 3 && m.roles[i-1] == RTail {
394
395 counter++
396 if counter > 1 {
397 return true
398 }
399 }
400 j--
401 } else {
402 len = 0
403 }
404 i--
405 }
406 return false
407 }
408
View as plain text