Source file src/index/suffixarray/sais2.go
1 // Copyright 2019 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Code generated by go generate; DO NOT EDIT. 6 7 package suffixarray 8 9 func text_64(text []byte, sa []int64) { 10 if int(int64(len(text))) != len(text) || len(text) != len(sa) { 11 panic("suffixarray: misuse of text_64") 12 } 13 sais_8_64(text, 256, sa, make([]int64, 2*256)) 14 } 15 16 func sais_8_64(text []byte, textMax int, sa, tmp []int64) { 17 if len(sa) != len(text) || len(tmp) < int(textMax) { 18 panic("suffixarray: misuse of sais_8_64") 19 } 20 21 // Trivial base cases. Sorting 0 or 1 things is easy. 22 if len(text) == 0 { 23 return 24 } 25 if len(text) == 1 { 26 sa[0] = 0 27 return 28 } 29 30 // Establish slices indexed by text character 31 // holding character frequency and bucket-sort offsets. 32 // If there's only enough tmp for one slice, 33 // we make it the bucket offsets and recompute 34 // the character frequency each time we need it. 35 var freq, bucket []int64 36 if len(tmp) >= 2*textMax { 37 freq, bucket = tmp[:textMax], tmp[textMax:2*textMax] 38 freq[0] = -1 // mark as uninitialized 39 } else { 40 freq, bucket = nil, tmp[:textMax] 41 } 42 43 // The SAIS algorithm. 44 // Each of these calls makes one scan through sa. 45 // See the individual functions for documentation 46 // about each's role in the algorithm. 47 numLMS := placeLMS_8_64(text, sa, freq, bucket) 48 if numLMS <= 1 { 49 // 0 or 1 items are already sorted. Do nothing. 50 } else { 51 induceSubL_8_64(text, sa, freq, bucket) 52 induceSubS_8_64(text, sa, freq, bucket) 53 length_8_64(text, sa, numLMS) 54 maxID := assignID_8_64(text, sa, numLMS) 55 if maxID < numLMS { 56 map_64(sa, numLMS) 57 recurse_64(sa, tmp, numLMS, maxID) 58 unmap_8_64(text, sa, numLMS) 59 } else { 60 // If maxID == numLMS, then each LMS-substring 61 // is unique, so the relative ordering of two LMS-suffixes 62 // is determined by just the leading LMS-substring. 63 // That is, the LMS-suffix sort order matches the 64 // (simpler) LMS-substring sort order. 65 // Copy the original LMS-substring order into the 66 // suffix array destination. 67 copy(sa, sa[len(sa)-numLMS:]) 68 } 69 expand_8_64(text, freq, bucket, sa, numLMS) 70 } 71 induceL_8_64(text, sa, freq, bucket) 72 induceS_8_64(text, sa, freq, bucket) 73 74 // Mark for caller that we overwrote tmp. 75 tmp[0] = -1 76 } 77 78 func sais_32(text []int32, textMax int, sa, tmp []int32) { 79 if len(sa) != len(text) || len(tmp) < int(textMax) { 80 panic("suffixarray: misuse of sais_32") 81 } 82 83 // Trivial base cases. Sorting 0 or 1 things is easy. 84 if len(text) == 0 { 85 return 86 } 87 if len(text) == 1 { 88 sa[0] = 0 89 return 90 } 91 92 // Establish slices indexed by text character 93 // holding character frequency and bucket-sort offsets. 94 // If there's only enough tmp for one slice, 95 // we make it the bucket offsets and recompute 96 // the character frequency each time we need it. 97 var freq, bucket []int32 98 if len(tmp) >= 2*textMax { 99 freq, bucket = tmp[:textMax], tmp[textMax:2*textMax] 100 freq[0] = -1 // mark as uninitialized 101 } else { 102 freq, bucket = nil, tmp[:textMax] 103 } 104 105 // The SAIS algorithm. 106 // Each of these calls makes one scan through sa. 107 // See the individual functions for documentation 108 // about each's role in the algorithm. 109 numLMS := placeLMS_32(text, sa, freq, bucket) 110 if numLMS <= 1 { 111 // 0 or 1 items are already sorted. Do nothing. 112 } else { 113 induceSubL_32(text, sa, freq, bucket) 114 induceSubS_32(text, sa, freq, bucket) 115 length_32(text, sa, numLMS) 116 maxID := assignID_32(text, sa, numLMS) 117 if maxID < numLMS { 118 map_32(sa, numLMS) 119 recurse_32(sa, tmp, numLMS, maxID) 120 unmap_32(text, sa, numLMS) 121 } else { 122 // If maxID == numLMS, then each LMS-substring 123 // is unique, so the relative ordering of two LMS-suffixes 124 // is determined by just the leading LMS-substring. 125 // That is, the LMS-suffix sort order matches the 126 // (simpler) LMS-substring sort order. 127 // Copy the original LMS-substring order into the 128 // suffix array destination. 129 copy(sa, sa[len(sa)-numLMS:]) 130 } 131 expand_32(text, freq, bucket, sa, numLMS) 132 } 133 induceL_32(text, sa, freq, bucket) 134 induceS_32(text, sa, freq, bucket) 135 136 // Mark for caller that we overwrote tmp. 137 tmp[0] = -1 138 } 139 140 func sais_64(text []int64, textMax int, sa, tmp []int64) { 141 if len(sa) != len(text) || len(tmp) < int(textMax) { 142 panic("suffixarray: misuse of sais_64") 143 } 144 145 // Trivial base cases. Sorting 0 or 1 things is easy. 146 if len(text) == 0 { 147 return 148 } 149 if len(text) == 1 { 150 sa[0] = 0 151 return 152 } 153 154 // Establish slices indexed by text character 155 // holding character frequency and bucket-sort offsets. 156 // If there's only enough tmp for one slice, 157 // we make it the bucket offsets and recompute 158 // the character frequency each time we need it. 159 var freq, bucket []int64 160 if len(tmp) >= 2*textMax { 161 freq, bucket = tmp[:textMax], tmp[textMax:2*textMax] 162 freq[0] = -1 // mark as uninitialized 163 } else { 164 freq, bucket = nil, tmp[:textMax] 165 } 166 167 // The SAIS algorithm. 168 // Each of these calls makes one scan through sa. 169 // See the individual functions for documentation 170 // about each's role in the algorithm. 171 numLMS := placeLMS_64(text, sa, freq, bucket) 172 if numLMS <= 1 { 173 // 0 or 1 items are already sorted. Do nothing. 174 } else { 175 induceSubL_64(text, sa, freq, bucket) 176 induceSubS_64(text, sa, freq, bucket) 177 length_64(text, sa, numLMS) 178 maxID := assignID_64(text, sa, numLMS) 179 if maxID < numLMS { 180 map_64(sa, numLMS) 181 recurse_64(sa, tmp, numLMS, maxID) 182 unmap_64(text, sa, numLMS) 183 } else { 184 // If maxID == numLMS, then each LMS-substring 185 // is unique, so the relative ordering of two LMS-suffixes 186 // is determined by just the leading LMS-substring. 187 // That is, the LMS-suffix sort order matches the 188 // (simpler) LMS-substring sort order. 189 // Copy the original LMS-substring order into the 190 // suffix array destination. 191 copy(sa, sa[len(sa)-numLMS:]) 192 } 193 expand_64(text, freq, bucket, sa, numLMS) 194 } 195 induceL_64(text, sa, freq, bucket) 196 induceS_64(text, sa, freq, bucket) 197 198 // Mark for caller that we overwrote tmp. 199 tmp[0] = -1 200 } 201 202 func freq_8_64(text []byte, freq, bucket []int64) []int64 { 203 if freq != nil && freq[0] >= 0 { 204 return freq // already computed 205 } 206 if freq == nil { 207 freq = bucket 208 } 209 210 freq = freq[:256] // eliminate bounds check for freq[c] below 211 for i := range freq { 212 freq[i] = 0 213 } 214 for _, c := range text { 215 freq[c]++ 216 } 217 return freq 218 } 219 220 func freq_32(text []int32, freq, bucket []int32) []int32 { 221 if freq != nil && freq[0] >= 0 { 222 return freq // already computed 223 } 224 if freq == nil { 225 freq = bucket 226 } 227 228 for i := range freq { 229 freq[i] = 0 230 } 231 for _, c := range text { 232 freq[c]++ 233 } 234 return freq 235 } 236 237 func freq_64(text []int64, freq, bucket []int64) []int64 { 238 if freq != nil && freq[0] >= 0 { 239 return freq // already computed 240 } 241 if freq == nil { 242 freq = bucket 243 } 244 245 for i := range freq { 246 freq[i] = 0 247 } 248 for _, c := range text { 249 freq[c]++ 250 } 251 return freq 252 } 253 254 func bucketMin_8_64(text []byte, freq, bucket []int64) { 255 freq = freq_8_64(text, freq, bucket) 256 freq = freq[:256] // establish len(freq) = 256, so 0 ≤ i < 256 below 257 bucket = bucket[:256] // eliminate bounds check for bucket[i] below 258 total := int64(0) 259 for i, n := range freq { 260 bucket[i] = total 261 total += n 262 } 263 } 264 265 func bucketMin_32(text []int32, freq, bucket []int32) { 266 freq = freq_32(text, freq, bucket) 267 total := int32(0) 268 for i, n := range freq { 269 bucket[i] = total 270 total += n 271 } 272 } 273 274 func bucketMin_64(text []int64, freq, bucket []int64) { 275 freq = freq_64(text, freq, bucket) 276 total := int64(0) 277 for i, n := range freq { 278 bucket[i] = total 279 total += n 280 } 281 } 282 283 func bucketMax_8_64(text []byte, freq, bucket []int64) { 284 freq = freq_8_64(text, freq, bucket) 285 freq = freq[:256] // establish len(freq) = 256, so 0 ≤ i < 256 below 286 bucket = bucket[:256] // eliminate bounds check for bucket[i] below 287 total := int64(0) 288 for i, n := range freq { 289 total += n 290 bucket[i] = total 291 } 292 } 293 294 func bucketMax_32(text []int32, freq, bucket []int32) { 295 freq = freq_32(text, freq, bucket) 296 total := int32(0) 297 for i, n := range freq { 298 total += n 299 bucket[i] = total 300 } 301 } 302 303 func bucketMax_64(text []int64, freq, bucket []int64) { 304 freq = freq_64(text, freq, bucket) 305 total := int64(0) 306 for i, n := range freq { 307 total += n 308 bucket[i] = total 309 } 310 } 311 312 func placeLMS_8_64(text []byte, sa, freq, bucket []int64) int { 313 bucketMax_8_64(text, freq, bucket) 314 315 numLMS := 0 316 lastB := int64(-1) 317 bucket = bucket[:256] // eliminate bounds check for bucket[c1] below 318 319 // The next stanza of code (until the blank line) loop backward 320 // over text, stopping to execute a code body at each position i 321 // such that text[i] is an L-character and text[i+1] is an S-character. 322 // That is, i+1 is the position of the start of an LMS-substring. 323 // These could be hoisted out into a function with a callback, 324 // but at a significant speed cost. Instead, we just write these 325 // seven lines a few times in this source file. The copies below 326 // refer back to the pattern established by this original as the 327 // "LMS-substring iterator". 328 // 329 // In every scan through the text, c0, c1 are successive characters of text. 330 // In this backward scan, c0 == text[i] and c1 == text[i+1]. 331 // By scanning backward, we can keep track of whether the current 332 // position is type-S or type-L according to the usual definition: 333 // 334 // - position len(text) is type S with text[len(text)] == -1 (the sentinel) 335 // - position i is type S if text[i] < text[i+1], or if text[i] == text[i+1] && i+1 is type S. 336 // - position i is type L if text[i] > text[i+1], or if text[i] == text[i+1] && i+1 is type L. 337 // 338 // The backward scan lets us maintain the current type, 339 // update it when we see c0 != c1, and otherwise leave it alone. 340 // We want to identify all S positions with a preceding L. 341 // Position len(text) is one such position by definition, but we have 342 // nowhere to write it down, so we eliminate it by untruthfully 343 // setting isTypeS = false at the start of the loop. 344 c0, c1, isTypeS := byte(0), byte(0), false 345 for i := len(text) - 1; i >= 0; i-- { 346 c0, c1 = text[i], c0 347 if c0 < c1 { 348 isTypeS = true 349 } else if c0 > c1 && isTypeS { 350 isTypeS = false 351 352 // Bucket the index i+1 for the start of an LMS-substring. 353 b := bucket[c1] - 1 354 bucket[c1] = b 355 sa[b] = int64(i + 1) 356 lastB = b 357 numLMS++ 358 } 359 } 360 361 // We recorded the LMS-substring starts but really want the ends. 362 // Luckily, with two differences, the start indexes and the end indexes are the same. 363 // The first difference is that the rightmost LMS-substring's end index is len(text), 364 // so the caller must pretend that sa[-1] == len(text), as noted above. 365 // The second difference is that the first leftmost LMS-substring start index 366 // does not end an earlier LMS-substring, so as an optimization we can omit 367 // that leftmost LMS-substring start index (the last one we wrote). 368 // 369 // Exception: if numLMS <= 1, the caller is not going to bother with 370 // the recursion at all and will treat the result as containing LMS-substring starts. 371 // In that case, we don't remove the final entry. 372 if numLMS > 1 { 373 sa[lastB] = 0 374 } 375 return numLMS 376 } 377 378 func placeLMS_32(text []int32, sa, freq, bucket []int32) int { 379 bucketMax_32(text, freq, bucket) 380 381 numLMS := 0 382 lastB := int32(-1) 383 384 // The next stanza of code (until the blank line) loop backward 385 // over text, stopping to execute a code body at each position i 386 // such that text[i] is an L-character and text[i+1] is an S-character. 387 // That is, i+1 is the position of the start of an LMS-substring. 388 // These could be hoisted out into a function with a callback, 389 // but at a significant speed cost. Instead, we just write these 390 // seven lines a few times in this source file. The copies below 391 // refer back to the pattern established by this original as the 392 // "LMS-substring iterator". 393 // 394 // In every scan through the text, c0, c1 are successive characters of text. 395 // In this backward scan, c0 == text[i] and c1 == text[i+1]. 396 // By scanning backward, we can keep track of whether the current 397 // position is type-S or type-L according to the usual definition: 398 // 399 // - position len(text) is type S with text[len(text)] == -1 (the sentinel) 400 // - position i is type S if text[i] < text[i+1], or if text[i] == text[i+1] && i+1 is type S. 401 // - position i is type L if text[i] > text[i+1], or if text[i] == text[i+1] && i+1 is type L. 402 // 403 // The backward scan lets us maintain the current type, 404 // update it when we see c0 != c1, and otherwise leave it alone. 405 // We want to identify all S positions with a preceding L. 406 // Position len(text) is one such position by definition, but we have 407 // nowhere to write it down, so we eliminate it by untruthfully 408 // setting isTypeS = false at the start of the loop. 409 c0, c1, isTypeS := int32(0), int32(0), false 410 for i := len(text) - 1; i >= 0; i-- { 411 c0, c1 = text[i], c0 412 if c0 < c1 { 413 isTypeS = true 414 } else if c0 > c1 && isTypeS { 415 isTypeS = false 416 417 // Bucket the index i+1 for the start of an LMS-substring. 418 b := bucket[c1] - 1 419 bucket[c1] = b 420 sa[b] = int32(i + 1) 421 lastB = b 422 numLMS++ 423 } 424 } 425 426 // We recorded the LMS-substring starts but really want the ends. 427 // Luckily, with two differences, the start indexes and the end indexes are the same. 428 // The first difference is that the rightmost LMS-substring's end index is len(text), 429 // so the caller must pretend that sa[-1] == len(text), as noted above. 430 // The second difference is that the first leftmost LMS-substring start index 431 // does not end an earlier LMS-substring, so as an optimization we can omit 432 // that leftmost LMS-substring start index (the last one we wrote). 433 // 434 // Exception: if numLMS <= 1, the caller is not going to bother with 435 // the recursion at all and will treat the result as containing LMS-substring starts. 436 // In that case, we don't remove the final entry. 437 if numLMS > 1 { 438 sa[lastB] = 0 439 } 440 return numLMS 441 } 442 443 func placeLMS_64(text []int64, sa, freq, bucket []int64) int { 444 bucketMax_64(text, freq, bucket) 445 446 numLMS := 0 447 lastB := int64(-1) 448 449 // The next stanza of code (until the blank line) loop backward 450 // over text, stopping to execute a code body at each position i 451 // such that text[i] is an L-character and text[i+1] is an S-character. 452 // That is, i+1 is the position of the start of an LMS-substring. 453 // These could be hoisted out into a function with a callback, 454 // but at a significant speed cost. Instead, we just write these 455 // seven lines a few times in this source file. The copies below 456 // refer back to the pattern established by this original as the 457 // "LMS-substring iterator". 458 // 459 // In every scan through the text, c0, c1 are successive characters of text. 460 // In this backward scan, c0 == text[i] and c1 == text[i+1]. 461 // By scanning backward, we can keep track of whether the current 462 // position is type-S or type-L according to the usual definition: 463 // 464 // - position len(text) is type S with text[len(text)] == -1 (the sentinel) 465 // - position i is type S if text[i] < text[i+1], or if text[i] == text[i+1] && i+1 is type S. 466 // - position i is type L if text[i] > text[i+1], or if text[i] == text[i+1] && i+1 is type L. 467 // 468 // The backward scan lets us maintain the current type, 469 // update it when we see c0 != c1, and otherwise leave it alone. 470 // We want to identify all S positions with a preceding L. 471 // Position len(text) is one such position by definition, but we have 472 // nowhere to write it down, so we eliminate it by untruthfully 473 // setting isTypeS = false at the start of the loop. 474 c0, c1, isTypeS := int64(0), int64(0), false 475 for i := len(text) - 1; i >= 0; i-- { 476 c0, c1 = text[i], c0 477 if c0 < c1 { 478 isTypeS = true 479 } else if c0 > c1 && isTypeS { 480 isTypeS = false 481 482 // Bucket the index i+1 for the start of an LMS-substring. 483 b := bucket[c1] - 1 484 bucket[c1] = b 485 sa[b] = int64(i + 1) 486 lastB = b 487 numLMS++ 488 } 489 } 490 491 // We recorded the LMS-substring starts but really want the ends. 492 // Luckily, with two differences, the start indexes and the end indexes are the same. 493 // The first difference is that the rightmost LMS-substring's end index is len(text), 494 // so the caller must pretend that sa[-1] == len(text), as noted above. 495 // The second difference is that the first leftmost LMS-substring start index 496 // does not end an earlier LMS-substring, so as an optimization we can omit 497 // that leftmost LMS-substring start index (the last one we wrote). 498 // 499 // Exception: if numLMS <= 1, the caller is not going to bother with 500 // the recursion at all and will treat the result as containing LMS-substring starts. 501 // In that case, we don't remove the final entry. 502 if numLMS > 1 { 503 sa[lastB] = 0 504 } 505 return numLMS 506 } 507 508 func induceSubL_8_64(text []byte, sa, freq, bucket []int64) { 509 // Initialize positions for left side of character buckets. 510 bucketMin_8_64(text, freq, bucket) 511 bucket = bucket[:256] // eliminate bounds check for bucket[cB] below 512 513 // As we scan the array left-to-right, each sa[i] = j > 0 is a correctly 514 // sorted suffix array entry (for text[j:]) for which we know that j-1 is type L. 515 // Because j-1 is type L, inserting it into sa now will sort it correctly. 516 // But we want to distinguish a j-1 with j-2 of type L from type S. 517 // We can process the former but want to leave the latter for the caller. 518 // We record the difference by negating j-1 if it is preceded by type S. 519 // Either way, the insertion (into the text[j-1] bucket) is guaranteed to 520 // happen at sa[i´] for some i´ > i, that is, in the portion of sa we have 521 // yet to scan. A single pass therefore sees indexes j, j-1, j-2, j-3, 522 // and so on, in sorted but not necessarily adjacent order, until it finds 523 // one preceded by an index of type S, at which point it must stop. 524 // 525 // As we scan through the array, we clear the worked entries (sa[i] > 0) to zero, 526 // and we flip sa[i] < 0 to -sa[i], so that the loop finishes with sa containing 527 // only the indexes of the leftmost L-type indexes for each LMS-substring. 528 // 529 // The suffix array sa therefore serves simultaneously as input, output, 530 // and a miraculously well-tailored work queue. 531 532 // placeLMS_8_64 left out the implicit entry sa[-1] == len(text), 533 // corresponding to the identified type-L index len(text)-1. 534 // Process it before the left-to-right scan of sa proper. 535 // See body in loop for commentary. 536 k := len(text) - 1 537 c0, c1 := text[k-1], text[k] 538 if c0 < c1 { 539 k = -k 540 } 541 542 // Cache recently used bucket index: 543 // we're processing suffixes in sorted order 544 // and accessing buckets indexed by the 545 // byte before the sorted order, which still 546 // has very good locality. 547 // Invariant: b is cached, possibly dirty copy of bucket[cB]. 548 cB := c1 549 b := bucket[cB] 550 sa[b] = int64(k) 551 b++ 552 553 for i := 0; i < len(sa); i++ { 554 j := int(sa[i]) 555 if j == 0 { 556 // Skip empty entry. 557 continue 558 } 559 if j < 0 { 560 // Leave discovered type-S index for caller. 561 sa[i] = int64(-j) 562 continue 563 } 564 sa[i] = 0 565 566 // Index j was on work queue, meaning k := j-1 is L-type, 567 // so we can now place k correctly into sa. 568 // If k-1 is L-type, queue k for processing later in this loop. 569 // If k-1 is S-type (text[k-1] < text[k]), queue -k to save for the caller. 570 k := j - 1 571 c0, c1 := text[k-1], text[k] 572 if c0 < c1 { 573 k = -k 574 } 575 576 if cB != c1 { 577 bucket[cB] = b 578 cB = c1 579 b = bucket[cB] 580 } 581 sa[b] = int64(k) 582 b++ 583 } 584 } 585 586 func induceSubL_32(text []int32, sa, freq, bucket []int32) { 587 // Initialize positions for left side of character buckets. 588 bucketMin_32(text, freq, bucket) 589 590 // As we scan the array left-to-right, each sa[i] = j > 0 is a correctly 591 // sorted suffix array entry (for text[j:]) for which we know that j-1 is type L. 592 // Because j-1 is type L, inserting it into sa now will sort it correctly. 593 // But we want to distinguish a j-1 with j-2 of type L from type S. 594 // We can process the former but want to leave the latter for the caller. 595 // We record the difference by negating j-1 if it is preceded by type S. 596 // Either way, the insertion (into the text[j-1] bucket) is guaranteed to 597 // happen at sa[i´] for some i´ > i, that is, in the portion of sa we have 598 // yet to scan. A single pass therefore sees indexes j, j-1, j-2, j-3, 599 // and so on, in sorted but not necessarily adjacent order, until it finds 600 // one preceded by an index of type S, at which point it must stop. 601 // 602 // As we scan through the array, we clear the worked entries (sa[i] > 0) to zero, 603 // and we flip sa[i] < 0 to -sa[i], so that the loop finishes with sa containing 604 // only the indexes of the leftmost L-type indexes for each LMS-substring. 605 // 606 // The suffix array sa therefore serves simultaneously as input, output, 607 // and a miraculously well-tailored work queue. 608 609 // placeLMS_32 left out the implicit entry sa[-1] == len(text), 610 // corresponding to the identified type-L index len(text)-1. 611 // Process it before the left-to-right scan of sa proper. 612 // See body in loop for commentary. 613 k := len(text) - 1 614 c0, c1 := text[k-1], text[k] 615 if c0 < c1 { 616 k = -k 617 } 618 619 // Cache recently used bucket index: 620 // we're processing suffixes in sorted order 621 // and accessing buckets indexed by the 622 // int32 before the sorted order, which still 623 // has very good locality. 624 // Invariant: b is cached, possibly dirty copy of bucket[cB]. 625 cB := c1 626 b := bucket[cB] 627 sa[b] = int32(k) 628 b++ 629 630 for i := 0; i < len(sa); i++ { 631 j := int(sa[i]) 632 if j == 0 { 633 // Skip empty entry. 634 continue 635 } 636 if j < 0 { 637 // Leave discovered type-S index for caller. 638 sa[i] = int32(-j) 639 continue 640 } 641 sa[i] = 0 642 643 // Index j was on work queue, meaning k := j-1 is L-type, 644 // so we can now place k correctly into sa. 645 // If k-1 is L-type, queue k for processing later in this loop. 646 // If k-1 is S-type (text[k-1] < text[k]), queue -k to save for the caller. 647 k := j - 1 648 c0, c1 := text[k-1], text[k] 649 if c0 < c1 { 650 k = -k 651 } 652 653 if cB != c1 { 654 bucket[cB] = b 655 cB = c1 656 b = bucket[cB] 657 } 658 sa[b] = int32(k) 659 b++ 660 } 661 } 662 663 func induceSubL_64(text []int64, sa, freq, bucket []int64) { 664 // Initialize positions for left side of character buckets. 665 bucketMin_64(text, freq, bucket) 666 667 // As we scan the array left-to-right, each sa[i] = j > 0 is a correctly 668 // sorted suffix array entry (for text[j:]) for which we know that j-1 is type L. 669 // Because j-1 is type L, inserting it into sa now will sort it correctly. 670 // But we want to distinguish a j-1 with j-2 of type L from type S. 671 // We can process the former but want to leave the latter for the caller. 672 // We record the difference by negating j-1 if it is preceded by type S. 673 // Either way, the insertion (into the text[j-1] bucket) is guaranteed to 674 // happen at sa[i´] for some i´ > i, that is, in the portion of sa we have 675 // yet to scan. A single pass therefore sees indexes j, j-1, j-2, j-3, 676 // and so on, in sorted but not necessarily adjacent order, until it finds 677 // one preceded by an index of type S, at which point it must stop. 678 // 679 // As we scan through the array, we clear the worked entries (sa[i] > 0) to zero, 680 // and we flip sa[i] < 0 to -sa[i], so that the loop finishes with sa containing 681 // only the indexes of the leftmost L-type indexes for each LMS-substring. 682 // 683 // The suffix array sa therefore serves simultaneously as input, output, 684 // and a miraculously well-tailored work queue. 685 686 // placeLMS_64 left out the implicit entry sa[-1] == len(text), 687 // corresponding to the identified type-L index len(text)-1. 688 // Process it before the left-to-right scan of sa proper. 689 // See body in loop for commentary. 690 k := len(text) - 1 691 c0, c1 := text[k-1], text[k] 692 if c0 < c1 { 693 k = -k 694 } 695 696 // Cache recently used bucket index: 697 // we're processing suffixes in sorted order 698 // and accessing buckets indexed by the 699 // int64 before the sorted order, which still 700 // has very good locality. 701 // Invariant: b is cached, possibly dirty copy of bucket[cB]. 702 cB := c1 703 b := bucket[cB] 704 sa[b] = int64(k) 705 b++ 706 707 for i := 0; i < len(sa); i++ { 708 j := int(sa[i]) 709 if j == 0 { 710 // Skip empty entry. 711 continue 712 } 713 if j < 0 { 714 // Leave discovered type-S index for caller. 715 sa[i] = int64(-j) 716 continue 717 } 718 sa[i] = 0 719 720 // Index j was on work queue, meaning k := j-1 is L-type, 721 // so we can now place k correctly into sa. 722 // If k-1 is L-type, queue k for processing later in this loop. 723 // If k-1 is S-type (text[k-1] < text[k]), queue -k to save for the caller. 724 k := j - 1 725 c0, c1 := text[k-1], text[k] 726 if c0 < c1 { 727 k = -k 728 } 729 730 if cB != c1 { 731 bucket[cB] = b 732 cB = c1 733 b = bucket[cB] 734 } 735 sa[b] = int64(k) 736 b++ 737 } 738 } 739 740 func induceSubS_8_64(text []byte, sa, freq, bucket []int64) { 741 // Initialize positions for right side of character buckets. 742 bucketMax_8_64(text, freq, bucket) 743 bucket = bucket[:256] // eliminate bounds check for bucket[cB] below 744 745 // Analogous to induceSubL_8_64 above, 746 // as we scan the array right-to-left, each sa[i] = j > 0 is a correctly 747 // sorted suffix array entry (for text[j:]) for which we know that j-1 is type S. 748 // Because j-1 is type S, inserting it into sa now will sort it correctly. 749 // But we want to distinguish a j-1 with j-2 of type S from type L. 750 // We can process the former but want to leave the latter for the caller. 751 // We record the difference by negating j-1 if it is preceded by type L. 752 // Either way, the insertion (into the text[j-1] bucket) is guaranteed to 753 // happen at sa[i´] for some i´ < i, that is, in the portion of sa we have 754 // yet to scan. A single pass therefore sees indexes j, j-1, j-2, j-3, 755 // and so on, in sorted but not necessarily adjacent order, until it finds 756 // one preceded by an index of type L, at which point it must stop. 757 // That index (preceded by one of type L) is an LMS-substring start. 758 // 759 // As we scan through the array, we clear the worked entries (sa[i] > 0) to zero, 760 // and we flip sa[i] < 0 to -sa[i] and compact into the top of sa, 761 // so that the loop finishes with the top of sa containing exactly 762 // the LMS-substring start indexes, sorted by LMS-substring. 763 764 // Cache recently used bucket index: 765 cB := byte(0) 766 b := bucket[cB] 767 768 top := len(sa) 769 for i := len(sa) - 1; i >= 0; i-- { 770 j := int(sa[i]) 771 if j == 0 { 772 // Skip empty entry. 773 continue 774 } 775 sa[i] = 0 776 if j < 0 { 777 // Leave discovered LMS-substring start index for caller. 778 top-- 779 sa[top] = int64(-j) 780 continue 781 } 782 783 // Index j was on work queue, meaning k := j-1 is S-type, 784 // so we can now place k correctly into sa. 785 // If k-1 is S-type, queue k for processing later in this loop. 786 // If k-1 is L-type (text[k-1] > text[k]), queue -k to save for the caller. 787 k := j - 1 788 c1 := text[k] 789 c0 := text[k-1] 790 if c0 > c1 { 791 k = -k 792 } 793 794 if cB != c1 { 795 bucket[cB] = b 796 cB = c1 797 b = bucket[cB] 798 } 799 b-- 800 sa[b] = int64(k) 801 } 802 } 803 804 func induceSubS_32(text []int32, sa, freq, bucket []int32) { 805 // Initialize positions for right side of character buckets. 806 bucketMax_32(text, freq, bucket) 807 808 // Analogous to induceSubL_32 above, 809 // as we scan the array right-to-left, each sa[i] = j > 0 is a correctly 810 // sorted suffix array entry (for text[j:]) for which we know that j-1 is type S. 811 // Because j-1 is type S, inserting it into sa now will sort it correctly. 812 // But we want to distinguish a j-1 with j-2 of type S from type L. 813 // We can process the former but want to leave the latter for the caller. 814 // We record the difference by negating j-1 if it is preceded by type L. 815 // Either way, the insertion (into the text[j-1] bucket) is guaranteed to 816 // happen at sa[i´] for some i´ < i, that is, in the portion of sa we have 817 // yet to scan. A single pass therefore sees indexes j, j-1, j-2, j-3, 818 // and so on, in sorted but not necessarily adjacent order, until it finds 819 // one preceded by an index of type L, at which point it must stop. 820 // That index (preceded by one of type L) is an LMS-substring start. 821 // 822 // As we scan through the array, we clear the worked entries (sa[i] > 0) to zero, 823 // and we flip sa[i] < 0 to -sa[i] and compact into the top of sa, 824 // so that the loop finishes with the top of sa containing exactly 825 // the LMS-substring start indexes, sorted by LMS-substring. 826 827 // Cache recently used bucket index: 828 cB := int32(0) 829 b := bucket[cB] 830 831 top := len(sa) 832 for i := len(sa) - 1; i >= 0; i-- { 833 j := int(sa[i]) 834 if j == 0 { 835 // Skip empty entry. 836 continue 837 } 838 sa[i] = 0 839 if j < 0 { 840 // Leave discovered LMS-substring start index for caller. 841 top-- 842 sa[top] = int32(-j) 843 continue 844 } 845 846 // Index j was on work queue, meaning k := j-1 is S-type, 847 // so we can now place k correctly into sa. 848 // If k-1 is S-type, queue k for processing later in this loop. 849 // If k-1 is L-type (text[k-1] > text[k]), queue -k to save for the caller. 850 k := j - 1 851 c1 := text[k] 852 c0 := text[k-1] 853 if c0 > c1 { 854 k = -k 855 } 856 857 if cB != c1 { 858 bucket[cB] = b 859 cB = c1 860 b = bucket[cB] 861 } 862 b-- 863 sa[b] = int32(k) 864 } 865 } 866 867 func induceSubS_64(text []int64, sa, freq, bucket []int64) { 868 // Initialize positions for right side of character buckets. 869 bucketMax_64(text, freq, bucket) 870 871 // Analogous to induceSubL_64 above, 872 // as we scan the array right-to-left, each sa[i] = j > 0 is a correctly 873 // sorted suffix array entry (for text[j:]) for which we know that j-1 is type S. 874 // Because j-1 is type S, inserting it into sa now will sort it correctly. 875 // But we want to distinguish a j-1 with j-2 of type S from type L. 876 // We can process the former but want to leave the latter for the caller. 877 // We record the difference by negating j-1 if it is preceded by type L. 878 // Either way, the insertion (into the text[j-1] bucket) is guaranteed to 879 // happen at sa[i´] for some i´ < i, that is, in the portion of sa we have 880 // yet to scan. A single pass therefore sees indexes j, j-1, j-2, j-3, 881 // and so on, in sorted but not necessarily adjacent order, until it finds 882 // one preceded by an index of type L, at which point it must stop. 883 // That index (preceded by one of type L) is an LMS-substring start. 884 // 885 // As we scan through the array, we clear the worked entries (sa[i] > 0) to zero, 886 // and we flip sa[i] < 0 to -sa[i] and compact into the top of sa, 887 // so that the loop finishes with the top of sa containing exactly 888 // the LMS-substring start indexes, sorted by LMS-substring. 889 890 // Cache recently used bucket index: 891 cB := int64(0) 892 b := bucket[cB] 893 894 top := len(sa) 895 for i := len(sa) - 1; i >= 0; i-- { 896 j := int(sa[i]) 897 if j == 0 { 898 // Skip empty entry. 899 continue 900 } 901 sa[i] = 0 902 if j < 0 { 903 // Leave discovered LMS-substring start index for caller. 904 top-- 905 sa[top] = int64(-j) 906 continue 907 } 908 909 // Index j was on work queue, meaning k := j-1 is S-type, 910 // so we can now place k correctly into sa. 911 // If k-1 is S-type, queue k for processing later in this loop. 912 // If k-1 is L-type (text[k-1] > text[k]), queue -k to save for the caller. 913 k := j - 1 914 c1 := text[k] 915 c0 := text[k-1] 916 if c0 > c1 { 917 k = -k 918 } 919 920 if cB != c1 { 921 bucket[cB] = b 922 cB = c1 923 b = bucket[cB] 924 } 925 b-- 926 sa[b] = int64(k) 927 } 928 } 929 930 func length_8_64(text []byte, sa []int64, numLMS int) { 931 end := 0 // index of current LMS-substring end (0 indicates final LMS-substring) 932 933 // The encoding of N text bytes into a “length” word 934 // adds 1 to each byte, packs them into the bottom 935 // N*8 bits of a word, and then bitwise inverts the result. 936 // That is, the text sequence A B C (hex 41 42 43) 937 // encodes as ^uint64(0x42_43_44). 938 // LMS-substrings can never start or end with 0xFF. 939 // Adding 1 ensures the encoded byte sequence never 940 // starts or ends with 0x00, so that present bytes can be 941 // distinguished from zero-padding in the top bits, 942 // so the length need not be separately encoded. 943 // Inverting the bytes increases the chance that a 944 // 4-byte encoding will still be ≥ len(text). 945 // In particular, if the first byte is ASCII (<= 0x7E, so +1 <= 0x7F) 946 // then the high bit of the inversion will be set, 947 // making it clearly not a valid length (it would be a negative one). 948 // 949 // cx holds the pre-inverted encoding (the packed incremented bytes). 950 cx := uint64(0) // byte-only 951 952 // This stanza (until the blank line) is the "LMS-substring iterator", 953 // described in placeLMS_8_64 above, with one line added to maintain cx. 954 c0, c1, isTypeS := byte(0), byte(0), false 955 for i := len(text) - 1; i >= 0; i-- { 956 c0, c1 = text[i], c0 957 cx = cx<<8 | uint64(c1+1) // byte-only 958 if c0 < c1 { 959 isTypeS = true 960 } else if c0 > c1 && isTypeS { 961 isTypeS = false 962 963 // Index j = i+1 is the start of an LMS-substring. 964 // Compute length or encoded text to store in sa[j/2]. 965 j := i + 1 966 var code int64 967 if end == 0 { 968 code = 0 969 } else { 970 code = int64(end - j) 971 if code <= 64/8 && ^cx >= uint64(len(text)) { // byte-only 972 code = int64(^cx) // byte-only 973 } // byte-only 974 } 975 sa[j>>1] = code 976 end = j + 1 977 cx = uint64(c1 + 1) // byte-only 978 } 979 } 980 } 981 982 func length_32(text []int32, sa []int32, numLMS int) { 983 end := 0 // index of current LMS-substring end (0 indicates final LMS-substring) 984 985 // The encoding of N text int32s into a “length” word 986 // adds 1 to each int32, packs them into the bottom 987 // N*8 bits of a word, and then bitwise inverts the result. 988 // That is, the text sequence A B C (hex 41 42 43) 989 // encodes as ^uint32(0x42_43_44). 990 // LMS-substrings can never start or end with 0xFF. 991 // Adding 1 ensures the encoded int32 sequence never 992 // starts or ends with 0x00, so that present int32s can be 993 // distinguished from zero-padding in the top bits, 994 // so the length need not be separately encoded. 995 // Inverting the int32s increases the chance that a 996 // 4-int32 encoding will still be ≥ len(text). 997 // In particular, if the first int32 is ASCII (<= 0x7E, so +1 <= 0x7F) 998 // then the high bit of the inversion will be set, 999 // making it clearly not a valid length (it would be a negative one). 1000 // 1001 // cx holds the pre-inverted encoding (the packed incremented int32s). 1002 1003 // This stanza (until the blank line) is the "LMS-substring iterator", 1004 // described in placeLMS_32 above, with one line added to maintain cx. 1005 c0, c1, isTypeS := int32(0), int32(0), false 1006 for i := len(text) - 1; i >= 0; i-- { 1007 c0, c1 = text[i], c0 1008 if c0 < c1 { 1009 isTypeS = true 1010 } else if c0 > c1 && isTypeS { 1011 isTypeS = false 1012 1013 // Index j = i+1 is the start of an LMS-substring. 1014 // Compute length or encoded text to store in sa[j/2]. 1015 j := i + 1 1016 var code int32 1017 if end == 0 { 1018 code = 0 1019 } else { 1020 code = int32(end - j) 1021 } 1022 sa[j>>1] = code 1023 end = j + 1 1024 } 1025 } 1026 } 1027 1028 func length_64(text []int64, sa []int64, numLMS int) { 1029 end := 0 // index of current LMS-substring end (0 indicates final LMS-substring) 1030 1031 // The encoding of N text int64s into a “length” word 1032 // adds 1 to each int64, packs them into the bottom 1033 // N*8 bits of a word, and then bitwise inverts the result. 1034 // That is, the text sequence A B C (hex 41 42 43) 1035 // encodes as ^uint64(0x42_43_44). 1036 // LMS-substrings can never start or end with 0xFF. 1037 // Adding 1 ensures the encoded int64 sequence never 1038 // starts or ends with 0x00, so that present int64s can be 1039 // distinguished from zero-padding in the top bits, 1040 // so the length need not be separately encoded. 1041 // Inverting the int64s increases the chance that a 1042 // 4-int64 encoding will still be ≥ len(text). 1043 // In particular, if the first int64 is ASCII (<= 0x7E, so +1 <= 0x7F) 1044 // then the high bit of the inversion will be set, 1045 // making it clearly not a valid length (it would be a negative one). 1046 // 1047 // cx holds the pre-inverted encoding (the packed incremented int64s). 1048 1049 // This stanza (until the blank line) is the "LMS-substring iterator", 1050 // described in placeLMS_64 above, with one line added to maintain cx. 1051 c0, c1, isTypeS := int64(0), int64(0), false 1052 for i := len(text) - 1; i >= 0; i-- { 1053 c0, c1 = text[i], c0 1054 if c0 < c1 { 1055 isTypeS = true 1056 } else if c0 > c1 && isTypeS { 1057 isTypeS = false 1058 1059 // Index j = i+1 is the start of an LMS-substring. 1060 // Compute length or encoded text to store in sa[j/2]. 1061 j := i + 1 1062 var code int64 1063 if end == 0 { 1064 code = 0 1065 } else { 1066 code = int64(end - j) 1067 } 1068 sa[j>>1] = code 1069 end = j + 1 1070 } 1071 } 1072 } 1073 1074 func assignID_8_64(text []byte, sa []int64, numLMS int) int { 1075 id := 0 1076 lastLen := int64(-1) // impossible 1077 lastPos := int64(0) 1078 for _, j := range sa[len(sa)-numLMS:] { 1079 // Is the LMS-substring at index j new, or is it the same as the last one we saw? 1080 n := sa[j/2] 1081 if n != lastLen { 1082 goto New 1083 } 1084 if uint64(n) >= uint64(len(text)) { 1085 // “Length” is really encoded full text, and they match. 1086 goto Same 1087 } 1088 { 1089 // Compare actual texts. 1090 n := int(n) 1091 this := text[j:][:n] 1092 last := text[lastPos:][:n] 1093 for i := 0; i < n; i++ { 1094 if this[i] != last[i] { 1095 goto New 1096 } 1097 } 1098 goto Same 1099 } 1100 New: 1101 id++ 1102 lastPos = j 1103 lastLen = n 1104 Same: 1105 sa[j/2] = int64(id) 1106 } 1107 return id 1108 } 1109 1110 func assignID_32(text []int32, sa []int32, numLMS int) int { 1111 id := 0 1112 lastLen := int32(-1) // impossible 1113 lastPos := int32(0) 1114 for _, j := range sa[len(sa)-numLMS:] { 1115 // Is the LMS-substring at index j new, or is it the same as the last one we saw? 1116 n := sa[j/2] 1117 if n != lastLen { 1118 goto New 1119 } 1120 if uint32(n) >= uint32(len(text)) { 1121 // “Length” is really encoded full text, and they match. 1122 goto Same 1123 } 1124 { 1125 // Compare actual texts. 1126 n := int(n) 1127 this := text[j:][:n] 1128 last := text[lastPos:][:n] 1129 for i := 0; i < n; i++ { 1130 if this[i] != last[i] { 1131 goto New 1132 } 1133 } 1134 goto Same 1135 } 1136 New: 1137 id++ 1138 lastPos = j 1139 lastLen = n 1140 Same: 1141 sa[j/2] = int32(id) 1142 } 1143 return id 1144 } 1145 1146 func assignID_64(text []int64, sa []int64, numLMS int) int { 1147 id := 0 1148 lastLen := int64(-1) // impossible 1149 lastPos := int64(0) 1150 for _, j := range sa[len(sa)-numLMS:] { 1151 // Is the LMS-substring at index j new, or is it the same as the last one we saw? 1152 n := sa[j/2] 1153 if n != lastLen { 1154 goto New 1155 } 1156 if uint64(n) >= uint64(len(text)) { 1157 // “Length” is really encoded full text, and they match. 1158 goto Same 1159 } 1160 { 1161 // Compare actual texts. 1162 n := int(n) 1163 this := text[j:][:n] 1164 last := text[lastPos:][:n] 1165 for i := 0; i < n; i++ { 1166 if this[i] != last[i] { 1167 goto New 1168 } 1169 } 1170 goto Same 1171 } 1172 New: 1173 id++ 1174 lastPos = j 1175 lastLen = n 1176 Same: 1177 sa[j/2] = int64(id) 1178 } 1179 return id 1180 } 1181 1182 func map_64(sa []int64, numLMS int) { 1183 w := len(sa) 1184 for i := len(sa) / 2; i >= 0; i-- { 1185 j := sa[i] 1186 if j > 0 { 1187 w-- 1188 sa[w] = j - 1 1189 } 1190 } 1191 } 1192 1193 func recurse_64(sa, oldTmp []int64, numLMS, maxID int) { 1194 dst, saTmp, text := sa[:numLMS], sa[numLMS:len(sa)-numLMS], sa[len(sa)-numLMS:] 1195 1196 // Set up temporary space for recursive call. 1197 // We must pass sais_64 a tmp buffer wiith at least maxID entries. 1198 // 1199 // The subproblem is guaranteed to have length at most len(sa)/2, 1200 // so that sa can hold both the subproblem and its suffix array. 1201 // Nearly all the time, however, the subproblem has length < len(sa)/3, 1202 // in which case there is a subproblem-sized middle of sa that 1203 // we can reuse for temporary space (saTmp). 1204 // When recurse_64 is called from sais_8_64, oldTmp is length 512 1205 // (from text_64), and saTmp will typically be much larger, so we'll use saTmp. 1206 // When deeper recursions come back to recurse_64, now oldTmp is 1207 // the saTmp from the top-most recursion, it is typically larger than 1208 // the current saTmp (because the current sa gets smaller and smaller 1209 // as the recursion gets deeper), and we keep reusing that top-most 1210 // large saTmp instead of the offered smaller ones. 1211 // 1212 // Why is the subproblem length so often just under len(sa)/3? 1213 // See Nong, Zhang, and Chen, section 3.6 for a plausible explanation. 1214 // In brief, the len(sa)/2 case would correspond to an SLSLSLSLSLSL pattern 1215 // in the input, perfect alternation of larger and smaller input bytes. 1216 // Real text doesn't do that. If each L-type index is randomly followed 1217 // by either an L-type or S-type index, then half the substrings will 1218 // be of the form SLS, but the other half will be longer. Of that half, 1219 // half (a quarter overall) will be SLLS; an eighth will be SLLLS, and so on. 1220 // Not counting the final S in each (which overlaps the first S in the next), 1221 // This works out to an average length 2×½ + 3×¼ + 4×⅛ + ... = 3. 1222 // The space we need is further reduced by the fact that many of the 1223 // short patterns like SLS will often be the same character sequences 1224 // repeated throughout the text, reducing maxID relative to numLMS. 1225 // 1226 // For short inputs, the averages may not run in our favor, but then we 1227 // can often fall back to using the length-512 tmp available in the 1228 // top-most call. (Also a short allocation would not be a big deal.) 1229 // 1230 // For pathological inputs, we fall back to allocating a new tmp of length 1231 // max(maxID, numLMS/2). This level of the recursion needs maxID, 1232 // and all deeper levels of the recursion will need no more than numLMS/2, 1233 // so this one allocation is guaranteed to suffice for the entire stack 1234 // of recursive calls. 1235 tmp := oldTmp 1236 if len(tmp) < len(saTmp) { 1237 tmp = saTmp 1238 } 1239 if len(tmp) < numLMS { 1240 // TestSAIS/forcealloc reaches this code. 1241 n := maxID 1242 if n < numLMS/2 { 1243 n = numLMS / 2 1244 } 1245 tmp = make([]int64, n) 1246 } 1247 1248 // sais_64 requires that the caller arrange to clear dst, 1249 // because in general the caller may know dst is 1250 // freshly-allocated and already cleared. But this one is not. 1251 for i := range dst { 1252 dst[i] = 0 1253 } 1254 sais_64(text, maxID, dst, tmp) 1255 } 1256 1257 func unmap_8_64(text []byte, sa []int64, numLMS int) { 1258 unmap := sa[len(sa)-numLMS:] 1259 j := len(unmap) 1260 1261 // "LMS-substring iterator" (see placeLMS_8_64 above). 1262 c0, c1, isTypeS := byte(0), byte(0), false 1263 for i := len(text) - 1; i >= 0; i-- { 1264 c0, c1 = text[i], c0 1265 if c0 < c1 { 1266 isTypeS = true 1267 } else if c0 > c1 && isTypeS { 1268 isTypeS = false 1269 1270 // Populate inverse map. 1271 j-- 1272 unmap[j] = int64(i + 1) 1273 } 1274 } 1275 1276 // Apply inverse map to subproblem suffix array. 1277 sa = sa[:numLMS] 1278 for i := 0; i < len(sa); i++ { 1279 sa[i] = unmap[sa[i]] 1280 } 1281 } 1282 1283 func unmap_32(text []int32, sa []int32, numLMS int) { 1284 unmap := sa[len(sa)-numLMS:] 1285 j := len(unmap) 1286 1287 // "LMS-substring iterator" (see placeLMS_32 above). 1288 c0, c1, isTypeS := int32(0), int32(0), false 1289 for i := len(text) - 1; i >= 0; i-- { 1290 c0, c1 = text[i], c0 1291 if c0 < c1 { 1292 isTypeS = true 1293 } else if c0 > c1 && isTypeS { 1294 isTypeS = false 1295 1296 // Populate inverse map. 1297 j-- 1298 unmap[j] = int32(i + 1) 1299 } 1300 } 1301 1302 // Apply inverse map to subproblem suffix array. 1303 sa = sa[:numLMS] 1304 for i := 0; i < len(sa); i++ { 1305 sa[i] = unmap[sa[i]] 1306 } 1307 } 1308 1309 func unmap_64(text []int64, sa []int64, numLMS int) { 1310 unmap := sa[len(sa)-numLMS:] 1311 j := len(unmap) 1312 1313 // "LMS-substring iterator" (see placeLMS_64 above). 1314 c0, c1, isTypeS := int64(0), int64(0), false 1315 for i := len(text) - 1; i >= 0; i-- { 1316 c0, c1 = text[i], c0 1317 if c0 < c1 { 1318 isTypeS = true 1319 } else if c0 > c1 && isTypeS { 1320 isTypeS = false 1321 1322 // Populate inverse map. 1323 j-- 1324 unmap[j] = int64(i + 1) 1325 } 1326 } 1327 1328 // Apply inverse map to subproblem suffix array. 1329 sa = sa[:numLMS] 1330 for i := 0; i < len(sa); i++ { 1331 sa[i] = unmap[sa[i]] 1332 } 1333 } 1334 1335 func expand_8_64(text []byte, freq, bucket, sa []int64, numLMS int) { 1336 bucketMax_8_64(text, freq, bucket) 1337 bucket = bucket[:256] // eliminate bound check for bucket[c] below 1338 1339 // Loop backward through sa, always tracking 1340 // the next index to populate from sa[:numLMS]. 1341 // When we get to one, populate it. 1342 // Zero the rest of the slots; they have dead values in them. 1343 x := numLMS - 1 1344 saX := sa[x] 1345 c := text[saX] 1346 b := bucket[c] - 1 1347 bucket[c] = b 1348 1349 for i := len(sa) - 1; i >= 0; i-- { 1350 if i != int(b) { 1351 sa[i] = 0 1352 continue 1353 } 1354 sa[i] = saX 1355 1356 // Load next entry to put down (if any). 1357 if x > 0 { 1358 x-- 1359 saX = sa[x] // TODO bounds check 1360 c = text[saX] 1361 b = bucket[c] - 1 1362 bucket[c] = b 1363 } 1364 } 1365 } 1366 1367 func expand_32(text []int32, freq, bucket, sa []int32, numLMS int) { 1368 bucketMax_32(text, freq, bucket) 1369 1370 // Loop backward through sa, always tracking 1371 // the next index to populate from sa[:numLMS]. 1372 // When we get to one, populate it. 1373 // Zero the rest of the slots; they have dead values in them. 1374 x := numLMS - 1 1375 saX := sa[x] 1376 c := text[saX] 1377 b := bucket[c] - 1 1378 bucket[c] = b 1379 1380 for i := len(sa) - 1; i >= 0; i-- { 1381 if i != int(b) { 1382 sa[i] = 0 1383 continue 1384 } 1385 sa[i] = saX 1386 1387 // Load next entry to put down (if any). 1388 if x > 0 { 1389 x-- 1390 saX = sa[x] // TODO bounds check 1391 c = text[saX] 1392 b = bucket[c] - 1 1393 bucket[c] = b 1394 } 1395 } 1396 } 1397 1398 func expand_64(text []int64, freq, bucket, sa []int64, numLMS int) { 1399 bucketMax_64(text, freq, bucket) 1400 1401 // Loop backward through sa, always tracking 1402 // the next index to populate from sa[:numLMS]. 1403 // When we get to one, populate it. 1404 // Zero the rest of the slots; they have dead values in them. 1405 x := numLMS - 1 1406 saX := sa[x] 1407 c := text[saX] 1408 b := bucket[c] - 1 1409 bucket[c] = b 1410 1411 for i := len(sa) - 1; i >= 0; i-- { 1412 if i != int(b) { 1413 sa[i] = 0 1414 continue 1415 } 1416 sa[i] = saX 1417 1418 // Load next entry to put down (if any). 1419 if x > 0 { 1420 x-- 1421 saX = sa[x] // TODO bounds check 1422 c = text[saX] 1423 b = bucket[c] - 1 1424 bucket[c] = b 1425 } 1426 } 1427 } 1428 1429 func induceL_8_64(text []byte, sa, freq, bucket []int64) { 1430 // Initialize positions for left side of character buckets. 1431 bucketMin_8_64(text, freq, bucket) 1432 bucket = bucket[:256] // eliminate bounds check for bucket[cB] below 1433 1434 // This scan is similar to the one in induceSubL_8_64 above. 1435 // That one arranges to clear all but the leftmost L-type indexes. 1436 // This scan leaves all the L-type indexes and the original S-type 1437 // indexes, but it negates the positive leftmost L-type indexes 1438 // (the ones that induceS_8_64 needs to process). 1439 1440 // expand_8_64 left out the implicit entry sa[-1] == len(text), 1441 // corresponding to the identified type-L index len(text)-1. 1442 // Process it before the left-to-right scan of sa proper. 1443 // See body in loop for commentary. 1444 k := len(text) - 1 1445 c0, c1 := text[k-1], text[k] 1446 if c0 < c1 { 1447 k = -k 1448 } 1449 1450 // Cache recently used bucket index. 1451 cB := c1 1452 b := bucket[cB] 1453 sa[b] = int64(k) 1454 b++ 1455 1456 for i := 0; i < len(sa); i++ { 1457 j := int(sa[i]) 1458 if j <= 0 { 1459 // Skip empty or negated entry (including negated zero). 1460 continue 1461 } 1462 1463 // Index j was on work queue, meaning k := j-1 is L-type, 1464 // so we can now place k correctly into sa. 1465 // If k-1 is L-type, queue k for processing later in this loop. 1466 // If k-1 is S-type (text[k-1] < text[k]), queue -k to save for the caller. 1467 // If k is zero, k-1 doesn't exist, so we only need to leave it 1468 // for the caller. The caller can't tell the difference between 1469 // an empty slot and a non-empty zero, but there's no need 1470 // to distinguish them anyway: the final suffix array will end up 1471 // with one zero somewhere, and that will be a real zero. 1472 k := j - 1 1473 c1 := text[k] 1474 if k > 0 { 1475 if c0 := text[k-1]; c0 < c1 { 1476 k = -k 1477 } 1478 } 1479 1480 if cB != c1 { 1481 bucket[cB] = b 1482 cB = c1 1483 b = bucket[cB] 1484 } 1485 sa[b] = int64(k) 1486 b++ 1487 } 1488 } 1489 1490 func induceL_32(text []int32, sa, freq, bucket []int32) { 1491 // Initialize positions for left side of character buckets. 1492 bucketMin_32(text, freq, bucket) 1493 1494 // This scan is similar to the one in induceSubL_32 above. 1495 // That one arranges to clear all but the leftmost L-type indexes. 1496 // This scan leaves all the L-type indexes and the original S-type 1497 // indexes, but it negates the positive leftmost L-type indexes 1498 // (the ones that induceS_32 needs to process). 1499 1500 // expand_32 left out the implicit entry sa[-1] == len(text), 1501 // corresponding to the identified type-L index len(text)-1. 1502 // Process it before the left-to-right scan of sa proper. 1503 // See body in loop for commentary. 1504 k := len(text) - 1 1505 c0, c1 := text[k-1], text[k] 1506 if c0 < c1 { 1507 k = -k 1508 } 1509 1510 // Cache recently used bucket index. 1511 cB := c1 1512 b := bucket[cB] 1513 sa[b] = int32(k) 1514 b++ 1515 1516 for i := 0; i < len(sa); i++ { 1517 j := int(sa[i]) 1518 if j <= 0 { 1519 // Skip empty or negated entry (including negated zero). 1520 continue 1521 } 1522 1523 // Index j was on work queue, meaning k := j-1 is L-type, 1524 // so we can now place k correctly into sa. 1525 // If k-1 is L-type, queue k for processing later in this loop. 1526 // If k-1 is S-type (text[k-1] < text[k]), queue -k to save for the caller. 1527 // If k is zero, k-1 doesn't exist, so we only need to leave it 1528 // for the caller. The caller can't tell the difference between 1529 // an empty slot and a non-empty zero, but there's no need 1530 // to distinguish them anyway: the final suffix array will end up 1531 // with one zero somewhere, and that will be a real zero. 1532 k := j - 1 1533 c1 := text[k] 1534 if k > 0 { 1535 if c0 := text[k-1]; c0 < c1 { 1536 k = -k 1537 } 1538 } 1539 1540 if cB != c1 { 1541 bucket[cB] = b 1542 cB = c1 1543 b = bucket[cB] 1544 } 1545 sa[b] = int32(k) 1546 b++ 1547 } 1548 } 1549 1550 func induceL_64(text []int64, sa, freq, bucket []int64) { 1551 // Initialize positions for left side of character buckets. 1552 bucketMin_64(text, freq, bucket) 1553 1554 // This scan is similar to the one in induceSubL_64 above. 1555 // That one arranges to clear all but the leftmost L-type indexes. 1556 // This scan leaves all the L-type indexes and the original S-type 1557 // indexes, but it negates the positive leftmost L-type indexes 1558 // (the ones that induceS_64 needs to process). 1559 1560 // expand_64 left out the implicit entry sa[-1] == len(text), 1561 // corresponding to the identified type-L index len(text)-1. 1562 // Process it before the left-to-right scan of sa proper. 1563 // See body in loop for commentary. 1564 k := len(text) - 1 1565 c0, c1 := text[k-1], text[k] 1566 if c0 < c1 { 1567 k = -k 1568 } 1569 1570 // Cache recently used bucket index. 1571 cB := c1 1572 b := bucket[cB] 1573 sa[b] = int64(k) 1574 b++ 1575 1576 for i := 0; i < len(sa); i++ { 1577 j := int(sa[i]) 1578 if j <= 0 { 1579 // Skip empty or negated entry (including negated zero). 1580 continue 1581 } 1582 1583 // Index j was on work queue, meaning k := j-1 is L-type, 1584 // so we can now place k correctly into sa. 1585 // If k-1 is L-type, queue k for processing later in this loop. 1586 // If k-1 is S-type (text[k-1] < text[k]), queue -k to save for the caller. 1587 // If k is zero, k-1 doesn't exist, so we only need to leave it 1588 // for the caller. The caller can't tell the difference between 1589 // an empty slot and a non-empty zero, but there's no need 1590 // to distinguish them anyway: the final suffix array will end up 1591 // with one zero somewhere, and that will be a real zero. 1592 k := j - 1 1593 c1 := text[k] 1594 if k > 0 { 1595 if c0 := text[k-1]; c0 < c1 { 1596 k = -k 1597 } 1598 } 1599 1600 if cB != c1 { 1601 bucket[cB] = b 1602 cB = c1 1603 b = bucket[cB] 1604 } 1605 sa[b] = int64(k) 1606 b++ 1607 } 1608 } 1609 1610 func induceS_8_64(text []byte, sa, freq, bucket []int64) { 1611 // Initialize positions for right side of character buckets. 1612 bucketMax_8_64(text, freq, bucket) 1613 bucket = bucket[:256] // eliminate bounds check for bucket[cB] below 1614 1615 cB := byte(0) 1616 b := bucket[cB] 1617 1618 for i := len(sa) - 1; i >= 0; i-- { 1619 j := int(sa[i]) 1620 if j >= 0 { 1621 // Skip non-flagged entry. 1622 // (This loop can't see an empty entry; 0 means the real zero index.) 1623 continue 1624 } 1625 1626 // Negative j is a work queue entry; rewrite to positive j for final suffix array. 1627 j = -j 1628 sa[i] = int64(j) 1629 1630 // Index j was on work queue (encoded as -j but now decoded), 1631 // meaning k := j-1 is L-type, 1632 // so we can now place k correctly into sa. 1633 // If k-1 is S-type, queue -k for processing later in this loop. 1634 // If k-1 is L-type (text[k-1] > text[k]), queue k to save for the caller. 1635 // If k is zero, k-1 doesn't exist, so we only need to leave it 1636 // for the caller. 1637 k := j - 1 1638 c1 := text[k] 1639 if k > 0 { 1640 if c0 := text[k-1]; c0 <= c1 { 1641 k = -k 1642 } 1643 } 1644 1645 if cB != c1 { 1646 bucket[cB] = b 1647 cB = c1 1648 b = bucket[cB] 1649 } 1650 b-- 1651 sa[b] = int64(k) 1652 } 1653 } 1654 1655 func induceS_32(text []int32, sa, freq, bucket []int32) { 1656 // Initialize positions for right side of character buckets. 1657 bucketMax_32(text, freq, bucket) 1658 1659 cB := int32(0) 1660 b := bucket[cB] 1661 1662 for i := len(sa) - 1; i >= 0; i-- { 1663 j := int(sa[i]) 1664 if j >= 0 { 1665 // Skip non-flagged entry. 1666 // (This loop can't see an empty entry; 0 means the real zero index.) 1667 continue 1668 } 1669 1670 // Negative j is a work queue entry; rewrite to positive j for final suffix array. 1671 j = -j 1672 sa[i] = int32(j) 1673 1674 // Index j was on work queue (encoded as -j but now decoded), 1675 // meaning k := j-1 is L-type, 1676 // so we can now place k correctly into sa. 1677 // If k-1 is S-type, queue -k for processing later in this loop. 1678 // If k-1 is L-type (text[k-1] > text[k]), queue k to save for the caller. 1679 // If k is zero, k-1 doesn't exist, so we only need to leave it 1680 // for the caller. 1681 k := j - 1 1682 c1 := text[k] 1683 if k > 0 { 1684 if c0 := text[k-1]; c0 <= c1 { 1685 k = -k 1686 } 1687 } 1688 1689 if cB != c1 { 1690 bucket[cB] = b 1691 cB = c1 1692 b = bucket[cB] 1693 } 1694 b-- 1695 sa[b] = int32(k) 1696 } 1697 } 1698 1699 func induceS_64(text []int64, sa, freq, bucket []int64) { 1700 // Initialize positions for right side of character buckets. 1701 bucketMax_64(text, freq, bucket) 1702 1703 cB := int64(0) 1704 b := bucket[cB] 1705 1706 for i := len(sa) - 1; i >= 0; i-- { 1707 j := int(sa[i]) 1708 if j >= 0 { 1709 // Skip non-flagged entry. 1710 // (This loop can't see an empty entry; 0 means the real zero index.) 1711 continue 1712 } 1713 1714 // Negative j is a work queue entry; rewrite to positive j for final suffix array. 1715 j = -j 1716 sa[i] = int64(j) 1717 1718 // Index j was on work queue (encoded as -j but now decoded), 1719 // meaning k := j-1 is L-type, 1720 // so we can now place k correctly into sa. 1721 // If k-1 is S-type, queue -k for processing later in this loop. 1722 // If k-1 is L-type (text[k-1] > text[k]), queue k to save for the caller. 1723 // If k is zero, k-1 doesn't exist, so we only need to leave it 1724 // for the caller. 1725 k := j - 1 1726 c1 := text[k] 1727 if k > 0 { 1728 if c0 := text[k-1]; c0 <= c1 { 1729 k = -k 1730 } 1731 } 1732 1733 if cB != c1 { 1734 bucket[cB] = b 1735 cB = c1 1736 b = bucket[cB] 1737 } 1738 b-- 1739 sa[b] = int64(k) 1740 } 1741 } 1742