sais.go

     1  // Copyright 2019 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // Suffix array construction by induced sorting (SAIS).
     6  // See Ge Nong, Sen Zhang, and Wai Hong Chen,
     7  // "Two Efficient Algorithms for Linear Time Suffix Array Construction",
     8  // especially section 3 (https://ieeexplore.ieee.org/document/5582081).
     9  // See also http://zork.net/~st/jottings/sais.html.
    10  //
    11  // With optimizations inspired by Yuta Mori's sais-lite
    12  // (https://sites.google.com/site/yuta256/sais).
    13  //
    14  // And with other new optimizations.
    15  
    16  // Many of these functions are parameterized by the sizes of
    17  // the types they operate on. The generator gen.go makes
    18  // copies of these functions for use with other sizes.
    19  // Specifically:
    20  //
    21  // - A function with a name ending in _8_32 takes []byte and []int32 arguments
    22  //   and is duplicated into _32_32, _8_64, and _64_64 forms.
    23  //   The _32_32 and _64_64_ suffixes are shortened to plain _32 and _64.
    24  //   Any lines in the function body that contain the text "byte-only" or "256"
    25  //   are stripped when creating _32_32 and _64_64 forms.
    26  //   (Those lines are typically 8-bit-specific optimizations.)
    27  //
    28  // - A function with a name ending only in _32 operates on []int32
    29  //   and is duplicated into a _64 form. (Note that it may still take a []byte,
    30  //   but there is no need for a version of the function in which the []byte
    31  //   is widened to a full integer array.)
    32  
    33  // The overall runtime of this code is linear in the input size:
    34  // it runs a sequence of linear passes to reduce the problem to
    35  // a subproblem at most half as big, invokes itself recursively,
    36  // and then runs a sequence of linear passes to turn the answer
    37  // for the subproblem into the answer for the original problem.
    38  // This gives T(N) = O(N) + T(N/2) = O(N) + O(N/2) + O(N/4) + ... = O(N).
    39  //
    40  // The outline of the code, with the forward and backward scans
    41  // through O(N)-sized arrays called out, is:
    42  //
    43  // sais_I_N
    44  //	placeLMS_I_B
    45  //		bucketMax_I_B
    46  //			freq_I_B
    47  //				<scan +text> (1)
    48  //			<scan +freq> (2)
    49  //		<scan -text, random bucket> (3)
    50  //	induceSubL_I_B
    51  //		bucketMin_I_B
    52  //			freq_I_B
    53  //				<scan +text, often optimized away> (4)
    54  //			<scan +freq> (5)
    55  //		<scan +sa, random text, random bucket> (6)
    56  //	induceSubS_I_B
    57  //		bucketMax_I_B
    58  //			freq_I_B
    59  //				<scan +text, often optimized away> (7)
    60  //			<scan +freq> (8)
    61  //		<scan -sa, random text, random bucket> (9)
    62  //	assignID_I_B
    63  //		<scan +sa, random text substrings> (10)
    64  //	map_B
    65  //		<scan -sa> (11)
    66  //	recurse_B
    67  //		(recursive call to sais_B_B for a subproblem of size at most 1/2 input, often much smaller)
    68  //	unmap_I_B
    69  //		<scan -text> (12)
    70  //		<scan +sa> (13)
    71  //	expand_I_B
    72  //		bucketMax_I_B
    73  //			freq_I_B
    74  //				<scan +text, often optimized away> (14)
    75  //			<scan +freq> (15)
    76  //		<scan -sa, random text, random bucket> (16)
    77  //	induceL_I_B
    78  //		bucketMin_I_B
    79  //			freq_I_B
    80  //				<scan +text, often optimized away> (17)
    81  //			<scan +freq> (18)
    82  //		<scan +sa, random text, random bucket> (19)
    83  //	induceS_I_B
    84  //		bucketMax_I_B
    85  //			freq_I_B
    86  //				<scan +text, often optimized away> (20)
    87  //			<scan +freq> (21)
    88  //		<scan -sa, random text, random bucket> (22)
    89  //
    90  // Here, _B indicates the suffix array size (_32 or _64) and _I the input size (_8 or _B).
    91  //
    92  // The outline shows there are in general 22 scans through
    93  // O(N)-sized arrays for a given level of the recursion.
    94  // In the top level, operating on 8-bit input text,
    95  // the six freq scans are fixed size (256) instead of potentially
    96  // input-sized. Also, the frequency is counted once and cached
    97  // whenever there is room to do so (there is nearly always room in general,
    98  // and always room at the top level), which eliminates all but
    99  // the first freq_I_B text scans (that is, 5 of the 6).
   100  // So the top level of the recursion only does 22 - 6 - 5 = 11
   101  // input-sized scans and a typical level does 16 scans.
   102  //
   103  // The linear scans do not cost anywhere near as much as
   104  // the random accesses to the text made during a few of
   105  // the scans (specifically #6, #9, #16, #19, #22 marked above).
   106  // In real texts, there is not much but some locality to
   107  // the accesses, due to the repetitive structure of the text
   108  // (the same reason Burrows-Wheeler compression is so effective).
   109  // For random inputs, there is no locality, which makes those
   110  // accesses even more expensive, especially once the text
   111  // no longer fits in cache.
   112  // For example, running on 50 MB of Go source code, induceSubL_8_32
   113  // (which runs only once, at the top level of the recursion)
   114  // takes 0.44s, while on 50 MB of random input, it takes 2.55s.
   115  // Nearly all the relative slowdown is explained by the text access:
   116  //
   117  //		c0, c1 := text[k-1], text[k]
   118  //
   119  // That line runs for 0.23s on the Go text and 2.02s on random text.
   120  
   121  //go:generate go run gen.go
   122  
   123  package suffixarray
   124  
   125  // text_32 returns the suffix array for the input text.
   126  // It requires that len(text) fit in an int32
   127  // and that the caller zero sa.
   128  func text_32(text []byte, sa []int32) {
   129  	if int(int32(len(text))) != len(text) || len(text) != len(sa) {
   130  		panic("suffixarray: misuse of text_32")
   131  	}
   132  	sais_8_32(text, 256, sa, make([]int32, 2*256))
   133  }
   134  
   135  // sais_8_32 computes the suffix array of text.
   136  // The text must contain only values in [0, textMax).
   137  // The suffix array is stored in sa, which the caller
   138  // must ensure is already zeroed.
   139  // The caller must also provide temporary space tmp
   140  // with len(tmp) ≥ textMax. If len(tmp) ≥ 2*textMax
   141  // then the algorithm runs a little faster.
   142  // If sais_8_32 modifies tmp, it sets tmp[0] = -1 on return.
   143  func sais_8_32(text []byte, textMax int, sa, tmp []int32) {
   144  	if len(sa) != len(text) || len(tmp) < int(textMax) {
   145  		panic("suffixarray: misuse of sais_8_32")
   146  	}
   147  
   148  	// Trivial base cases. Sorting 0 or 1 things is easy.
   149  	if len(text) == 0 {
   150  		return
   151  	}
   152  	if len(text) == 1 {
   153  		sa[0] = 0
   154  		return
   155  	}
   156  
   157  	// Establish slices indexed by text character
   158  	// holding character frequency and bucket-sort offsets.
   159  	// If there's only enough tmp for one slice,
   160  	// we make it the bucket offsets and recompute
   161  	// the character frequency each time we need it.
   162  	var freq, bucket []int32
   163  	if len(tmp) >= 2*textMax {
   164  		freq, bucket = tmp[:textMax], tmp[textMax:2*textMax]
   165  		freq[0] = -1 // mark as uninitialized
   166  	} else {
   167  		freq, bucket = nil, tmp[:textMax]
   168  	}
   169  
   170  	// The SAIS algorithm.
   171  	// Each of these calls makes one scan through sa.
   172  	// See the individual functions for documentation
   173  	// about each's role in the algorithm.
   174  	numLMS := placeLMS_8_32(text, sa, freq, bucket)
   175  	if numLMS <= 1 {
   176  		// 0 or 1 items are already sorted. Do nothing.
   177  	} else {
   178  		induceSubL_8_32(text, sa, freq, bucket)
   179  		induceSubS_8_32(text, sa, freq, bucket)
   180  		length_8_32(text, sa, numLMS)
   181  		maxID := assignID_8_32(text, sa, numLMS)
   182  		if maxID < numLMS {
   183  			map_32(sa, numLMS)
   184  			recurse_32(sa, tmp, numLMS, maxID)
   185  			unmap_8_32(text, sa, numLMS)
   186  		} else {
   187  			// If maxID == numLMS, then each LMS-substring
   188  			// is unique, so the relative ordering of two LMS-suffixes
   189  			// is determined by just the leading LMS-substring.
   190  			// That is, the LMS-suffix sort order matches the
   191  			// (simpler) LMS-substring sort order.
   192  			// Copy the original LMS-substring order into the
   193  			// suffix array destination.
   194  			copy(sa, sa[len(sa)-numLMS:])
   195  		}
   196  		expand_8_32(text, freq, bucket, sa, numLMS)
   197  	}
   198  	induceL_8_32(text, sa, freq, bucket)
   199  	induceS_8_32(text, sa, freq, bucket)
   200  
   201  	// Mark for caller that we overwrote tmp.
   202  	tmp[0] = -1
   203  }
   204  
   205  // freq_8_32 returns the character frequencies
   206  // for text, as a slice indexed by character value.
   207  // If freq is nil, freq_8_32 uses and returns bucket.
   208  // If freq is non-nil, freq_8_32 assumes that freq[0] >= 0
   209  // means the frequencies are already computed.
   210  // If the frequency data is overwritten or uninitialized,
   211  // the caller must set freq[0] = -1 to force recomputation
   212  // the next time it is needed.
   213  func freq_8_32(text []byte, freq, bucket []int32) []int32 {
   214  	if freq != nil && freq[0] >= 0 {
   215  		return freq // already computed
   216  	}
   217  	if freq == nil {
   218  		freq = bucket
   219  	}
   220  
   221  	freq = freq[:256] // eliminate bounds check for freq[c] below
   222  	for i := range freq {
   223  		freq[i] = 0
   224  	}
   225  	for _, c := range text {
   226  		freq[c]++
   227  	}
   228  	return freq
   229  }
   230  
   231  // bucketMin_8_32 stores into bucket[c] the minimum index
   232  // in the bucket for character c in a bucket-sort of text.
   233  func bucketMin_8_32(text []byte, freq, bucket []int32) {
   234  	freq = freq_8_32(text, freq, bucket)
   235  	freq = freq[:256]     // establish len(freq) = 256, so 0 ≤ i < 256 below
   236  	bucket = bucket[:256] // eliminate bounds check for bucket[i] below
   237  	total := int32(0)
   238  	for i, n := range freq {
   239  		bucket[i] = total
   240  		total += n
   241  	}
   242  }
   243  
   244  // bucketMax_8_32 stores into bucket[c] the maximum index
   245  // in the bucket for character c in a bucket-sort of text.
   246  // The bucket indexes for c are [min, max).
   247  // That is, max is one past the final index in that bucket.
   248  func bucketMax_8_32(text []byte, freq, bucket []int32) {
   249  	freq = freq_8_32(text, freq, bucket)
   250  	freq = freq[:256]     // establish len(freq) = 256, so 0 ≤ i < 256 below
   251  	bucket = bucket[:256] // eliminate bounds check for bucket[i] below
   252  	total := int32(0)
   253  	for i, n := range freq {
   254  		total += n
   255  		bucket[i] = total
   256  	}
   257  }
   258  
   259  // The SAIS algorithm proceeds in a sequence of scans through sa.
   260  // Each of the following functions implements one scan,
   261  // and the functions appear here in the order they execute in the algorithm.
   262  
   263  // placeLMS_8_32 places into sa the indexes of the
   264  // final characters of the LMS substrings of text,
   265  // sorted into the rightmost ends of their correct buckets
   266  // in the suffix array.
   267  //
   268  // The imaginary sentinel character at the end of the text
   269  // is the final character of the final LMS substring, but there
   270  // is no bucket for the imaginary sentinel character,
   271  // which has a smaller value than any real character.
   272  // The caller must therefore pretend that sa[-1] == len(text).
   273  //
   274  // The text indexes of LMS-substring characters are always ≥ 1
   275  // (the first LMS-substring must be preceded by one or more L-type
   276  // characters that are not part of any LMS-substring),
   277  // so using 0 as a “not present” suffix array entry is safe,
   278  // both in this function and in most later functions
   279  // (until induceL_8_32 below).
   280  func placeLMS_8_32(text []byte, sa, freq, bucket []int32) int {
   281  	bucketMax_8_32(text, freq, bucket)
   282  
   283  	numLMS := 0
   284  	lastB := int32(-1)
   285  	bucket = bucket[:256] // eliminate bounds check for bucket[c1] below
   286  
   287  	// The next stanza of code (until the blank line) loop backward
   288  	// over text, stopping to execute a code body at each position i
   289  	// such that text[i] is an L-character and text[i+1] is an S-character.
   290  	// That is, i+1 is the position of the start of an LMS-substring.
   291  	// These could be hoisted out into a function with a callback,
   292  	// but at a significant speed cost. Instead, we just write these
   293  	// seven lines a few times in this source file. The copies below
   294  	// refer back to the pattern established by this original as the
   295  	// "LMS-substring iterator".
   296  	//
   297  	// In every scan through the text, c0, c1 are successive characters of text.
   298  	// In this backward scan, c0 == text[i] and c1 == text[i+1].
   299  	// By scanning backward, we can keep track of whether the current
   300  	// position is type-S or type-L according to the usual definition:
   301  	//
   302  	//	- position len(text) is type S with text[len(text)] == -1 (the sentinel)
   303  	//	- position i is type S if text[i] < text[i+1], or if text[i] == text[i+1] && i+1 is type S.
   304  	//	- position i is type L if text[i] > text[i+1], or if text[i] == text[i+1] && i+1 is type L.
   305  	//
   306  	// The backward scan lets us maintain the current type,
   307  	// update it when we see c0 != c1, and otherwise leave it alone.
   308  	// We want to identify all S positions with a preceding L.
   309  	// Position len(text) is one such position by definition, but we have
   310  	// nowhere to write it down, so we eliminate it by untruthfully
   311  	// setting isTypeS = false at the start of the loop.
   312  	c0, c1, isTypeS := byte(0), byte(0), false
   313  	for i := len(text) - 1; i >= 0; i-- {
   314  		c0, c1 = text[i], c0
   315  		if c0 < c1 {
   316  			isTypeS = true
   317  		} else if c0 > c1 && isTypeS {
   318  			isTypeS = false
   319  
   320  			// Bucket the index i+1 for the start of an LMS-substring.
   321  			b := bucket[c1] - 1
   322  			bucket[c1] = b
   323  			sa[b] = int32(i + 1)
   324  			lastB = b
   325  			numLMS++
   326  		}
   327  	}
   328  
   329  	// We recorded the LMS-substring starts but really want the ends.
   330  	// Luckily, with two differences, the start indexes and the end indexes are the same.
   331  	// The first difference is that the rightmost LMS-substring's end index is len(text),
   332  	// so the caller must pretend that sa[-1] == len(text), as noted above.
   333  	// The second difference is that the first leftmost LMS-substring start index
   334  	// does not end an earlier LMS-substring, so as an optimization we can omit
   335  	// that leftmost LMS-substring start index (the last one we wrote).
   336  	//
   337  	// Exception: if numLMS <= 1, the caller is not going to bother with
   338  	// the recursion at all and will treat the result as containing LMS-substring starts.
   339  	// In that case, we don't remove the final entry.
   340  	if numLMS > 1 {
   341  		sa[lastB] = 0
   342  	}
   343  	return numLMS
   344  }
   345  
   346  // induceSubL_8_32 inserts the L-type text indexes of LMS-substrings
   347  // into sa, assuming that the final characters of the LMS-substrings
   348  // are already inserted into sa, sorted by final character, and at the
   349  // right (not left) end of the corresponding character bucket.
   350  // Each LMS-substring has the form (as a regexp) /S+L+S/:
   351  // one or more S-type, one or more L-type, final S-type.
   352  // induceSubL_8_32 leaves behind only the leftmost L-type text
   353  // index for each LMS-substring. That is, it removes the final S-type
   354  // indexes that are present on entry, and it inserts but then removes
   355  // the interior L-type indexes too.
   356  // (Only the leftmost L-type index is needed by induceSubS_8_32.)
   357  func induceSubL_8_32(text []byte, sa, freq, bucket []int32) {
   358  	// Initialize positions for left side of character buckets.
   359  	bucketMin_8_32(text, freq, bucket)
   360  	bucket = bucket[:256] // eliminate bounds check for bucket[cB] below
   361  
   362  	// As we scan the array left-to-right, each sa[i] = j > 0 is a correctly
   363  	// sorted suffix array entry (for text[j:]) for which we know that j-1 is type L.
   364  	// Because j-1 is type L, inserting it into sa now will sort it correctly.
   365  	// But we want to distinguish a j-1 with j-2 of type L from type S.
   366  	// We can process the former but want to leave the latter for the caller.
   367  	// We record the difference by negating j-1 if it is preceded by type S.
   368  	// Either way, the insertion (into the text[j-1] bucket) is guaranteed to
   369  	// happen at sa[i´] for some i´ > i, that is, in the portion of sa we have
   370  	// yet to scan. A single pass therefore sees indexes j, j-1, j-2, j-3,
   371  	// and so on, in sorted but not necessarily adjacent order, until it finds
   372  	// one preceded by an index of type S, at which point it must stop.
   373  	//
   374  	// As we scan through the array, we clear the worked entries (sa[i] > 0) to zero,
   375  	// and we flip sa[i] < 0 to -sa[i], so that the loop finishes with sa containing
   376  	// only the indexes of the leftmost L-type indexes for each LMS-substring.
   377  	//
   378  	// The suffix array sa therefore serves simultaneously as input, output,
   379  	// and a miraculously well-tailored work queue.
   380  
   381  	// placeLMS_8_32 left out the implicit entry sa[-1] == len(text),
   382  	// corresponding to the identified type-L index len(text)-1.
   383  	// Process it before the left-to-right scan of sa proper.
   384  	// See body in loop for commentary.
   385  	k := len(text) - 1
   386  	c0, c1 := text[k-1], text[k]
   387  	if c0 < c1 {
   388  		k = -k
   389  	}
   390  
   391  	// Cache recently used bucket index:
   392  	// we're processing suffixes in sorted order
   393  	// and accessing buckets indexed by the
   394  	// byte before the sorted order, which still
   395  	// has very good locality.
   396  	// Invariant: b is cached, possibly dirty copy of bucket[cB].
   397  	cB := c1
   398  	b := bucket[cB]
   399  	sa[b] = int32(k)
   400  	b++
   401  
   402  	for i := 0; i < len(sa); i++ {
   403  		j := int(sa[i])
   404  		if j == 0 {
   405  			// Skip empty entry.
   406  			continue
   407  		}
   408  		if j < 0 {
   409  			// Leave discovered type-S index for caller.
   410  			sa[i] = int32(-j)
   411  			continue
   412  		}
   413  		sa[i] = 0
   414  
   415  		// Index j was on work queue, meaning k := j-1 is L-type,
   416  		// so we can now place k correctly into sa.
   417  		// If k-1 is L-type, queue k for processing later in this loop.
   418  		// If k-1 is S-type (text[k-1] < text[k]), queue -k to save for the caller.
   419  		k := j - 1
   420  		c0, c1 := text[k-1], text[k]
   421  		if c0 < c1 {
   422  			k = -k
   423  		}
   424  
   425  		if cB != c1 {
   426  			bucket[cB] = b
   427  			cB = c1
   428  			b = bucket[cB]
   429  		}
   430  		sa[b] = int32(k)
   431  		b++
   432  	}
   433  }
   434  
   435  // induceSubS_8_32 inserts the S-type text indexes of LMS-substrings
   436  // into sa, assuming that the leftmost L-type text indexes are already
   437  // inserted into sa, sorted by LMS-substring suffix, and at the
   438  // left end of the corresponding character bucket.
   439  // Each LMS-substring has the form (as a regexp) /S+L+S/:
   440  // one or more S-type, one or more L-type, final S-type.
   441  // induceSubS_8_32 leaves behind only the leftmost S-type text
   442  // index for each LMS-substring, in sorted order, at the right end of sa.
   443  // That is, it removes the L-type indexes that are present on entry,
   444  // and it inserts but then removes the interior S-type indexes too,
   445  // leaving the LMS-substring start indexes packed into sa[len(sa)-numLMS:].
   446  // (Only the LMS-substring start indexes are processed by the recursion.)
   447  func induceSubS_8_32(text []byte, sa, freq, bucket []int32) {
   448  	// Initialize positions for right side of character buckets.
   449  	bucketMax_8_32(text, freq, bucket)
   450  	bucket = bucket[:256] // eliminate bounds check for bucket[cB] below
   451  
   452  	// Analogous to induceSubL_8_32 above,
   453  	// as we scan the array right-to-left, each sa[i] = j > 0 is a correctly
   454  	// sorted suffix array entry (for text[j:]) for which we know that j-1 is type S.
   455  	// Because j-1 is type S, inserting it into sa now will sort it correctly.
   456  	// But we want to distinguish a j-1 with j-2 of type S from type L.
   457  	// We can process the former but want to leave the latter for the caller.
   458  	// We record the difference by negating j-1 if it is preceded by type L.
   459  	// Either way, the insertion (into the text[j-1] bucket) is guaranteed to
   460  	// happen at sa[i´] for some i´ < i, that is, in the portion of sa we have
   461  	// yet to scan. A single pass therefore sees indexes j, j-1, j-2, j-3,
   462  	// and so on, in sorted but not necessarily adjacent order, until it finds
   463  	// one preceded by an index of type L, at which point it must stop.
   464  	// That index (preceded by one of type L) is an LMS-substring start.
   465  	//
   466  	// As we scan through the array, we clear the worked entries (sa[i] > 0) to zero,
   467  	// and we flip sa[i] < 0 to -sa[i] and compact into the top of sa,
   468  	// so that the loop finishes with the top of sa containing exactly
   469  	// the LMS-substring start indexes, sorted by LMS-substring.
   470  
   471  	// Cache recently used bucket index:
   472  	cB := byte(0)
   473  	b := bucket[cB]
   474  
   475  	top := len(sa)
   476  	for i := len(sa) - 1; i >= 0; i-- {
   477  		j := int(sa[i])
   478  		if j == 0 {
   479  			// Skip empty entry.
   480  			continue
   481  		}
   482  		sa[i] = 0
   483  		if j < 0 {
   484  			// Leave discovered LMS-substring start index for caller.
   485  			top--
   486  			sa[top] = int32(-j)
   487  			continue
   488  		}
   489  
   490  		// Index j was on work queue, meaning k := j-1 is S-type,
   491  		// so we can now place k correctly into sa.
   492  		// If k-1 is S-type, queue k for processing later in this loop.
   493  		// If k-1 is L-type (text[k-1] > text[k]), queue -k to save for the caller.
   494  		k := j - 1
   495  		c1 := text[k]
   496  		c0 := text[k-1]
   497  		if c0 > c1 {
   498  			k = -k
   499  		}
   500  
   501  		if cB != c1 {
   502  			bucket[cB] = b
   503  			cB = c1
   504  			b = bucket[cB]
   505  		}
   506  		b--
   507  		sa[b] = int32(k)
   508  	}
   509  }
   510  
   511  // length_8_32 computes and records the length of each LMS-substring in text.
   512  // The length of the LMS-substring at index j is stored at sa[j/2],
   513  // avoiding the LMS-substring indexes already stored in the top half of sa.
   514  // (If index j is an LMS-substring start, then index j-1 is type L and cannot be.)
   515  // There are two exceptions, made for optimizations in name_8_32 below.
   516  //
   517  // First, the final LMS-substring is recorded as having length 0, which is otherwise
   518  // impossible, instead of giving it a length that includes the implicit sentinel.
   519  // This ensures the final LMS-substring has length unequal to all others
   520  // and therefore can be detected as different without text comparison
   521  // (it is unequal because it is the only one that ends in the implicit sentinel,
   522  // and the text comparison would be problematic since the implicit sentinel
   523  // is not actually present at text[len(text)]).
   524  //
   525  // Second, to avoid text comparison entirely, if an LMS-substring is very short,
   526  // sa[j/2] records its actual text instead of its length, so that if two such
   527  // substrings have matching “length,” the text need not be read at all.
   528  // The definition of “very short” is that the text bytes must pack into an uint32,
   529  // and the unsigned encoding e must be ≥ len(text), so that it can be
   530  // distinguished from a valid length.
   531  func length_8_32(text []byte, sa []int32, numLMS int) {
   532  	end := 0 // index of current LMS-substring end (0 indicates final LMS-substring)
   533  
   534  	// The encoding of N text bytes into a “length” word
   535  	// adds 1 to each byte, packs them into the bottom
   536  	// N*8 bits of a word, and then bitwise inverts the result.
   537  	// That is, the text sequence A B C (hex 41 42 43)
   538  	// encodes as ^uint32(0x42_43_44).
   539  	// LMS-substrings can never start or end with 0xFF.
   540  	// Adding 1 ensures the encoded byte sequence never
   541  	// starts or ends with 0x00, so that present bytes can be
   542  	// distinguished from zero-padding in the top bits,
   543  	// so the length need not be separately encoded.
   544  	// Inverting the bytes increases the chance that a
   545  	// 4-byte encoding will still be ≥ len(text).
   546  	// In particular, if the first byte is ASCII (<= 0x7E, so +1 <= 0x7F)
   547  	// then the high bit of the inversion will be set,
   548  	// making it clearly not a valid length (it would be a negative one).
   549  	//
   550  	// cx holds the pre-inverted encoding (the packed incremented bytes).
   551  	cx := uint32(0) // byte-only
   552  
   553  	// This stanza (until the blank line) is the "LMS-substring iterator",
   554  	// described in placeLMS_8_32 above, with one line added to maintain cx.
   555  	c0, c1, isTypeS := byte(0), byte(0), false
   556  	for i := len(text) - 1; i >= 0; i-- {
   557  		c0, c1 = text[i], c0
   558  		cx = cx<<8 | uint32(c1+1) // byte-only
   559  		if c0 < c1 {
   560  			isTypeS = true
   561  		} else if c0 > c1 && isTypeS {
   562  			isTypeS = false
   563  
   564  			// Index j = i+1 is the start of an LMS-substring.
   565  			// Compute length or encoded text to store in sa[j/2].
   566  			j := i + 1
   567  			var code int32
   568  			if end == 0 {
   569  				code = 0
   570  			} else {
   571  				code = int32(end - j)
   572  				if code <= 32/8 && ^cx >= uint32(len(text)) { // byte-only
   573  					code = int32(^cx) // byte-only
   574  				} // byte-only
   575  			}
   576  			sa[j>>1] = code
   577  			end = j + 1
   578  			cx = uint32(c1 + 1) // byte-only
   579  		}
   580  	}
   581  }
   582  
   583  // assignID_8_32 assigns a dense ID numbering to the
   584  // set of LMS-substrings respecting string ordering and equality,
   585  // returning the maximum assigned ID.
   586  // For example given the input "ababab", the LMS-substrings
   587  // are "aba", "aba", and "ab", renumbered as 2 2 1.
   588  // sa[len(sa)-numLMS:] holds the LMS-substring indexes
   589  // sorted in string order, so to assign numbers we can
   590  // consider each in turn, removing adjacent duplicates.
   591  // The new ID for the LMS-substring at index j is written to sa[j/2],
   592  // overwriting the length previously stored there (by length_8_32 above).
   593  func assignID_8_32(text []byte, sa []int32, numLMS int) int {
   594  	id := 0
   595  	lastLen := int32(-1) // impossible
   596  	lastPos := int32(0)
   597  	for _, j := range sa[len(sa)-numLMS:] {
   598  		// Is the LMS-substring at index j new, or is it the same as the last one we saw?
   599  		n := sa[j/2]
   600  		if n != lastLen {
   601  			goto New
   602  		}
   603  		if uint32(n) >= uint32(len(text)) {
   604  			// “Length” is really encoded full text, and they match.
   605  			goto Same
   606  		}
   607  		{
   608  			// Compare actual texts.
   609  			n := int(n)
   610  			this := text[j:][:n]
   611  			last := text[lastPos:][:n]
   612  			for i := 0; i < n; i++ {
   613  				if this[i] != last[i] {
   614  					goto New
   615  				}
   616  			}
   617  			goto Same
   618  		}
   619  	New:
   620  		id++
   621  		lastPos = j
   622  		lastLen = n
   623  	Same:
   624  		sa[j/2] = int32(id)
   625  	}
   626  	return id
   627  }
   628  
   629  // map_32 maps the LMS-substrings in text to their new IDs,
   630  // producing the subproblem for the recursion.
   631  // The mapping itself was mostly applied by assignID_8_32:
   632  // sa[i] is either 0, the ID for the LMS-substring at index 2*i,
   633  // or the ID for the LMS-substring at index 2*i+1.
   634  // To produce the subproblem we need only remove the zeros
   635  // and change ID into ID-1 (our IDs start at 1, but text chars start at 0).
   636  //
   637  // map_32 packs the result, which is the input to the recursion,
   638  // into the top of sa, so that the recursion result can be stored
   639  // in the bottom of sa, which sets up for expand_8_32 well.
   640  func map_32(sa []int32, numLMS int) {
   641  	w := len(sa)
   642  	for i := len(sa) / 2; i >= 0; i-- {
   643  		j := sa[i]
   644  		if j > 0 {
   645  			w--
   646  			sa[w] = j - 1
   647  		}
   648  	}
   649  }
   650  
   651  // recurse_32 calls sais_32 recursively to solve the subproblem we've built.
   652  // The subproblem is at the right end of sa, the suffix array result will be
   653  // written at the left end of sa, and the middle of sa is available for use as
   654  // temporary frequency and bucket storage.
   655  func recurse_32(sa, oldTmp []int32, numLMS, maxID int) {
   656  	dst, saTmp, text := sa[:numLMS], sa[numLMS:len(sa)-numLMS], sa[len(sa)-numLMS:]
   657  
   658  	// Set up temporary space for recursive call.
   659  	// We must pass sais_32 a tmp buffer wiith at least maxID entries.
   660  	//
   661  	// The subproblem is guaranteed to have length at most len(sa)/2,
   662  	// so that sa can hold both the subproblem and its suffix array.
   663  	// Nearly all the time, however, the subproblem has length < len(sa)/3,
   664  	// in which case there is a subproblem-sized middle of sa that
   665  	// we can reuse for temporary space (saTmp).
   666  	// When recurse_32 is called from sais_8_32, oldTmp is length 512
   667  	// (from text_32), and saTmp will typically be much larger, so we'll use saTmp.
   668  	// When deeper recursions come back to recurse_32, now oldTmp is
   669  	// the saTmp from the top-most recursion, it is typically larger than
   670  	// the current saTmp (because the current sa gets smaller and smaller
   671  	// as the recursion gets deeper), and we keep reusing that top-most
   672  	// large saTmp instead of the offered smaller ones.
   673  	//
   674  	// Why is the subproblem length so often just under len(sa)/3?
   675  	// See Nong, Zhang, and Chen, section 3.6 for a plausible explanation.
   676  	// In brief, the len(sa)/2 case would correspond to an SLSLSLSLSLSL pattern
   677  	// in the input, perfect alternation of larger and smaller input bytes.
   678  	// Real text doesn't do that. If each L-type index is randomly followed
   679  	// by either an L-type or S-type index, then half the substrings will
   680  	// be of the form SLS, but the other half will be longer. Of that half,
   681  	// half (a quarter overall) will be SLLS; an eighth will be SLLLS, and so on.
   682  	// Not counting the final S in each (which overlaps the first S in the next),
   683  	// This works out to an average length 2×½ + 3×¼ + 4×⅛ + ... = 3.
   684  	// The space we need is further reduced by the fact that many of the
   685  	// short patterns like SLS will often be the same character sequences
   686  	// repeated throughout the text, reducing maxID relative to numLMS.
   687  	//
   688  	// For short inputs, the averages may not run in our favor, but then we
   689  	// can often fall back to using the length-512 tmp available in the
   690  	// top-most call. (Also a short allocation would not be a big deal.)
   691  	//
   692  	// For pathological inputs, we fall back to allocating a new tmp of length
   693  	// max(maxID, numLMS/2). This level of the recursion needs maxID,
   694  	// and all deeper levels of the recursion will need no more than numLMS/2,
   695  	// so this one allocation is guaranteed to suffice for the entire stack
   696  	// of recursive calls.
   697  	tmp := oldTmp
   698  	if len(tmp) < len(saTmp) {
   699  		tmp = saTmp
   700  	}
   701  	if len(tmp) < numLMS {
   702  		// TestSAIS/forcealloc reaches this code.
   703  		n := maxID
   704  		if n < numLMS/2 {
   705  			n = numLMS / 2
   706  		}
   707  		tmp = make([]int32, n)
   708  	}
   709  
   710  	// sais_32 requires that the caller arrange to clear dst,
   711  	// because in general the caller may know dst is
   712  	// freshly-allocated and already cleared. But this one is not.
   713  	for i := range dst {
   714  		dst[i] = 0
   715  	}
   716  	sais_32(text, maxID, dst, tmp)
   717  }
   718  
   719  // unmap_8_32 unmaps the subproblem back to the original.
   720  // sa[:numLMS] is the LMS-substring numbers, which don't matter much anymore.
   721  // sa[len(sa)-numLMS:] is the sorted list of those LMS-substring numbers.
   722  // The key part is that if the list says K that means the K'th substring.
   723  // We can replace sa[:numLMS] with the indexes of the LMS-substrings.
   724  // Then if the list says K it really means sa[K].
   725  // Having mapped the list back to LMS-substring indexes,
   726  // we can place those into the right buckets.
   727  func unmap_8_32(text []byte, sa []int32, numLMS int) {
   728  	unmap := sa[len(sa)-numLMS:]
   729  	j := len(unmap)
   730  
   731  	// "LMS-substring iterator" (see placeLMS_8_32 above).
   732  	c0, c1, isTypeS := byte(0), byte(0), false
   733  	for i := len(text) - 1; i >= 0; i-- {
   734  		c0, c1 = text[i], c0
   735  		if c0 < c1 {
   736  			isTypeS = true
   737  		} else if c0 > c1 && isTypeS {
   738  			isTypeS = false
   739  
   740  			// Populate inverse map.
   741  			j--
   742  			unmap[j] = int32(i + 1)
   743  		}
   744  	}
   745  
   746  	// Apply inverse map to subproblem suffix array.
   747  	sa = sa[:numLMS]
   748  	for i := 0; i < len(sa); i++ {
   749  		sa[i] = unmap[sa[i]]
   750  	}
   751  }
   752  
   753  // expand_8_32 distributes the compacted, sorted LMS-suffix indexes
   754  // from sa[:numLMS] into the tops of the appropriate buckets in sa,
   755  // preserving the sorted order and making room for the L-type indexes
   756  // to be slotted into the sorted sequence by induceL_8_32.
   757  func expand_8_32(text []byte, freq, bucket, sa []int32, numLMS int) {
   758  	bucketMax_8_32(text, freq, bucket)
   759  	bucket = bucket[:256] // eliminate bound check for bucket[c] below
   760  
   761  	// Loop backward through sa, always tracking
   762  	// the next index to populate from sa[:numLMS].
   763  	// When we get to one, populate it.
   764  	// Zero the rest of the slots; they have dead values in them.
   765  	x := numLMS - 1
   766  	saX := sa[x]
   767  	c := text[saX]
   768  	b := bucket[c] - 1
   769  	bucket[c] = b
   770  
   771  	for i := len(sa) - 1; i >= 0; i-- {
   772  		if i != int(b) {
   773  			sa[i] = 0
   774  			continue
   775  		}
   776  		sa[i] = saX
   777  
   778  		// Load next entry to put down (if any).
   779  		if x > 0 {
   780  			x--
   781  			saX = sa[x] // TODO bounds check
   782  			c = text[saX]
   783  			b = bucket[c] - 1
   784  			bucket[c] = b
   785  		}
   786  	}
   787  }
   788  
   789  // induceL_8_32 inserts L-type text indexes into sa,
   790  // assuming that the leftmost S-type indexes are inserted
   791  // into sa, in sorted order, in the right bucket halves.
   792  // It leaves all the L-type indexes in sa, but the
   793  // leftmost L-type indexes are negated, to mark them
   794  // for processing by induceS_8_32.
   795  func induceL_8_32(text []byte, sa, freq, bucket []int32) {
   796  	// Initialize positions for left side of character buckets.
   797  	bucketMin_8_32(text, freq, bucket)
   798  	bucket = bucket[:256] // eliminate bounds check for bucket[cB] below
   799  
   800  	// This scan is similar to the one in induceSubL_8_32 above.
   801  	// That one arranges to clear all but the leftmost L-type indexes.
   802  	// This scan leaves all the L-type indexes and the original S-type
   803  	// indexes, but it negates the positive leftmost L-type indexes
   804  	// (the ones that induceS_8_32 needs to process).
   805  
   806  	// expand_8_32 left out the implicit entry sa[-1] == len(text),
   807  	// corresponding to the identified type-L index len(text)-1.
   808  	// Process it before the left-to-right scan of sa proper.
   809  	// See body in loop for commentary.
   810  	k := len(text) - 1
   811  	c0, c1 := text[k-1], text[k]
   812  	if c0 < c1 {
   813  		k = -k
   814  	}
   815  
   816  	// Cache recently used bucket index.
   817  	cB := c1
   818  	b := bucket[cB]
   819  	sa[b] = int32(k)
   820  	b++
   821  
   822  	for i := 0; i < len(sa); i++ {
   823  		j := int(sa[i])
   824  		if j <= 0 {
   825  			// Skip empty or negated entry (including negated zero).
   826  			continue
   827  		}
   828  
   829  		// Index j was on work queue, meaning k := j-1 is L-type,
   830  		// so we can now place k correctly into sa.
   831  		// If k-1 is L-type, queue k for processing later in this loop.
   832  		// If k-1 is S-type (text[k-1] < text[k]), queue -k to save for the caller.
   833  		// If k is zero, k-1 doesn't exist, so we only need to leave it
   834  		// for the caller. The caller can't tell the difference between
   835  		// an empty slot and a non-empty zero, but there's no need
   836  		// to distinguish them anyway: the final suffix array will end up
   837  		// with one zero somewhere, and that will be a real zero.
   838  		k := j - 1
   839  		c1 := text[k]
   840  		if k > 0 {
   841  			if c0 := text[k-1]; c0 < c1 {
   842  				k = -k
   843  			}
   844  		}
   845  
   846  		if cB != c1 {
   847  			bucket[cB] = b
   848  			cB = c1
   849  			b = bucket[cB]
   850  		}
   851  		sa[b] = int32(k)
   852  		b++
   853  	}
   854  }
   855  
   856  func induceS_8_32(text []byte, sa, freq, bucket []int32) {
   857  	// Initialize positions for right side of character buckets.
   858  	bucketMax_8_32(text, freq, bucket)
   859  	bucket = bucket[:256] // eliminate bounds check for bucket[cB] below
   860  
   861  	cB := byte(0)
   862  	b := bucket[cB]
   863  
   864  	for i := len(sa) - 1; i >= 0; i-- {
   865  		j := int(sa[i])
   866  		if j >= 0 {
   867  			// Skip non-flagged entry.
   868  			// (This loop can't see an empty entry; 0 means the real zero index.)
   869  			continue
   870  		}
   871  
   872  		// Negative j is a work queue entry; rewrite to positive j for final suffix array.
   873  		j = -j
   874  		sa[i] = int32(j)
   875  
   876  		// Index j was on work queue (encoded as -j but now decoded),
   877  		// meaning k := j-1 is L-type,
   878  		// so we can now place k correctly into sa.
   879  		// If k-1 is S-type, queue -k for processing later in this loop.
   880  		// If k-1 is L-type (text[k-1] > text[k]), queue k to save for the caller.
   881  		// If k is zero, k-1 doesn't exist, so we only need to leave it
   882  		// for the caller.
   883  		k := j - 1
   884  		c1 := text[k]
   885  		if k > 0 {
   886  			if c0 := text[k-1]; c0 <= c1 {
   887  				k = -k
   888  			}
   889  		}
   890  
   891  		if cB != c1 {
   892  			bucket[cB] = b
   893  			cB = c1
   894  			b = bucket[cB]
   895  		}
   896  		b--
   897  		sa[b] = int32(k)
   898  	}
   899  }
   900
View as plain text