suffixarray_test.go

     1  // Copyright 2010 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package suffixarray
     6  
     7  import (
     8  	"bytes"
     9  	"fmt"
    10  	"io/fs"
    11  	"math/rand"
    12  	"os"
    13  	"path/filepath"
    14  	"regexp"
    15  	"sort"
    16  	"strings"
    17  	"testing"
    18  )
    19  
    20  type testCase struct {
    21  	name     string   // name of test case
    22  	source   string   // source to index
    23  	patterns []string // patterns to lookup
    24  }
    25  
    26  var testCases = []testCase{
    27  	{
    28  		"empty string",
    29  		"",
    30  		[]string{
    31  			"",
    32  			"foo",
    33  			"(foo)",
    34  			".*",
    35  			"a*",
    36  		},
    37  	},
    38  
    39  	{
    40  		"all a's",
    41  		"aaaaaaaaaa", // 10 a's
    42  		[]string{
    43  			"",
    44  			"a",
    45  			"aa",
    46  			"aaa",
    47  			"aaaa",
    48  			"aaaaa",
    49  			"aaaaaa",
    50  			"aaaaaaa",
    51  			"aaaaaaaa",
    52  			"aaaaaaaaa",
    53  			"aaaaaaaaaa",
    54  			"aaaaaaaaaaa", // 11 a's
    55  			".",
    56  			".*",
    57  			"a+",
    58  			"aa+",
    59  			"aaaa[b]?",
    60  			"aaa*",
    61  		},
    62  	},
    63  
    64  	{
    65  		"abc",
    66  		"abc",
    67  		[]string{
    68  			"a",
    69  			"b",
    70  			"c",
    71  			"ab",
    72  			"bc",
    73  			"abc",
    74  			"a.c",
    75  			"a(b|c)",
    76  			"abc?",
    77  		},
    78  	},
    79  
    80  	{
    81  		"barbara*3",
    82  		"barbarabarbarabarbara",
    83  		[]string{
    84  			"a",
    85  			"bar",
    86  			"rab",
    87  			"arab",
    88  			"barbar",
    89  			"bara?bar",
    90  		},
    91  	},
    92  
    93  	{
    94  		"typing drill",
    95  		"Now is the time for all good men to come to the aid of their country.",
    96  		[]string{
    97  			"Now",
    98  			"the time",
    99  			"to come the aid",
   100  			"is the time for all good men to come to the aid of their",
   101  			"to (come|the)?",
   102  		},
   103  	},
   104  
   105  	{
   106  		"godoc simulation",
   107  		"package main\n\nimport(\n    \"rand\"\n    ",
   108  		[]string{},
   109  	},
   110  }
   111  
   112  // find all occurrences of s in source; report at most n occurrences
   113  func find(src, s string, n int) []int {
   114  	var res []int
   115  	if s != "" && n != 0 {
   116  		// find at most n occurrences of s in src
   117  		for i := -1; n < 0 || len(res) < n; {
   118  			j := strings.Index(src[i+1:], s)
   119  			if j < 0 {
   120  				break
   121  			}
   122  			i += j + 1
   123  			res = append(res, i)
   124  		}
   125  	}
   126  	return res
   127  }
   128  
   129  func testLookup(t *testing.T, tc *testCase, x *Index, s string, n int) {
   130  	res := x.Lookup([]byte(s), n)
   131  	exp := find(tc.source, s, n)
   132  
   133  	// check that the lengths match
   134  	if len(res) != len(exp) {
   135  		t.Errorf("test %q, lookup %q (n = %d): expected %d results; got %d", tc.name, s, n, len(exp), len(res))
   136  	}
   137  
   138  	// if n >= 0 the number of results is limited --- unless n >= all results,
   139  	// we may obtain different positions from the Index and from find (because
   140  	// Index may not find the results in the same order as find) => in general
   141  	// we cannot simply check that the res and exp lists are equal
   142  
   143  	// check that each result is in fact a correct match and there are no duplicates
   144  	sort.Ints(res)
   145  	for i, r := range res {
   146  		if r < 0 || len(tc.source) <= r {
   147  			t.Errorf("test %q, lookup %q, result %d (n = %d): index %d out of range [0, %d[", tc.name, s, i, n, r, len(tc.source))
   148  		} else if !strings.HasPrefix(tc.source[r:], s) {
   149  			t.Errorf("test %q, lookup %q, result %d (n = %d): index %d not a match", tc.name, s, i, n, r)
   150  		}
   151  		if i > 0 && res[i-1] == r {
   152  			t.Errorf("test %q, lookup %q, result %d (n = %d): found duplicate index %d", tc.name, s, i, n, r)
   153  		}
   154  	}
   155  
   156  	if n < 0 {
   157  		// all results computed - sorted res and exp must be equal
   158  		for i, r := range res {
   159  			e := exp[i]
   160  			if r != e {
   161  				t.Errorf("test %q, lookup %q, result %d: expected index %d; got %d", tc.name, s, i, e, r)
   162  			}
   163  		}
   164  	}
   165  }
   166  
   167  func testFindAllIndex(t *testing.T, tc *testCase, x *Index, rx *regexp.Regexp, n int) {
   168  	res := x.FindAllIndex(rx, n)
   169  	exp := rx.FindAllStringIndex(tc.source, n)
   170  
   171  	// check that the lengths match
   172  	if len(res) != len(exp) {
   173  		t.Errorf("test %q, FindAllIndex %q (n = %d): expected %d results; got %d", tc.name, rx, n, len(exp), len(res))
   174  	}
   175  
   176  	// if n >= 0 the number of results is limited --- unless n >= all results,
   177  	// we may obtain different positions from the Index and from regexp (because
   178  	// Index may not find the results in the same order as regexp) => in general
   179  	// we cannot simply check that the res and exp lists are equal
   180  
   181  	// check that each result is in fact a correct match and the result is sorted
   182  	for i, r := range res {
   183  		if r[0] < 0 || r[0] > r[1] || len(tc.source) < r[1] {
   184  			t.Errorf("test %q, FindAllIndex %q, result %d (n == %d): illegal match [%d, %d]", tc.name, rx, i, n, r[0], r[1])
   185  		} else if !rx.MatchString(tc.source[r[0]:r[1]]) {
   186  			t.Errorf("test %q, FindAllIndex %q, result %d (n = %d): [%d, %d] not a match", tc.name, rx, i, n, r[0], r[1])
   187  		}
   188  	}
   189  
   190  	if n < 0 {
   191  		// all results computed - sorted res and exp must be equal
   192  		for i, r := range res {
   193  			e := exp[i]
   194  			if r[0] != e[0] || r[1] != e[1] {
   195  				t.Errorf("test %q, FindAllIndex %q, result %d: expected match [%d, %d]; got [%d, %d]",
   196  					tc.name, rx, i, e[0], e[1], r[0], r[1])
   197  			}
   198  		}
   199  	}
   200  }
   201  
   202  func testLookups(t *testing.T, tc *testCase, x *Index, n int) {
   203  	for _, pat := range tc.patterns {
   204  		testLookup(t, tc, x, pat, n)
   205  		if rx, err := regexp.Compile(pat); err == nil {
   206  			testFindAllIndex(t, tc, x, rx, n)
   207  		}
   208  	}
   209  }
   210  
   211  // index is used to hide the sort.Interface
   212  type index Index
   213  
   214  func (x *index) Len() int           { return x.sa.len() }
   215  func (x *index) Less(i, j int) bool { return bytes.Compare(x.at(i), x.at(j)) < 0 }
   216  func (x *index) Swap(i, j int) {
   217  	if x.sa.int32 != nil {
   218  		x.sa.int32[i], x.sa.int32[j] = x.sa.int32[j], x.sa.int32[i]
   219  	} else {
   220  		x.sa.int64[i], x.sa.int64[j] = x.sa.int64[j], x.sa.int64[i]
   221  	}
   222  }
   223  
   224  func (x *index) at(i int) []byte {
   225  	return x.data[x.sa.get(i):]
   226  }
   227  
   228  func testConstruction(t *testing.T, tc *testCase, x *Index) {
   229  	if !sort.IsSorted((*index)(x)) {
   230  		t.Errorf("failed testConstruction %s", tc.name)
   231  	}
   232  }
   233  
   234  func equal(x, y *Index) bool {
   235  	if !bytes.Equal(x.data, y.data) {
   236  		return false
   237  	}
   238  	if x.sa.len() != y.sa.len() {
   239  		return false
   240  	}
   241  	n := x.sa.len()
   242  	for i := 0; i < n; i++ {
   243  		if x.sa.get(i) != y.sa.get(i) {
   244  			return false
   245  		}
   246  	}
   247  	return true
   248  }
   249  
   250  // returns the serialized index size
   251  func testSaveRestore(t *testing.T, tc *testCase, x *Index) int {
   252  	var buf bytes.Buffer
   253  	if err := x.Write(&buf); err != nil {
   254  		t.Errorf("failed writing index %s (%s)", tc.name, err)
   255  	}
   256  	size := buf.Len()
   257  	var y Index
   258  	if err := y.Read(bytes.NewReader(buf.Bytes())); err != nil {
   259  		t.Errorf("failed reading index %s (%s)", tc.name, err)
   260  	}
   261  	if !equal(x, &y) {
   262  		t.Errorf("restored index doesn't match saved index %s", tc.name)
   263  	}
   264  
   265  	old := maxData32
   266  	defer func() {
   267  		maxData32 = old
   268  	}()
   269  	// Reread as forced 32.
   270  	y = Index{}
   271  	maxData32 = realMaxData32
   272  	if err := y.Read(bytes.NewReader(buf.Bytes())); err != nil {
   273  		t.Errorf("failed reading index %s (%s)", tc.name, err)
   274  	}
   275  	if !equal(x, &y) {
   276  		t.Errorf("restored index doesn't match saved index %s", tc.name)
   277  	}
   278  
   279  	// Reread as forced 64.
   280  	y = Index{}
   281  	maxData32 = -1
   282  	if err := y.Read(bytes.NewReader(buf.Bytes())); err != nil {
   283  		t.Errorf("failed reading index %s (%s)", tc.name, err)
   284  	}
   285  	if !equal(x, &y) {
   286  		t.Errorf("restored index doesn't match saved index %s", tc.name)
   287  	}
   288  
   289  	return size
   290  }
   291  
   292  func testIndex(t *testing.T) {
   293  	for _, tc := range testCases {
   294  		x := New([]byte(tc.source))
   295  		testConstruction(t, &tc, x)
   296  		testSaveRestore(t, &tc, x)
   297  		testLookups(t, &tc, x, 0)
   298  		testLookups(t, &tc, x, 1)
   299  		testLookups(t, &tc, x, 10)
   300  		testLookups(t, &tc, x, 2e9)
   301  		testLookups(t, &tc, x, -1)
   302  	}
   303  }
   304  
   305  func TestIndex32(t *testing.T) {
   306  	testIndex(t)
   307  }
   308  
   309  func TestIndex64(t *testing.T) {
   310  	maxData32 = -1
   311  	defer func() {
   312  		maxData32 = realMaxData32
   313  	}()
   314  	testIndex(t)
   315  }
   316  
   317  func TestNew32(t *testing.T) {
   318  	test(t, func(x []byte) []int {
   319  		sa := make([]int32, len(x))
   320  		text_32(x, sa)
   321  		out := make([]int, len(sa))
   322  		for i, v := range sa {
   323  			out[i] = int(v)
   324  		}
   325  		return out
   326  	})
   327  }
   328  
   329  func TestNew64(t *testing.T) {
   330  	test(t, func(x []byte) []int {
   331  		sa := make([]int64, len(x))
   332  		text_64(x, sa)
   333  		out := make([]int, len(sa))
   334  		for i, v := range sa {
   335  			out[i] = int(v)
   336  		}
   337  		return out
   338  	})
   339  }
   340  
   341  // test tests an arbitrary suffix array construction function.
   342  // Generates many inputs, builds and checks suffix arrays.
   343  func test(t *testing.T, build func([]byte) []int) {
   344  	t.Run("ababab...", func(t *testing.T) {
   345  		// Very repetitive input has numLMS = len(x)/2-1
   346  		// at top level, the largest it can be.
   347  		// But maxID is only two (aba and ab$).
   348  		size := 100000
   349  		if testing.Short() {
   350  			size = 10000
   351  		}
   352  		x := make([]byte, size)
   353  		for i := range x {
   354  			x[i] = "ab"[i%2]
   355  		}
   356  		testSA(t, x, build)
   357  	})
   358  
   359  	t.Run("forcealloc", func(t *testing.T) {
   360  		// Construct a pathological input that forces
   361  		// recurse_32 to allocate a new temporary buffer.
   362  		// The input must have more than N/3 LMS-substrings,
   363  		// which we arrange by repeating an SLSLSLSLSLSL pattern
   364  		// like ababab... above, but then we must also arrange
   365  		// for a large number of distinct LMS-substrings.
   366  		// We use this pattern:
   367  		// 1 255 1 254 1 253 1 ... 1 2 1 255 2 254 2 253 2 252 2 ...
   368  		// This gives approximately 2¹⁵ distinct LMS-substrings.
   369  		// We need to repeat at least one substring, though,
   370  		// or else the recursion can be bypassed entirely.
   371  		x := make([]byte, 100000, 100001)
   372  		lo := byte(1)
   373  		hi := byte(255)
   374  		for i := range x {
   375  			if i%2 == 0 {
   376  				x[i] = lo
   377  			} else {
   378  				x[i] = hi
   379  				hi--
   380  				if hi <= lo {
   381  					lo++
   382  					if lo == 0 {
   383  						lo = 1
   384  					}
   385  					hi = 255
   386  				}
   387  			}
   388  		}
   389  		x[:cap(x)][len(x)] = 0 // for sais.New
   390  		testSA(t, x, build)
   391  	})
   392  
   393  	t.Run("exhaustive2", func(t *testing.T) {
   394  		// All inputs over {0,1} up to length 21.
   395  		// Runs in about 10 seconds on my laptop.
   396  		x := make([]byte, 30)
   397  		numFail := 0
   398  		for n := 0; n <= 21; n++ {
   399  			if n > 12 && testing.Short() {
   400  				break
   401  			}
   402  			x[n] = 0 // for sais.New
   403  			testRec(t, x[:n], 0, 2, &numFail, build)
   404  		}
   405  	})
   406  
   407  	t.Run("exhaustive3", func(t *testing.T) {
   408  		// All inputs over {0,1,2} up to length 14.
   409  		// Runs in about 10 seconds on my laptop.
   410  		x := make([]byte, 30)
   411  		numFail := 0
   412  		for n := 0; n <= 14; n++ {
   413  			if n > 8 && testing.Short() {
   414  				break
   415  			}
   416  			x[n] = 0 // for sais.New
   417  			testRec(t, x[:n], 0, 3, &numFail, build)
   418  		}
   419  	})
   420  }
   421  
   422  // testRec fills x[i:] with all possible combinations of values in [1,max]
   423  // and then calls testSA(t, x, build) for each one.
   424  func testRec(t *testing.T, x []byte, i, max int, numFail *int, build func([]byte) []int) {
   425  	if i < len(x) {
   426  		for x[i] = 1; x[i] <= byte(max); x[i]++ {
   427  			testRec(t, x, i+1, max, numFail, build)
   428  		}
   429  		return
   430  	}
   431  
   432  	if !testSA(t, x, build) {
   433  		*numFail++
   434  		if *numFail >= 10 {
   435  			t.Errorf("stopping after %d failures", *numFail)
   436  			t.FailNow()
   437  		}
   438  	}
   439  }
   440  
   441  // testSA tests the suffix array build function on the input x.
   442  // It constructs the suffix array and then checks that it is correct.
   443  func testSA(t *testing.T, x []byte, build func([]byte) []int) bool {
   444  	defer func() {
   445  		if e := recover(); e != nil {
   446  			t.Logf("build %v", x)
   447  			panic(e)
   448  		}
   449  	}()
   450  	sa := build(x)
   451  	if len(sa) != len(x) {
   452  		t.Errorf("build %v: len(sa) = %d, want %d", x, len(sa), len(x))
   453  		return false
   454  	}
   455  	for i := 0; i+1 < len(sa); i++ {
   456  		if sa[i] < 0 || sa[i] >= len(x) || sa[i+1] < 0 || sa[i+1] >= len(x) {
   457  			t.Errorf("build %s: sa out of range: %v\n", x, sa)
   458  			return false
   459  		}
   460  		if bytes.Compare(x[sa[i]:], x[sa[i+1]:]) >= 0 {
   461  			t.Errorf("build %v -> %v\nsa[%d:] = %d,%d out of order", x, sa, i, sa[i], sa[i+1])
   462  			return false
   463  		}
   464  	}
   465  
   466  	return true
   467  }
   468  
   469  var (
   470  	benchdata = make([]byte, 1e6)
   471  	benchrand = make([]byte, 1e6)
   472  )
   473  
   474  // Of all possible inputs, the random bytes have the least amount of substring
   475  // repetition, and the repeated bytes have the most. For most algorithms,
   476  // the running time of every input will be between these two.
   477  func benchmarkNew(b *testing.B, random bool) {
   478  	b.ReportAllocs()
   479  	b.StopTimer()
   480  	data := benchdata
   481  	if random {
   482  		data = benchrand
   483  		if data[0] == 0 {
   484  			for i := range data {
   485  				data[i] = byte(rand.Intn(256))
   486  			}
   487  		}
   488  	}
   489  	b.StartTimer()
   490  	b.SetBytes(int64(len(data)))
   491  	for i := 0; i < b.N; i++ {
   492  		New(data)
   493  	}
   494  }
   495  
   496  func makeText(name string) ([]byte, error) {
   497  	var data []byte
   498  	switch name {
   499  	case "opticks":
   500  		var err error
   501  		data, err = os.ReadFile("../../testdata/Isaac.Newton-Opticks.txt")
   502  		if err != nil {
   503  			return nil, err
   504  		}
   505  	case "go":
   506  		err := filepath.WalkDir("../..", func(path string, info fs.DirEntry, err error) error {
   507  			if err == nil && strings.HasSuffix(path, ".go") && !info.IsDir() {
   508  				file, err := os.ReadFile(path)
   509  				if err != nil {
   510  					return err
   511  				}
   512  				data = append(data, file...)
   513  			}
   514  			return nil
   515  		})
   516  		if err != nil {
   517  			return nil, err
   518  		}
   519  	case "zero":
   520  		data = make([]byte, 50e6)
   521  	case "rand":
   522  		data = make([]byte, 50e6)
   523  		for i := range data {
   524  			data[i] = byte(rand.Intn(256))
   525  		}
   526  	}
   527  	return data, nil
   528  }
   529  
   530  func setBits(bits int) (cleanup func()) {
   531  	if bits == 32 {
   532  		maxData32 = realMaxData32
   533  	} else {
   534  		maxData32 = -1 // force use of 64-bit code
   535  	}
   536  	return func() {
   537  		maxData32 = realMaxData32
   538  	}
   539  }
   540  
   541  func BenchmarkNew(b *testing.B) {
   542  	for _, text := range []string{"opticks", "go", "zero", "rand"} {
   543  		b.Run("text="+text, func(b *testing.B) {
   544  			data, err := makeText(text)
   545  			if err != nil {
   546  				b.Fatal(err)
   547  			}
   548  			if testing.Short() && len(data) > 5e6 {
   549  				data = data[:5e6]
   550  			}
   551  			for _, size := range []int{100e3, 500e3, 1e6, 5e6, 10e6, 50e6} {
   552  				if len(data) < size {
   553  					continue
   554  				}
   555  				data := data[:size]
   556  				name := fmt.Sprintf("%dK", size/1e3)
   557  				if size >= 1e6 {
   558  					name = fmt.Sprintf("%dM", size/1e6)
   559  				}
   560  				b.Run("size="+name, func(b *testing.B) {
   561  					for _, bits := range []int{32, 64} {
   562  						if ^uint(0) == 0xffffffff && bits == 64 {
   563  							continue
   564  						}
   565  						b.Run(fmt.Sprintf("bits=%d", bits), func(b *testing.B) {
   566  							cleanup := setBits(bits)
   567  							defer cleanup()
   568  
   569  							b.SetBytes(int64(len(data)))
   570  							b.ReportAllocs()
   571  							for i := 0; i < b.N; i++ {
   572  								New(data)
   573  							}
   574  						})
   575  					}
   576  				})
   577  			}
   578  		})
   579  	}
   580  }
   581  
   582  func BenchmarkSaveRestore(b *testing.B) {
   583  	r := rand.New(rand.NewSource(0x5a77a1)) // guarantee always same sequence
   584  	data := make([]byte, 1<<20)             // 1MB of data to index
   585  	for i := range data {
   586  		data[i] = byte(r.Intn(256))
   587  	}
   588  	for _, bits := range []int{32, 64} {
   589  		if ^uint(0) == 0xffffffff && bits == 64 {
   590  			continue
   591  		}
   592  		b.Run(fmt.Sprintf("bits=%d", bits), func(b *testing.B) {
   593  			cleanup := setBits(bits)
   594  			defer cleanup()
   595  
   596  			b.StopTimer()
   597  			x := New(data)
   598  			size := testSaveRestore(nil, nil, x)       // verify correctness
   599  			buf := bytes.NewBuffer(make([]byte, size)) // avoid growing
   600  			b.SetBytes(int64(size))
   601  			b.StartTimer()
   602  			b.ReportAllocs()
   603  			for i := 0; i < b.N; i++ {
   604  				buf.Reset()
   605  				if err := x.Write(buf); err != nil {
   606  					b.Fatal(err)
   607  				}
   608  				var y Index
   609  				if err := y.Read(buf); err != nil {
   610  					b.Fatal(err)
   611  				}
   612  			}
   613  		})
   614  	}
   615  }
   616
View as plain text