1
2
3
4
5 package suffixarray
6
7 import (
8 "bytes"
9 "fmt"
10 "io/fs"
11 "math/rand"
12 "os"
13 "path/filepath"
14 "regexp"
15 "sort"
16 "strings"
17 "testing"
18 )
19
20 type testCase struct {
21 name string
22 source string
23 patterns []string
24 }
25
26 var testCases = []testCase{
27 {
28 "empty string",
29 "",
30 []string{
31 "",
32 "foo",
33 "(foo)",
34 ".*",
35 "a*",
36 },
37 },
38
39 {
40 "all a's",
41 "aaaaaaaaaa",
42 []string{
43 "",
44 "a",
45 "aa",
46 "aaa",
47 "aaaa",
48 "aaaaa",
49 "aaaaaa",
50 "aaaaaaa",
51 "aaaaaaaa",
52 "aaaaaaaaa",
53 "aaaaaaaaaa",
54 "aaaaaaaaaaa",
55 ".",
56 ".*",
57 "a+",
58 "aa+",
59 "aaaa[b]?",
60 "aaa*",
61 },
62 },
63
64 {
65 "abc",
66 "abc",
67 []string{
68 "a",
69 "b",
70 "c",
71 "ab",
72 "bc",
73 "abc",
74 "a.c",
75 "a(b|c)",
76 "abc?",
77 },
78 },
79
80 {
81 "barbara*3",
82 "barbarabarbarabarbara",
83 []string{
84 "a",
85 "bar",
86 "rab",
87 "arab",
88 "barbar",
89 "bara?bar",
90 },
91 },
92
93 {
94 "typing drill",
95 "Now is the time for all good men to come to the aid of their country.",
96 []string{
97 "Now",
98 "the time",
99 "to come the aid",
100 "is the time for all good men to come to the aid of their",
101 "to (come|the)?",
102 },
103 },
104
105 {
106 "godoc simulation",
107 "package main\n\nimport(\n \"rand\"\n ",
108 []string{},
109 },
110 }
111
112
113 func find(src, s string, n int) []int {
114 var res []int
115 if s != "" && n != 0 {
116
117 for i := -1; n < 0 || len(res) < n; {
118 j := strings.Index(src[i+1:], s)
119 if j < 0 {
120 break
121 }
122 i += j + 1
123 res = append(res, i)
124 }
125 }
126 return res
127 }
128
129 func testLookup(t *testing.T, tc *testCase, x *Index, s string, n int) {
130 res := x.Lookup([]byte(s), n)
131 exp := find(tc.source, s, n)
132
133
134 if len(res) != len(exp) {
135 t.Errorf("test %q, lookup %q (n = %d): expected %d results; got %d", tc.name, s, n, len(exp), len(res))
136 }
137
138
139
140
141
142
143
144 sort.Ints(res)
145 for i, r := range res {
146 if r < 0 || len(tc.source) <= r {
147 t.Errorf("test %q, lookup %q, result %d (n = %d): index %d out of range [0, %d[", tc.name, s, i, n, r, len(tc.source))
148 } else if !strings.HasPrefix(tc.source[r:], s) {
149 t.Errorf("test %q, lookup %q, result %d (n = %d): index %d not a match", tc.name, s, i, n, r)
150 }
151 if i > 0 && res[i-1] == r {
152 t.Errorf("test %q, lookup %q, result %d (n = %d): found duplicate index %d", tc.name, s, i, n, r)
153 }
154 }
155
156 if n < 0 {
157
158 for i, r := range res {
159 e := exp[i]
160 if r != e {
161 t.Errorf("test %q, lookup %q, result %d: expected index %d; got %d", tc.name, s, i, e, r)
162 }
163 }
164 }
165 }
166
167 func testFindAllIndex(t *testing.T, tc *testCase, x *Index, rx *regexp.Regexp, n int) {
168 res := x.FindAllIndex(rx, n)
169 exp := rx.FindAllStringIndex(tc.source, n)
170
171
172 if len(res) != len(exp) {
173 t.Errorf("test %q, FindAllIndex %q (n = %d): expected %d results; got %d", tc.name, rx, n, len(exp), len(res))
174 }
175
176
177
178
179
180
181
182 for i, r := range res {
183 if r[0] < 0 || r[0] > r[1] || len(tc.source) < r[1] {
184 t.Errorf("test %q, FindAllIndex %q, result %d (n == %d): illegal match [%d, %d]", tc.name, rx, i, n, r[0], r[1])
185 } else if !rx.MatchString(tc.source[r[0]:r[1]]) {
186 t.Errorf("test %q, FindAllIndex %q, result %d (n = %d): [%d, %d] not a match", tc.name, rx, i, n, r[0], r[1])
187 }
188 }
189
190 if n < 0 {
191
192 for i, r := range res {
193 e := exp[i]
194 if r[0] != e[0] || r[1] != e[1] {
195 t.Errorf("test %q, FindAllIndex %q, result %d: expected match [%d, %d]; got [%d, %d]",
196 tc.name, rx, i, e[0], e[1], r[0], r[1])
197 }
198 }
199 }
200 }
201
202 func testLookups(t *testing.T, tc *testCase, x *Index, n int) {
203 for _, pat := range tc.patterns {
204 testLookup(t, tc, x, pat, n)
205 if rx, err := regexp.Compile(pat); err == nil {
206 testFindAllIndex(t, tc, x, rx, n)
207 }
208 }
209 }
210
211
212 type index Index
213
214 func (x *index) Len() int { return x.sa.len() }
215 func (x *index) Less(i, j int) bool { return bytes.Compare(x.at(i), x.at(j)) < 0 }
216 func (x *index) Swap(i, j int) {
217 if x.sa.int32 != nil {
218 x.sa.int32[i], x.sa.int32[j] = x.sa.int32[j], x.sa.int32[i]
219 } else {
220 x.sa.int64[i], x.sa.int64[j] = x.sa.int64[j], x.sa.int64[i]
221 }
222 }
223
224 func (x *index) at(i int) []byte {
225 return x.data[x.sa.get(i):]
226 }
227
228 func testConstruction(t *testing.T, tc *testCase, x *Index) {
229 if !sort.IsSorted((*index)(x)) {
230 t.Errorf("failed testConstruction %s", tc.name)
231 }
232 }
233
234 func equal(x, y *Index) bool {
235 if !bytes.Equal(x.data, y.data) {
236 return false
237 }
238 if x.sa.len() != y.sa.len() {
239 return false
240 }
241 n := x.sa.len()
242 for i := 0; i < n; i++ {
243 if x.sa.get(i) != y.sa.get(i) {
244 return false
245 }
246 }
247 return true
248 }
249
250
251 func testSaveRestore(t *testing.T, tc *testCase, x *Index) int {
252 var buf bytes.Buffer
253 if err := x.Write(&buf); err != nil {
254 t.Errorf("failed writing index %s (%s)", tc.name, err)
255 }
256 size := buf.Len()
257 var y Index
258 if err := y.Read(bytes.NewReader(buf.Bytes())); err != nil {
259 t.Errorf("failed reading index %s (%s)", tc.name, err)
260 }
261 if !equal(x, &y) {
262 t.Errorf("restored index doesn't match saved index %s", tc.name)
263 }
264
265 old := maxData32
266 defer func() {
267 maxData32 = old
268 }()
269
270 y = Index{}
271 maxData32 = realMaxData32
272 if err := y.Read(bytes.NewReader(buf.Bytes())); err != nil {
273 t.Errorf("failed reading index %s (%s)", tc.name, err)
274 }
275 if !equal(x, &y) {
276 t.Errorf("restored index doesn't match saved index %s", tc.name)
277 }
278
279
280 y = Index{}
281 maxData32 = -1
282 if err := y.Read(bytes.NewReader(buf.Bytes())); err != nil {
283 t.Errorf("failed reading index %s (%s)", tc.name, err)
284 }
285 if !equal(x, &y) {
286 t.Errorf("restored index doesn't match saved index %s", tc.name)
287 }
288
289 return size
290 }
291
292 func testIndex(t *testing.T) {
293 for _, tc := range testCases {
294 x := New([]byte(tc.source))
295 testConstruction(t, &tc, x)
296 testSaveRestore(t, &tc, x)
297 testLookups(t, &tc, x, 0)
298 testLookups(t, &tc, x, 1)
299 testLookups(t, &tc, x, 10)
300 testLookups(t, &tc, x, 2e9)
301 testLookups(t, &tc, x, -1)
302 }
303 }
304
305 func TestIndex32(t *testing.T) {
306 testIndex(t)
307 }
308
309 func TestIndex64(t *testing.T) {
310 maxData32 = -1
311 defer func() {
312 maxData32 = realMaxData32
313 }()
314 testIndex(t)
315 }
316
317 func TestNew32(t *testing.T) {
318 test(t, func(x []byte) []int {
319 sa := make([]int32, len(x))
320 text_32(x, sa)
321 out := make([]int, len(sa))
322 for i, v := range sa {
323 out[i] = int(v)
324 }
325 return out
326 })
327 }
328
329 func TestNew64(t *testing.T) {
330 test(t, func(x []byte) []int {
331 sa := make([]int64, len(x))
332 text_64(x, sa)
333 out := make([]int, len(sa))
334 for i, v := range sa {
335 out[i] = int(v)
336 }
337 return out
338 })
339 }
340
341
342
343 func test(t *testing.T, build func([]byte) []int) {
344 t.Run("ababab...", func(t *testing.T) {
345
346
347
348 size := 100000
349 if testing.Short() {
350 size = 10000
351 }
352 x := make([]byte, size)
353 for i := range x {
354 x[i] = "ab"[i%2]
355 }
356 testSA(t, x, build)
357 })
358
359 t.Run("forcealloc", func(t *testing.T) {
360
361
362
363
364
365
366
367
368
369
370
371 x := make([]byte, 100000, 100001)
372 lo := byte(1)
373 hi := byte(255)
374 for i := range x {
375 if i%2 == 0 {
376 x[i] = lo
377 } else {
378 x[i] = hi
379 hi--
380 if hi <= lo {
381 lo++
382 if lo == 0 {
383 lo = 1
384 }
385 hi = 255
386 }
387 }
388 }
389 x[:cap(x)][len(x)] = 0
390 testSA(t, x, build)
391 })
392
393 t.Run("exhaustive2", func(t *testing.T) {
394
395
396 x := make([]byte, 30)
397 numFail := 0
398 for n := 0; n <= 21; n++ {
399 if n > 12 && testing.Short() {
400 break
401 }
402 x[n] = 0
403 testRec(t, x[:n], 0, 2, &numFail, build)
404 }
405 })
406
407 t.Run("exhaustive3", func(t *testing.T) {
408
409
410 x := make([]byte, 30)
411 numFail := 0
412 for n := 0; n <= 14; n++ {
413 if n > 8 && testing.Short() {
414 break
415 }
416 x[n] = 0
417 testRec(t, x[:n], 0, 3, &numFail, build)
418 }
419 })
420 }
421
422
423
424 func testRec(t *testing.T, x []byte, i, max int, numFail *int, build func([]byte) []int) {
425 if i < len(x) {
426 for x[i] = 1; x[i] <= byte(max); x[i]++ {
427 testRec(t, x, i+1, max, numFail, build)
428 }
429 return
430 }
431
432 if !testSA(t, x, build) {
433 *numFail++
434 if *numFail >= 10 {
435 t.Errorf("stopping after %d failures", *numFail)
436 t.FailNow()
437 }
438 }
439 }
440
441
442
443 func testSA(t *testing.T, x []byte, build func([]byte) []int) bool {
444 defer func() {
445 if e := recover(); e != nil {
446 t.Logf("build %v", x)
447 panic(e)
448 }
449 }()
450 sa := build(x)
451 if len(sa) != len(x) {
452 t.Errorf("build %v: len(sa) = %d, want %d", x, len(sa), len(x))
453 return false
454 }
455 for i := 0; i+1 < len(sa); i++ {
456 if sa[i] < 0 || sa[i] >= len(x) || sa[i+1] < 0 || sa[i+1] >= len(x) {
457 t.Errorf("build %s: sa out of range: %v\n", x, sa)
458 return false
459 }
460 if bytes.Compare(x[sa[i]:], x[sa[i+1]:]) >= 0 {
461 t.Errorf("build %v -> %v\nsa[%d:] = %d,%d out of order", x, sa, i, sa[i], sa[i+1])
462 return false
463 }
464 }
465
466 return true
467 }
468
469 var (
470 benchdata = make([]byte, 1e6)
471 benchrand = make([]byte, 1e6)
472 )
473
474
475
476
477 func benchmarkNew(b *testing.B, random bool) {
478 b.ReportAllocs()
479 b.StopTimer()
480 data := benchdata
481 if random {
482 data = benchrand
483 if data[0] == 0 {
484 for i := range data {
485 data[i] = byte(rand.Intn(256))
486 }
487 }
488 }
489 b.StartTimer()
490 b.SetBytes(int64(len(data)))
491 for i := 0; i < b.N; i++ {
492 New(data)
493 }
494 }
495
496 func makeText(name string) ([]byte, error) {
497 var data []byte
498 switch name {
499 case "opticks":
500 var err error
501 data, err = os.ReadFile("../../testdata/Isaac.Newton-Opticks.txt")
502 if err != nil {
503 return nil, err
504 }
505 case "go":
506 err := filepath.WalkDir("../..", func(path string, info fs.DirEntry, err error) error {
507 if err == nil && strings.HasSuffix(path, ".go") && !info.IsDir() {
508 file, err := os.ReadFile(path)
509 if err != nil {
510 return err
511 }
512 data = append(data, file...)
513 }
514 return nil
515 })
516 if err != nil {
517 return nil, err
518 }
519 case "zero":
520 data = make([]byte, 50e6)
521 case "rand":
522 data = make([]byte, 50e6)
523 for i := range data {
524 data[i] = byte(rand.Intn(256))
525 }
526 }
527 return data, nil
528 }
529
530 func setBits(bits int) (cleanup func()) {
531 if bits == 32 {
532 maxData32 = realMaxData32
533 } else {
534 maxData32 = -1
535 }
536 return func() {
537 maxData32 = realMaxData32
538 }
539 }
540
541 func BenchmarkNew(b *testing.B) {
542 for _, text := range []string{"opticks", "go", "zero", "rand"} {
543 b.Run("text="+text, func(b *testing.B) {
544 data, err := makeText(text)
545 if err != nil {
546 b.Fatal(err)
547 }
548 if testing.Short() && len(data) > 5e6 {
549 data = data[:5e6]
550 }
551 for _, size := range []int{100e3, 500e3, 1e6, 5e6, 10e6, 50e6} {
552 if len(data) < size {
553 continue
554 }
555 data := data[:size]
556 name := fmt.Sprintf("%dK", size/1e3)
557 if size >= 1e6 {
558 name = fmt.Sprintf("%dM", size/1e6)
559 }
560 b.Run("size="+name, func(b *testing.B) {
561 for _, bits := range []int{32, 64} {
562 if ^uint(0) == 0xffffffff && bits == 64 {
563 continue
564 }
565 b.Run(fmt.Sprintf("bits=%d", bits), func(b *testing.B) {
566 cleanup := setBits(bits)
567 defer cleanup()
568
569 b.SetBytes(int64(len(data)))
570 b.ReportAllocs()
571 for i := 0; i < b.N; i++ {
572 New(data)
573 }
574 })
575 }
576 })
577 }
578 })
579 }
580 }
581
582 func BenchmarkSaveRestore(b *testing.B) {
583 r := rand.New(rand.NewSource(0x5a77a1))
584 data := make([]byte, 1<<20)
585 for i := range data {
586 data[i] = byte(r.Intn(256))
587 }
588 for _, bits := range []int{32, 64} {
589 if ^uint(0) == 0xffffffff && bits == 64 {
590 continue
591 }
592 b.Run(fmt.Sprintf("bits=%d", bits), func(b *testing.B) {
593 cleanup := setBits(bits)
594 defer cleanup()
595
596 b.StopTimer()
597 x := New(data)
598 size := testSaveRestore(nil, nil, x)
599 buf := bytes.NewBuffer(make([]byte, size))
600 b.SetBytes(int64(size))
601 b.StartTimer()
602 b.ReportAllocs()
603 for i := 0; i < b.N; i++ {
604 buf.Reset()
605 if err := x.Write(buf); err != nil {
606 b.Fatal(err)
607 }
608 var y Index
609 if err := y.Read(buf); err != nil {
610 b.Fatal(err)
611 }
612 }
613 })
614 }
615 }
616
View as plain text