Source file test/bench/go1/fasta_test.go

     1  // Copyright 2011 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package go1
     6  
     7  import "runtime"
     8  
     9  // Not a benchmark; input for revcomp.
    10  
    11  var fastabytes = makefasta()
    12  
    13  func makefasta() []byte {
    14  	var n int = 25e6
    15  	if runtime.GOARCH == "arm" || runtime.GOARCH == "mips" || runtime.GOARCH == "mips64" {
    16  		// TODO(dfc) remove this limitation after precise gc.
    17  		// A value of 25e6 consumes 465mb of heap on 32bit
    18  		// platforms, which is too much for some systems.
    19  		// A value of 25e5 produces a memory layout that
    20  		// confuses the gc on 32bit platforms. So 25e4 it is.
    21  		n = 25e4
    22  	}
    23  	return fasta(n)
    24  }
    25  
    26  func fasta(n int) []byte {
    27  	out := make(fastaBuffer, 0, 11*n)
    28  
    29  	iub := []fastaAcid{
    30  		{prob: 0.27, sym: 'a'},
    31  		{prob: 0.12, sym: 'c'},
    32  		{prob: 0.12, sym: 'g'},
    33  		{prob: 0.27, sym: 't'},
    34  		{prob: 0.02, sym: 'B'},
    35  		{prob: 0.02, sym: 'D'},
    36  		{prob: 0.02, sym: 'H'},
    37  		{prob: 0.02, sym: 'K'},
    38  		{prob: 0.02, sym: 'M'},
    39  		{prob: 0.02, sym: 'N'},
    40  		{prob: 0.02, sym: 'R'},
    41  		{prob: 0.02, sym: 'S'},
    42  		{prob: 0.02, sym: 'V'},
    43  		{prob: 0.02, sym: 'W'},
    44  		{prob: 0.02, sym: 'Y'},
    45  	}
    46  
    47  	homosapiens := []fastaAcid{
    48  		{prob: 0.3029549426680, sym: 'a'},
    49  		{prob: 0.1979883004921, sym: 'c'},
    50  		{prob: 0.1975473066391, sym: 'g'},
    51  		{prob: 0.3015094502008, sym: 't'},
    52  	}
    53  
    54  	alu := []byte(
    55  		"GGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGG" +
    56  			"GAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGA" +
    57  			"CCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAAT" +
    58  			"ACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCA" +
    59  			"GCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGG" +
    60  			"AGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCC" +
    61  			"AGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAA")
    62  
    63  	out.WriteString(">ONE Homo sapiens alu\n")
    64  	fastaRepeat(&out, alu, 2*n)
    65  	out.WriteString(">TWO IUB ambiguity codes\n")
    66  	fastaRandom(&out, iub, 3*n)
    67  	out.WriteString(">THREE Homo sapiens frequency\n")
    68  	fastaRandom(&out, homosapiens, 5*n)
    69  	return out
    70  }
    71  
    72  type fastaBuffer []byte
    73  
    74  func (b *fastaBuffer) Flush() {
    75  	panic("flush")
    76  }
    77  
    78  func (b *fastaBuffer) WriteString(s string) {
    79  	p := b.NextWrite(len(s))
    80  	copy(p, s)
    81  }
    82  
    83  func (b *fastaBuffer) NextWrite(n int) []byte {
    84  	p := *b
    85  	if len(p)+n > cap(p) {
    86  		b.Flush()
    87  		p = *b
    88  	}
    89  	out := p[len(p) : len(p)+n]
    90  	*b = p[:len(p)+n]
    91  	return out
    92  }
    93  
    94  const fastaLine = 60
    95  
    96  func fastaRepeat(out *fastaBuffer, alu []byte, n int) {
    97  	buf := append(alu, alu...)
    98  	off := 0
    99  	for n > 0 {
   100  		m := n
   101  		if m > fastaLine {
   102  			m = fastaLine
   103  		}
   104  		buf1 := out.NextWrite(m + 1)
   105  		copy(buf1, buf[off:])
   106  		buf1[m] = '\n'
   107  		if off += m; off >= len(alu) {
   108  			off -= len(alu)
   109  		}
   110  		n -= m
   111  	}
   112  }
   113  
   114  const (
   115  	fastaLookupSize          = 4096
   116  	fastaLookupScale float64 = fastaLookupSize - 1
   117  )
   118  
   119  var fastaRand uint32 = 42
   120  
   121  type fastaAcid struct {
   122  	sym   byte
   123  	prob  float64
   124  	cprob float64
   125  	next  *fastaAcid
   126  }
   127  
   128  func fastaComputeLookup(acid []fastaAcid) *[fastaLookupSize]*fastaAcid {
   129  	var lookup [fastaLookupSize]*fastaAcid
   130  	var p float64
   131  	for i := range acid {
   132  		p += acid[i].prob
   133  		acid[i].cprob = p * fastaLookupScale
   134  		if i > 0 {
   135  			acid[i-1].next = &acid[i]
   136  		}
   137  	}
   138  	acid[len(acid)-1].cprob = 1.0 * fastaLookupScale
   139  
   140  	j := 0
   141  	for i := range lookup {
   142  		for acid[j].cprob < float64(i) {
   143  			j++
   144  		}
   145  		lookup[i] = &acid[j]
   146  	}
   147  
   148  	return &lookup
   149  }
   150  
   151  func fastaRandom(out *fastaBuffer, acid []fastaAcid, n int) {
   152  	const (
   153  		IM = 139968
   154  		IA = 3877
   155  		IC = 29573
   156  	)
   157  	lookup := fastaComputeLookup(acid)
   158  	for n > 0 {
   159  		m := n
   160  		if m > fastaLine {
   161  			m = fastaLine
   162  		}
   163  		buf := out.NextWrite(m + 1)
   164  		f := fastaLookupScale / IM
   165  		myrand := fastaRand
   166  		for i := 0; i < m; i++ {
   167  			myrand = (myrand*IA + IC) % IM
   168  			r := float64(int(myrand)) * f
   169  			a := lookup[int(r)]
   170  			for a.cprob < r {
   171  				a = a.next
   172  			}
   173  			buf[i] = a.sym
   174  		}
   175  		fastaRand = myrand
   176  		buf[m] = '\n'
   177  		n -= m
   178  	}
   179  }
   180  

View as plain text