maphash.go

     1  // Copyright 2019 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // Package maphash provides hash functions on byte sequences.
     6  // These hash functions are intended to be used to implement hash tables or
     7  // other data structures that need to map arbitrary strings or byte
     8  // sequences to a uniform distribution on unsigned 64-bit integers.
     9  // Each different instance of a hash table or data structure should use its own Seed.
    10  //
    11  // The hash functions are not cryptographically secure.
    12  // (See crypto/sha256 and crypto/sha512 for cryptographic use.)
    13  //
    14  package maphash
    15  
    16  import (
    17  	"internal/unsafeheader"
    18  	"unsafe"
    19  )
    20  
    21  // A Seed is a random value that selects the specific hash function
    22  // computed by a Hash. If two Hashes use the same Seeds, they
    23  // will compute the same hash values for any given input.
    24  // If two Hashes use different Seeds, they are very likely to compute
    25  // distinct hash values for any given input.
    26  //
    27  // A Seed must be initialized by calling MakeSeed.
    28  // The zero seed is uninitialized and not valid for use with Hash's SetSeed method.
    29  //
    30  // Each Seed value is local to a single process and cannot be serialized
    31  // or otherwise recreated in a different process.
    32  type Seed struct {
    33  	s uint64
    34  }
    35  
    36  // A Hash computes a seeded hash of a byte sequence.
    37  //
    38  // The zero Hash is a valid Hash ready to use.
    39  // A zero Hash chooses a random seed for itself during
    40  // the first call to a Reset, Write, Seed, or Sum64 method.
    41  // For control over the seed, use SetSeed.
    42  //
    43  // The computed hash values depend only on the initial seed and
    44  // the sequence of bytes provided to the Hash object, not on the way
    45  // in which the bytes are provided. For example, the three sequences
    46  //
    47  //     h.Write([]byte{'f','o','o'})
    48  //     h.WriteByte('f'); h.WriteByte('o'); h.WriteByte('o')
    49  //     h.WriteString("foo")
    50  //
    51  // all have the same effect.
    52  //
    53  // Hashes are intended to be collision-resistant, even for situations
    54  // where an adversary controls the byte sequences being hashed.
    55  //
    56  // A Hash is not safe for concurrent use by multiple goroutines, but a Seed is.
    57  // If multiple goroutines must compute the same seeded hash,
    58  // each can declare its own Hash and call SetSeed with a common Seed.
    59  type Hash struct {
    60  	_     [0]func()     // not comparable
    61  	seed  Seed          // initial seed used for this hash
    62  	state Seed          // current hash of all flushed bytes
    63  	buf   [bufSize]byte // unflushed byte buffer
    64  	n     int           // number of unflushed bytes
    65  }
    66  
    67  // bufSize is the size of the Hash write buffer.
    68  // The buffer ensures that writes depend only on the sequence of bytes,
    69  // not the sequence of WriteByte/Write/WriteString calls,
    70  // by always calling rthash with a full buffer (except for the tail).
    71  const bufSize = 128
    72  
    73  // initSeed seeds the hash if necessary.
    74  // initSeed is called lazily before any operation that actually uses h.seed/h.state.
    75  // Note that this does not include Write/WriteByte/WriteString in the case
    76  // where they only add to h.buf. (If they write too much, they call h.flush,
    77  // which does call h.initSeed.)
    78  func (h *Hash) initSeed() {
    79  	if h.seed.s == 0 {
    80  		seed := MakeSeed()
    81  		h.seed = seed
    82  		h.state = seed
    83  	}
    84  }
    85  
    86  // WriteByte adds b to the sequence of bytes hashed by h.
    87  // It never fails; the error result is for implementing io.ByteWriter.
    88  func (h *Hash) WriteByte(b byte) error {
    89  	if h.n == len(h.buf) {
    90  		h.flush()
    91  	}
    92  	h.buf[h.n] = b
    93  	h.n++
    94  	return nil
    95  }
    96  
    97  // Write adds b to the sequence of bytes hashed by h.
    98  // It always writes all of b and never fails; the count and error result are for implementing io.Writer.
    99  func (h *Hash) Write(b []byte) (int, error) {
   100  	size := len(b)
   101  	// Deal with bytes left over in h.buf.
   102  	// h.n <= bufSize is always true.
   103  	// Checking it is ~free and it lets the compiler eliminate a bounds check.
   104  	if h.n > 0 && h.n <= bufSize {
   105  		k := copy(h.buf[h.n:], b)
   106  		h.n += k
   107  		if h.n < bufSize {
   108  			// Copied the entirety of b to h.buf.
   109  			return size, nil
   110  		}
   111  		b = b[k:]
   112  		h.flush()
   113  		// No need to set h.n = 0 here; it happens just before exit.
   114  	}
   115  	// Process as many full buffers as possible, without copying, and calling initSeed only once.
   116  	if len(b) > bufSize {
   117  		h.initSeed()
   118  		for len(b) > bufSize {
   119  			h.state.s = rthash(&b[0], bufSize, h.state.s)
   120  			b = b[bufSize:]
   121  		}
   122  	}
   123  	// Copy the tail.
   124  	copy(h.buf[:], b)
   125  	h.n = len(b)
   126  	return size, nil
   127  }
   128  
   129  // WriteString adds the bytes of s to the sequence of bytes hashed by h.
   130  // It always writes all of s and never fails; the count and error result are for implementing io.StringWriter.
   131  func (h *Hash) WriteString(s string) (int, error) {
   132  	// WriteString mirrors Write. See Write for comments.
   133  	size := len(s)
   134  	if h.n > 0 && h.n <= bufSize {
   135  		k := copy(h.buf[h.n:], s)
   136  		h.n += k
   137  		if h.n < bufSize {
   138  			return size, nil
   139  		}
   140  		s = s[k:]
   141  		h.flush()
   142  	}
   143  	if len(s) > bufSize {
   144  		h.initSeed()
   145  		for len(s) > bufSize {
   146  			ptr := (*byte)((*unsafeheader.String)(unsafe.Pointer(&s)).Data)
   147  			h.state.s = rthash(ptr, bufSize, h.state.s)
   148  			s = s[bufSize:]
   149  		}
   150  	}
   151  	copy(h.buf[:], s)
   152  	h.n = len(s)
   153  	return size, nil
   154  }
   155  
   156  // Seed returns h's seed value.
   157  func (h *Hash) Seed() Seed {
   158  	h.initSeed()
   159  	return h.seed
   160  }
   161  
   162  // SetSeed sets h to use seed, which must have been returned by MakeSeed
   163  // or by another Hash's Seed method.
   164  // Two Hash objects with the same seed behave identically.
   165  // Two Hash objects with different seeds will very likely behave differently.
   166  // Any bytes added to h before this call will be discarded.
   167  func (h *Hash) SetSeed(seed Seed) {
   168  	if seed.s == 0 {
   169  		panic("maphash: use of uninitialized Seed")
   170  	}
   171  	h.seed = seed
   172  	h.state = seed
   173  	h.n = 0
   174  }
   175  
   176  // Reset discards all bytes added to h.
   177  // (The seed remains the same.)
   178  func (h *Hash) Reset() {
   179  	h.initSeed()
   180  	h.state = h.seed
   181  	h.n = 0
   182  }
   183  
   184  // precondition: buffer is full.
   185  func (h *Hash) flush() {
   186  	if h.n != len(h.buf) {
   187  		panic("maphash: flush of partially full buffer")
   188  	}
   189  	h.initSeed()
   190  	h.state.s = rthash(&h.buf[0], h.n, h.state.s)
   191  	h.n = 0
   192  }
   193  
   194  // Sum64 returns h's current 64-bit value, which depends on
   195  // h's seed and the sequence of bytes added to h since the
   196  // last call to Reset or SetSeed.
   197  //
   198  // All bits of the Sum64 result are close to uniformly and
   199  // independently distributed, so it can be safely reduced
   200  // by using bit masking, shifting, or modular arithmetic.
   201  func (h *Hash) Sum64() uint64 {
   202  	h.initSeed()
   203  	return rthash(&h.buf[0], h.n, h.state.s)
   204  }
   205  
   206  // MakeSeed returns a new random seed.
   207  func MakeSeed() Seed {
   208  	var s1, s2 uint64
   209  	for {
   210  		s1 = uint64(runtime_fastrand())
   211  		s2 = uint64(runtime_fastrand())
   212  		// We use seed 0 to indicate an uninitialized seed/hash,
   213  		// so keep trying until we get a non-zero seed.
   214  		if s1|s2 != 0 {
   215  			break
   216  		}
   217  	}
   218  	return Seed{s: s1<<32 + s2}
   219  }
   220  
   221  //go:linkname runtime_fastrand runtime.fastrand
   222  func runtime_fastrand() uint32
   223  
   224  func rthash(ptr *byte, len int, seed uint64) uint64 {
   225  	if len == 0 {
   226  		return seed
   227  	}
   228  	// The runtime hasher only works on uintptr. For 64-bit
   229  	// architectures, we use the hasher directly. Otherwise,
   230  	// we use two parallel hashers on the lower and upper 32 bits.
   231  	if unsafe.Sizeof(uintptr(0)) == 8 {
   232  		return uint64(runtime_memhash(unsafe.Pointer(ptr), uintptr(seed), uintptr(len)))
   233  	}
   234  	lo := runtime_memhash(unsafe.Pointer(ptr), uintptr(seed), uintptr(len))
   235  	hi := runtime_memhash(unsafe.Pointer(ptr), uintptr(seed>>32), uintptr(len))
   236  	return uint64(hi)<<32 | uint64(lo)
   237  }
   238  
   239  //go:linkname runtime_memhash runtime.memhash
   240  //go:noescape
   241  func runtime_memhash(p unsafe.Pointer, seed, s uintptr) uintptr
   242  
   243  // Sum appends the hash's current 64-bit value to b.
   244  // It exists for implementing hash.Hash.
   245  // For direct calls, it is more efficient to use Sum64.
   246  func (h *Hash) Sum(b []byte) []byte {
   247  	x := h.Sum64()
   248  	return append(b,
   249  		byte(x>>0),
   250  		byte(x>>8),
   251  		byte(x>>16),
   252  		byte(x>>24),
   253  		byte(x>>32),
   254  		byte(x>>40),
   255  		byte(x>>48),
   256  		byte(x>>56))
   257  }
   258  
   259  // Size returns h's hash value size, 8 bytes.
   260  func (h *Hash) Size() int { return 8 }
   261  
   262  // BlockSize returns h's block size.
   263  func (h *Hash) BlockSize() int { return len(h.buf) }
   264
View as plain text