p256_ppc64le.go

     1  // Copyright 2019 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  //go:build ppc64le
     6  
     7  package elliptic
     8  
     9  import (
    10  	"crypto/subtle"
    11  	"encoding/binary"
    12  	"math/big"
    13  )
    14  
    15  // This was ported from the s390x implementation for ppc64le.
    16  // Some hints are included here for changes that should be
    17  // in the big endian ppc64 implementation, however more
    18  // investigation and testing is needed for the ppc64 big
    19  // endian version to work.
    20  type p256CurveFast struct {
    21  	*CurveParams
    22  }
    23  
    24  type p256Point struct {
    25  	x [32]byte
    26  	y [32]byte
    27  	z [32]byte
    28  }
    29  
    30  var (
    31  	p256        Curve
    32  	p256PreFast *[37][64]p256Point
    33  )
    34  
    35  func initP256Arch() {
    36  	p256 = p256CurveFast{p256Params}
    37  	initTable()
    38  	return
    39  }
    40  
    41  func (curve p256CurveFast) Params() *CurveParams {
    42  	return curve.CurveParams
    43  }
    44  
    45  // Functions implemented in p256_asm_ppc64le.s
    46  // Montgomery multiplication modulo P256
    47  //
    48  //go:noescape
    49  func p256MulAsm(res, in1, in2 []byte)
    50  
    51  // Montgomery square modulo P256
    52  //
    53  func p256Sqr(res, in []byte) {
    54  	p256MulAsm(res, in, in)
    55  }
    56  
    57  // Montgomery multiplication by 1
    58  //
    59  //go:noescape
    60  func p256FromMont(res, in []byte)
    61  
    62  // iff cond == 1  val <- -val
    63  //
    64  //go:noescape
    65  func p256NegCond(val *p256Point, cond int)
    66  
    67  // if cond == 0 res <- b; else res <- a
    68  //
    69  //go:noescape
    70  func p256MovCond(res, a, b *p256Point, cond int)
    71  
    72  // Constant time table access
    73  //
    74  //go:noescape
    75  func p256Select(point *p256Point, table []p256Point, idx int)
    76  
    77  //
    78  //go:noescape
    79  func p256SelectBase(point *p256Point, table []p256Point, idx int)
    80  
    81  // Point add with P2 being affine point
    82  // If sign == 1 -> P2 = -P2
    83  // If sel == 0 -> P3 = P1
    84  // if zero == 0 -> P3 = P2
    85  //
    86  //go:noescape
    87  func p256PointAddAffineAsm(res, in1, in2 *p256Point, sign, sel, zero int)
    88  
    89  // Point add
    90  //
    91  //go:noescape
    92  func p256PointAddAsm(res, in1, in2 *p256Point) int
    93  
    94  //
    95  //go:noescape
    96  func p256PointDoubleAsm(res, in *p256Point)
    97  
    98  // The result should be a slice in LE order, but the slice
    99  // from big.Bytes is in BE order.
   100  // TODO: For big endian implementation, do not reverse bytes.
   101  //
   102  func fromBig(big *big.Int) []byte {
   103  	// This could be done a lot more efficiently...
   104  	res := big.Bytes()
   105  	t := make([]byte, 32)
   106  	if len(res) < 32 {
   107  		copy(t[32-len(res):], res)
   108  	} else if len(res) == 32 {
   109  		copy(t, res)
   110  	} else {
   111  		copy(t, res[len(res)-32:])
   112  	}
   113  	p256ReverseBytes(t, t)
   114  	return t
   115  }
   116  
   117  // p256GetMultiplier makes sure byte array will have 32 byte elements, If the scalar
   118  // is equal or greater than the order of the group, it's reduced modulo that order.
   119  func p256GetMultiplier(in []byte) []byte {
   120  	n := new(big.Int).SetBytes(in)
   121  
   122  	if n.Cmp(p256Params.N) >= 0 {
   123  		n.Mod(n, p256Params.N)
   124  	}
   125  	return fromBig(n)
   126  }
   127  
   128  // p256MulAsm operates in a Montgomery domain with R = 2^256 mod p, where p is the
   129  // underlying field of the curve. (See initP256 for the value.) Thus rr here is
   130  // R×R mod p. See comment in Inverse about how this is used.
   131  // TODO: For big endian implementation, the bytes in these slices should be in reverse order,
   132  // as found in the s390x implementation.
   133  var rr = []byte{0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0, 0xff, 0xff, 0xff, 0xff, 0xfb, 0xff, 0xff, 0xff, 0xfe, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xfd, 0xff, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00}
   134  
   135  // (This is one, in the Montgomery domain.)
   136  var one = []byte{0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xfe, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00}
   137  
   138  func maybeReduceModP(in *big.Int) *big.Int {
   139  	if in.Cmp(p256Params.P) < 0 {
   140  		return in
   141  	}
   142  	return new(big.Int).Mod(in, p256Params.P)
   143  }
   144  
   145  // p256ReverseBytes copies the first 32 bytes from in to res in reverse order.
   146  func p256ReverseBytes(res, in []byte) {
   147  	// remove bounds check
   148  	in = in[:32]
   149  	res = res[:32]
   150  
   151  	// Load in reverse order
   152  	a := binary.BigEndian.Uint64(in[0:])
   153  	b := binary.BigEndian.Uint64(in[8:])
   154  	c := binary.BigEndian.Uint64(in[16:])
   155  	d := binary.BigEndian.Uint64(in[24:])
   156  
   157  	// Store in normal order
   158  	binary.LittleEndian.PutUint64(res[0:], d)
   159  	binary.LittleEndian.PutUint64(res[8:], c)
   160  	binary.LittleEndian.PutUint64(res[16:], b)
   161  	binary.LittleEndian.PutUint64(res[24:], a)
   162  }
   163  
   164  func (curve p256CurveFast) CombinedMult(bigX, bigY *big.Int, baseScalar, scalar []byte) (x, y *big.Int) {
   165  	var r1, r2 p256Point
   166  
   167  	scalarReduced := p256GetMultiplier(baseScalar)
   168  	r1IsInfinity := scalarIsZero(scalarReduced)
   169  	r1.p256BaseMult(scalarReduced)
   170  
   171  	copy(r2.x[:], fromBig(maybeReduceModP(bigX)))
   172  	copy(r2.y[:], fromBig(maybeReduceModP(bigY)))
   173  	copy(r2.z[:], one)
   174  	p256MulAsm(r2.x[:], r2.x[:], rr[:])
   175  	p256MulAsm(r2.y[:], r2.y[:], rr[:])
   176  
   177  	scalarReduced = p256GetMultiplier(scalar)
   178  	r2IsInfinity := scalarIsZero(scalarReduced)
   179  	r2.p256ScalarMult(scalarReduced)
   180  
   181  	var sum, double p256Point
   182  	pointsEqual := p256PointAddAsm(&sum, &r1, &r2)
   183  	p256PointDoubleAsm(&double, &r1)
   184  	p256MovCond(&sum, &double, &sum, pointsEqual)
   185  	p256MovCond(&sum, &r1, &sum, r2IsInfinity)
   186  	p256MovCond(&sum, &r2, &sum, r1IsInfinity)
   187  	return sum.p256PointToAffine()
   188  }
   189  
   190  func (curve p256CurveFast) ScalarBaseMult(scalar []byte) (x, y *big.Int) {
   191  	var r p256Point
   192  	reducedScalar := p256GetMultiplier(scalar)
   193  	r.p256BaseMult(reducedScalar)
   194  	return r.p256PointToAffine()
   195  }
   196  
   197  func (curve p256CurveFast) ScalarMult(bigX, bigY *big.Int, scalar []byte) (x, y *big.Int) {
   198  	scalarReduced := p256GetMultiplier(scalar)
   199  	var r p256Point
   200  	copy(r.x[:], fromBig(maybeReduceModP(bigX)))
   201  	copy(r.y[:], fromBig(maybeReduceModP(bigY)))
   202  	copy(r.z[:], one)
   203  	p256MulAsm(r.x[:], r.x[:], rr[:])
   204  	p256MulAsm(r.y[:], r.y[:], rr[:])
   205  	r.p256ScalarMult(scalarReduced)
   206  	return r.p256PointToAffine()
   207  }
   208  
   209  func scalarIsZero(scalar []byte) int {
   210  	// If any byte is not zero, return 0.
   211  	// Check for -0.... since that appears to compare to 0.
   212  	b := byte(0)
   213  	for _, s := range scalar {
   214  		b |= s
   215  	}
   216  	return subtle.ConstantTimeByteEq(b, 0)
   217  }
   218  
   219  func (p *p256Point) p256PointToAffine() (x, y *big.Int) {
   220  	zInv := make([]byte, 32)
   221  	zInvSq := make([]byte, 32)
   222  
   223  	p256Inverse(zInv, p.z[:])
   224  	p256Sqr(zInvSq, zInv)
   225  	p256MulAsm(zInv, zInv, zInvSq)
   226  
   227  	p256MulAsm(zInvSq, p.x[:], zInvSq)
   228  	p256MulAsm(zInv, p.y[:], zInv)
   229  
   230  	p256FromMont(zInvSq, zInvSq)
   231  	p256FromMont(zInv, zInv)
   232  
   233  	// SetBytes expects a slice in big endian order,
   234  	// since ppc64le is little endian, reverse the bytes.
   235  	// TODO: For big endian, bytes don't need to be reversed.
   236  	p256ReverseBytes(zInvSq, zInvSq)
   237  	p256ReverseBytes(zInv, zInv)
   238  	rx := new(big.Int).SetBytes(zInvSq)
   239  	ry := new(big.Int).SetBytes(zInv)
   240  	return rx, ry
   241  }
   242  
   243  // p256Inverse sets out to in^-1 mod p.
   244  func p256Inverse(out, in []byte) {
   245  	var stack [6 * 32]byte
   246  	p2 := stack[32*0 : 32*0+32]
   247  	p4 := stack[32*1 : 32*1+32]
   248  	p8 := stack[32*2 : 32*2+32]
   249  	p16 := stack[32*3 : 32*3+32]
   250  	p32 := stack[32*4 : 32*4+32]
   251  
   252  	p256Sqr(out, in)
   253  	p256MulAsm(p2, out, in) // 3*p
   254  
   255  	p256Sqr(out, p2)
   256  	p256Sqr(out, out)
   257  	p256MulAsm(p4, out, p2) // f*p
   258  
   259  	p256Sqr(out, p4)
   260  	p256Sqr(out, out)
   261  	p256Sqr(out, out)
   262  	p256Sqr(out, out)
   263  	p256MulAsm(p8, out, p4) // ff*p
   264  
   265  	p256Sqr(out, p8)
   266  
   267  	for i := 0; i < 7; i++ {
   268  		p256Sqr(out, out)
   269  	}
   270  	p256MulAsm(p16, out, p8) // ffff*p
   271  
   272  	p256Sqr(out, p16)
   273  	for i := 0; i < 15; i++ {
   274  		p256Sqr(out, out)
   275  	}
   276  	p256MulAsm(p32, out, p16) // ffffffff*p
   277  
   278  	p256Sqr(out, p32)
   279  
   280  	for i := 0; i < 31; i++ {
   281  		p256Sqr(out, out)
   282  	}
   283  	p256MulAsm(out, out, in)
   284  
   285  	for i := 0; i < 32*4; i++ {
   286  		p256Sqr(out, out)
   287  	}
   288  	p256MulAsm(out, out, p32)
   289  
   290  	for i := 0; i < 32; i++ {
   291  		p256Sqr(out, out)
   292  	}
   293  	p256MulAsm(out, out, p32)
   294  
   295  	for i := 0; i < 16; i++ {
   296  		p256Sqr(out, out)
   297  	}
   298  	p256MulAsm(out, out, p16)
   299  
   300  	for i := 0; i < 8; i++ {
   301  		p256Sqr(out, out)
   302  	}
   303  	p256MulAsm(out, out, p8)
   304  
   305  	p256Sqr(out, out)
   306  	p256Sqr(out, out)
   307  	p256Sqr(out, out)
   308  	p256Sqr(out, out)
   309  	p256MulAsm(out, out, p4)
   310  
   311  	p256Sqr(out, out)
   312  	p256Sqr(out, out)
   313  	p256MulAsm(out, out, p2)
   314  
   315  	p256Sqr(out, out)
   316  	p256Sqr(out, out)
   317  	p256MulAsm(out, out, in)
   318  }
   319  
   320  func boothW5(in uint) (int, int) {
   321  	var s uint = ^((in >> 5) - 1)
   322  	var d uint = (1 << 6) - in - 1
   323  	d = (d & s) | (in & (^s))
   324  	d = (d >> 1) + (d & 1)
   325  	return int(d), int(s & 1)
   326  }
   327  
   328  func boothW6(in uint) (int, int) {
   329  	var s uint = ^((in >> 6) - 1)
   330  	var d uint = (1 << 7) - in - 1
   331  	d = (d & s) | (in & (^s))
   332  	d = (d >> 1) + (d & 1)
   333  	return int(d), int(s & 1)
   334  }
   335  
   336  func boothW7(in uint) (int, int) {
   337  	var s uint = ^((in >> 7) - 1)
   338  	var d uint = (1 << 8) - in - 1
   339  	d = (d & s) | (in & (^s))
   340  	d = (d >> 1) + (d & 1)
   341  	return int(d), int(s & 1)
   342  }
   343  
   344  func initTable() {
   345  
   346  	p256PreFast = new([37][64]p256Point)
   347  
   348  	// TODO: For big endian, these slices should be in reverse byte order,
   349  	// as found in the s390x implementation.
   350  	basePoint := p256Point{
   351  		x: [32]byte{0x3c, 0x14, 0xa9, 0x18, 0xd4, 0x30, 0xe7, 0x79, 0x01, 0xb6, 0xed, 0x5f, 0xfc, 0x95, 0xba, 0x75,
   352  			0x10, 0x25, 0x62, 0x77, 0x2b, 0x73, 0xfb, 0x79, 0xc6, 0x55, 0x37, 0xa5, 0x76, 0x5f, 0x90, 0x18}, //(p256.x*2^256)%p
   353  		y: [32]byte{0x0a, 0x56, 0x95, 0xce, 0x57, 0x53, 0xf2, 0xdd, 0x5c, 0xe4, 0x19, 0xba, 0xe4, 0xb8, 0x4a, 0x8b,
   354  			0x25, 0xf3, 0x21, 0xdd, 0x88, 0x86, 0xe8, 0xd2, 0x85, 0x5d, 0x88, 0x25, 0x18, 0xff, 0x71, 0x85}, //(p256.y*2^256)%p
   355  		z: [32]byte{0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
   356  			0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xfe, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00}, //(p256.z*2^256)%p
   357  
   358  	}
   359  
   360  	t1 := new(p256Point)
   361  	t2 := new(p256Point)
   362  	*t2 = basePoint
   363  
   364  	zInv := make([]byte, 32)
   365  	zInvSq := make([]byte, 32)
   366  	for j := 0; j < 64; j++ {
   367  		*t1 = *t2
   368  		for i := 0; i < 37; i++ {
   369  			// The window size is 7 so we need to double 7 times.
   370  			if i != 0 {
   371  				for k := 0; k < 7; k++ {
   372  					p256PointDoubleAsm(t1, t1)
   373  				}
   374  			}
   375  			// Convert the point to affine form. (Its values are
   376  			// still in Montgomery form however.)
   377  			p256Inverse(zInv, t1.z[:])
   378  			p256Sqr(zInvSq, zInv)
   379  			p256MulAsm(zInv, zInv, zInvSq)
   380  
   381  			p256MulAsm(t1.x[:], t1.x[:], zInvSq)
   382  			p256MulAsm(t1.y[:], t1.y[:], zInv)
   383  
   384  			copy(t1.z[:], basePoint.z[:])
   385  			// Update the table entry
   386  			copy(p256PreFast[i][j].x[:], t1.x[:])
   387  			copy(p256PreFast[i][j].y[:], t1.y[:])
   388  		}
   389  		if j == 0 {
   390  			p256PointDoubleAsm(t2, &basePoint)
   391  		} else {
   392  			p256PointAddAsm(t2, t2, &basePoint)
   393  		}
   394  	}
   395  }
   396  
   397  func (p *p256Point) p256BaseMult(scalar []byte) {
   398  	// TODO: For big endian, the index should be 31 not 0.
   399  	wvalue := (uint(scalar[0]) << 1) & 0xff
   400  	sel, sign := boothW7(uint(wvalue))
   401  	p256SelectBase(p, p256PreFast[0][:], sel)
   402  	p256NegCond(p, sign)
   403  
   404  	copy(p.z[:], one[:])
   405  	var t0 p256Point
   406  
   407  	copy(t0.z[:], one[:])
   408  
   409  	index := uint(6)
   410  	zero := sel
   411  	for i := 1; i < 37; i++ {
   412  		// TODO: For big endian, use the same index values as found
   413  		// in the  s390x implementation.
   414  		if index < 247 {
   415  			wvalue = ((uint(scalar[index/8]) >> (index % 8)) + (uint(scalar[index/8+1]) << (8 - (index % 8)))) & 0xff
   416  		} else {
   417  			wvalue = (uint(scalar[index/8]) >> (index % 8)) & 0xff
   418  		}
   419  		index += 7
   420  		sel, sign = boothW7(uint(wvalue))
   421  		p256SelectBase(&t0, p256PreFast[i][:], sel)
   422  		p256PointAddAffineAsm(p, p, &t0, sign, sel, zero)
   423  		zero |= sel
   424  	}
   425  }
   426  
   427  func (p *p256Point) p256ScalarMult(scalar []byte) {
   428  	// precomp is a table of precomputed points that stores powers of p
   429  	// from p^1 to p^16.
   430  	var precomp [16]p256Point
   431  	var t0, t1, t2, t3 p256Point
   432  
   433  	*&precomp[0] = *p
   434  	p256PointDoubleAsm(&t0, p)
   435  	p256PointDoubleAsm(&t1, &t0)
   436  	p256PointDoubleAsm(&t2, &t1)
   437  	p256PointDoubleAsm(&t3, &t2)
   438  	*&precomp[1] = t0
   439  	*&precomp[3] = t1
   440  	*&precomp[7] = t2
   441  	*&precomp[15] = t3
   442  
   443  	p256PointAddAsm(&t0, &t0, p)
   444  	p256PointAddAsm(&t1, &t1, p)
   445  	p256PointAddAsm(&t2, &t2, p)
   446  
   447  	*&precomp[2] = t0
   448  	*&precomp[4] = t1
   449  	*&precomp[8] = t2
   450  
   451  	p256PointDoubleAsm(&t0, &t0)
   452  	p256PointDoubleAsm(&t1, &t1)
   453  	*&precomp[5] = t0
   454  	*&precomp[9] = t1
   455  
   456  	p256PointAddAsm(&t2, &t0, p)
   457  	p256PointAddAsm(&t1, &t1, p)
   458  	*&precomp[6] = t2
   459  	*&precomp[10] = t1
   460  
   461  	p256PointDoubleAsm(&t0, &t0)
   462  	p256PointDoubleAsm(&t2, &t2)
   463  	*&precomp[11] = t0
   464  	*&precomp[13] = t2
   465  
   466  	p256PointAddAsm(&t0, &t0, p)
   467  	p256PointAddAsm(&t2, &t2, p)
   468  	*&precomp[12] = t0
   469  	*&precomp[14] = t2
   470  
   471  	// Start scanning the window from top bit
   472  	index := uint(254)
   473  	var sel, sign int
   474  
   475  	// TODO: For big endian, use index found in s390x implementation.
   476  	wvalue := (uint(scalar[index/8]) >> (index % 8)) & 0x3f
   477  	sel, _ = boothW5(uint(wvalue))
   478  	p256Select(p, precomp[:], sel)
   479  	zero := sel
   480  
   481  	for index > 4 {
   482  		index -= 5
   483  		p256PointDoubleAsm(p, p)
   484  		p256PointDoubleAsm(p, p)
   485  		p256PointDoubleAsm(p, p)
   486  		p256PointDoubleAsm(p, p)
   487  		p256PointDoubleAsm(p, p)
   488  
   489  		// TODO: For big endian, use index values as found in s390x implementation.
   490  		if index < 247 {
   491  			wvalue = ((uint(scalar[index/8]) >> (index % 8)) + (uint(scalar[index/8+1]) << (8 - (index % 8)))) & 0x3f
   492  		} else {
   493  			wvalue = (uint(scalar[index/8]) >> (index % 8)) & 0x3f
   494  		}
   495  
   496  		sel, sign = boothW5(uint(wvalue))
   497  
   498  		p256Select(&t0, precomp[:], sel)
   499  		p256NegCond(&t0, sign)
   500  		p256PointAddAsm(&t1, p, &t0)
   501  		p256MovCond(&t1, &t1, p, sel)
   502  		p256MovCond(p, &t1, &t0, zero)
   503  		zero |= sel
   504  	}
   505  
   506  	p256PointDoubleAsm(p, p)
   507  	p256PointDoubleAsm(p, p)
   508  	p256PointDoubleAsm(p, p)
   509  	p256PointDoubleAsm(p, p)
   510  	p256PointDoubleAsm(p, p)
   511  
   512  	// TODO: Use index for big endian as found in s390x implementation.
   513  	wvalue = (uint(scalar[0]) << 1) & 0x3f
   514  	sel, sign = boothW5(uint(wvalue))
   515  
   516  	p256Select(&t0, precomp[:], sel)
   517  	p256NegCond(&t0, sign)
   518  	p256PointAddAsm(&t1, p, &t0)
   519  	p256MovCond(&t1, &t1, p, sel)
   520  	p256MovCond(p, &t1, &t0, zero)
   521  }
   522
View as plain text