p256_asm_arm64.s

     1  // Copyright 2018 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // This file contains constant-time, 64-bit assembly implementation of
     6  // P256. The optimizations performed here are described in detail in:
     7  // S.Gueron and V.Krasnov, "Fast prime field elliptic-curve cryptography with
     8  //                          256-bit primes"
     9  // http://link.springer.com/article/10.1007%2Fs13389-014-0090-x
    10  // https://eprint.iacr.org/2013/816.pdf
    11  
    12  #include "textflag.h"
    13  
    14  #define res_ptr R0
    15  #define a_ptr R1
    16  #define b_ptr R2
    17  
    18  #define acc0 R3
    19  #define acc1 R4
    20  #define acc2 R5
    21  #define acc3 R6
    22  
    23  #define acc4 R7
    24  #define acc5 R8
    25  #define acc6 R9
    26  #define acc7 R10
    27  #define t0 R11
    28  #define t1 R12
    29  #define t2 R13
    30  #define t3 R14
    31  #define const0 R15
    32  #define const1 R16
    33  
    34  #define hlp0 R17
    35  #define hlp1 res_ptr
    36  
    37  #define x0 R19
    38  #define x1 R20
    39  #define x2 R21
    40  #define x3 R22
    41  #define y0 R23
    42  #define y1 R24
    43  #define y2 R25
    44  #define y3 R26
    45  
    46  #define const2 t2
    47  #define const3 t3
    48  
    49  DATA p256const0<>+0x00(SB)/8, $0x00000000ffffffff
    50  DATA p256const1<>+0x00(SB)/8, $0xffffffff00000001
    51  DATA p256ordK0<>+0x00(SB)/8, $0xccd1c8aaee00bc4f
    52  DATA p256ord<>+0x00(SB)/8, $0xf3b9cac2fc632551
    53  DATA p256ord<>+0x08(SB)/8, $0xbce6faada7179e84
    54  DATA p256ord<>+0x10(SB)/8, $0xffffffffffffffff
    55  DATA p256ord<>+0x18(SB)/8, $0xffffffff00000000
    56  DATA p256one<>+0x00(SB)/8, $0x0000000000000001
    57  DATA p256one<>+0x08(SB)/8, $0xffffffff00000000
    58  DATA p256one<>+0x10(SB)/8, $0xffffffffffffffff
    59  DATA p256one<>+0x18(SB)/8, $0x00000000fffffffe
    60  GLOBL p256const0<>(SB), 8, $8
    61  GLOBL p256const1<>(SB), 8, $8
    62  GLOBL p256ordK0<>(SB), 8, $8
    63  GLOBL p256ord<>(SB), 8, $32
    64  GLOBL p256one<>(SB), 8, $32
    65  
    66  /* ---------------------------------------*/
    67  // func p256LittleToBig(res []byte, in []uint64)
    68  TEXT ·p256LittleToBig(SB),NOSPLIT,$0
    69  	JMP	·p256BigToLittle(SB)
    70  /* ---------------------------------------*/
    71  // func p256BigToLittle(res []uint64, in []byte)
    72  TEXT ·p256BigToLittle(SB),NOSPLIT,$0
    73  	MOVD	res+0(FP), res_ptr
    74  	MOVD	in+24(FP), a_ptr
    75  
    76  	LDP	0*16(a_ptr), (acc0, acc1)
    77  	LDP	1*16(a_ptr), (acc2, acc3)
    78  
    79  	REV	acc0, acc0
    80  	REV	acc1, acc1
    81  	REV	acc2, acc2
    82  	REV	acc3, acc3
    83  
    84  	STP	(acc3, acc2), 0*16(res_ptr)
    85  	STP	(acc1, acc0), 1*16(res_ptr)
    86  	RET
    87  /* ---------------------------------------*/
    88  // func p256MovCond(res, a, b []uint64, cond int)
    89  // If cond == 0 res=b, else res=a
    90  TEXT ·p256MovCond(SB),NOSPLIT,$0
    91  	MOVD	res+0(FP), res_ptr
    92  	MOVD	a+24(FP), a_ptr
    93  	MOVD	b+48(FP), b_ptr
    94  	MOVD	cond+72(FP), R3
    95  
    96  	CMP	$0, R3
    97  	// Two remarks:
    98  	// 1) Will want to revisit NEON, when support is better
    99  	// 2) CSEL might not be constant time on all ARM processors
   100  	LDP	0*16(a_ptr), (R4, R5)
   101  	LDP	1*16(a_ptr), (R6, R7)
   102  	LDP	2*16(a_ptr), (R8, R9)
   103  	LDP	0*16(b_ptr), (R16, R17)
   104  	LDP	1*16(b_ptr), (R19, R20)
   105  	LDP	2*16(b_ptr), (R21, R22)
   106  	CSEL	EQ, R16, R4, R4
   107  	CSEL	EQ, R17, R5, R5
   108  	CSEL	EQ, R19, R6, R6
   109  	CSEL	EQ, R20, R7, R7
   110  	CSEL	EQ, R21, R8, R8
   111  	CSEL	EQ, R22, R9, R9
   112  	STP	(R4, R5), 0*16(res_ptr)
   113  	STP	(R6, R7), 1*16(res_ptr)
   114  	STP	(R8, R9), 2*16(res_ptr)
   115  
   116  	LDP	3*16(a_ptr), (R4, R5)
   117  	LDP	4*16(a_ptr), (R6, R7)
   118  	LDP	5*16(a_ptr), (R8, R9)
   119  	LDP	3*16(b_ptr), (R16, R17)
   120  	LDP	4*16(b_ptr), (R19, R20)
   121  	LDP	5*16(b_ptr), (R21, R22)
   122  	CSEL	EQ, R16, R4, R4
   123  	CSEL	EQ, R17, R5, R5
   124  	CSEL	EQ, R19, R6, R6
   125  	CSEL	EQ, R20, R7, R7
   126  	CSEL	EQ, R21, R8, R8
   127  	CSEL	EQ, R22, R9, R9
   128  	STP	(R4, R5), 3*16(res_ptr)
   129  	STP	(R6, R7), 4*16(res_ptr)
   130  	STP	(R8, R9), 5*16(res_ptr)
   131  
   132  	RET
   133  /* ---------------------------------------*/
   134  // func p256NegCond(val []uint64, cond int)
   135  TEXT ·p256NegCond(SB),NOSPLIT,$0
   136  	MOVD	val+0(FP), a_ptr
   137  	MOVD	cond+24(FP), hlp0
   138  	MOVD	a_ptr, res_ptr
   139  	// acc = poly
   140  	MOVD	$-1, acc0
   141  	MOVD	p256const0<>(SB), acc1
   142  	MOVD	$0, acc2
   143  	MOVD	p256const1<>(SB), acc3
   144  	// Load the original value
   145  	LDP	0*16(a_ptr), (t0, t1)
   146  	LDP	1*16(a_ptr), (t2, t3)
   147  	// Speculatively subtract
   148  	SUBS	t0, acc0
   149  	SBCS	t1, acc1
   150  	SBCS	t2, acc2
   151  	SBC	t3, acc3
   152  	// If condition is 0, keep original value
   153  	CMP	$0, hlp0
   154  	CSEL	EQ, t0, acc0, acc0
   155  	CSEL	EQ, t1, acc1, acc1
   156  	CSEL	EQ, t2, acc2, acc2
   157  	CSEL	EQ, t3, acc3, acc3
   158  	// Store result
   159  	STP	(acc0, acc1), 0*16(res_ptr)
   160  	STP	(acc2, acc3), 1*16(res_ptr)
   161  
   162  	RET
   163  /* ---------------------------------------*/
   164  // func p256Sqr(res, in []uint64, n int)
   165  TEXT ·p256Sqr(SB),NOSPLIT,$0
   166  	MOVD	res+0(FP), res_ptr
   167  	MOVD	in+24(FP), a_ptr
   168  	MOVD	n+48(FP), b_ptr
   169  
   170  	MOVD	p256const0<>(SB), const0
   171  	MOVD	p256const1<>(SB), const1
   172  
   173  	LDP	0*16(a_ptr), (x0, x1)
   174  	LDP	1*16(a_ptr), (x2, x3)
   175  
   176  sqrLoop:
   177  	SUB	$1, b_ptr
   178  	CALL	p256SqrInternal<>(SB)
   179  	MOVD	y0, x0
   180  	MOVD	y1, x1
   181  	MOVD	y2, x2
   182  	MOVD	y3, x3
   183  	CBNZ	b_ptr, sqrLoop
   184  
   185  	STP	(y0, y1), 0*16(res_ptr)
   186  	STP	(y2, y3), 1*16(res_ptr)
   187  	RET
   188  /* ---------------------------------------*/
   189  // func p256Mul(res, in1, in2 []uint64)
   190  TEXT ·p256Mul(SB),NOSPLIT,$0
   191  	MOVD	res+0(FP), res_ptr
   192  	MOVD	in1+24(FP), a_ptr
   193  	MOVD	in2+48(FP), b_ptr
   194  
   195  	MOVD	p256const0<>(SB), const0
   196  	MOVD	p256const1<>(SB), const1
   197  
   198  	LDP	0*16(a_ptr), (x0, x1)
   199  	LDP	1*16(a_ptr), (x2, x3)
   200  
   201  	LDP	0*16(b_ptr), (y0, y1)
   202  	LDP	1*16(b_ptr), (y2, y3)
   203  
   204  	CALL	p256MulInternal<>(SB)
   205  
   206  	STP	(y0, y1), 0*16(res_ptr)
   207  	STP	(y2, y3), 1*16(res_ptr)
   208  	RET
   209  /* ---------------------------------------*/
   210  // func p256FromMont(res, in []uint64)
   211  TEXT ·p256FromMont(SB),NOSPLIT,$0
   212  	MOVD	res+0(FP), res_ptr
   213  	MOVD	in+24(FP), a_ptr
   214  
   215  	MOVD	p256const0<>(SB), const0
   216  	MOVD	p256const1<>(SB), const1
   217  
   218  	LDP	0*16(a_ptr), (acc0, acc1)
   219  	LDP	1*16(a_ptr), (acc2, acc3)
   220  	// Only reduce, no multiplications are needed
   221  	// First reduction step
   222  	ADDS	acc0<<32, acc1, acc1
   223  	LSR	$32, acc0, t0
   224  	MUL	acc0, const1, t1
   225  	UMULH	acc0, const1, acc0
   226  	ADCS	t0, acc2
   227  	ADCS	t1, acc3
   228  	ADC	$0, acc0
   229  	// Second reduction step
   230  	ADDS	acc1<<32, acc2, acc2
   231  	LSR	$32, acc1, t0
   232  	MUL	acc1, const1, t1
   233  	UMULH	acc1, const1, acc1
   234  	ADCS	t0, acc3
   235  	ADCS	t1, acc0
   236  	ADC	$0, acc1
   237  	// Third reduction step
   238  	ADDS	acc2<<32, acc3, acc3
   239  	LSR	$32, acc2, t0
   240  	MUL	acc2, const1, t1
   241  	UMULH	acc2, const1, acc2
   242  	ADCS	t0, acc0
   243  	ADCS	t1, acc1
   244  	ADC	$0, acc2
   245  	// Last reduction step
   246  	ADDS	acc3<<32, acc0, acc0
   247  	LSR	$32, acc3, t0
   248  	MUL	acc3, const1, t1
   249  	UMULH	acc3, const1, acc3
   250  	ADCS	t0, acc1
   251  	ADCS	t1, acc2
   252  	ADC	$0, acc3
   253  
   254  	SUBS	$-1, acc0, t0
   255  	SBCS	const0, acc1, t1
   256  	SBCS	$0, acc2, t2
   257  	SBCS	const1, acc3, t3
   258  
   259  	CSEL	CS, t0, acc0, acc0
   260  	CSEL	CS, t1, acc1, acc1
   261  	CSEL	CS, t2, acc2, acc2
   262  	CSEL	CS, t3, acc3, acc3
   263  
   264  	STP	(acc0, acc1), 0*16(res_ptr)
   265  	STP	(acc2, acc3), 1*16(res_ptr)
   266  
   267  	RET
   268  /* ---------------------------------------*/
   269  // Constant time point access to arbitrary point table.
   270  // Indexed from 1 to 15, with -1 offset
   271  // (index 0 is implicitly point at infinity)
   272  // func p256Select(point, table []uint64, idx int)
   273  TEXT ·p256Select(SB),NOSPLIT,$0
   274  	MOVD	idx+48(FP), const0
   275  	MOVD	table+24(FP), b_ptr
   276  	MOVD	point+0(FP), res_ptr
   277  
   278  	EOR	x0, x0, x0
   279  	EOR	x1, x1, x1
   280  	EOR	x2, x2, x2
   281  	EOR	x3, x3, x3
   282  	EOR	y0, y0, y0
   283  	EOR	y1, y1, y1
   284  	EOR	y2, y2, y2
   285  	EOR	y3, y3, y3
   286  	EOR	t0, t0, t0
   287  	EOR	t1, t1, t1
   288  	EOR	t2, t2, t2
   289  	EOR	t3, t3, t3
   290  
   291  	MOVD	$0, const1
   292  
   293  loop_select:
   294  		ADD	$1, const1
   295  		CMP	const0, const1
   296  		LDP.P	16(b_ptr), (acc0, acc1)
   297  		CSEL	EQ, acc0, x0, x0
   298  		CSEL	EQ, acc1, x1, x1
   299  		LDP.P	16(b_ptr), (acc2, acc3)
   300  		CSEL	EQ, acc2, x2, x2
   301  		CSEL	EQ, acc3, x3, x3
   302  		LDP.P	16(b_ptr), (acc4, acc5)
   303  		CSEL	EQ, acc4, y0, y0
   304  		CSEL	EQ, acc5, y1, y1
   305  		LDP.P	16(b_ptr), (acc6, acc7)
   306  		CSEL	EQ, acc6, y2, y2
   307  		CSEL	EQ, acc7, y3, y3
   308  		LDP.P	16(b_ptr), (acc0, acc1)
   309  		CSEL	EQ, acc0, t0, t0
   310  		CSEL	EQ, acc1, t1, t1
   311  		LDP.P	16(b_ptr), (acc2, acc3)
   312  		CSEL	EQ, acc2, t2, t2
   313  		CSEL	EQ, acc3, t3, t3
   314  
   315  		CMP	$16, const1
   316  		BNE	loop_select
   317  
   318  	STP	(x0, x1), 0*16(res_ptr)
   319  	STP	(x2, x3), 1*16(res_ptr)
   320  	STP	(y0, y1), 2*16(res_ptr)
   321  	STP	(y2, y3), 3*16(res_ptr)
   322  	STP	(t0, t1), 4*16(res_ptr)
   323  	STP	(t2, t3), 5*16(res_ptr)
   324  	RET
   325  /* ---------------------------------------*/
   326  // Constant time point access to base point table.
   327  // func p256SelectBase(point *[12]uint64, table string, idx int)
   328  TEXT ·p256SelectBase(SB),NOSPLIT,$0
   329  	MOVD	idx+24(FP), t0
   330  	MOVD	table_base+8(FP), t1
   331  	MOVD	point+0(FP), res_ptr
   332  
   333  	EOR	x0, x0, x0
   334  	EOR	x1, x1, x1
   335  	EOR	x2, x2, x2
   336  	EOR	x3, x3, x3
   337  	EOR	y0, y0, y0
   338  	EOR	y1, y1, y1
   339  	EOR	y2, y2, y2
   340  	EOR	y3, y3, y3
   341  
   342  	MOVD	$0, t2
   343  
   344  loop_select:
   345  		ADD	$1, t2
   346  		CMP	t0, t2
   347  		LDP.P	16(t1), (acc0, acc1)
   348  		CSEL	EQ, acc0, x0, x0
   349  		CSEL	EQ, acc1, x1, x1
   350  		LDP.P	16(t1), (acc2, acc3)
   351  		CSEL	EQ, acc2, x2, x2
   352  		CSEL	EQ, acc3, x3, x3
   353  		LDP.P	16(t1), (acc4, acc5)
   354  		CSEL	EQ, acc4, y0, y0
   355  		CSEL	EQ, acc5, y1, y1
   356  		LDP.P	16(t1), (acc6, acc7)
   357  		CSEL	EQ, acc6, y2, y2
   358  		CSEL	EQ, acc7, y3, y3
   359  
   360  		CMP	$32, t2
   361  		BNE	loop_select
   362  
   363  	STP	(x0, x1), 0*16(res_ptr)
   364  	STP	(x2, x3), 1*16(res_ptr)
   365  	STP	(y0, y1), 2*16(res_ptr)
   366  	STP	(y2, y3), 3*16(res_ptr)
   367  	RET
   368  /* ---------------------------------------*/
   369  // func p256OrdSqr(res, in []uint64, n int)
   370  TEXT ·p256OrdSqr(SB),NOSPLIT,$0
   371  	MOVD	in+24(FP), a_ptr
   372  	MOVD	n+48(FP), b_ptr
   373  
   374  	MOVD	p256ordK0<>(SB), hlp1
   375  	LDP	p256ord<>+0x00(SB), (const0, const1)
   376  	LDP	p256ord<>+0x10(SB), (const2, const3)
   377  
   378  	LDP	0*16(a_ptr), (x0, x1)
   379  	LDP	1*16(a_ptr), (x2, x3)
   380  
   381  ordSqrLoop:
   382  	SUB	$1, b_ptr
   383  
   384  	// x[1:] * x[0]
   385  	MUL	x0, x1, acc1
   386  	UMULH	x0, x1, acc2
   387  
   388  	MUL	x0, x2, t0
   389  	ADDS	t0, acc2, acc2
   390  	UMULH	x0, x2, acc3
   391  
   392  	MUL	x0, x3, t0
   393  	ADCS	t0, acc3, acc3
   394  	UMULH	x0, x3, acc4
   395  	ADC	$0, acc4, acc4
   396  	// x[2:] * x[1]
   397  	MUL	x1, x2, t0
   398  	ADDS	t0, acc3
   399  	UMULH	x1, x2, t1
   400  	ADCS	t1, acc4
   401  	ADC	$0, ZR, acc5
   402  
   403  	MUL	x1, x3, t0
   404  	ADDS	t0, acc4
   405  	UMULH	x1, x3, t1
   406  	ADC	t1, acc5
   407  	// x[3] * x[2]
   408  	MUL	x2, x3, t0
   409  	ADDS	t0, acc5
   410  	UMULH	x2, x3, acc6
   411  	ADC	$0, acc6
   412  
   413  	MOVD	$0, acc7
   414  	// *2
   415  	ADDS	acc1, acc1
   416  	ADCS	acc2, acc2
   417  	ADCS	acc3, acc3
   418  	ADCS	acc4, acc4
   419  	ADCS	acc5, acc5
   420  	ADCS	acc6, acc6
   421  	ADC	$0, acc7
   422  	// Missing products
   423  	MUL	x0, x0, acc0
   424  	UMULH	x0, x0, t0
   425  	ADDS	t0, acc1, acc1
   426  
   427  	MUL	x1, x1, t0
   428  	ADCS	t0, acc2, acc2
   429  	UMULH	x1, x1, t1
   430  	ADCS	t1, acc3, acc3
   431  
   432  	MUL	x2, x2, t0
   433  	ADCS	t0, acc4, acc4
   434  	UMULH	x2, x2, t1
   435  	ADCS	t1, acc5, acc5
   436  
   437  	MUL	x3, x3, t0
   438  	ADCS	t0, acc6, acc6
   439  	UMULH	x3, x3, t1
   440  	ADC	t1, acc7, acc7
   441  	// First reduction step
   442  	MUL	acc0, hlp1, hlp0
   443  
   444  	MUL	const0, hlp1, t0
   445  	ADDS	t0, acc0, acc0
   446  	UMULH	const0, hlp0, t1
   447  
   448  	MUL	const1, hlp0, t0
   449  	ADCS	t0, acc1, acc1
   450  	UMULH	const1, hlp0, y0
   451  
   452  	MUL	const2, hlp0, t0
   453  	ADCS	t0, acc2, acc2
   454  	UMULH	const2, hlp0, acc0
   455  
   456  	MUL	const3, hlp0, t0
   457  	ADCS	t0, acc3, acc3
   458  
   459  	UMULH	const3, hlp0, hlp0
   460  	ADC	$0, hlp0
   461  
   462  	ADDS	t1, acc1, acc1
   463  	ADCS	y0, acc2, acc2
   464  	ADCS	acc0, acc3, acc3
   465  	ADC	$0, hlp0, acc0
   466  	// Second reduction step
   467  	MUL	acc1, hlp1, hlp0
   468  
   469  	MUL	const0, hlp1, t0
   470  	ADDS	t0, acc1, acc1
   471  	UMULH	const0, hlp0, t1
   472  
   473  	MUL	const1, hlp0, t0
   474  	ADCS	t0, acc2, acc2
   475  	UMULH	const1, hlp0, y0
   476  
   477  	MUL	const2, hlp0, t0
   478  	ADCS	t0, acc3, acc3
   479  	UMULH	const2, hlp0, acc1
   480  
   481  	MUL	const3, hlp0, t0
   482  	ADCS	t0, acc0, acc0
   483  
   484  	UMULH	const3, hlp0, hlp0
   485  	ADC	$0, hlp0
   486  
   487  	ADDS	t1, acc2, acc2
   488  	ADCS	y0, acc3, acc3
   489  	ADCS	acc1, acc0, acc0
   490  	ADC	$0, hlp0, acc1
   491  	// Third reduction step
   492  	MUL	acc2, hlp1, hlp0
   493  
   494  	MUL	const0, hlp1, t0
   495  	ADDS	t0, acc2, acc2
   496  	UMULH	const0, hlp0, t1
   497  
   498  	MUL	const1, hlp0, t0
   499  	ADCS	t0, acc3, acc3
   500  	UMULH	const1, hlp0, y0
   501  
   502  	MUL	const2, hlp0, t0
   503  	ADCS	t0, acc0, acc0
   504  	UMULH	const2, hlp0, acc2
   505  
   506  	MUL	const3, hlp0, t0
   507  	ADCS	t0, acc1, acc1
   508  
   509  	UMULH	const3, hlp0, hlp0
   510  	ADC	$0, hlp0
   511  
   512  	ADDS	t1, acc3, acc3
   513  	ADCS	y0, acc0, acc0
   514  	ADCS	acc2, acc1, acc1
   515  	ADC	$0, hlp0, acc2
   516  
   517  	// Last reduction step
   518  	MUL	acc3, hlp1, hlp0
   519  
   520  	MUL	const0, hlp1, t0
   521  	ADDS	t0, acc3, acc3
   522  	UMULH	const0, hlp0, t1
   523  
   524  	MUL	const1, hlp0, t0
   525  	ADCS	t0, acc0, acc0
   526  	UMULH	const1, hlp0, y0
   527  
   528  	MUL	const2, hlp0, t0
   529  	ADCS	t0, acc1, acc1
   530  	UMULH	const2, hlp0, acc3
   531  
   532  	MUL	const3, hlp0, t0
   533  	ADCS	t0, acc2, acc2
   534  
   535  	UMULH	const3, hlp0, hlp0
   536  	ADC	$0, acc7
   537  
   538  	ADDS	t1, acc0, acc0
   539  	ADCS	y0, acc1, acc1
   540  	ADCS	acc3, acc2, acc2
   541  	ADC	$0, hlp0, acc3
   542  
   543  	ADDS	acc4, acc0, acc0
   544  	ADCS	acc5, acc1, acc1
   545  	ADCS	acc6, acc2, acc2
   546  	ADCS	acc7, acc3, acc3
   547  	ADC	$0, ZR, acc4
   548  
   549  	SUBS	const0, acc0, y0
   550  	SBCS	const1, acc1, y1
   551  	SBCS	const2, acc2, y2
   552  	SBCS	const3, acc3, y3
   553  	SBCS	$0, acc4, acc4
   554  
   555  	CSEL	CS, y0, acc0, x0
   556  	CSEL	CS, y1, acc1, x1
   557  	CSEL	CS, y2, acc2, x2
   558  	CSEL	CS, y3, acc3, x3
   559  
   560  	CBNZ	b_ptr, ordSqrLoop
   561  
   562  	MOVD	res+0(FP), res_ptr
   563  	STP	(x0, x1), 0*16(res_ptr)
   564  	STP	(x2, x3), 1*16(res_ptr)
   565  
   566  	RET
   567  /* ---------------------------------------*/
   568  // func p256OrdMul(res, in1, in2 []uint64)
   569  TEXT ·p256OrdMul(SB),NOSPLIT,$0
   570  	MOVD	in1+24(FP), a_ptr
   571  	MOVD	in2+48(FP), b_ptr
   572  
   573  	MOVD	p256ordK0<>(SB), hlp1
   574  	LDP	p256ord<>+0x00(SB), (const0, const1)
   575  	LDP	p256ord<>+0x10(SB), (const2, const3)
   576  
   577  	LDP	0*16(a_ptr), (x0, x1)
   578  	LDP	1*16(a_ptr), (x2, x3)
   579  	LDP	0*16(b_ptr), (y0, y1)
   580  	LDP	1*16(b_ptr), (y2, y3)
   581  
   582  	// y[0] * x
   583  	MUL	y0, x0, acc0
   584  	UMULH	y0, x0, acc1
   585  
   586  	MUL	y0, x1, t0
   587  	ADDS	t0, acc1
   588  	UMULH	y0, x1, acc2
   589  
   590  	MUL	y0, x2, t0
   591  	ADCS	t0, acc2
   592  	UMULH	y0, x2, acc3
   593  
   594  	MUL	y0, x3, t0
   595  	ADCS	t0, acc3
   596  	UMULH	y0, x3, acc4
   597  	ADC	$0, acc4
   598  	// First reduction step
   599  	MUL	acc0, hlp1, hlp0
   600  
   601  	MUL	const0, hlp1, t0
   602  	ADDS	t0, acc0, acc0
   603  	UMULH	const0, hlp0, t1
   604  
   605  	MUL	const1, hlp0, t0
   606  	ADCS	t0, acc1, acc1
   607  	UMULH	const1, hlp0, y0
   608  
   609  	MUL	const2, hlp0, t0
   610  	ADCS	t0, acc2, acc2
   611  	UMULH	const2, hlp0, acc0
   612  
   613  	MUL	const3, hlp0, t0
   614  	ADCS	t0, acc3, acc3
   615  
   616  	UMULH	const3, hlp0, hlp0
   617  	ADC	$0, acc4
   618  
   619  	ADDS	t1, acc1, acc1
   620  	ADCS	y0, acc2, acc2
   621  	ADCS	acc0, acc3, acc3
   622  	ADC	$0, hlp0, acc0
   623  	// y[1] * x
   624  	MUL	y1, x0, t0
   625  	ADDS	t0, acc1
   626  	UMULH	y1, x0, t1
   627  
   628  	MUL	y1, x1, t0
   629  	ADCS	t0, acc2
   630  	UMULH	y1, x1, hlp0
   631  
   632  	MUL	y1, x2, t0
   633  	ADCS	t0, acc3
   634  	UMULH	y1, x2, y0
   635  
   636  	MUL	y1, x3, t0
   637  	ADCS	t0, acc4
   638  	UMULH	y1, x3, y1
   639  	ADC	$0, ZR, acc5
   640  
   641  	ADDS	t1, acc2
   642  	ADCS	hlp0, acc3
   643  	ADCS	y0, acc4
   644  	ADC	y1, acc5
   645  	// Second reduction step
   646  	MUL	acc1, hlp1, hlp0
   647  
   648  	MUL	const0, hlp1, t0
   649  	ADDS	t0, acc1, acc1
   650  	UMULH	const0, hlp0, t1
   651  
   652  	MUL	const1, hlp0, t0
   653  	ADCS	t0, acc2, acc2
   654  	UMULH	const1, hlp0, y0
   655  
   656  	MUL	const2, hlp0, t0
   657  	ADCS	t0, acc3, acc3
   658  	UMULH	const2, hlp0, acc1
   659  
   660  	MUL	const3, hlp0, t0
   661  	ADCS	t0, acc0, acc0
   662  
   663  	UMULH	const3, hlp0, hlp0
   664  	ADC	$0, acc5
   665  
   666  	ADDS	t1, acc2, acc2
   667  	ADCS	y0, acc3, acc3
   668  	ADCS	acc1, acc0, acc0
   669  	ADC	$0, hlp0, acc1
   670  	// y[2] * x
   671  	MUL	y2, x0, t0
   672  	ADDS	t0, acc2
   673  	UMULH	y2, x0, t1
   674  
   675  	MUL	y2, x1, t0
   676  	ADCS	t0, acc3
   677  	UMULH	y2, x1, hlp0
   678  
   679  	MUL	y2, x2, t0
   680  	ADCS	t0, acc4
   681  	UMULH	y2, x2, y0
   682  
   683  	MUL	y2, x3, t0
   684  	ADCS	t0, acc5
   685  	UMULH	y2, x3, y1
   686  	ADC	$0, ZR, acc6
   687  
   688  	ADDS	t1, acc3
   689  	ADCS	hlp0, acc4
   690  	ADCS	y0, acc5
   691  	ADC	y1, acc6
   692  	// Third reduction step
   693  	MUL	acc2, hlp1, hlp0
   694  
   695  	MUL	const0, hlp1, t0
   696  	ADDS	t0, acc2, acc2
   697  	UMULH	const0, hlp0, t1
   698  
   699  	MUL	const1, hlp0, t0
   700  	ADCS	t0, acc3, acc3
   701  	UMULH	const1, hlp0, y0
   702  
   703  	MUL	const2, hlp0, t0
   704  	ADCS	t0, acc0, acc0
   705  	UMULH	const2, hlp0, acc2
   706  
   707  	MUL	const3, hlp0, t0
   708  	ADCS	t0, acc1, acc1
   709  
   710  	UMULH	const3, hlp0, hlp0
   711  	ADC	$0, acc6
   712  
   713  	ADDS	t1, acc3, acc3
   714  	ADCS	y0, acc0, acc0
   715  	ADCS	acc2, acc1, acc1
   716  	ADC	$0, hlp0, acc2
   717  	// y[3] * x
   718  	MUL	y3, x0, t0
   719  	ADDS	t0, acc3
   720  	UMULH	y3, x0, t1
   721  
   722  	MUL	y3, x1, t0
   723  	ADCS	t0, acc4
   724  	UMULH	y3, x1, hlp0
   725  
   726  	MUL	y3, x2, t0
   727  	ADCS	t0, acc5
   728  	UMULH	y3, x2, y0
   729  
   730  	MUL	y3, x3, t0
   731  	ADCS	t0, acc6
   732  	UMULH	y3, x3, y1
   733  	ADC	$0, ZR, acc7
   734  
   735  	ADDS	t1, acc4
   736  	ADCS	hlp0, acc5
   737  	ADCS	y0, acc6
   738  	ADC	y1, acc7
   739  	// Last reduction step
   740  	MUL	acc3, hlp1, hlp0
   741  
   742  	MUL	const0, hlp1, t0
   743  	ADDS	t0, acc3, acc3
   744  	UMULH	const0, hlp0, t1
   745  
   746  	MUL	const1, hlp0, t0
   747  	ADCS	t0, acc0, acc0
   748  	UMULH	const1, hlp0, y0
   749  
   750  	MUL	const2, hlp0, t0
   751  	ADCS	t0, acc1, acc1
   752  	UMULH	const2, hlp0, acc3
   753  
   754  	MUL	const3, hlp0, t0
   755  	ADCS	t0, acc2, acc2
   756  
   757  	UMULH	const3, hlp0, hlp0
   758  	ADC	$0, acc7
   759  
   760  	ADDS	t1, acc0, acc0
   761  	ADCS	y0, acc1, acc1
   762  	ADCS	acc3, acc2, acc2
   763  	ADC	$0, hlp0, acc3
   764  
   765  	ADDS	acc4, acc0, acc0
   766  	ADCS	acc5, acc1, acc1
   767  	ADCS	acc6, acc2, acc2
   768  	ADCS	acc7, acc3, acc3
   769  	ADC	$0, ZR, acc4
   770  
   771  	SUBS	const0, acc0, t0
   772  	SBCS	const1, acc1, t1
   773  	SBCS	const2, acc2, t2
   774  	SBCS	const3, acc3, t3
   775  	SBCS	$0, acc4, acc4
   776  
   777  	CSEL	CS, t0, acc0, acc0
   778  	CSEL	CS, t1, acc1, acc1
   779  	CSEL	CS, t2, acc2, acc2
   780  	CSEL	CS, t3, acc3, acc3
   781  
   782  	MOVD	res+0(FP), res_ptr
   783  	STP	(acc0, acc1), 0*16(res_ptr)
   784  	STP	(acc2, acc3), 1*16(res_ptr)
   785  
   786  	RET
   787  /* ---------------------------------------*/
   788  TEXT p256SubInternal<>(SB),NOSPLIT,$0
   789  	SUBS	x0, y0, acc0
   790  	SBCS	x1, y1, acc1
   791  	SBCS	x2, y2, acc2
   792  	SBCS	x3, y3, acc3
   793  	SBC	$0, ZR, t0
   794  
   795  	ADDS	$-1, acc0, acc4
   796  	ADCS	const0, acc1, acc5
   797  	ADCS	$0, acc2, acc6
   798  	ADC	const1, acc3, acc7
   799  
   800  	ANDS	$1, t0
   801  	CSEL	EQ, acc0, acc4, x0
   802  	CSEL	EQ, acc1, acc5, x1
   803  	CSEL	EQ, acc2, acc6, x2
   804  	CSEL	EQ, acc3, acc7, x3
   805  
   806  	RET
   807  /* ---------------------------------------*/
   808  TEXT p256SqrInternal<>(SB),NOSPLIT,$0
   809  	// x[1:] * x[0]
   810  	MUL	x0, x1, acc1
   811  	UMULH	x0, x1, acc2
   812  
   813  	MUL	x0, x2, t0
   814  	ADDS	t0, acc2, acc2
   815  	UMULH	x0, x2, acc3
   816  
   817  	MUL	x0, x3, t0
   818  	ADCS	t0, acc3, acc3
   819  	UMULH	x0, x3, acc4
   820  	ADC	$0, acc4, acc4
   821  	// x[2:] * x[1]
   822  	MUL	x1, x2, t0
   823  	ADDS	t0, acc3
   824  	UMULH	x1, x2, t1
   825  	ADCS	t1, acc4
   826  	ADC	$0, ZR, acc5
   827  
   828  	MUL	x1, x3, t0
   829  	ADDS	t0, acc4
   830  	UMULH	x1, x3, t1
   831  	ADC	t1, acc5
   832  	// x[3] * x[2]
   833  	MUL	x2, x3, t0
   834  	ADDS	t0, acc5
   835  	UMULH	x2, x3, acc6
   836  	ADC	$0, acc6
   837  
   838  	MOVD	$0, acc7
   839  	// *2
   840  	ADDS	acc1, acc1
   841  	ADCS	acc2, acc2
   842  	ADCS	acc3, acc3
   843  	ADCS	acc4, acc4
   844  	ADCS	acc5, acc5
   845  	ADCS	acc6, acc6
   846  	ADC	$0, acc7
   847  	// Missing products
   848  	MUL	x0, x0, acc0
   849  	UMULH	x0, x0, t0
   850  	ADDS	t0, acc1, acc1
   851  
   852  	MUL	x1, x1, t0
   853  	ADCS	t0, acc2, acc2
   854  	UMULH	x1, x1, t1
   855  	ADCS	t1, acc3, acc3
   856  
   857  	MUL	x2, x2, t0
   858  	ADCS	t0, acc4, acc4
   859  	UMULH	x2, x2, t1
   860  	ADCS	t1, acc5, acc5
   861  
   862  	MUL	x3, x3, t0
   863  	ADCS	t0, acc6, acc6
   864  	UMULH	x3, x3, t1
   865  	ADCS	t1, acc7, acc7
   866  	// First reduction step
   867  	ADDS	acc0<<32, acc1, acc1
   868  	LSR	$32, acc0, t0
   869  	MUL	acc0, const1, t1
   870  	UMULH	acc0, const1, acc0
   871  	ADCS	t0, acc2, acc2
   872  	ADCS	t1, acc3, acc3
   873  	ADC	$0, acc0, acc0
   874  	// Second reduction step
   875  	ADDS	acc1<<32, acc2, acc2
   876  	LSR	$32, acc1, t0
   877  	MUL	acc1, const1, t1
   878  	UMULH	acc1, const1, acc1
   879  	ADCS	t0, acc3, acc3
   880  	ADCS	t1, acc0, acc0
   881  	ADC	$0, acc1, acc1
   882  	// Third reduction step
   883  	ADDS	acc2<<32, acc3, acc3
   884  	LSR	$32, acc2, t0
   885  	MUL	acc2, const1, t1
   886  	UMULH	acc2, const1, acc2
   887  	ADCS	t0, acc0, acc0
   888  	ADCS	t1, acc1, acc1
   889  	ADC	$0, acc2, acc2
   890  	// Last reduction step
   891  	ADDS	acc3<<32, acc0, acc0
   892  	LSR	$32, acc3, t0
   893  	MUL	acc3, const1, t1
   894  	UMULH	acc3, const1, acc3
   895  	ADCS	t0, acc1, acc1
   896  	ADCS	t1, acc2, acc2
   897  	ADC	$0, acc3, acc3
   898  	// Add bits [511:256] of the sqr result
   899  	ADDS	acc4, acc0, acc0
   900  	ADCS	acc5, acc1, acc1
   901  	ADCS	acc6, acc2, acc2
   902  	ADCS	acc7, acc3, acc3
   903  	ADC	$0, ZR, acc4
   904  
   905  	SUBS	$-1, acc0, t0
   906  	SBCS	const0, acc1, t1
   907  	SBCS	$0, acc2, t2
   908  	SBCS	const1, acc3, t3
   909  	SBCS	$0, acc4, acc4
   910  
   911  	CSEL	CS, t0, acc0, y0
   912  	CSEL	CS, t1, acc1, y1
   913  	CSEL	CS, t2, acc2, y2
   914  	CSEL	CS, t3, acc3, y3
   915  	RET
   916  /* ---------------------------------------*/
   917  TEXT p256MulInternal<>(SB),NOSPLIT,$0
   918  	// y[0] * x
   919  	MUL	y0, x0, acc0
   920  	UMULH	y0, x0, acc1
   921  
   922  	MUL	y0, x1, t0
   923  	ADDS	t0, acc1
   924  	UMULH	y0, x1, acc2
   925  
   926  	MUL	y0, x2, t0
   927  	ADCS	t0, acc2
   928  	UMULH	y0, x2, acc3
   929  
   930  	MUL	y0, x3, t0
   931  	ADCS	t0, acc3
   932  	UMULH	y0, x3, acc4
   933  	ADC	$0, acc4
   934  	// First reduction step
   935  	ADDS	acc0<<32, acc1, acc1
   936  	LSR	$32, acc0, t0
   937  	MUL	acc0, const1, t1
   938  	UMULH	acc0, const1, acc0
   939  	ADCS	t0, acc2
   940  	ADCS	t1, acc3
   941  	ADC	$0, acc0
   942  	// y[1] * x
   943  	MUL	y1, x0, t0
   944  	ADDS	t0, acc1
   945  	UMULH	y1, x0, t1
   946  
   947  	MUL	y1, x1, t0
   948  	ADCS	t0, acc2
   949  	UMULH	y1, x1, t2
   950  
   951  	MUL	y1, x2, t0
   952  	ADCS	t0, acc3
   953  	UMULH	y1, x2, t3
   954  
   955  	MUL	y1, x3, t0
   956  	ADCS	t0, acc4
   957  	UMULH	y1, x3, hlp0
   958  	ADC	$0, ZR, acc5
   959  
   960  	ADDS	t1, acc2
   961  	ADCS	t2, acc3
   962  	ADCS	t3, acc4
   963  	ADC	hlp0, acc5
   964  	// Second reduction step
   965  	ADDS	acc1<<32, acc2, acc2
   966  	LSR	$32, acc1, t0
   967  	MUL	acc1, const1, t1
   968  	UMULH	acc1, const1, acc1
   969  	ADCS	t0, acc3
   970  	ADCS	t1, acc0
   971  	ADC	$0, acc1
   972  	// y[2] * x
   973  	MUL	y2, x0, t0
   974  	ADDS	t0, acc2
   975  	UMULH	y2, x0, t1
   976  
   977  	MUL	y2, x1, t0
   978  	ADCS	t0, acc3
   979  	UMULH	y2, x1, t2
   980  
   981  	MUL	y2, x2, t0
   982  	ADCS	t0, acc4
   983  	UMULH	y2, x2, t3
   984  
   985  	MUL	y2, x3, t0
   986  	ADCS	t0, acc5
   987  	UMULH	y2, x3, hlp0
   988  	ADC	$0, ZR, acc6
   989  
   990  	ADDS	t1, acc3
   991  	ADCS	t2, acc4
   992  	ADCS	t3, acc5
   993  	ADC	hlp0, acc6
   994  	// Third reduction step
   995  	ADDS	acc2<<32, acc3, acc3
   996  	LSR	$32, acc2, t0
   997  	MUL	acc2, const1, t1
   998  	UMULH	acc2, const1, acc2
   999  	ADCS	t0, acc0
  1000  	ADCS	t1, acc1
  1001  	ADC	$0, acc2
  1002  	// y[3] * x
  1003  	MUL	y3, x0, t0
  1004  	ADDS	t0, acc3
  1005  	UMULH	y3, x0, t1
  1006  
  1007  	MUL	y3, x1, t0
  1008  	ADCS	t0, acc4
  1009  	UMULH	y3, x1, t2
  1010  
  1011  	MUL	y3, x2, t0
  1012  	ADCS	t0, acc5
  1013  	UMULH	y3, x2, t3
  1014  
  1015  	MUL	y3, x3, t0
  1016  	ADCS	t0, acc6
  1017  	UMULH	y3, x3, hlp0
  1018  	ADC	$0, ZR, acc7
  1019  
  1020  	ADDS	t1, acc4
  1021  	ADCS	t2, acc5
  1022  	ADCS	t3, acc6
  1023  	ADC	hlp0, acc7
  1024  	// Last reduction step
  1025  	ADDS	acc3<<32, acc0, acc0
  1026  	LSR	$32, acc3, t0
  1027  	MUL	acc3, const1, t1
  1028  	UMULH	acc3, const1, acc3
  1029  	ADCS	t0, acc1
  1030  	ADCS	t1, acc2
  1031  	ADC	$0, acc3
  1032  	// Add bits [511:256] of the mul result
  1033  	ADDS	acc4, acc0, acc0
  1034  	ADCS	acc5, acc1, acc1
  1035  	ADCS	acc6, acc2, acc2
  1036  	ADCS	acc7, acc3, acc3
  1037  	ADC	$0, ZR, acc4
  1038  
  1039  	SUBS	$-1, acc0, t0
  1040  	SBCS	const0, acc1, t1
  1041  	SBCS	$0, acc2, t2
  1042  	SBCS	const1, acc3, t3
  1043  	SBCS	$0, acc4, acc4
  1044  
  1045  	CSEL	CS, t0, acc0, y0
  1046  	CSEL	CS, t1, acc1, y1
  1047  	CSEL	CS, t2, acc2, y2
  1048  	CSEL	CS, t3, acc3, y3
  1049  	RET
  1050  /* ---------------------------------------*/
  1051  #define p256MulBy2Inline       \
  1052  	ADDS	y0, y0, x0;    \
  1053  	ADCS	y1, y1, x1;    \
  1054  	ADCS	y2, y2, x2;    \
  1055  	ADCS	y3, y3, x3;    \
  1056  	ADC	$0, ZR, hlp0;  \
  1057  	SUBS	$-1, x0, t0;   \
  1058  	SBCS	const0, x1, t1;\
  1059  	SBCS	$0, x2, t2;    \
  1060  	SBCS	const1, x3, t3;\
  1061  	SBCS	$0, hlp0, hlp0;\
  1062  	CSEL	CC, x0, t0, x0;\
  1063  	CSEL	CC, x1, t1, x1;\
  1064  	CSEL	CC, x2, t2, x2;\
  1065  	CSEL	CC, x3, t3, x3;
  1066  /* ---------------------------------------*/
  1067  #define x1in(off) (off)(a_ptr)
  1068  #define y1in(off) (off + 32)(a_ptr)
  1069  #define z1in(off) (off + 64)(a_ptr)
  1070  #define x2in(off) (off)(b_ptr)
  1071  #define z2in(off) (off + 64)(b_ptr)
  1072  #define x3out(off) (off)(res_ptr)
  1073  #define y3out(off) (off + 32)(res_ptr)
  1074  #define z3out(off) (off + 64)(res_ptr)
  1075  #define LDx(src) LDP src(0), (x0, x1); LDP src(16), (x2, x3)
  1076  #define LDy(src) LDP src(0), (y0, y1); LDP src(16), (y2, y3)
  1077  #define STx(src) STP (x0, x1), src(0); STP (x2, x3), src(16)
  1078  #define STy(src) STP (y0, y1), src(0); STP (y2, y3), src(16)
  1079  /* ---------------------------------------*/
  1080  #define y2in(off)  (32*0 + 8 + off)(RSP)
  1081  #define s2(off)    (32*1 + 8 + off)(RSP)
  1082  #define z1sqr(off) (32*2 + 8 + off)(RSP)
  1083  #define h(off)	   (32*3 + 8 + off)(RSP)
  1084  #define r(off)	   (32*4 + 8 + off)(RSP)
  1085  #define hsqr(off)  (32*5 + 8 + off)(RSP)
  1086  #define rsqr(off)  (32*6 + 8 + off)(RSP)
  1087  #define hcub(off)  (32*7 + 8 + off)(RSP)
  1088  
  1089  #define z2sqr(off) (32*8 + 8 + off)(RSP)
  1090  #define s1(off) (32*9 + 8 + off)(RSP)
  1091  #define u1(off) (32*10 + 8 + off)(RSP)
  1092  #define u2(off) (32*11 + 8 + off)(RSP)
  1093  
  1094  // func p256PointAddAffineAsm(res, in1, in2 []uint64, sign, sel, zero int)
  1095  TEXT ·p256PointAddAffineAsm(SB),0,$264-96
  1096  	MOVD	in1+24(FP), a_ptr
  1097  	MOVD	in2+48(FP), b_ptr
  1098  	MOVD	sign+72(FP), hlp0
  1099  	MOVD	sel+80(FP), hlp1
  1100  	MOVD	zero+88(FP), t2
  1101  
  1102  	MOVD	$1, t0
  1103  	CMP	$0, t2
  1104  	CSEL	EQ, ZR, t0, t2
  1105  	CMP	$0, hlp1
  1106  	CSEL	EQ, ZR, t0, hlp1
  1107  
  1108  	MOVD	p256const0<>(SB), const0
  1109  	MOVD	p256const1<>(SB), const1
  1110  	EOR	t2<<1, hlp1
  1111  
  1112  	// Negate y2in based on sign
  1113  	LDP	2*16(b_ptr), (y0, y1)
  1114  	LDP	3*16(b_ptr), (y2, y3)
  1115  	MOVD	$-1, acc0
  1116  
  1117  	SUBS	y0, acc0, acc0
  1118  	SBCS	y1, const0, acc1
  1119  	SBCS	y2, ZR, acc2
  1120  	SBCS	y3, const1, acc3
  1121  	SBC	$0, ZR, t0
  1122  
  1123  	ADDS	$-1, acc0, acc4
  1124  	ADCS	const0, acc1, acc5
  1125  	ADCS	$0, acc2, acc6
  1126  	ADCS	const1, acc3, acc7
  1127  	ADC	$0, t0, t0
  1128  
  1129  	CMP	$0, t0
  1130  	CSEL	EQ, acc4, acc0, acc0
  1131  	CSEL	EQ, acc5, acc1, acc1
  1132  	CSEL	EQ, acc6, acc2, acc2
  1133  	CSEL	EQ, acc7, acc3, acc3
  1134  	// If condition is 0, keep original value
  1135  	CMP	$0, hlp0
  1136  	CSEL	EQ, y0, acc0, y0
  1137  	CSEL	EQ, y1, acc1, y1
  1138  	CSEL	EQ, y2, acc2, y2
  1139  	CSEL	EQ, y3, acc3, y3
  1140  	// Store result
  1141  	STy(y2in)
  1142  	// Begin point add
  1143  	LDx(z1in)
  1144  	CALL	p256SqrInternal<>(SB)    // z1ˆ2
  1145  	STy(z1sqr)
  1146  
  1147  	LDx(x2in)
  1148  	CALL	p256MulInternal<>(SB)    // x2 * z1ˆ2
  1149  
  1150  	LDx(x1in)
  1151  	CALL	p256SubInternal<>(SB)    // h = u2 - u1
  1152  	STx(h)
  1153  
  1154  	LDy(z1in)
  1155  	CALL	p256MulInternal<>(SB)    // z3 = h * z1
  1156  
  1157  	LDP	4*16(a_ptr), (acc0, acc1)// iff select[0] == 0, z3 = z1
  1158  	LDP	5*16(a_ptr), (acc2, acc3)
  1159  	ANDS	$1, hlp1, ZR
  1160  	CSEL	EQ, acc0, y0, y0
  1161  	CSEL	EQ, acc1, y1, y1
  1162  	CSEL	EQ, acc2, y2, y2
  1163  	CSEL	EQ, acc3, y3, y3
  1164  	LDP	p256one<>+0x00(SB), (acc0, acc1)
  1165  	LDP	p256one<>+0x10(SB), (acc2, acc3)
  1166  	ANDS	$2, hlp1, ZR            // iff select[1] == 0, z3 = 1
  1167  	CSEL	EQ, acc0, y0, y0
  1168  	CSEL	EQ, acc1, y1, y1
  1169  	CSEL	EQ, acc2, y2, y2
  1170  	CSEL	EQ, acc3, y3, y3
  1171  	LDx(z1in)
  1172  	MOVD	res+0(FP), t0
  1173  	STP	(y0, y1), 4*16(t0)
  1174  	STP	(y2, y3), 5*16(t0)
  1175  
  1176  	LDy(z1sqr)
  1177  	CALL	p256MulInternal<>(SB)    // z1 ^ 3
  1178  
  1179  	LDx(y2in)
  1180  	CALL	p256MulInternal<>(SB)    // s2 = y2 * z1ˆ3
  1181  	STy(s2)
  1182  
  1183  	LDx(y1in)
  1184  	CALL	p256SubInternal<>(SB)    // r = s2 - s1
  1185  	STx(r)
  1186  
  1187  	CALL	p256SqrInternal<>(SB)    // rsqr = rˆ2
  1188  	STy	(rsqr)
  1189  
  1190  	LDx(h)
  1191  	CALL	p256SqrInternal<>(SB)    // hsqr = hˆ2
  1192  	STy(hsqr)
  1193  
  1194  	CALL	p256MulInternal<>(SB)    // hcub = hˆ3
  1195  	STy(hcub)
  1196  
  1197  	LDx(y1in)
  1198  	CALL	p256MulInternal<>(SB)    // y1 * hˆ3
  1199  	STy(s2)
  1200  
  1201  	LDP	hsqr(0*8), (x0, x1)
  1202  	LDP	hsqr(2*8), (x2, x3)
  1203  	LDP	0*16(a_ptr), (y0, y1)
  1204  	LDP	1*16(a_ptr), (y2, y3)
  1205  	CALL	p256MulInternal<>(SB)    // u1 * hˆ2
  1206  	STP	(y0, y1), h(0*8)
  1207  	STP	(y2, y3), h(2*8)
  1208  
  1209  	p256MulBy2Inline               // u1 * hˆ2 * 2, inline
  1210  
  1211  	LDy(rsqr)
  1212  	CALL	p256SubInternal<>(SB)    // rˆ2 - u1 * hˆ2 * 2
  1213  
  1214  	MOVD	x0, y0
  1215  	MOVD	x1, y1
  1216  	MOVD	x2, y2
  1217  	MOVD	x3, y3
  1218  	LDx(hcub)
  1219  	CALL	p256SubInternal<>(SB)
  1220  
  1221  	LDP	0*16(a_ptr), (acc0, acc1)
  1222  	LDP	1*16(a_ptr), (acc2, acc3)
  1223  	ANDS	$1, hlp1, ZR           // iff select[0] == 0, x3 = x1
  1224  	CSEL	EQ, acc0, x0, x0
  1225  	CSEL	EQ, acc1, x1, x1
  1226  	CSEL	EQ, acc2, x2, x2
  1227  	CSEL	EQ, acc3, x3, x3
  1228  	LDP	0*16(b_ptr), (acc0, acc1)
  1229  	LDP	1*16(b_ptr), (acc2, acc3)
  1230  	ANDS	$2, hlp1, ZR           // iff select[1] == 0, x3 = x2
  1231  	CSEL	EQ, acc0, x0, x0
  1232  	CSEL	EQ, acc1, x1, x1
  1233  	CSEL	EQ, acc2, x2, x2
  1234  	CSEL	EQ, acc3, x3, x3
  1235  	MOVD	res+0(FP), t0
  1236  	STP	(x0, x1), 0*16(t0)
  1237  	STP	(x2, x3), 1*16(t0)
  1238  
  1239  	LDP	h(0*8), (y0, y1)
  1240  	LDP	h(2*8), (y2, y3)
  1241  	CALL	p256SubInternal<>(SB)
  1242  
  1243  	LDP	r(0*8), (y0, y1)
  1244  	LDP	r(2*8), (y2, y3)
  1245  	CALL	p256MulInternal<>(SB)
  1246  
  1247  	LDP	s2(0*8), (x0, x1)
  1248  	LDP	s2(2*8), (x2, x3)
  1249  	CALL	p256SubInternal<>(SB)
  1250  	LDP	2*16(a_ptr), (acc0, acc1)
  1251  	LDP	3*16(a_ptr), (acc2, acc3)
  1252  	ANDS	$1, hlp1, ZR           // iff select[0] == 0, y3 = y1
  1253  	CSEL	EQ, acc0, x0, x0
  1254  	CSEL	EQ, acc1, x1, x1
  1255  	CSEL	EQ, acc2, x2, x2
  1256  	CSEL	EQ, acc3, x3, x3
  1257  	LDP	y2in(0*8), (acc0, acc1)
  1258  	LDP	y2in(2*8), (acc2, acc3)
  1259  	ANDS	$2, hlp1, ZR            // iff select[1] == 0, y3 = y2
  1260  	CSEL	EQ, acc0, x0, x0
  1261  	CSEL	EQ, acc1, x1, x1
  1262  	CSEL	EQ, acc2, x2, x2
  1263  	CSEL	EQ, acc3, x3, x3
  1264  	MOVD	res+0(FP), t0
  1265  	STP	(x0, x1), 2*16(t0)
  1266  	STP	(x2, x3), 3*16(t0)
  1267  
  1268  	RET
  1269  
  1270  #define p256AddInline          \
  1271  	ADDS	y0, x0, x0;    \
  1272  	ADCS	y1, x1, x1;    \
  1273  	ADCS	y2, x2, x2;    \
  1274  	ADCS	y3, x3, x3;    \
  1275  	ADC	$0, ZR, hlp0;  \
  1276  	SUBS	$-1, x0, t0;   \
  1277  	SBCS	const0, x1, t1;\
  1278  	SBCS	$0, x2, t2;    \
  1279  	SBCS	const1, x3, t3;\
  1280  	SBCS	$0, hlp0, hlp0;\
  1281  	CSEL	CC, x0, t0, x0;\
  1282  	CSEL	CC, x1, t1, x1;\
  1283  	CSEL	CC, x2, t2, x2;\
  1284  	CSEL	CC, x3, t3, x3;
  1285  
  1286  #define s(off)	(32*0 + 8 + off)(RSP)
  1287  #define m(off)	(32*1 + 8 + off)(RSP)
  1288  #define zsqr(off) (32*2 + 8 + off)(RSP)
  1289  #define tmp(off)  (32*3 + 8 + off)(RSP)
  1290  
  1291  //func p256PointDoubleAsm(res, in []uint64)
  1292  TEXT ·p256PointDoubleAsm(SB),NOSPLIT,$136-48
  1293  	MOVD	res+0(FP), res_ptr
  1294  	MOVD	in+24(FP), a_ptr
  1295  
  1296  	MOVD	p256const0<>(SB), const0
  1297  	MOVD	p256const1<>(SB), const1
  1298  
  1299  	// Begin point double
  1300  	LDP	4*16(a_ptr), (x0, x1)
  1301  	LDP	5*16(a_ptr), (x2, x3)
  1302  	CALL	p256SqrInternal<>(SB)
  1303  	STP	(y0, y1), zsqr(0*8)
  1304  	STP	(y2, y3), zsqr(2*8)
  1305  
  1306  	LDP	0*16(a_ptr), (x0, x1)
  1307  	LDP	1*16(a_ptr), (x2, x3)
  1308  	p256AddInline
  1309  	STx(m)
  1310  
  1311  	LDx(z1in)
  1312  	LDy(y1in)
  1313  	CALL	p256MulInternal<>(SB)
  1314  	p256MulBy2Inline
  1315  	STx(z3out)
  1316  
  1317  	LDy(x1in)
  1318  	LDx(zsqr)
  1319  	CALL	p256SubInternal<>(SB)
  1320  	LDy(m)
  1321  	CALL	p256MulInternal<>(SB)
  1322  
  1323  	// Multiply by 3
  1324  	p256MulBy2Inline
  1325  	p256AddInline
  1326  	STx(m)
  1327  
  1328  	LDy(y1in)
  1329  	p256MulBy2Inline
  1330  	CALL	p256SqrInternal<>(SB)
  1331  	STy(s)
  1332  	MOVD	y0, x0
  1333  	MOVD	y1, x1
  1334  	MOVD	y2, x2
  1335  	MOVD	y3, x3
  1336  	CALL	p256SqrInternal<>(SB)
  1337  
  1338  	// Divide by 2
  1339  	ADDS	$-1, y0, t0
  1340  	ADCS	const0, y1, t1
  1341  	ADCS	$0, y2, t2
  1342  	ADCS	const1, y3, t3
  1343  	ADC	$0, ZR, hlp0
  1344  
  1345  	ANDS	$1, y0, ZR
  1346  	CSEL	EQ, y0, t0, t0
  1347  	CSEL	EQ, y1, t1, t1
  1348  	CSEL	EQ, y2, t2, t2
  1349  	CSEL	EQ, y3, t3, t3
  1350  	AND	y0, hlp0, hlp0
  1351  
  1352  	EXTR	$1, t0, t1, y0
  1353  	EXTR	$1, t1, t2, y1
  1354  	EXTR	$1, t2, t3, y2
  1355  	EXTR	$1, t3, hlp0, y3
  1356  	STy(y3out)
  1357  
  1358  	LDx(x1in)
  1359  	LDy(s)
  1360  	CALL	p256MulInternal<>(SB)
  1361  	STy(s)
  1362  	p256MulBy2Inline
  1363  	STx(tmp)
  1364  
  1365  	LDx(m)
  1366  	CALL	p256SqrInternal<>(SB)
  1367  	LDx(tmp)
  1368  	CALL	p256SubInternal<>(SB)
  1369  
  1370  	STx(x3out)
  1371  
  1372  	LDy(s)
  1373  	CALL	p256SubInternal<>(SB)
  1374  
  1375  	LDy(m)
  1376  	CALL	p256MulInternal<>(SB)
  1377  
  1378  	LDx(y3out)
  1379  	CALL	p256SubInternal<>(SB)
  1380  	STx(y3out)
  1381  	RET
  1382  /* ---------------------------------------*/
  1383  #undef y2in
  1384  #undef x3out
  1385  #undef y3out
  1386  #undef z3out
  1387  #define y2in(off) (off + 32)(b_ptr)
  1388  #define x3out(off) (off)(b_ptr)
  1389  #define y3out(off) (off + 32)(b_ptr)
  1390  #define z3out(off) (off + 64)(b_ptr)
  1391  //func p256PointAddAsm(res, in1, in2 []uint64) int
  1392  TEXT ·p256PointAddAsm(SB),0,$392-80
  1393  	// See https://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl
  1394  	// Move input to stack in order to free registers
  1395  	MOVD	in1+24(FP), a_ptr
  1396  	MOVD	in2+48(FP), b_ptr
  1397  
  1398  	MOVD	p256const0<>(SB), const0
  1399  	MOVD	p256const1<>(SB), const1
  1400  
  1401  	// Begin point add
  1402  	LDx(z2in)
  1403  	CALL	p256SqrInternal<>(SB)    // z2^2
  1404  	STy(z2sqr)
  1405  
  1406  	CALL	p256MulInternal<>(SB)    // z2^3
  1407  
  1408  	LDx(y1in)
  1409  	CALL	p256MulInternal<>(SB)    // s1 = z2ˆ3*y1
  1410  	STy(s1)
  1411  
  1412  	LDx(z1in)
  1413  	CALL	p256SqrInternal<>(SB)    // z1^2
  1414  	STy(z1sqr)
  1415  
  1416  	CALL	p256MulInternal<>(SB)    // z1^3
  1417  
  1418  	LDx(y2in)
  1419  	CALL	p256MulInternal<>(SB)    // s2 = z1ˆ3*y2
  1420  
  1421  	LDx(s1)
  1422  	CALL	p256SubInternal<>(SB)    // r = s2 - s1
  1423  	STx(r)
  1424  
  1425  	MOVD	$1, t2
  1426  	ORR	x0, x1, t0             // Check if zero mod p256
  1427  	ORR	x2, x3, t1
  1428  	ORR	t1, t0, t0
  1429  	CMP	$0, t0
  1430  	CSEL	EQ, t2, ZR, hlp1
  1431  
  1432  	EOR	$-1, x0, t0
  1433  	EOR	const0, x1, t1
  1434  	EOR	const1, x3, t3
  1435  
  1436  	ORR	t0, t1, t0
  1437  	ORR	x2, t3, t1
  1438  	ORR	t1, t0, t0
  1439  	CMP	$0, t0
  1440  	CSEL	EQ, t2, hlp1, hlp1
  1441  
  1442  	LDx(z2sqr)
  1443  	LDy(x1in)
  1444  	CALL	p256MulInternal<>(SB)    // u1 = x1 * z2ˆ2
  1445  	STy(u1)
  1446  
  1447  	LDx(z1sqr)
  1448  	LDy(x2in)
  1449  	CALL	p256MulInternal<>(SB)    // u2 = x2 * z1ˆ2
  1450  	STy(u2)
  1451  
  1452  	LDx(u1)
  1453  	CALL	p256SubInternal<>(SB)    // h = u2 - u1
  1454  	STx(h)
  1455  
  1456  	MOVD	$1, t2
  1457  	ORR	x0, x1, t0             // Check if zero mod p256
  1458  	ORR	x2, x3, t1
  1459  	ORR	t1, t0, t0
  1460  	CMP	$0, t0
  1461  	CSEL	EQ, t2, ZR, hlp0
  1462  
  1463  	EOR	$-1, x0, t0
  1464  	EOR	const0, x1, t1
  1465  	EOR	const1, x3, t3
  1466  
  1467  	ORR	t0, t1, t0
  1468  	ORR	x2, t3, t1
  1469  	ORR	t1, t0, t0
  1470  	CMP	$0, t0
  1471  	CSEL	EQ, t2, hlp0, hlp0
  1472  
  1473  	AND	hlp0, hlp1, hlp1
  1474  
  1475  	LDx(r)
  1476  	CALL	p256SqrInternal<>(SB)    // rsqr = rˆ2
  1477  	STy(rsqr)
  1478  
  1479  	LDx(h)
  1480  	CALL	p256SqrInternal<>(SB)    // hsqr = hˆ2
  1481  	STy(hsqr)
  1482  
  1483  	LDx(h)
  1484  	CALL	p256MulInternal<>(SB)    // hcub = hˆ3
  1485  	STy(hcub)
  1486  
  1487  	LDx(s1)
  1488  	CALL	p256MulInternal<>(SB)
  1489  	STy(s2)
  1490  
  1491  	LDx(z1in)
  1492  	LDy(z2in)
  1493  	CALL	p256MulInternal<>(SB)    // z1 * z2
  1494  	LDx(h)
  1495  	CALL	p256MulInternal<>(SB)    // z1 * z2 * h
  1496  	MOVD	res+0(FP), b_ptr
  1497  	STy(z3out)
  1498  
  1499  	LDx(hsqr)
  1500  	LDy(u1)
  1501  	CALL	p256MulInternal<>(SB)    // hˆ2 * u1
  1502  	STy(u2)
  1503  
  1504  	p256MulBy2Inline               // u1 * hˆ2 * 2, inline
  1505  	LDy(rsqr)
  1506  	CALL	p256SubInternal<>(SB)    // rˆ2 - u1 * hˆ2 * 2
  1507  
  1508  	MOVD	x0, y0
  1509  	MOVD	x1, y1
  1510  	MOVD	x2, y2
  1511  	MOVD	x3, y3
  1512  	LDx(hcub)
  1513  	CALL	p256SubInternal<>(SB)
  1514  	STx(x3out)
  1515  
  1516  	LDy(u2)
  1517  	CALL	p256SubInternal<>(SB)
  1518  
  1519  	LDy(r)
  1520  	CALL	p256MulInternal<>(SB)
  1521  
  1522  	LDx(s2)
  1523  	CALL	p256SubInternal<>(SB)
  1524  	STx(y3out)
  1525  
  1526  	MOVD	hlp1, R0
  1527  	MOVD	R0, ret+72(FP)
  1528  
  1529  	RET
  1530
View as plain text