1 // Copyright 2018 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 // This file contains constant-time, 64-bit assembly implementation of
6 // P256. The optimizations performed here are described in detail in:
7 // S.Gueron and V.Krasnov, "Fast prime field elliptic-curve cryptography with
8 // 256-bit primes"
9 // http://link.springer.com/article/10.1007%2Fs13389-014-0090-x
10 // https://eprint.iacr.org/2013/816.pdf
11
12 #include "textflag.h"
13
14 #define res_ptr R0
15 #define a_ptr R1
16 #define b_ptr R2
17
18 #define acc0 R3
19 #define acc1 R4
20 #define acc2 R5
21 #define acc3 R6
22
23 #define acc4 R7
24 #define acc5 R8
25 #define acc6 R9
26 #define acc7 R10
27 #define t0 R11
28 #define t1 R12
29 #define t2 R13
30 #define t3 R14
31 #define const0 R15
32 #define const1 R16
33
34 #define hlp0 R17
35 #define hlp1 res_ptr
36
37 #define x0 R19
38 #define x1 R20
39 #define x2 R21
40 #define x3 R22
41 #define y0 R23
42 #define y1 R24
43 #define y2 R25
44 #define y3 R26
45
46 #define const2 t2
47 #define const3 t3
48
49 DATA p256const0<>+0x00(SB)/8, $0x00000000ffffffff
50 DATA p256const1<>+0x00(SB)/8, $0xffffffff00000001
51 DATA p256ordK0<>+0x00(SB)/8, $0xccd1c8aaee00bc4f
52 DATA p256ord<>+0x00(SB)/8, $0xf3b9cac2fc632551
53 DATA p256ord<>+0x08(SB)/8, $0xbce6faada7179e84
54 DATA p256ord<>+0x10(SB)/8, $0xffffffffffffffff
55 DATA p256ord<>+0x18(SB)/8, $0xffffffff00000000
56 DATA p256one<>+0x00(SB)/8, $0x0000000000000001
57 DATA p256one<>+0x08(SB)/8, $0xffffffff00000000
58 DATA p256one<>+0x10(SB)/8, $0xffffffffffffffff
59 DATA p256one<>+0x18(SB)/8, $0x00000000fffffffe
60 GLOBL p256const0<>(SB), 8, $8
61 GLOBL p256const1<>(SB), 8, $8
62 GLOBL p256ordK0<>(SB), 8, $8
63 GLOBL p256ord<>(SB), 8, $32
64 GLOBL p256one<>(SB), 8, $32
65
66 /* ---------------------------------------*/
67 // func p256LittleToBig(res []byte, in []uint64)
68 TEXT ·p256LittleToBig(SB),NOSPLIT,$0
69 JMP ·p256BigToLittle(SB)
70 /* ---------------------------------------*/
71 // func p256BigToLittle(res []uint64, in []byte)
72 TEXT ·p256BigToLittle(SB),NOSPLIT,$0
73 MOVD res+0(FP), res_ptr
74 MOVD in+24(FP), a_ptr
75
76 LDP 0*16(a_ptr), (acc0, acc1)
77 LDP 1*16(a_ptr), (acc2, acc3)
78
79 REV acc0, acc0
80 REV acc1, acc1
81 REV acc2, acc2
82 REV acc3, acc3
83
84 STP (acc3, acc2), 0*16(res_ptr)
85 STP (acc1, acc0), 1*16(res_ptr)
86 RET
87 /* ---------------------------------------*/
88 // func p256MovCond(res, a, b []uint64, cond int)
89 // If cond == 0 res=b, else res=a
90 TEXT ·p256MovCond(SB),NOSPLIT,$0
91 MOVD res+0(FP), res_ptr
92 MOVD a+24(FP), a_ptr
93 MOVD b+48(FP), b_ptr
94 MOVD cond+72(FP), R3
95
96 CMP $0, R3
97 // Two remarks:
98 // 1) Will want to revisit NEON, when support is better
99 // 2) CSEL might not be constant time on all ARM processors
100 LDP 0*16(a_ptr), (R4, R5)
101 LDP 1*16(a_ptr), (R6, R7)
102 LDP 2*16(a_ptr), (R8, R9)
103 LDP 0*16(b_ptr), (R16, R17)
104 LDP 1*16(b_ptr), (R19, R20)
105 LDP 2*16(b_ptr), (R21, R22)
106 CSEL EQ, R16, R4, R4
107 CSEL EQ, R17, R5, R5
108 CSEL EQ, R19, R6, R6
109 CSEL EQ, R20, R7, R7
110 CSEL EQ, R21, R8, R8
111 CSEL EQ, R22, R9, R9
112 STP (R4, R5), 0*16(res_ptr)
113 STP (R6, R7), 1*16(res_ptr)
114 STP (R8, R9), 2*16(res_ptr)
115
116 LDP 3*16(a_ptr), (R4, R5)
117 LDP 4*16(a_ptr), (R6, R7)
118 LDP 5*16(a_ptr), (R8, R9)
119 LDP 3*16(b_ptr), (R16, R17)
120 LDP 4*16(b_ptr), (R19, R20)
121 LDP 5*16(b_ptr), (R21, R22)
122 CSEL EQ, R16, R4, R4
123 CSEL EQ, R17, R5, R5
124 CSEL EQ, R19, R6, R6
125 CSEL EQ, R20, R7, R7
126 CSEL EQ, R21, R8, R8
127 CSEL EQ, R22, R9, R9
128 STP (R4, R5), 3*16(res_ptr)
129 STP (R6, R7), 4*16(res_ptr)
130 STP (R8, R9), 5*16(res_ptr)
131
132 RET
133 /* ---------------------------------------*/
134 // func p256NegCond(val []uint64, cond int)
135 TEXT ·p256NegCond(SB),NOSPLIT,$0
136 MOVD val+0(FP), a_ptr
137 MOVD cond+24(FP), hlp0
138 MOVD a_ptr, res_ptr
139 // acc = poly
140 MOVD $-1, acc0
141 MOVD p256const0<>(SB), acc1
142 MOVD $0, acc2
143 MOVD p256const1<>(SB), acc3
144 // Load the original value
145 LDP 0*16(a_ptr), (t0, t1)
146 LDP 1*16(a_ptr), (t2, t3)
147 // Speculatively subtract
148 SUBS t0, acc0
149 SBCS t1, acc1
150 SBCS t2, acc2
151 SBC t3, acc3
152 // If condition is 0, keep original value
153 CMP $0, hlp0
154 CSEL EQ, t0, acc0, acc0
155 CSEL EQ, t1, acc1, acc1
156 CSEL EQ, t2, acc2, acc2
157 CSEL EQ, t3, acc3, acc3
158 // Store result
159 STP (acc0, acc1), 0*16(res_ptr)
160 STP (acc2, acc3), 1*16(res_ptr)
161
162 RET
163 /* ---------------------------------------*/
164 // func p256Sqr(res, in []uint64, n int)
165 TEXT ·p256Sqr(SB),NOSPLIT,$0
166 MOVD res+0(FP), res_ptr
167 MOVD in+24(FP), a_ptr
168 MOVD n+48(FP), b_ptr
169
170 MOVD p256const0<>(SB), const0
171 MOVD p256const1<>(SB), const1
172
173 LDP 0*16(a_ptr), (x0, x1)
174 LDP 1*16(a_ptr), (x2, x3)
175
176 sqrLoop:
177 SUB $1, b_ptr
178 CALL p256SqrInternal<>(SB)
179 MOVD y0, x0
180 MOVD y1, x1
181 MOVD y2, x2
182 MOVD y3, x3
183 CBNZ b_ptr, sqrLoop
184
185 STP (y0, y1), 0*16(res_ptr)
186 STP (y2, y3), 1*16(res_ptr)
187 RET
188 /* ---------------------------------------*/
189 // func p256Mul(res, in1, in2 []uint64)
190 TEXT ·p256Mul(SB),NOSPLIT,$0
191 MOVD res+0(FP), res_ptr
192 MOVD in1+24(FP), a_ptr
193 MOVD in2+48(FP), b_ptr
194
195 MOVD p256const0<>(SB), const0
196 MOVD p256const1<>(SB), const1
197
198 LDP 0*16(a_ptr), (x0, x1)
199 LDP 1*16(a_ptr), (x2, x3)
200
201 LDP 0*16(b_ptr), (y0, y1)
202 LDP 1*16(b_ptr), (y2, y3)
203
204 CALL p256MulInternal<>(SB)
205
206 STP (y0, y1), 0*16(res_ptr)
207 STP (y2, y3), 1*16(res_ptr)
208 RET
209 /* ---------------------------------------*/
210 // func p256FromMont(res, in []uint64)
211 TEXT ·p256FromMont(SB),NOSPLIT,$0
212 MOVD res+0(FP), res_ptr
213 MOVD in+24(FP), a_ptr
214
215 MOVD p256const0<>(SB), const0
216 MOVD p256const1<>(SB), const1
217
218 LDP 0*16(a_ptr), (acc0, acc1)
219 LDP 1*16(a_ptr), (acc2, acc3)
220 // Only reduce, no multiplications are needed
221 // First reduction step
222 ADDS acc0<<32, acc1, acc1
223 LSR $32, acc0, t0
224 MUL acc0, const1, t1
225 UMULH acc0, const1, acc0
226 ADCS t0, acc2
227 ADCS t1, acc3
228 ADC $0, acc0
229 // Second reduction step
230 ADDS acc1<<32, acc2, acc2
231 LSR $32, acc1, t0
232 MUL acc1, const1, t1
233 UMULH acc1, const1, acc1
234 ADCS t0, acc3
235 ADCS t1, acc0
236 ADC $0, acc1
237 // Third reduction step
238 ADDS acc2<<32, acc3, acc3
239 LSR $32, acc2, t0
240 MUL acc2, const1, t1
241 UMULH acc2, const1, acc2
242 ADCS t0, acc0
243 ADCS t1, acc1
244 ADC $0, acc2
245 // Last reduction step
246 ADDS acc3<<32, acc0, acc0
247 LSR $32, acc3, t0
248 MUL acc3, const1, t1
249 UMULH acc3, const1, acc3
250 ADCS t0, acc1
251 ADCS t1, acc2
252 ADC $0, acc3
253
254 SUBS $-1, acc0, t0
255 SBCS const0, acc1, t1
256 SBCS $0, acc2, t2
257 SBCS const1, acc3, t3
258
259 CSEL CS, t0, acc0, acc0
260 CSEL CS, t1, acc1, acc1
261 CSEL CS, t2, acc2, acc2
262 CSEL CS, t3, acc3, acc3
263
264 STP (acc0, acc1), 0*16(res_ptr)
265 STP (acc2, acc3), 1*16(res_ptr)
266
267 RET
268 /* ---------------------------------------*/
269 // Constant time point access to arbitrary point table.
270 // Indexed from 1 to 15, with -1 offset
271 // (index 0 is implicitly point at infinity)
272 // func p256Select(point, table []uint64, idx int)
273 TEXT ·p256Select(SB),NOSPLIT,$0
274 MOVD idx+48(FP), const0
275 MOVD table+24(FP), b_ptr
276 MOVD point+0(FP), res_ptr
277
278 EOR x0, x0, x0
279 EOR x1, x1, x1
280 EOR x2, x2, x2
281 EOR x3, x3, x3
282 EOR y0, y0, y0
283 EOR y1, y1, y1
284 EOR y2, y2, y2
285 EOR y3, y3, y3
286 EOR t0, t0, t0
287 EOR t1, t1, t1
288 EOR t2, t2, t2
289 EOR t3, t3, t3
290
291 MOVD $0, const1
292
293 loop_select:
294 ADD $1, const1
295 CMP const0, const1
296 LDP.P 16(b_ptr), (acc0, acc1)
297 CSEL EQ, acc0, x0, x0
298 CSEL EQ, acc1, x1, x1
299 LDP.P 16(b_ptr), (acc2, acc3)
300 CSEL EQ, acc2, x2, x2
301 CSEL EQ, acc3, x3, x3
302 LDP.P 16(b_ptr), (acc4, acc5)
303 CSEL EQ, acc4, y0, y0
304 CSEL EQ, acc5, y1, y1
305 LDP.P 16(b_ptr), (acc6, acc7)
306 CSEL EQ, acc6, y2, y2
307 CSEL EQ, acc7, y3, y3
308 LDP.P 16(b_ptr), (acc0, acc1)
309 CSEL EQ, acc0, t0, t0
310 CSEL EQ, acc1, t1, t1
311 LDP.P 16(b_ptr), (acc2, acc3)
312 CSEL EQ, acc2, t2, t2
313 CSEL EQ, acc3, t3, t3
314
315 CMP $16, const1
316 BNE loop_select
317
318 STP (x0, x1), 0*16(res_ptr)
319 STP (x2, x3), 1*16(res_ptr)
320 STP (y0, y1), 2*16(res_ptr)
321 STP (y2, y3), 3*16(res_ptr)
322 STP (t0, t1), 4*16(res_ptr)
323 STP (t2, t3), 5*16(res_ptr)
324 RET
325 /* ---------------------------------------*/
326 // Constant time point access to base point table.
327 // func p256SelectBase(point *[12]uint64, table string, idx int)
328 TEXT ·p256SelectBase(SB),NOSPLIT,$0
329 MOVD idx+24(FP), t0
330 MOVD table_base+8(FP), t1
331 MOVD point+0(FP), res_ptr
332
333 EOR x0, x0, x0
334 EOR x1, x1, x1
335 EOR x2, x2, x2
336 EOR x3, x3, x3
337 EOR y0, y0, y0
338 EOR y1, y1, y1
339 EOR y2, y2, y2
340 EOR y3, y3, y3
341
342 MOVD $0, t2
343
344 loop_select:
345 ADD $1, t2
346 CMP t0, t2
347 LDP.P 16(t1), (acc0, acc1)
348 CSEL EQ, acc0, x0, x0
349 CSEL EQ, acc1, x1, x1
350 LDP.P 16(t1), (acc2, acc3)
351 CSEL EQ, acc2, x2, x2
352 CSEL EQ, acc3, x3, x3
353 LDP.P 16(t1), (acc4, acc5)
354 CSEL EQ, acc4, y0, y0
355 CSEL EQ, acc5, y1, y1
356 LDP.P 16(t1), (acc6, acc7)
357 CSEL EQ, acc6, y2, y2
358 CSEL EQ, acc7, y3, y3
359
360 CMP $32, t2
361 BNE loop_select
362
363 STP (x0, x1), 0*16(res_ptr)
364 STP (x2, x3), 1*16(res_ptr)
365 STP (y0, y1), 2*16(res_ptr)
366 STP (y2, y3), 3*16(res_ptr)
367 RET
368 /* ---------------------------------------*/
369 // func p256OrdSqr(res, in []uint64, n int)
370 TEXT ·p256OrdSqr(SB),NOSPLIT,$0
371 MOVD in+24(FP), a_ptr
372 MOVD n+48(FP), b_ptr
373
374 MOVD p256ordK0<>(SB), hlp1
375 LDP p256ord<>+0x00(SB), (const0, const1)
376 LDP p256ord<>+0x10(SB), (const2, const3)
377
378 LDP 0*16(a_ptr), (x0, x1)
379 LDP 1*16(a_ptr), (x2, x3)
380
381 ordSqrLoop:
382 SUB $1, b_ptr
383
384 // x[1:] * x[0]
385 MUL x0, x1, acc1
386 UMULH x0, x1, acc2
387
388 MUL x0, x2, t0
389 ADDS t0, acc2, acc2
390 UMULH x0, x2, acc3
391
392 MUL x0, x3, t0
393 ADCS t0, acc3, acc3
394 UMULH x0, x3, acc4
395 ADC $0, acc4, acc4
396 // x[2:] * x[1]
397 MUL x1, x2, t0
398 ADDS t0, acc3
399 UMULH x1, x2, t1
400 ADCS t1, acc4
401 ADC $0, ZR, acc5
402
403 MUL x1, x3, t0
404 ADDS t0, acc4
405 UMULH x1, x3, t1
406 ADC t1, acc5
407 // x[3] * x[2]
408 MUL x2, x3, t0
409 ADDS t0, acc5
410 UMULH x2, x3, acc6
411 ADC $0, acc6
412
413 MOVD $0, acc7
414 // *2
415 ADDS acc1, acc1
416 ADCS acc2, acc2
417 ADCS acc3, acc3
418 ADCS acc4, acc4
419 ADCS acc5, acc5
420 ADCS acc6, acc6
421 ADC $0, acc7
422 // Missing products
423 MUL x0, x0, acc0
424 UMULH x0, x0, t0
425 ADDS t0, acc1, acc1
426
427 MUL x1, x1, t0
428 ADCS t0, acc2, acc2
429 UMULH x1, x1, t1
430 ADCS t1, acc3, acc3
431
432 MUL x2, x2, t0
433 ADCS t0, acc4, acc4
434 UMULH x2, x2, t1
435 ADCS t1, acc5, acc5
436
437 MUL x3, x3, t0
438 ADCS t0, acc6, acc6
439 UMULH x3, x3, t1
440 ADC t1, acc7, acc7
441 // First reduction step
442 MUL acc0, hlp1, hlp0
443
444 MUL const0, hlp1, t0
445 ADDS t0, acc0, acc0
446 UMULH const0, hlp0, t1
447
448 MUL const1, hlp0, t0
449 ADCS t0, acc1, acc1
450 UMULH const1, hlp0, y0
451
452 MUL const2, hlp0, t0
453 ADCS t0, acc2, acc2
454 UMULH const2, hlp0, acc0
455
456 MUL const3, hlp0, t0
457 ADCS t0, acc3, acc3
458
459 UMULH const3, hlp0, hlp0
460 ADC $0, hlp0
461
462 ADDS t1, acc1, acc1
463 ADCS y0, acc2, acc2
464 ADCS acc0, acc3, acc3
465 ADC $0, hlp0, acc0
466 // Second reduction step
467 MUL acc1, hlp1, hlp0
468
469 MUL const0, hlp1, t0
470 ADDS t0, acc1, acc1
471 UMULH const0, hlp0, t1
472
473 MUL const1, hlp0, t0
474 ADCS t0, acc2, acc2
475 UMULH const1, hlp0, y0
476
477 MUL const2, hlp0, t0
478 ADCS t0, acc3, acc3
479 UMULH const2, hlp0, acc1
480
481 MUL const3, hlp0, t0
482 ADCS t0, acc0, acc0
483
484 UMULH const3, hlp0, hlp0
485 ADC $0, hlp0
486
487 ADDS t1, acc2, acc2
488 ADCS y0, acc3, acc3
489 ADCS acc1, acc0, acc0
490 ADC $0, hlp0, acc1
491 // Third reduction step
492 MUL acc2, hlp1, hlp0
493
494 MUL const0, hlp1, t0
495 ADDS t0, acc2, acc2
496 UMULH const0, hlp0, t1
497
498 MUL const1, hlp0, t0
499 ADCS t0, acc3, acc3
500 UMULH const1, hlp0, y0
501
502 MUL const2, hlp0, t0
503 ADCS t0, acc0, acc0
504 UMULH const2, hlp0, acc2
505
506 MUL const3, hlp0, t0
507 ADCS t0, acc1, acc1
508
509 UMULH const3, hlp0, hlp0
510 ADC $0, hlp0
511
512 ADDS t1, acc3, acc3
513 ADCS y0, acc0, acc0
514 ADCS acc2, acc1, acc1
515 ADC $0, hlp0, acc2
516
517 // Last reduction step
518 MUL acc3, hlp1, hlp0
519
520 MUL const0, hlp1, t0
521 ADDS t0, acc3, acc3
522 UMULH const0, hlp0, t1
523
524 MUL const1, hlp0, t0
525 ADCS t0, acc0, acc0
526 UMULH const1, hlp0, y0
527
528 MUL const2, hlp0, t0
529 ADCS t0, acc1, acc1
530 UMULH const2, hlp0, acc3
531
532 MUL const3, hlp0, t0
533 ADCS t0, acc2, acc2
534
535 UMULH const3, hlp0, hlp0
536 ADC $0, acc7
537
538 ADDS t1, acc0, acc0
539 ADCS y0, acc1, acc1
540 ADCS acc3, acc2, acc2
541 ADC $0, hlp0, acc3
542
543 ADDS acc4, acc0, acc0
544 ADCS acc5, acc1, acc1
545 ADCS acc6, acc2, acc2
546 ADCS acc7, acc3, acc3
547 ADC $0, ZR, acc4
548
549 SUBS const0, acc0, y0
550 SBCS const1, acc1, y1
551 SBCS const2, acc2, y2
552 SBCS const3, acc3, y3
553 SBCS $0, acc4, acc4
554
555 CSEL CS, y0, acc0, x0
556 CSEL CS, y1, acc1, x1
557 CSEL CS, y2, acc2, x2
558 CSEL CS, y3, acc3, x3
559
560 CBNZ b_ptr, ordSqrLoop
561
562 MOVD res+0(FP), res_ptr
563 STP (x0, x1), 0*16(res_ptr)
564 STP (x2, x3), 1*16(res_ptr)
565
566 RET
567 /* ---------------------------------------*/
568 // func p256OrdMul(res, in1, in2 []uint64)
569 TEXT ·p256OrdMul(SB),NOSPLIT,$0
570 MOVD in1+24(FP), a_ptr
571 MOVD in2+48(FP), b_ptr
572
573 MOVD p256ordK0<>(SB), hlp1
574 LDP p256ord<>+0x00(SB), (const0, const1)
575 LDP p256ord<>+0x10(SB), (const2, const3)
576
577 LDP 0*16(a_ptr), (x0, x1)
578 LDP 1*16(a_ptr), (x2, x3)
579 LDP 0*16(b_ptr), (y0, y1)
580 LDP 1*16(b_ptr), (y2, y3)
581
582 // y[0] * x
583 MUL y0, x0, acc0
584 UMULH y0, x0, acc1
585
586 MUL y0, x1, t0
587 ADDS t0, acc1
588 UMULH y0, x1, acc2
589
590 MUL y0, x2, t0
591 ADCS t0, acc2
592 UMULH y0, x2, acc3
593
594 MUL y0, x3, t0
595 ADCS t0, acc3
596 UMULH y0, x3, acc4
597 ADC $0, acc4
598 // First reduction step
599 MUL acc0, hlp1, hlp0
600
601 MUL const0, hlp1, t0
602 ADDS t0, acc0, acc0
603 UMULH const0, hlp0, t1
604
605 MUL const1, hlp0, t0
606 ADCS t0, acc1, acc1
607 UMULH const1, hlp0, y0
608
609 MUL const2, hlp0, t0
610 ADCS t0, acc2, acc2
611 UMULH const2, hlp0, acc0
612
613 MUL const3, hlp0, t0
614 ADCS t0, acc3, acc3
615
616 UMULH const3, hlp0, hlp0
617 ADC $0, acc4
618
619 ADDS t1, acc1, acc1
620 ADCS y0, acc2, acc2
621 ADCS acc0, acc3, acc3
622 ADC $0, hlp0, acc0
623 // y[1] * x
624 MUL y1, x0, t0
625 ADDS t0, acc1
626 UMULH y1, x0, t1
627
628 MUL y1, x1, t0
629 ADCS t0, acc2
630 UMULH y1, x1, hlp0
631
632 MUL y1, x2, t0
633 ADCS t0, acc3
634 UMULH y1, x2, y0
635
636 MUL y1, x3, t0
637 ADCS t0, acc4
638 UMULH y1, x3, y1
639 ADC $0, ZR, acc5
640
641 ADDS t1, acc2
642 ADCS hlp0, acc3
643 ADCS y0, acc4
644 ADC y1, acc5
645 // Second reduction step
646 MUL acc1, hlp1, hlp0
647
648 MUL const0, hlp1, t0
649 ADDS t0, acc1, acc1
650 UMULH const0, hlp0, t1
651
652 MUL const1, hlp0, t0
653 ADCS t0, acc2, acc2
654 UMULH const1, hlp0, y0
655
656 MUL const2, hlp0, t0
657 ADCS t0, acc3, acc3
658 UMULH const2, hlp0, acc1
659
660 MUL const3, hlp0, t0
661 ADCS t0, acc0, acc0
662
663 UMULH const3, hlp0, hlp0
664 ADC $0, acc5
665
666 ADDS t1, acc2, acc2
667 ADCS y0, acc3, acc3
668 ADCS acc1, acc0, acc0
669 ADC $0, hlp0, acc1
670 // y[2] * x
671 MUL y2, x0, t0
672 ADDS t0, acc2
673 UMULH y2, x0, t1
674
675 MUL y2, x1, t0
676 ADCS t0, acc3
677 UMULH y2, x1, hlp0
678
679 MUL y2, x2, t0
680 ADCS t0, acc4
681 UMULH y2, x2, y0
682
683 MUL y2, x3, t0
684 ADCS t0, acc5
685 UMULH y2, x3, y1
686 ADC $0, ZR, acc6
687
688 ADDS t1, acc3
689 ADCS hlp0, acc4
690 ADCS y0, acc5
691 ADC y1, acc6
692 // Third reduction step
693 MUL acc2, hlp1, hlp0
694
695 MUL const0, hlp1, t0
696 ADDS t0, acc2, acc2
697 UMULH const0, hlp0, t1
698
699 MUL const1, hlp0, t0
700 ADCS t0, acc3, acc3
701 UMULH const1, hlp0, y0
702
703 MUL const2, hlp0, t0
704 ADCS t0, acc0, acc0
705 UMULH const2, hlp0, acc2
706
707 MUL const3, hlp0, t0
708 ADCS t0, acc1, acc1
709
710 UMULH const3, hlp0, hlp0
711 ADC $0, acc6
712
713 ADDS t1, acc3, acc3
714 ADCS y0, acc0, acc0
715 ADCS acc2, acc1, acc1
716 ADC $0, hlp0, acc2
717 // y[3] * x
718 MUL y3, x0, t0
719 ADDS t0, acc3
720 UMULH y3, x0, t1
721
722 MUL y3, x1, t0
723 ADCS t0, acc4
724 UMULH y3, x1, hlp0
725
726 MUL y3, x2, t0
727 ADCS t0, acc5
728 UMULH y3, x2, y0
729
730 MUL y3, x3, t0
731 ADCS t0, acc6
732 UMULH y3, x3, y1
733 ADC $0, ZR, acc7
734
735 ADDS t1, acc4
736 ADCS hlp0, acc5
737 ADCS y0, acc6
738 ADC y1, acc7
739 // Last reduction step
740 MUL acc3, hlp1, hlp0
741
742 MUL const0, hlp1, t0
743 ADDS t0, acc3, acc3
744 UMULH const0, hlp0, t1
745
746 MUL const1, hlp0, t0
747 ADCS t0, acc0, acc0
748 UMULH const1, hlp0, y0
749
750 MUL const2, hlp0, t0
751 ADCS t0, acc1, acc1
752 UMULH const2, hlp0, acc3
753
754 MUL const3, hlp0, t0
755 ADCS t0, acc2, acc2
756
757 UMULH const3, hlp0, hlp0
758 ADC $0, acc7
759
760 ADDS t1, acc0, acc0
761 ADCS y0, acc1, acc1
762 ADCS acc3, acc2, acc2
763 ADC $0, hlp0, acc3
764
765 ADDS acc4, acc0, acc0
766 ADCS acc5, acc1, acc1
767 ADCS acc6, acc2, acc2
768 ADCS acc7, acc3, acc3
769 ADC $0, ZR, acc4
770
771 SUBS const0, acc0, t0
772 SBCS const1, acc1, t1
773 SBCS const2, acc2, t2
774 SBCS const3, acc3, t3
775 SBCS $0, acc4, acc4
776
777 CSEL CS, t0, acc0, acc0
778 CSEL CS, t1, acc1, acc1
779 CSEL CS, t2, acc2, acc2
780 CSEL CS, t3, acc3, acc3
781
782 MOVD res+0(FP), res_ptr
783 STP (acc0, acc1), 0*16(res_ptr)
784 STP (acc2, acc3), 1*16(res_ptr)
785
786 RET
787 /* ---------------------------------------*/
788 TEXT p256SubInternal<>(SB),NOSPLIT,$0
789 SUBS x0, y0, acc0
790 SBCS x1, y1, acc1
791 SBCS x2, y2, acc2
792 SBCS x3, y3, acc3
793 SBC $0, ZR, t0
794
795 ADDS $-1, acc0, acc4
796 ADCS const0, acc1, acc5
797 ADCS $0, acc2, acc6
798 ADC const1, acc3, acc7
799
800 ANDS $1, t0
801 CSEL EQ, acc0, acc4, x0
802 CSEL EQ, acc1, acc5, x1
803 CSEL EQ, acc2, acc6, x2
804 CSEL EQ, acc3, acc7, x3
805
806 RET
807 /* ---------------------------------------*/
808 TEXT p256SqrInternal<>(SB),NOSPLIT,$0
809 // x[1:] * x[0]
810 MUL x0, x1, acc1
811 UMULH x0, x1, acc2
812
813 MUL x0, x2, t0
814 ADDS t0, acc2, acc2
815 UMULH x0, x2, acc3
816
817 MUL x0, x3, t0
818 ADCS t0, acc3, acc3
819 UMULH x0, x3, acc4
820 ADC $0, acc4, acc4
821 // x[2:] * x[1]
822 MUL x1, x2, t0
823 ADDS t0, acc3
824 UMULH x1, x2, t1
825 ADCS t1, acc4
826 ADC $0, ZR, acc5
827
828 MUL x1, x3, t0
829 ADDS t0, acc4
830 UMULH x1, x3, t1
831 ADC t1, acc5
832 // x[3] * x[2]
833 MUL x2, x3, t0
834 ADDS t0, acc5
835 UMULH x2, x3, acc6
836 ADC $0, acc6
837
838 MOVD $0, acc7
839 // *2
840 ADDS acc1, acc1
841 ADCS acc2, acc2
842 ADCS acc3, acc3
843 ADCS acc4, acc4
844 ADCS acc5, acc5
845 ADCS acc6, acc6
846 ADC $0, acc7
847 // Missing products
848 MUL x0, x0, acc0
849 UMULH x0, x0, t0
850 ADDS t0, acc1, acc1
851
852 MUL x1, x1, t0
853 ADCS t0, acc2, acc2
854 UMULH x1, x1, t1
855 ADCS t1, acc3, acc3
856
857 MUL x2, x2, t0
858 ADCS t0, acc4, acc4
859 UMULH x2, x2, t1
860 ADCS t1, acc5, acc5
861
862 MUL x3, x3, t0
863 ADCS t0, acc6, acc6
864 UMULH x3, x3, t1
865 ADCS t1, acc7, acc7
866 // First reduction step
867 ADDS acc0<<32, acc1, acc1
868 LSR $32, acc0, t0
869 MUL acc0, const1, t1
870 UMULH acc0, const1, acc0
871 ADCS t0, acc2, acc2
872 ADCS t1, acc3, acc3
873 ADC $0, acc0, acc0
874 // Second reduction step
875 ADDS acc1<<32, acc2, acc2
876 LSR $32, acc1, t0
877 MUL acc1, const1, t1
878 UMULH acc1, const1, acc1
879 ADCS t0, acc3, acc3
880 ADCS t1, acc0, acc0
881 ADC $0, acc1, acc1
882 // Third reduction step
883 ADDS acc2<<32, acc3, acc3
884 LSR $32, acc2, t0
885 MUL acc2, const1, t1
886 UMULH acc2, const1, acc2
887 ADCS t0, acc0, acc0
888 ADCS t1, acc1, acc1
889 ADC $0, acc2, acc2
890 // Last reduction step
891 ADDS acc3<<32, acc0, acc0
892 LSR $32, acc3, t0
893 MUL acc3, const1, t1
894 UMULH acc3, const1, acc3
895 ADCS t0, acc1, acc1
896 ADCS t1, acc2, acc2
897 ADC $0, acc3, acc3
898 // Add bits [511:256] of the sqr result
899 ADDS acc4, acc0, acc0
900 ADCS acc5, acc1, acc1
901 ADCS acc6, acc2, acc2
902 ADCS acc7, acc3, acc3
903 ADC $0, ZR, acc4
904
905 SUBS $-1, acc0, t0
906 SBCS const0, acc1, t1
907 SBCS $0, acc2, t2
908 SBCS const1, acc3, t3
909 SBCS $0, acc4, acc4
910
911 CSEL CS, t0, acc0, y0
912 CSEL CS, t1, acc1, y1
913 CSEL CS, t2, acc2, y2
914 CSEL CS, t3, acc3, y3
915 RET
916 /* ---------------------------------------*/
917 TEXT p256MulInternal<>(SB),NOSPLIT,$0
918 // y[0] * x
919 MUL y0, x0, acc0
920 UMULH y0, x0, acc1
921
922 MUL y0, x1, t0
923 ADDS t0, acc1
924 UMULH y0, x1, acc2
925
926 MUL y0, x2, t0
927 ADCS t0, acc2
928 UMULH y0, x2, acc3
929
930 MUL y0, x3, t0
931 ADCS t0, acc3
932 UMULH y0, x3, acc4
933 ADC $0, acc4
934 // First reduction step
935 ADDS acc0<<32, acc1, acc1
936 LSR $32, acc0, t0
937 MUL acc0, const1, t1
938 UMULH acc0, const1, acc0
939 ADCS t0, acc2
940 ADCS t1, acc3
941 ADC $0, acc0
942 // y[1] * x
943 MUL y1, x0, t0
944 ADDS t0, acc1
945 UMULH y1, x0, t1
946
947 MUL y1, x1, t0
948 ADCS t0, acc2
949 UMULH y1, x1, t2
950
951 MUL y1, x2, t0
952 ADCS t0, acc3
953 UMULH y1, x2, t3
954
955 MUL y1, x3, t0
956 ADCS t0, acc4
957 UMULH y1, x3, hlp0
958 ADC $0, ZR, acc5
959
960 ADDS t1, acc2
961 ADCS t2, acc3
962 ADCS t3, acc4
963 ADC hlp0, acc5
964 // Second reduction step
965 ADDS acc1<<32, acc2, acc2
966 LSR $32, acc1, t0
967 MUL acc1, const1, t1
968 UMULH acc1, const1, acc1
969 ADCS t0, acc3
970 ADCS t1, acc0
971 ADC $0, acc1
972 // y[2] * x
973 MUL y2, x0, t0
974 ADDS t0, acc2
975 UMULH y2, x0, t1
976
977 MUL y2, x1, t0
978 ADCS t0, acc3
979 UMULH y2, x1, t2
980
981 MUL y2, x2, t0
982 ADCS t0, acc4
983 UMULH y2, x2, t3
984
985 MUL y2, x3, t0
986 ADCS t0, acc5
987 UMULH y2, x3, hlp0
988 ADC $0, ZR, acc6
989
990 ADDS t1, acc3
991 ADCS t2, acc4
992 ADCS t3, acc5
993 ADC hlp0, acc6
994 // Third reduction step
995 ADDS acc2<<32, acc3, acc3
996 LSR $32, acc2, t0
997 MUL acc2, const1, t1
998 UMULH acc2, const1, acc2
999 ADCS t0, acc0
1000 ADCS t1, acc1
1001 ADC $0, acc2
1002 // y[3] * x
1003 MUL y3, x0, t0
1004 ADDS t0, acc3
1005 UMULH y3, x0, t1
1006
1007 MUL y3, x1, t0
1008 ADCS t0, acc4
1009 UMULH y3, x1, t2
1010
1011 MUL y3, x2, t0
1012 ADCS t0, acc5
1013 UMULH y3, x2, t3
1014
1015 MUL y3, x3, t0
1016 ADCS t0, acc6
1017 UMULH y3, x3, hlp0
1018 ADC $0, ZR, acc7
1019
1020 ADDS t1, acc4
1021 ADCS t2, acc5
1022 ADCS t3, acc6
1023 ADC hlp0, acc7
1024 // Last reduction step
1025 ADDS acc3<<32, acc0, acc0
1026 LSR $32, acc3, t0
1027 MUL acc3, const1, t1
1028 UMULH acc3, const1, acc3
1029 ADCS t0, acc1
1030 ADCS t1, acc2
1031 ADC $0, acc3
1032 // Add bits [511:256] of the mul result
1033 ADDS acc4, acc0, acc0
1034 ADCS acc5, acc1, acc1
1035 ADCS acc6, acc2, acc2
1036 ADCS acc7, acc3, acc3
1037 ADC $0, ZR, acc4
1038
1039 SUBS $-1, acc0, t0
1040 SBCS const0, acc1, t1
1041 SBCS $0, acc2, t2
1042 SBCS const1, acc3, t3
1043 SBCS $0, acc4, acc4
1044
1045 CSEL CS, t0, acc0, y0
1046 CSEL CS, t1, acc1, y1
1047 CSEL CS, t2, acc2, y2
1048 CSEL CS, t3, acc3, y3
1049 RET
1050 /* ---------------------------------------*/
1051 #define p256MulBy2Inline \
1052 ADDS y0, y0, x0; \
1053 ADCS y1, y1, x1; \
1054 ADCS y2, y2, x2; \
1055 ADCS y3, y3, x3; \
1056 ADC $0, ZR, hlp0; \
1057 SUBS $-1, x0, t0; \
1058 SBCS const0, x1, t1;\
1059 SBCS $0, x2, t2; \
1060 SBCS const1, x3, t3;\
1061 SBCS $0, hlp0, hlp0;\
1062 CSEL CC, x0, t0, x0;\
1063 CSEL CC, x1, t1, x1;\
1064 CSEL CC, x2, t2, x2;\
1065 CSEL CC, x3, t3, x3;
1066 /* ---------------------------------------*/
1067 #define x1in(off) (off)(a_ptr)
1068 #define y1in(off) (off + 32)(a_ptr)
1069 #define z1in(off) (off + 64)(a_ptr)
1070 #define x2in(off) (off)(b_ptr)
1071 #define z2in(off) (off + 64)(b_ptr)
1072 #define x3out(off) (off)(res_ptr)
1073 #define y3out(off) (off + 32)(res_ptr)
1074 #define z3out(off) (off + 64)(res_ptr)
1075 #define LDx(src) LDP src(0), (x0, x1); LDP src(16), (x2, x3)
1076 #define LDy(src) LDP src(0), (y0, y1); LDP src(16), (y2, y3)
1077 #define STx(src) STP (x0, x1), src(0); STP (x2, x3), src(16)
1078 #define STy(src) STP (y0, y1), src(0); STP (y2, y3), src(16)
1079 /* ---------------------------------------*/
1080 #define y2in(off) (32*0 + 8 + off)(RSP)
1081 #define s2(off) (32*1 + 8 + off)(RSP)
1082 #define z1sqr(off) (32*2 + 8 + off)(RSP)
1083 #define h(off) (32*3 + 8 + off)(RSP)
1084 #define r(off) (32*4 + 8 + off)(RSP)
1085 #define hsqr(off) (32*5 + 8 + off)(RSP)
1086 #define rsqr(off) (32*6 + 8 + off)(RSP)
1087 #define hcub(off) (32*7 + 8 + off)(RSP)
1088
1089 #define z2sqr(off) (32*8 + 8 + off)(RSP)
1090 #define s1(off) (32*9 + 8 + off)(RSP)
1091 #define u1(off) (32*10 + 8 + off)(RSP)
1092 #define u2(off) (32*11 + 8 + off)(RSP)
1093
1094 // func p256PointAddAffineAsm(res, in1, in2 []uint64, sign, sel, zero int)
1095 TEXT ·p256PointAddAffineAsm(SB),0,$264-96
1096 MOVD in1+24(FP), a_ptr
1097 MOVD in2+48(FP), b_ptr
1098 MOVD sign+72(FP), hlp0
1099 MOVD sel+80(FP), hlp1
1100 MOVD zero+88(FP), t2
1101
1102 MOVD $1, t0
1103 CMP $0, t2
1104 CSEL EQ, ZR, t0, t2
1105 CMP $0, hlp1
1106 CSEL EQ, ZR, t0, hlp1
1107
1108 MOVD p256const0<>(SB), const0
1109 MOVD p256const1<>(SB), const1
1110 EOR t2<<1, hlp1
1111
1112 // Negate y2in based on sign
1113 LDP 2*16(b_ptr), (y0, y1)
1114 LDP 3*16(b_ptr), (y2, y3)
1115 MOVD $-1, acc0
1116
1117 SUBS y0, acc0, acc0
1118 SBCS y1, const0, acc1
1119 SBCS y2, ZR, acc2
1120 SBCS y3, const1, acc3
1121 SBC $0, ZR, t0
1122
1123 ADDS $-1, acc0, acc4
1124 ADCS const0, acc1, acc5
1125 ADCS $0, acc2, acc6
1126 ADCS const1, acc3, acc7
1127 ADC $0, t0, t0
1128
1129 CMP $0, t0
1130 CSEL EQ, acc4, acc0, acc0
1131 CSEL EQ, acc5, acc1, acc1
1132 CSEL EQ, acc6, acc2, acc2
1133 CSEL EQ, acc7, acc3, acc3
1134 // If condition is 0, keep original value
1135 CMP $0, hlp0
1136 CSEL EQ, y0, acc0, y0
1137 CSEL EQ, y1, acc1, y1
1138 CSEL EQ, y2, acc2, y2
1139 CSEL EQ, y3, acc3, y3
1140 // Store result
1141 STy(y2in)
1142 // Begin point add
1143 LDx(z1in)
1144 CALL p256SqrInternal<>(SB) // z1ˆ2
1145 STy(z1sqr)
1146
1147 LDx(x2in)
1148 CALL p256MulInternal<>(SB) // x2 * z1ˆ2
1149
1150 LDx(x1in)
1151 CALL p256SubInternal<>(SB) // h = u2 - u1
1152 STx(h)
1153
1154 LDy(z1in)
1155 CALL p256MulInternal<>(SB) // z3 = h * z1
1156
1157 LDP 4*16(a_ptr), (acc0, acc1)// iff select[0] == 0, z3 = z1
1158 LDP 5*16(a_ptr), (acc2, acc3)
1159 ANDS $1, hlp1, ZR
1160 CSEL EQ, acc0, y0, y0
1161 CSEL EQ, acc1, y1, y1
1162 CSEL EQ, acc2, y2, y2
1163 CSEL EQ, acc3, y3, y3
1164 LDP p256one<>+0x00(SB), (acc0, acc1)
1165 LDP p256one<>+0x10(SB), (acc2, acc3)
1166 ANDS $2, hlp1, ZR // iff select[1] == 0, z3 = 1
1167 CSEL EQ, acc0, y0, y0
1168 CSEL EQ, acc1, y1, y1
1169 CSEL EQ, acc2, y2, y2
1170 CSEL EQ, acc3, y3, y3
1171 LDx(z1in)
1172 MOVD res+0(FP), t0
1173 STP (y0, y1), 4*16(t0)
1174 STP (y2, y3), 5*16(t0)
1175
1176 LDy(z1sqr)
1177 CALL p256MulInternal<>(SB) // z1 ^ 3
1178
1179 LDx(y2in)
1180 CALL p256MulInternal<>(SB) // s2 = y2 * z1ˆ3
1181 STy(s2)
1182
1183 LDx(y1in)
1184 CALL p256SubInternal<>(SB) // r = s2 - s1
1185 STx(r)
1186
1187 CALL p256SqrInternal<>(SB) // rsqr = rˆ2
1188 STy (rsqr)
1189
1190 LDx(h)
1191 CALL p256SqrInternal<>(SB) // hsqr = hˆ2
1192 STy(hsqr)
1193
1194 CALL p256MulInternal<>(SB) // hcub = hˆ3
1195 STy(hcub)
1196
1197 LDx(y1in)
1198 CALL p256MulInternal<>(SB) // y1 * hˆ3
1199 STy(s2)
1200
1201 LDP hsqr(0*8), (x0, x1)
1202 LDP hsqr(2*8), (x2, x3)
1203 LDP 0*16(a_ptr), (y0, y1)
1204 LDP 1*16(a_ptr), (y2, y3)
1205 CALL p256MulInternal<>(SB) // u1 * hˆ2
1206 STP (y0, y1), h(0*8)
1207 STP (y2, y3), h(2*8)
1208
1209 p256MulBy2Inline // u1 * hˆ2 * 2, inline
1210
1211 LDy(rsqr)
1212 CALL p256SubInternal<>(SB) // rˆ2 - u1 * hˆ2 * 2
1213
1214 MOVD x0, y0
1215 MOVD x1, y1
1216 MOVD x2, y2
1217 MOVD x3, y3
1218 LDx(hcub)
1219 CALL p256SubInternal<>(SB)
1220
1221 LDP 0*16(a_ptr), (acc0, acc1)
1222 LDP 1*16(a_ptr), (acc2, acc3)
1223 ANDS $1, hlp1, ZR // iff select[0] == 0, x3 = x1
1224 CSEL EQ, acc0, x0, x0
1225 CSEL EQ, acc1, x1, x1
1226 CSEL EQ, acc2, x2, x2
1227 CSEL EQ, acc3, x3, x3
1228 LDP 0*16(b_ptr), (acc0, acc1)
1229 LDP 1*16(b_ptr), (acc2, acc3)
1230 ANDS $2, hlp1, ZR // iff select[1] == 0, x3 = x2
1231 CSEL EQ, acc0, x0, x0
1232 CSEL EQ, acc1, x1, x1
1233 CSEL EQ, acc2, x2, x2
1234 CSEL EQ, acc3, x3, x3
1235 MOVD res+0(FP), t0
1236 STP (x0, x1), 0*16(t0)
1237 STP (x2, x3), 1*16(t0)
1238
1239 LDP h(0*8), (y0, y1)
1240 LDP h(2*8), (y2, y3)
1241 CALL p256SubInternal<>(SB)
1242
1243 LDP r(0*8), (y0, y1)
1244 LDP r(2*8), (y2, y3)
1245 CALL p256MulInternal<>(SB)
1246
1247 LDP s2(0*8), (x0, x1)
1248 LDP s2(2*8), (x2, x3)
1249 CALL p256SubInternal<>(SB)
1250 LDP 2*16(a_ptr), (acc0, acc1)
1251 LDP 3*16(a_ptr), (acc2, acc3)
1252 ANDS $1, hlp1, ZR // iff select[0] == 0, y3 = y1
1253 CSEL EQ, acc0, x0, x0
1254 CSEL EQ, acc1, x1, x1
1255 CSEL EQ, acc2, x2, x2
1256 CSEL EQ, acc3, x3, x3
1257 LDP y2in(0*8), (acc0, acc1)
1258 LDP y2in(2*8), (acc2, acc3)
1259 ANDS $2, hlp1, ZR // iff select[1] == 0, y3 = y2
1260 CSEL EQ, acc0, x0, x0
1261 CSEL EQ, acc1, x1, x1
1262 CSEL EQ, acc2, x2, x2
1263 CSEL EQ, acc3, x3, x3
1264 MOVD res+0(FP), t0
1265 STP (x0, x1), 2*16(t0)
1266 STP (x2, x3), 3*16(t0)
1267
1268 RET
1269
1270 #define p256AddInline \
1271 ADDS y0, x0, x0; \
1272 ADCS y1, x1, x1; \
1273 ADCS y2, x2, x2; \
1274 ADCS y3, x3, x3; \
1275 ADC $0, ZR, hlp0; \
1276 SUBS $-1, x0, t0; \
1277 SBCS const0, x1, t1;\
1278 SBCS $0, x2, t2; \
1279 SBCS const1, x3, t3;\
1280 SBCS $0, hlp0, hlp0;\
1281 CSEL CC, x0, t0, x0;\
1282 CSEL CC, x1, t1, x1;\
1283 CSEL CC, x2, t2, x2;\
1284 CSEL CC, x3, t3, x3;
1285
1286 #define s(off) (32*0 + 8 + off)(RSP)
1287 #define m(off) (32*1 + 8 + off)(RSP)
1288 #define zsqr(off) (32*2 + 8 + off)(RSP)
1289 #define tmp(off) (32*3 + 8 + off)(RSP)
1290
1291 //func p256PointDoubleAsm(res, in []uint64)
1292 TEXT ·p256PointDoubleAsm(SB),NOSPLIT,$136-48
1293 MOVD res+0(FP), res_ptr
1294 MOVD in+24(FP), a_ptr
1295
1296 MOVD p256const0<>(SB), const0
1297 MOVD p256const1<>(SB), const1
1298
1299 // Begin point double
1300 LDP 4*16(a_ptr), (x0, x1)
1301 LDP 5*16(a_ptr), (x2, x3)
1302 CALL p256SqrInternal<>(SB)
1303 STP (y0, y1), zsqr(0*8)
1304 STP (y2, y3), zsqr(2*8)
1305
1306 LDP 0*16(a_ptr), (x0, x1)
1307 LDP 1*16(a_ptr), (x2, x3)
1308 p256AddInline
1309 STx(m)
1310
1311 LDx(z1in)
1312 LDy(y1in)
1313 CALL p256MulInternal<>(SB)
1314 p256MulBy2Inline
1315 STx(z3out)
1316
1317 LDy(x1in)
1318 LDx(zsqr)
1319 CALL p256SubInternal<>(SB)
1320 LDy(m)
1321 CALL p256MulInternal<>(SB)
1322
1323 // Multiply by 3
1324 p256MulBy2Inline
1325 p256AddInline
1326 STx(m)
1327
1328 LDy(y1in)
1329 p256MulBy2Inline
1330 CALL p256SqrInternal<>(SB)
1331 STy(s)
1332 MOVD y0, x0
1333 MOVD y1, x1
1334 MOVD y2, x2
1335 MOVD y3, x3
1336 CALL p256SqrInternal<>(SB)
1337
1338 // Divide by 2
1339 ADDS $-1, y0, t0
1340 ADCS const0, y1, t1
1341 ADCS $0, y2, t2
1342 ADCS const1, y3, t3
1343 ADC $0, ZR, hlp0
1344
1345 ANDS $1, y0, ZR
1346 CSEL EQ, y0, t0, t0
1347 CSEL EQ, y1, t1, t1
1348 CSEL EQ, y2, t2, t2
1349 CSEL EQ, y3, t3, t3
1350 AND y0, hlp0, hlp0
1351
1352 EXTR $1, t0, t1, y0
1353 EXTR $1, t1, t2, y1
1354 EXTR $1, t2, t3, y2
1355 EXTR $1, t3, hlp0, y3
1356 STy(y3out)
1357
1358 LDx(x1in)
1359 LDy(s)
1360 CALL p256MulInternal<>(SB)
1361 STy(s)
1362 p256MulBy2Inline
1363 STx(tmp)
1364
1365 LDx(m)
1366 CALL p256SqrInternal<>(SB)
1367 LDx(tmp)
1368 CALL p256SubInternal<>(SB)
1369
1370 STx(x3out)
1371
1372 LDy(s)
1373 CALL p256SubInternal<>(SB)
1374
1375 LDy(m)
1376 CALL p256MulInternal<>(SB)
1377
1378 LDx(y3out)
1379 CALL p256SubInternal<>(SB)
1380 STx(y3out)
1381 RET
1382 /* ---------------------------------------*/
1383 #undef y2in
1384 #undef x3out
1385 #undef y3out
1386 #undef z3out
1387 #define y2in(off) (off + 32)(b_ptr)
1388 #define x3out(off) (off)(b_ptr)
1389 #define y3out(off) (off + 32)(b_ptr)
1390 #define z3out(off) (off + 64)(b_ptr)
1391 //func p256PointAddAsm(res, in1, in2 []uint64) int
1392 TEXT ·p256PointAddAsm(SB),0,$392-80
1393 // See https://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl
1394 // Move input to stack in order to free registers
1395 MOVD in1+24(FP), a_ptr
1396 MOVD in2+48(FP), b_ptr
1397
1398 MOVD p256const0<>(SB), const0
1399 MOVD p256const1<>(SB), const1
1400
1401 // Begin point add
1402 LDx(z2in)
1403 CALL p256SqrInternal<>(SB) // z2^2
1404 STy(z2sqr)
1405
1406 CALL p256MulInternal<>(SB) // z2^3
1407
1408 LDx(y1in)
1409 CALL p256MulInternal<>(SB) // s1 = z2ˆ3*y1
1410 STy(s1)
1411
1412 LDx(z1in)
1413 CALL p256SqrInternal<>(SB) // z1^2
1414 STy(z1sqr)
1415
1416 CALL p256MulInternal<>(SB) // z1^3
1417
1418 LDx(y2in)
1419 CALL p256MulInternal<>(SB) // s2 = z1ˆ3*y2
1420
1421 LDx(s1)
1422 CALL p256SubInternal<>(SB) // r = s2 - s1
1423 STx(r)
1424
1425 MOVD $1, t2
1426 ORR x0, x1, t0 // Check if zero mod p256
1427 ORR x2, x3, t1
1428 ORR t1, t0, t0
1429 CMP $0, t0
1430 CSEL EQ, t2, ZR, hlp1
1431
1432 EOR $-1, x0, t0
1433 EOR const0, x1, t1
1434 EOR const1, x3, t3
1435
1436 ORR t0, t1, t0
1437 ORR x2, t3, t1
1438 ORR t1, t0, t0
1439 CMP $0, t0
1440 CSEL EQ, t2, hlp1, hlp1
1441
1442 LDx(z2sqr)
1443 LDy(x1in)
1444 CALL p256MulInternal<>(SB) // u1 = x1 * z2ˆ2
1445 STy(u1)
1446
1447 LDx(z1sqr)
1448 LDy(x2in)
1449 CALL p256MulInternal<>(SB) // u2 = x2 * z1ˆ2
1450 STy(u2)
1451
1452 LDx(u1)
1453 CALL p256SubInternal<>(SB) // h = u2 - u1
1454 STx(h)
1455
1456 MOVD $1, t2
1457 ORR x0, x1, t0 // Check if zero mod p256
1458 ORR x2, x3, t1
1459 ORR t1, t0, t0
1460 CMP $0, t0
1461 CSEL EQ, t2, ZR, hlp0
1462
1463 EOR $-1, x0, t0
1464 EOR const0, x1, t1
1465 EOR const1, x3, t3
1466
1467 ORR t0, t1, t0
1468 ORR x2, t3, t1
1469 ORR t1, t0, t0
1470 CMP $0, t0
1471 CSEL EQ, t2, hlp0, hlp0
1472
1473 AND hlp0, hlp1, hlp1
1474
1475 LDx(r)
1476 CALL p256SqrInternal<>(SB) // rsqr = rˆ2
1477 STy(rsqr)
1478
1479 LDx(h)
1480 CALL p256SqrInternal<>(SB) // hsqr = hˆ2
1481 STy(hsqr)
1482
1483 LDx(h)
1484 CALL p256MulInternal<>(SB) // hcub = hˆ3
1485 STy(hcub)
1486
1487 LDx(s1)
1488 CALL p256MulInternal<>(SB)
1489 STy(s2)
1490
1491 LDx(z1in)
1492 LDy(z2in)
1493 CALL p256MulInternal<>(SB) // z1 * z2
1494 LDx(h)
1495 CALL p256MulInternal<>(SB) // z1 * z2 * h
1496 MOVD res+0(FP), b_ptr
1497 STy(z3out)
1498
1499 LDx(hsqr)
1500 LDy(u1)
1501 CALL p256MulInternal<>(SB) // hˆ2 * u1
1502 STy(u2)
1503
1504 p256MulBy2Inline // u1 * hˆ2 * 2, inline
1505 LDy(rsqr)
1506 CALL p256SubInternal<>(SB) // rˆ2 - u1 * hˆ2 * 2
1507
1508 MOVD x0, y0
1509 MOVD x1, y1
1510 MOVD x2, y2
1511 MOVD x3, y3
1512 LDx(hcub)
1513 CALL p256SubInternal<>(SB)
1514 STx(x3out)
1515
1516 LDy(u2)
1517 CALL p256SubInternal<>(SB)
1518
1519 LDy(r)
1520 CALL p256MulInternal<>(SB)
1521
1522 LDx(s2)
1523 CALL p256SubInternal<>(SB)
1524 STx(y3out)
1525
1526 MOVD hlp1, R0
1527 MOVD R0, ret+72(FP)
1528
1529 RET
1530
View as plain text