1 // Copyright 2015 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 // This file contains constant-time, 64-bit assembly implementation of
6 // P256. The optimizations performed here are described in detail in:
7 // S.Gueron and V.Krasnov, "Fast prime field elliptic-curve cryptography with
8 // 256-bit primes"
9 // https://link.springer.com/article/10.1007%2Fs13389-014-0090-x
10 // https://eprint.iacr.org/2013/816.pdf
11
12 #include "textflag.h"
13
14 #define res_ptr DI
15 #define x_ptr SI
16 #define y_ptr CX
17
18 #define acc0 R8
19 #define acc1 R9
20 #define acc2 R10
21 #define acc3 R11
22 #define acc4 R12
23 #define acc5 R13
24 #define t0 R14
25 #define t1 R15
26
27 DATA p256const0<>+0x00(SB)/8, $0x00000000ffffffff
28 DATA p256const1<>+0x00(SB)/8, $0xffffffff00000001
29 DATA p256ordK0<>+0x00(SB)/8, $0xccd1c8aaee00bc4f
30 DATA p256ord<>+0x00(SB)/8, $0xf3b9cac2fc632551
31 DATA p256ord<>+0x08(SB)/8, $0xbce6faada7179e84
32 DATA p256ord<>+0x10(SB)/8, $0xffffffffffffffff
33 DATA p256ord<>+0x18(SB)/8, $0xffffffff00000000
34 DATA p256one<>+0x00(SB)/8, $0x0000000000000001
35 DATA p256one<>+0x08(SB)/8, $0xffffffff00000000
36 DATA p256one<>+0x10(SB)/8, $0xffffffffffffffff
37 DATA p256one<>+0x18(SB)/8, $0x00000000fffffffe
38 GLOBL p256const0<>(SB), 8, $8
39 GLOBL p256const1<>(SB), 8, $8
40 GLOBL p256ordK0<>(SB), 8, $8
41 GLOBL p256ord<>(SB), 8, $32
42 GLOBL p256one<>(SB), 8, $32
43
44 /* ---------------------------------------*/
45 // func p256LittleToBig(res []byte, in []uint64)
46 TEXT ·p256LittleToBig(SB),NOSPLIT,$0
47 JMP ·p256BigToLittle(SB)
48 /* ---------------------------------------*/
49 // func p256BigToLittle(res []uint64, in []byte)
50 TEXT ·p256BigToLittle(SB),NOSPLIT,$0
51 MOVQ res+0(FP), res_ptr
52 MOVQ in+24(FP), x_ptr
53
54 MOVQ (8*0)(x_ptr), acc0
55 MOVQ (8*1)(x_ptr), acc1
56 MOVQ (8*2)(x_ptr), acc2
57 MOVQ (8*3)(x_ptr), acc3
58
59 BSWAPQ acc0
60 BSWAPQ acc1
61 BSWAPQ acc2
62 BSWAPQ acc3
63
64 MOVQ acc3, (8*0)(res_ptr)
65 MOVQ acc2, (8*1)(res_ptr)
66 MOVQ acc1, (8*2)(res_ptr)
67 MOVQ acc0, (8*3)(res_ptr)
68
69 RET
70 /* ---------------------------------------*/
71 // func p256MovCond(res, a, b []uint64, cond int)
72 // If cond == 0 res=b, else res=a
73 TEXT ·p256MovCond(SB),NOSPLIT,$0
74 MOVQ res+0(FP), res_ptr
75 MOVQ a+24(FP), x_ptr
76 MOVQ b+48(FP), y_ptr
77 MOVQ cond+72(FP), X12
78
79 PXOR X13, X13
80 PSHUFD $0, X12, X12
81 PCMPEQL X13, X12
82
83 MOVOU X12, X0
84 MOVOU (16*0)(x_ptr), X6
85 PANDN X6, X0
86 MOVOU X12, X1
87 MOVOU (16*1)(x_ptr), X7
88 PANDN X7, X1
89 MOVOU X12, X2
90 MOVOU (16*2)(x_ptr), X8
91 PANDN X8, X2
92 MOVOU X12, X3
93 MOVOU (16*3)(x_ptr), X9
94 PANDN X9, X3
95 MOVOU X12, X4
96 MOVOU (16*4)(x_ptr), X10
97 PANDN X10, X4
98 MOVOU X12, X5
99 MOVOU (16*5)(x_ptr), X11
100 PANDN X11, X5
101
102 MOVOU (16*0)(y_ptr), X6
103 MOVOU (16*1)(y_ptr), X7
104 MOVOU (16*2)(y_ptr), X8
105 MOVOU (16*3)(y_ptr), X9
106 MOVOU (16*4)(y_ptr), X10
107 MOVOU (16*5)(y_ptr), X11
108
109 PAND X12, X6
110 PAND X12, X7
111 PAND X12, X8
112 PAND X12, X9
113 PAND X12, X10
114 PAND X12, X11
115
116 PXOR X6, X0
117 PXOR X7, X1
118 PXOR X8, X2
119 PXOR X9, X3
120 PXOR X10, X4
121 PXOR X11, X5
122
123 MOVOU X0, (16*0)(res_ptr)
124 MOVOU X1, (16*1)(res_ptr)
125 MOVOU X2, (16*2)(res_ptr)
126 MOVOU X3, (16*3)(res_ptr)
127 MOVOU X4, (16*4)(res_ptr)
128 MOVOU X5, (16*5)(res_ptr)
129
130 RET
131 /* ---------------------------------------*/
132 // func p256NegCond(val []uint64, cond int)
133 TEXT ·p256NegCond(SB),NOSPLIT,$0
134 MOVQ val+0(FP), res_ptr
135 MOVQ cond+24(FP), t0
136 // acc = poly
137 MOVQ $-1, acc0
138 MOVQ p256const0<>(SB), acc1
139 MOVQ $0, acc2
140 MOVQ p256const1<>(SB), acc3
141 // Load the original value
142 MOVQ (8*0)(res_ptr), acc5
143 MOVQ (8*1)(res_ptr), x_ptr
144 MOVQ (8*2)(res_ptr), y_ptr
145 MOVQ (8*3)(res_ptr), t1
146 // Speculatively subtract
147 SUBQ acc5, acc0
148 SBBQ x_ptr, acc1
149 SBBQ y_ptr, acc2
150 SBBQ t1, acc3
151 // If condition is 0, keep original value
152 TESTQ t0, t0
153 CMOVQEQ acc5, acc0
154 CMOVQEQ x_ptr, acc1
155 CMOVQEQ y_ptr, acc2
156 CMOVQEQ t1, acc3
157 // Store result
158 MOVQ acc0, (8*0)(res_ptr)
159 MOVQ acc1, (8*1)(res_ptr)
160 MOVQ acc2, (8*2)(res_ptr)
161 MOVQ acc3, (8*3)(res_ptr)
162
163 RET
164 /* ---------------------------------------*/
165 // func p256Sqr(res, in []uint64, n int)
166 TEXT ·p256Sqr(SB),NOSPLIT,$0
167 MOVQ res+0(FP), res_ptr
168 MOVQ in+24(FP), x_ptr
169 MOVQ n+48(FP), BX
170
171 sqrLoop:
172
173 // y[1:] * y[0]
174 MOVQ (8*0)(x_ptr), t0
175
176 MOVQ (8*1)(x_ptr), AX
177 MULQ t0
178 MOVQ AX, acc1
179 MOVQ DX, acc2
180
181 MOVQ (8*2)(x_ptr), AX
182 MULQ t0
183 ADDQ AX, acc2
184 ADCQ $0, DX
185 MOVQ DX, acc3
186
187 MOVQ (8*3)(x_ptr), AX
188 MULQ t0
189 ADDQ AX, acc3
190 ADCQ $0, DX
191 MOVQ DX, acc4
192 // y[2:] * y[1]
193 MOVQ (8*1)(x_ptr), t0
194
195 MOVQ (8*2)(x_ptr), AX
196 MULQ t0
197 ADDQ AX, acc3
198 ADCQ $0, DX
199 MOVQ DX, t1
200
201 MOVQ (8*3)(x_ptr), AX
202 MULQ t0
203 ADDQ t1, acc4
204 ADCQ $0, DX
205 ADDQ AX, acc4
206 ADCQ $0, DX
207 MOVQ DX, acc5
208 // y[3] * y[2]
209 MOVQ (8*2)(x_ptr), t0
210
211 MOVQ (8*3)(x_ptr), AX
212 MULQ t0
213 ADDQ AX, acc5
214 ADCQ $0, DX
215 MOVQ DX, y_ptr
216 XORQ t1, t1
217 // *2
218 ADDQ acc1, acc1
219 ADCQ acc2, acc2
220 ADCQ acc3, acc3
221 ADCQ acc4, acc4
222 ADCQ acc5, acc5
223 ADCQ y_ptr, y_ptr
224 ADCQ $0, t1
225 // Missing products
226 MOVQ (8*0)(x_ptr), AX
227 MULQ AX
228 MOVQ AX, acc0
229 MOVQ DX, t0
230
231 MOVQ (8*1)(x_ptr), AX
232 MULQ AX
233 ADDQ t0, acc1
234 ADCQ AX, acc2
235 ADCQ $0, DX
236 MOVQ DX, t0
237
238 MOVQ (8*2)(x_ptr), AX
239 MULQ AX
240 ADDQ t0, acc3
241 ADCQ AX, acc4
242 ADCQ $0, DX
243 MOVQ DX, t0
244
245 MOVQ (8*3)(x_ptr), AX
246 MULQ AX
247 ADDQ t0, acc5
248 ADCQ AX, y_ptr
249 ADCQ DX, t1
250 MOVQ t1, x_ptr
251 // First reduction step
252 MOVQ acc0, AX
253 MOVQ acc0, t1
254 SHLQ $32, acc0
255 MULQ p256const1<>(SB)
256 SHRQ $32, t1
257 ADDQ acc0, acc1
258 ADCQ t1, acc2
259 ADCQ AX, acc3
260 ADCQ $0, DX
261 MOVQ DX, acc0
262 // Second reduction step
263 MOVQ acc1, AX
264 MOVQ acc1, t1
265 SHLQ $32, acc1
266 MULQ p256const1<>(SB)
267 SHRQ $32, t1
268 ADDQ acc1, acc2
269 ADCQ t1, acc3
270 ADCQ AX, acc0
271 ADCQ $0, DX
272 MOVQ DX, acc1
273 // Third reduction step
274 MOVQ acc2, AX
275 MOVQ acc2, t1
276 SHLQ $32, acc2
277 MULQ p256const1<>(SB)
278 SHRQ $32, t1
279 ADDQ acc2, acc3
280 ADCQ t1, acc0
281 ADCQ AX, acc1
282 ADCQ $0, DX
283 MOVQ DX, acc2
284 // Last reduction step
285 XORQ t0, t0
286 MOVQ acc3, AX
287 MOVQ acc3, t1
288 SHLQ $32, acc3
289 MULQ p256const1<>(SB)
290 SHRQ $32, t1
291 ADDQ acc3, acc0
292 ADCQ t1, acc1
293 ADCQ AX, acc2
294 ADCQ $0, DX
295 MOVQ DX, acc3
296 // Add bits [511:256] of the sqr result
297 ADCQ acc4, acc0
298 ADCQ acc5, acc1
299 ADCQ y_ptr, acc2
300 ADCQ x_ptr, acc3
301 ADCQ $0, t0
302
303 MOVQ acc0, acc4
304 MOVQ acc1, acc5
305 MOVQ acc2, y_ptr
306 MOVQ acc3, t1
307 // Subtract p256
308 SUBQ $-1, acc0
309 SBBQ p256const0<>(SB) ,acc1
310 SBBQ $0, acc2
311 SBBQ p256const1<>(SB), acc3
312 SBBQ $0, t0
313
314 CMOVQCS acc4, acc0
315 CMOVQCS acc5, acc1
316 CMOVQCS y_ptr, acc2
317 CMOVQCS t1, acc3
318
319 MOVQ acc0, (8*0)(res_ptr)
320 MOVQ acc1, (8*1)(res_ptr)
321 MOVQ acc2, (8*2)(res_ptr)
322 MOVQ acc3, (8*3)(res_ptr)
323 MOVQ res_ptr, x_ptr
324 DECQ BX
325 JNE sqrLoop
326
327 RET
328 /* ---------------------------------------*/
329 // func p256Mul(res, in1, in2 []uint64)
330 TEXT ·p256Mul(SB),NOSPLIT,$0
331 MOVQ res+0(FP), res_ptr
332 MOVQ in1+24(FP), x_ptr
333 MOVQ in2+48(FP), y_ptr
334 // x * y[0]
335 MOVQ (8*0)(y_ptr), t0
336
337 MOVQ (8*0)(x_ptr), AX
338 MULQ t0
339 MOVQ AX, acc0
340 MOVQ DX, acc1
341
342 MOVQ (8*1)(x_ptr), AX
343 MULQ t0
344 ADDQ AX, acc1
345 ADCQ $0, DX
346 MOVQ DX, acc2
347
348 MOVQ (8*2)(x_ptr), AX
349 MULQ t0
350 ADDQ AX, acc2
351 ADCQ $0, DX
352 MOVQ DX, acc3
353
354 MOVQ (8*3)(x_ptr), AX
355 MULQ t0
356 ADDQ AX, acc3
357 ADCQ $0, DX
358 MOVQ DX, acc4
359 XORQ acc5, acc5
360 // First reduction step
361 MOVQ acc0, AX
362 MOVQ acc0, t1
363 SHLQ $32, acc0
364 MULQ p256const1<>(SB)
365 SHRQ $32, t1
366 ADDQ acc0, acc1
367 ADCQ t1, acc2
368 ADCQ AX, acc3
369 ADCQ DX, acc4
370 ADCQ $0, acc5
371 XORQ acc0, acc0
372 // x * y[1]
373 MOVQ (8*1)(y_ptr), t0
374
375 MOVQ (8*0)(x_ptr), AX
376 MULQ t0
377 ADDQ AX, acc1
378 ADCQ $0, DX
379 MOVQ DX, t1
380
381 MOVQ (8*1)(x_ptr), AX
382 MULQ t0
383 ADDQ t1, acc2
384 ADCQ $0, DX
385 ADDQ AX, acc2
386 ADCQ $0, DX
387 MOVQ DX, t1
388
389 MOVQ (8*2)(x_ptr), AX
390 MULQ t0
391 ADDQ t1, acc3
392 ADCQ $0, DX
393 ADDQ AX, acc3
394 ADCQ $0, DX
395 MOVQ DX, t1
396
397 MOVQ (8*3)(x_ptr), AX
398 MULQ t0
399 ADDQ t1, acc4
400 ADCQ $0, DX
401 ADDQ AX, acc4
402 ADCQ DX, acc5
403 ADCQ $0, acc0
404 // Second reduction step
405 MOVQ acc1, AX
406 MOVQ acc1, t1
407 SHLQ $32, acc1
408 MULQ p256const1<>(SB)
409 SHRQ $32, t1
410 ADDQ acc1, acc2
411 ADCQ t1, acc3
412 ADCQ AX, acc4
413 ADCQ DX, acc5
414 ADCQ $0, acc0
415 XORQ acc1, acc1
416 // x * y[2]
417 MOVQ (8*2)(y_ptr), t0
418
419 MOVQ (8*0)(x_ptr), AX
420 MULQ t0
421 ADDQ AX, acc2
422 ADCQ $0, DX
423 MOVQ DX, t1
424
425 MOVQ (8*1)(x_ptr), AX
426 MULQ t0
427 ADDQ t1, acc3
428 ADCQ $0, DX
429 ADDQ AX, acc3
430 ADCQ $0, DX
431 MOVQ DX, t1
432
433 MOVQ (8*2)(x_ptr), AX
434 MULQ t0
435 ADDQ t1, acc4
436 ADCQ $0, DX
437 ADDQ AX, acc4
438 ADCQ $0, DX
439 MOVQ DX, t1
440
441 MOVQ (8*3)(x_ptr), AX
442 MULQ t0
443 ADDQ t1, acc5
444 ADCQ $0, DX
445 ADDQ AX, acc5
446 ADCQ DX, acc0
447 ADCQ $0, acc1
448 // Third reduction step
449 MOVQ acc2, AX
450 MOVQ acc2, t1
451 SHLQ $32, acc2
452 MULQ p256const1<>(SB)
453 SHRQ $32, t1
454 ADDQ acc2, acc3
455 ADCQ t1, acc4
456 ADCQ AX, acc5
457 ADCQ DX, acc0
458 ADCQ $0, acc1
459 XORQ acc2, acc2
460 // x * y[3]
461 MOVQ (8*3)(y_ptr), t0
462
463 MOVQ (8*0)(x_ptr), AX
464 MULQ t0
465 ADDQ AX, acc3
466 ADCQ $0, DX
467 MOVQ DX, t1
468
469 MOVQ (8*1)(x_ptr), AX
470 MULQ t0
471 ADDQ t1, acc4
472 ADCQ $0, DX
473 ADDQ AX, acc4
474 ADCQ $0, DX
475 MOVQ DX, t1
476
477 MOVQ (8*2)(x_ptr), AX
478 MULQ t0
479 ADDQ t1, acc5
480 ADCQ $0, DX
481 ADDQ AX, acc5
482 ADCQ $0, DX
483 MOVQ DX, t1
484
485 MOVQ (8*3)(x_ptr), AX
486 MULQ t0
487 ADDQ t1, acc0
488 ADCQ $0, DX
489 ADDQ AX, acc0
490 ADCQ DX, acc1
491 ADCQ $0, acc2
492 // Last reduction step
493 MOVQ acc3, AX
494 MOVQ acc3, t1
495 SHLQ $32, acc3
496 MULQ p256const1<>(SB)
497 SHRQ $32, t1
498 ADDQ acc3, acc4
499 ADCQ t1, acc5
500 ADCQ AX, acc0
501 ADCQ DX, acc1
502 ADCQ $0, acc2
503 // Copy result [255:0]
504 MOVQ acc4, x_ptr
505 MOVQ acc5, acc3
506 MOVQ acc0, t0
507 MOVQ acc1, t1
508 // Subtract p256
509 SUBQ $-1, acc4
510 SBBQ p256const0<>(SB) ,acc5
511 SBBQ $0, acc0
512 SBBQ p256const1<>(SB), acc1
513 SBBQ $0, acc2
514
515 CMOVQCS x_ptr, acc4
516 CMOVQCS acc3, acc5
517 CMOVQCS t0, acc0
518 CMOVQCS t1, acc1
519
520 MOVQ acc4, (8*0)(res_ptr)
521 MOVQ acc5, (8*1)(res_ptr)
522 MOVQ acc0, (8*2)(res_ptr)
523 MOVQ acc1, (8*3)(res_ptr)
524
525 RET
526 /* ---------------------------------------*/
527 // func p256FromMont(res, in []uint64)
528 TEXT ·p256FromMont(SB),NOSPLIT,$0
529 MOVQ res+0(FP), res_ptr
530 MOVQ in+24(FP), x_ptr
531
532 MOVQ (8*0)(x_ptr), acc0
533 MOVQ (8*1)(x_ptr), acc1
534 MOVQ (8*2)(x_ptr), acc2
535 MOVQ (8*3)(x_ptr), acc3
536 XORQ acc4, acc4
537
538 // Only reduce, no multiplications are needed
539 // First stage
540 MOVQ acc0, AX
541 MOVQ acc0, t1
542 SHLQ $32, acc0
543 MULQ p256const1<>(SB)
544 SHRQ $32, t1
545 ADDQ acc0, acc1
546 ADCQ t1, acc2
547 ADCQ AX, acc3
548 ADCQ DX, acc4
549 XORQ acc5, acc5
550 // Second stage
551 MOVQ acc1, AX
552 MOVQ acc1, t1
553 SHLQ $32, acc1
554 MULQ p256const1<>(SB)
555 SHRQ $32, t1
556 ADDQ acc1, acc2
557 ADCQ t1, acc3
558 ADCQ AX, acc4
559 ADCQ DX, acc5
560 XORQ acc0, acc0
561 // Third stage
562 MOVQ acc2, AX
563 MOVQ acc2, t1
564 SHLQ $32, acc2
565 MULQ p256const1<>(SB)
566 SHRQ $32, t1
567 ADDQ acc2, acc3
568 ADCQ t1, acc4
569 ADCQ AX, acc5
570 ADCQ DX, acc0
571 XORQ acc1, acc1
572 // Last stage
573 MOVQ acc3, AX
574 MOVQ acc3, t1
575 SHLQ $32, acc3
576 MULQ p256const1<>(SB)
577 SHRQ $32, t1
578 ADDQ acc3, acc4
579 ADCQ t1, acc5
580 ADCQ AX, acc0
581 ADCQ DX, acc1
582
583 MOVQ acc4, x_ptr
584 MOVQ acc5, acc3
585 MOVQ acc0, t0
586 MOVQ acc1, t1
587
588 SUBQ $-1, acc4
589 SBBQ p256const0<>(SB), acc5
590 SBBQ $0, acc0
591 SBBQ p256const1<>(SB), acc1
592
593 CMOVQCS x_ptr, acc4
594 CMOVQCS acc3, acc5
595 CMOVQCS t0, acc0
596 CMOVQCS t1, acc1
597
598 MOVQ acc4, (8*0)(res_ptr)
599 MOVQ acc5, (8*1)(res_ptr)
600 MOVQ acc0, (8*2)(res_ptr)
601 MOVQ acc1, (8*3)(res_ptr)
602
603 RET
604 /* ---------------------------------------*/
605 // Constant time point access to arbitrary point table.
606 // Indexed from 1 to 15, with -1 offset
607 // (index 0 is implicitly point at infinity)
608 // func p256Select(point, table []uint64, idx int)
609 TEXT ·p256Select(SB),NOSPLIT,$0
610 MOVQ idx+48(FP),AX
611 MOVQ table+24(FP),DI
612 MOVQ point+0(FP),DX
613
614 PXOR X15, X15 // X15 = 0
615 PCMPEQL X14, X14 // X14 = -1
616 PSUBL X14, X15 // X15 = 1
617 MOVL AX, X14
618 PSHUFD $0, X14, X14
619
620 PXOR X0, X0
621 PXOR X1, X1
622 PXOR X2, X2
623 PXOR X3, X3
624 PXOR X4, X4
625 PXOR X5, X5
626 MOVQ $16, AX
627
628 MOVOU X15, X13
629
630 loop_select:
631
632 MOVOU X13, X12
633 PADDL X15, X13
634 PCMPEQL X14, X12
635
636 MOVOU (16*0)(DI), X6
637 MOVOU (16*1)(DI), X7
638 MOVOU (16*2)(DI), X8
639 MOVOU (16*3)(DI), X9
640 MOVOU (16*4)(DI), X10
641 MOVOU (16*5)(DI), X11
642 ADDQ $(16*6), DI
643
644 PAND X12, X6
645 PAND X12, X7
646 PAND X12, X8
647 PAND X12, X9
648 PAND X12, X10
649 PAND X12, X11
650
651 PXOR X6, X0
652 PXOR X7, X1
653 PXOR X8, X2
654 PXOR X9, X3
655 PXOR X10, X4
656 PXOR X11, X5
657
658 DECQ AX
659 JNE loop_select
660
661 MOVOU X0, (16*0)(DX)
662 MOVOU X1, (16*1)(DX)
663 MOVOU X2, (16*2)(DX)
664 MOVOU X3, (16*3)(DX)
665 MOVOU X4, (16*4)(DX)
666 MOVOU X5, (16*5)(DX)
667
668 RET
669 /* ---------------------------------------*/
670 // Constant time point access to base point table.
671 // func p256SelectBase(point *[12]uint64, table string, idx int)
672 TEXT ·p256SelectBase(SB),NOSPLIT,$0
673 MOVQ idx+24(FP),AX
674 MOVQ table+8(FP),DI
675 MOVQ point+0(FP),DX
676
677 PXOR X15, X15 // X15 = 0
678 PCMPEQL X14, X14 // X14 = -1
679 PSUBL X14, X15 // X15 = 1
680 MOVL AX, X14
681 PSHUFD $0, X14, X14
682
683 PXOR X0, X0
684 PXOR X1, X1
685 PXOR X2, X2
686 PXOR X3, X3
687 MOVQ $16, AX
688
689 MOVOU X15, X13
690
691 loop_select_base:
692
693 MOVOU X13, X12
694 PADDL X15, X13
695 PCMPEQL X14, X12
696
697 MOVOU (16*0)(DI), X4
698 MOVOU (16*1)(DI), X5
699 MOVOU (16*2)(DI), X6
700 MOVOU (16*3)(DI), X7
701
702 MOVOU (16*4)(DI), X8
703 MOVOU (16*5)(DI), X9
704 MOVOU (16*6)(DI), X10
705 MOVOU (16*7)(DI), X11
706
707 ADDQ $(16*8), DI
708
709 PAND X12, X4
710 PAND X12, X5
711 PAND X12, X6
712 PAND X12, X7
713
714 MOVOU X13, X12
715 PADDL X15, X13
716 PCMPEQL X14, X12
717
718 PAND X12, X8
719 PAND X12, X9
720 PAND X12, X10
721 PAND X12, X11
722
723 PXOR X4, X0
724 PXOR X5, X1
725 PXOR X6, X2
726 PXOR X7, X3
727
728 PXOR X8, X0
729 PXOR X9, X1
730 PXOR X10, X2
731 PXOR X11, X3
732
733 DECQ AX
734 JNE loop_select_base
735
736 MOVOU X0, (16*0)(DX)
737 MOVOU X1, (16*1)(DX)
738 MOVOU X2, (16*2)(DX)
739 MOVOU X3, (16*3)(DX)
740
741 RET
742 /* ---------------------------------------*/
743 // func p256OrdMul(res, in1, in2 []uint64)
744 TEXT ·p256OrdMul(SB),NOSPLIT,$0
745 MOVQ res+0(FP), res_ptr
746 MOVQ in1+24(FP), x_ptr
747 MOVQ in2+48(FP), y_ptr
748 // x * y[0]
749 MOVQ (8*0)(y_ptr), t0
750
751 MOVQ (8*0)(x_ptr), AX
752 MULQ t0
753 MOVQ AX, acc0
754 MOVQ DX, acc1
755
756 MOVQ (8*1)(x_ptr), AX
757 MULQ t0
758 ADDQ AX, acc1
759 ADCQ $0, DX
760 MOVQ DX, acc2
761
762 MOVQ (8*2)(x_ptr), AX
763 MULQ t0
764 ADDQ AX, acc2
765 ADCQ $0, DX
766 MOVQ DX, acc3
767
768 MOVQ (8*3)(x_ptr), AX
769 MULQ t0
770 ADDQ AX, acc3
771 ADCQ $0, DX
772 MOVQ DX, acc4
773 XORQ acc5, acc5
774 // First reduction step
775 MOVQ acc0, AX
776 MULQ p256ordK0<>(SB)
777 MOVQ AX, t0
778
779 MOVQ p256ord<>+0x00(SB), AX
780 MULQ t0
781 ADDQ AX, acc0
782 ADCQ $0, DX
783 MOVQ DX, t1
784
785 MOVQ p256ord<>+0x08(SB), AX
786 MULQ t0
787 ADDQ t1, acc1
788 ADCQ $0, DX
789 ADDQ AX, acc1
790 ADCQ $0, DX
791 MOVQ DX, t1
792
793 MOVQ p256ord<>+0x10(SB), AX
794 MULQ t0
795 ADDQ t1, acc2
796 ADCQ $0, DX
797 ADDQ AX, acc2
798 ADCQ $0, DX
799 MOVQ DX, t1
800
801 MOVQ p256ord<>+0x18(SB), AX
802 MULQ t0
803 ADDQ t1, acc3
804 ADCQ $0, DX
805 ADDQ AX, acc3
806 ADCQ DX, acc4
807 ADCQ $0, acc5
808 // x * y[1]
809 MOVQ (8*1)(y_ptr), t0
810
811 MOVQ (8*0)(x_ptr), AX
812 MULQ t0
813 ADDQ AX, acc1
814 ADCQ $0, DX
815 MOVQ DX, t1
816
817 MOVQ (8*1)(x_ptr), AX
818 MULQ t0
819 ADDQ t1, acc2
820 ADCQ $0, DX
821 ADDQ AX, acc2
822 ADCQ $0, DX
823 MOVQ DX, t1
824
825 MOVQ (8*2)(x_ptr), AX
826 MULQ t0
827 ADDQ t1, acc3
828 ADCQ $0, DX
829 ADDQ AX, acc3
830 ADCQ $0, DX
831 MOVQ DX, t1
832
833 MOVQ (8*3)(x_ptr), AX
834 MULQ t0
835 ADDQ t1, acc4
836 ADCQ $0, DX
837 ADDQ AX, acc4
838 ADCQ DX, acc5
839 ADCQ $0, acc0
840 // Second reduction step
841 MOVQ acc1, AX
842 MULQ p256ordK0<>(SB)
843 MOVQ AX, t0
844
845 MOVQ p256ord<>+0x00(SB), AX
846 MULQ t0
847 ADDQ AX, acc1
848 ADCQ $0, DX
849 MOVQ DX, t1
850
851 MOVQ p256ord<>+0x08(SB), AX
852 MULQ t0
853 ADDQ t1, acc2
854 ADCQ $0, DX
855 ADDQ AX, acc2
856 ADCQ $0, DX
857 MOVQ DX, t1
858
859 MOVQ p256ord<>+0x10(SB), AX
860 MULQ t0
861 ADDQ t1, acc3
862 ADCQ $0, DX
863 ADDQ AX, acc3
864 ADCQ $0, DX
865 MOVQ DX, t1
866
867 MOVQ p256ord<>+0x18(SB), AX
868 MULQ t0
869 ADDQ t1, acc4
870 ADCQ $0, DX
871 ADDQ AX, acc4
872 ADCQ DX, acc5
873 ADCQ $0, acc0
874 // x * y[2]
875 MOVQ (8*2)(y_ptr), t0
876
877 MOVQ (8*0)(x_ptr), AX
878 MULQ t0
879 ADDQ AX, acc2
880 ADCQ $0, DX
881 MOVQ DX, t1
882
883 MOVQ (8*1)(x_ptr), AX
884 MULQ t0
885 ADDQ t1, acc3
886 ADCQ $0, DX
887 ADDQ AX, acc3
888 ADCQ $0, DX
889 MOVQ DX, t1
890
891 MOVQ (8*2)(x_ptr), AX
892 MULQ t0
893 ADDQ t1, acc4
894 ADCQ $0, DX
895 ADDQ AX, acc4
896 ADCQ $0, DX
897 MOVQ DX, t1
898
899 MOVQ (8*3)(x_ptr), AX
900 MULQ t0
901 ADDQ t1, acc5
902 ADCQ $0, DX
903 ADDQ AX, acc5
904 ADCQ DX, acc0
905 ADCQ $0, acc1
906 // Third reduction step
907 MOVQ acc2, AX
908 MULQ p256ordK0<>(SB)
909 MOVQ AX, t0
910
911 MOVQ p256ord<>+0x00(SB), AX
912 MULQ t0
913 ADDQ AX, acc2
914 ADCQ $0, DX
915 MOVQ DX, t1
916
917 MOVQ p256ord<>+0x08(SB), AX
918 MULQ t0
919 ADDQ t1, acc3
920 ADCQ $0, DX
921 ADDQ AX, acc3
922 ADCQ $0, DX
923 MOVQ DX, t1
924
925 MOVQ p256ord<>+0x10(SB), AX
926 MULQ t0
927 ADDQ t1, acc4
928 ADCQ $0, DX
929 ADDQ AX, acc4
930 ADCQ $0, DX
931 MOVQ DX, t1
932
933 MOVQ p256ord<>+0x18(SB), AX
934 MULQ t0
935 ADDQ t1, acc5
936 ADCQ $0, DX
937 ADDQ AX, acc5
938 ADCQ DX, acc0
939 ADCQ $0, acc1
940 // x * y[3]
941 MOVQ (8*3)(y_ptr), t0
942
943 MOVQ (8*0)(x_ptr), AX
944 MULQ t0
945 ADDQ AX, acc3
946 ADCQ $0, DX
947 MOVQ DX, t1
948
949 MOVQ (8*1)(x_ptr), AX
950 MULQ t0
951 ADDQ t1, acc4
952 ADCQ $0, DX
953 ADDQ AX, acc4
954 ADCQ $0, DX
955 MOVQ DX, t1
956
957 MOVQ (8*2)(x_ptr), AX
958 MULQ t0
959 ADDQ t1, acc5
960 ADCQ $0, DX
961 ADDQ AX, acc5
962 ADCQ $0, DX
963 MOVQ DX, t1
964
965 MOVQ (8*3)(x_ptr), AX
966 MULQ t0
967 ADDQ t1, acc0
968 ADCQ $0, DX
969 ADDQ AX, acc0
970 ADCQ DX, acc1
971 ADCQ $0, acc2
972 // Last reduction step
973 MOVQ acc3, AX
974 MULQ p256ordK0<>(SB)
975 MOVQ AX, t0
976
977 MOVQ p256ord<>+0x00(SB), AX
978 MULQ t0
979 ADDQ AX, acc3
980 ADCQ $0, DX
981 MOVQ DX, t1
982
983 MOVQ p256ord<>+0x08(SB), AX
984 MULQ t0
985 ADDQ t1, acc4
986 ADCQ $0, DX
987 ADDQ AX, acc4
988 ADCQ $0, DX
989 MOVQ DX, t1
990
991 MOVQ p256ord<>+0x10(SB), AX
992 MULQ t0
993 ADDQ t1, acc5
994 ADCQ $0, DX
995 ADDQ AX, acc5
996 ADCQ $0, DX
997 MOVQ DX, t1
998
999 MOVQ p256ord<>+0x18(SB), AX
1000 MULQ t0
1001 ADDQ t1, acc0
1002 ADCQ $0, DX
1003 ADDQ AX, acc0
1004 ADCQ DX, acc1
1005 ADCQ $0, acc2
1006 // Copy result [255:0]
1007 MOVQ acc4, x_ptr
1008 MOVQ acc5, acc3
1009 MOVQ acc0, t0
1010 MOVQ acc1, t1
1011 // Subtract p256
1012 SUBQ p256ord<>+0x00(SB), acc4
1013 SBBQ p256ord<>+0x08(SB) ,acc5
1014 SBBQ p256ord<>+0x10(SB), acc0
1015 SBBQ p256ord<>+0x18(SB), acc1
1016 SBBQ $0, acc2
1017
1018 CMOVQCS x_ptr, acc4
1019 CMOVQCS acc3, acc5
1020 CMOVQCS t0, acc0
1021 CMOVQCS t1, acc1
1022
1023 MOVQ acc4, (8*0)(res_ptr)
1024 MOVQ acc5, (8*1)(res_ptr)
1025 MOVQ acc0, (8*2)(res_ptr)
1026 MOVQ acc1, (8*3)(res_ptr)
1027
1028 RET
1029 /* ---------------------------------------*/
1030 // func p256OrdSqr(res, in []uint64, n int)
1031 TEXT ·p256OrdSqr(SB),NOSPLIT,$0
1032 MOVQ res+0(FP), res_ptr
1033 MOVQ in+24(FP), x_ptr
1034 MOVQ n+48(FP), BX
1035
1036 ordSqrLoop:
1037
1038 // y[1:] * y[0]
1039 MOVQ (8*0)(x_ptr), t0
1040
1041 MOVQ (8*1)(x_ptr), AX
1042 MULQ t0
1043 MOVQ AX, acc1
1044 MOVQ DX, acc2
1045
1046 MOVQ (8*2)(x_ptr), AX
1047 MULQ t0
1048 ADDQ AX, acc2
1049 ADCQ $0, DX
1050 MOVQ DX, acc3
1051
1052 MOVQ (8*3)(x_ptr), AX
1053 MULQ t0
1054 ADDQ AX, acc3
1055 ADCQ $0, DX
1056 MOVQ DX, acc4
1057 // y[2:] * y[1]
1058 MOVQ (8*1)(x_ptr), t0
1059
1060 MOVQ (8*2)(x_ptr), AX
1061 MULQ t0
1062 ADDQ AX, acc3
1063 ADCQ $0, DX
1064 MOVQ DX, t1
1065
1066 MOVQ (8*3)(x_ptr), AX
1067 MULQ t0
1068 ADDQ t1, acc4
1069 ADCQ $0, DX
1070 ADDQ AX, acc4
1071 ADCQ $0, DX
1072 MOVQ DX, acc5
1073 // y[3] * y[2]
1074 MOVQ (8*2)(x_ptr), t0
1075
1076 MOVQ (8*3)(x_ptr), AX
1077 MULQ t0
1078 ADDQ AX, acc5
1079 ADCQ $0, DX
1080 MOVQ DX, y_ptr
1081 XORQ t1, t1
1082 // *2
1083 ADDQ acc1, acc1
1084 ADCQ acc2, acc2
1085 ADCQ acc3, acc3
1086 ADCQ acc4, acc4
1087 ADCQ acc5, acc5
1088 ADCQ y_ptr, y_ptr
1089 ADCQ $0, t1
1090 // Missing products
1091 MOVQ (8*0)(x_ptr), AX
1092 MULQ AX
1093 MOVQ AX, acc0
1094 MOVQ DX, t0
1095
1096 MOVQ (8*1)(x_ptr), AX
1097 MULQ AX
1098 ADDQ t0, acc1
1099 ADCQ AX, acc2
1100 ADCQ $0, DX
1101 MOVQ DX, t0
1102
1103 MOVQ (8*2)(x_ptr), AX
1104 MULQ AX
1105 ADDQ t0, acc3
1106 ADCQ AX, acc4
1107 ADCQ $0, DX
1108 MOVQ DX, t0
1109
1110 MOVQ (8*3)(x_ptr), AX
1111 MULQ AX
1112 ADDQ t0, acc5
1113 ADCQ AX, y_ptr
1114 ADCQ DX, t1
1115 MOVQ t1, x_ptr
1116 // First reduction step
1117 MOVQ acc0, AX
1118 MULQ p256ordK0<>(SB)
1119 MOVQ AX, t0
1120
1121 MOVQ p256ord<>+0x00(SB), AX
1122 MULQ t0
1123 ADDQ AX, acc0
1124 ADCQ $0, DX
1125 MOVQ DX, t1
1126
1127 MOVQ p256ord<>+0x08(SB), AX
1128 MULQ t0
1129 ADDQ t1, acc1
1130 ADCQ $0, DX
1131 ADDQ AX, acc1
1132
1133 MOVQ t0, t1
1134 ADCQ DX, acc2
1135 ADCQ $0, t1
1136 SUBQ t0, acc2
1137 SBBQ $0, t1
1138
1139 MOVQ t0, AX
1140 MOVQ t0, DX
1141 MOVQ t0, acc0
1142 SHLQ $32, AX
1143 SHRQ $32, DX
1144
1145 ADDQ t1, acc3
1146 ADCQ $0, acc0
1147 SUBQ AX, acc3
1148 SBBQ DX, acc0
1149 // Second reduction step
1150 MOVQ acc1, AX
1151 MULQ p256ordK0<>(SB)
1152 MOVQ AX, t0
1153
1154 MOVQ p256ord<>+0x00(SB), AX
1155 MULQ t0
1156 ADDQ AX, acc1
1157 ADCQ $0, DX
1158 MOVQ DX, t1
1159
1160 MOVQ p256ord<>+0x08(SB), AX
1161 MULQ t0
1162 ADDQ t1, acc2
1163 ADCQ $0, DX
1164 ADDQ AX, acc2
1165
1166 MOVQ t0, t1
1167 ADCQ DX, acc3
1168 ADCQ $0, t1
1169 SUBQ t0, acc3
1170 SBBQ $0, t1
1171
1172 MOVQ t0, AX
1173 MOVQ t0, DX
1174 MOVQ t0, acc1
1175 SHLQ $32, AX
1176 SHRQ $32, DX
1177
1178 ADDQ t1, acc0
1179 ADCQ $0, acc1
1180 SUBQ AX, acc0
1181 SBBQ DX, acc1
1182 // Third reduction step
1183 MOVQ acc2, AX
1184 MULQ p256ordK0<>(SB)
1185 MOVQ AX, t0
1186
1187 MOVQ p256ord<>+0x00(SB), AX
1188 MULQ t0
1189 ADDQ AX, acc2
1190 ADCQ $0, DX
1191 MOVQ DX, t1
1192
1193 MOVQ p256ord<>+0x08(SB), AX
1194 MULQ t0
1195 ADDQ t1, acc3
1196 ADCQ $0, DX
1197 ADDQ AX, acc3
1198
1199 MOVQ t0, t1
1200 ADCQ DX, acc0
1201 ADCQ $0, t1
1202 SUBQ t0, acc0
1203 SBBQ $0, t1
1204
1205 MOVQ t0, AX
1206 MOVQ t0, DX
1207 MOVQ t0, acc2
1208 SHLQ $32, AX
1209 SHRQ $32, DX
1210
1211 ADDQ t1, acc1
1212 ADCQ $0, acc2
1213 SUBQ AX, acc1
1214 SBBQ DX, acc2
1215 // Last reduction step
1216 MOVQ acc3, AX
1217 MULQ p256ordK0<>(SB)
1218 MOVQ AX, t0
1219
1220 MOVQ p256ord<>+0x00(SB), AX
1221 MULQ t0
1222 ADDQ AX, acc3
1223 ADCQ $0, DX
1224 MOVQ DX, t1
1225
1226 MOVQ p256ord<>+0x08(SB), AX
1227 MULQ t0
1228 ADDQ t1, acc0
1229 ADCQ $0, DX
1230 ADDQ AX, acc0
1231 ADCQ $0, DX
1232 MOVQ DX, t1
1233
1234 MOVQ t0, t1
1235 ADCQ DX, acc1
1236 ADCQ $0, t1
1237 SUBQ t0, acc1
1238 SBBQ $0, t1
1239
1240 MOVQ t0, AX
1241 MOVQ t0, DX
1242 MOVQ t0, acc3
1243 SHLQ $32, AX
1244 SHRQ $32, DX
1245
1246 ADDQ t1, acc2
1247 ADCQ $0, acc3
1248 SUBQ AX, acc2
1249 SBBQ DX, acc3
1250 XORQ t0, t0
1251 // Add bits [511:256] of the sqr result
1252 ADCQ acc4, acc0
1253 ADCQ acc5, acc1
1254 ADCQ y_ptr, acc2
1255 ADCQ x_ptr, acc3
1256 ADCQ $0, t0
1257
1258 MOVQ acc0, acc4
1259 MOVQ acc1, acc5
1260 MOVQ acc2, y_ptr
1261 MOVQ acc3, t1
1262 // Subtract p256
1263 SUBQ p256ord<>+0x00(SB), acc0
1264 SBBQ p256ord<>+0x08(SB) ,acc1
1265 SBBQ p256ord<>+0x10(SB), acc2
1266 SBBQ p256ord<>+0x18(SB), acc3
1267 SBBQ $0, t0
1268
1269 CMOVQCS acc4, acc0
1270 CMOVQCS acc5, acc1
1271 CMOVQCS y_ptr, acc2
1272 CMOVQCS t1, acc3
1273
1274 MOVQ acc0, (8*0)(res_ptr)
1275 MOVQ acc1, (8*1)(res_ptr)
1276 MOVQ acc2, (8*2)(res_ptr)
1277 MOVQ acc3, (8*3)(res_ptr)
1278 MOVQ res_ptr, x_ptr
1279 DECQ BX
1280 JNE ordSqrLoop
1281
1282 RET
1283 /* ---------------------------------------*/
1284 #undef res_ptr
1285 #undef x_ptr
1286 #undef y_ptr
1287
1288 #undef acc0
1289 #undef acc1
1290 #undef acc2
1291 #undef acc3
1292 #undef acc4
1293 #undef acc5
1294 #undef t0
1295 #undef t1
1296 /* ---------------------------------------*/
1297 #define mul0 AX
1298 #define mul1 DX
1299 #define acc0 BX
1300 #define acc1 CX
1301 #define acc2 R8
1302 #define acc3 R9
1303 #define acc4 R10
1304 #define acc5 R11
1305 #define acc6 R12
1306 #define acc7 R13
1307 #define t0 R14
1308 #define t1 R15
1309 #define t2 DI
1310 #define t3 SI
1311 #define hlp BP
1312 /* ---------------------------------------*/
1313 TEXT p256SubInternal(SB),NOSPLIT,$0
1314 XORQ mul0, mul0
1315 SUBQ t0, acc4
1316 SBBQ t1, acc5
1317 SBBQ t2, acc6
1318 SBBQ t3, acc7
1319 SBBQ $0, mul0
1320
1321 MOVQ acc4, acc0
1322 MOVQ acc5, acc1
1323 MOVQ acc6, acc2
1324 MOVQ acc7, acc3
1325
1326 ADDQ $-1, acc4
1327 ADCQ p256const0<>(SB), acc5
1328 ADCQ $0, acc6
1329 ADCQ p256const1<>(SB), acc7
1330 ANDQ $1, mul0
1331
1332 CMOVQEQ acc0, acc4
1333 CMOVQEQ acc1, acc5
1334 CMOVQEQ acc2, acc6
1335 CMOVQEQ acc3, acc7
1336
1337 RET
1338 /* ---------------------------------------*/
1339 TEXT p256MulInternal(SB),NOSPLIT,$8
1340 MOVQ acc4, mul0
1341 MULQ t0
1342 MOVQ mul0, acc0
1343 MOVQ mul1, acc1
1344
1345 MOVQ acc4, mul0
1346 MULQ t1
1347 ADDQ mul0, acc1
1348 ADCQ $0, mul1
1349 MOVQ mul1, acc2
1350
1351 MOVQ acc4, mul0
1352 MULQ t2
1353 ADDQ mul0, acc2
1354 ADCQ $0, mul1
1355 MOVQ mul1, acc3
1356
1357 MOVQ acc4, mul0
1358 MULQ t3
1359 ADDQ mul0, acc3
1360 ADCQ $0, mul1
1361 MOVQ mul1, acc4
1362
1363 MOVQ acc5, mul0
1364 MULQ t0
1365 ADDQ mul0, acc1
1366 ADCQ $0, mul1
1367 MOVQ mul1, hlp
1368
1369 MOVQ acc5, mul0
1370 MULQ t1
1371 ADDQ hlp, acc2
1372 ADCQ $0, mul1
1373 ADDQ mul0, acc2
1374 ADCQ $0, mul1
1375 MOVQ mul1, hlp
1376
1377 MOVQ acc5, mul0
1378 MULQ t2
1379 ADDQ hlp, acc3
1380 ADCQ $0, mul1
1381 ADDQ mul0, acc3
1382 ADCQ $0, mul1
1383 MOVQ mul1, hlp
1384
1385 MOVQ acc5, mul0
1386 MULQ t3
1387 ADDQ hlp, acc4
1388 ADCQ $0, mul1
1389 ADDQ mul0, acc4
1390 ADCQ $0, mul1
1391 MOVQ mul1, acc5
1392
1393 MOVQ acc6, mul0
1394 MULQ t0
1395 ADDQ mul0, acc2
1396 ADCQ $0, mul1
1397 MOVQ mul1, hlp
1398
1399 MOVQ acc6, mul0
1400 MULQ t1
1401 ADDQ hlp, acc3
1402 ADCQ $0, mul1
1403 ADDQ mul0, acc3
1404 ADCQ $0, mul1
1405 MOVQ mul1, hlp
1406
1407 MOVQ acc6, mul0
1408 MULQ t2
1409 ADDQ hlp, acc4
1410 ADCQ $0, mul1
1411 ADDQ mul0, acc4
1412 ADCQ $0, mul1
1413 MOVQ mul1, hlp
1414
1415 MOVQ acc6, mul0
1416 MULQ t3
1417 ADDQ hlp, acc5
1418 ADCQ $0, mul1
1419 ADDQ mul0, acc5
1420 ADCQ $0, mul1
1421 MOVQ mul1, acc6
1422
1423 MOVQ acc7, mul0
1424 MULQ t0
1425 ADDQ mul0, acc3
1426 ADCQ $0, mul1
1427 MOVQ mul1, hlp
1428
1429 MOVQ acc7, mul0
1430 MULQ t1
1431 ADDQ hlp, acc4
1432 ADCQ $0, mul1
1433 ADDQ mul0, acc4
1434 ADCQ $0, mul1
1435 MOVQ mul1, hlp
1436
1437 MOVQ acc7, mul0
1438 MULQ t2
1439 ADDQ hlp, acc5
1440 ADCQ $0, mul1
1441 ADDQ mul0, acc5
1442 ADCQ $0, mul1
1443 MOVQ mul1, hlp
1444
1445 MOVQ acc7, mul0
1446 MULQ t3
1447 ADDQ hlp, acc6
1448 ADCQ $0, mul1
1449 ADDQ mul0, acc6
1450 ADCQ $0, mul1
1451 MOVQ mul1, acc7
1452 // First reduction step
1453 MOVQ acc0, mul0
1454 MOVQ acc0, hlp
1455 SHLQ $32, acc0
1456 MULQ p256const1<>(SB)
1457 SHRQ $32, hlp
1458 ADDQ acc0, acc1
1459 ADCQ hlp, acc2
1460 ADCQ mul0, acc3
1461 ADCQ $0, mul1
1462 MOVQ mul1, acc0
1463 // Second reduction step
1464 MOVQ acc1, mul0
1465 MOVQ acc1, hlp
1466 SHLQ $32, acc1
1467 MULQ p256const1<>(SB)
1468 SHRQ $32, hlp
1469 ADDQ acc1, acc2
1470 ADCQ hlp, acc3
1471 ADCQ mul0, acc0
1472 ADCQ $0, mul1
1473 MOVQ mul1, acc1
1474 // Third reduction step
1475 MOVQ acc2, mul0
1476 MOVQ acc2, hlp
1477 SHLQ $32, acc2
1478 MULQ p256const1<>(SB)
1479 SHRQ $32, hlp
1480 ADDQ acc2, acc3
1481 ADCQ hlp, acc0
1482 ADCQ mul0, acc1
1483 ADCQ $0, mul1
1484 MOVQ mul1, acc2
1485 // Last reduction step
1486 MOVQ acc3, mul0
1487 MOVQ acc3, hlp
1488 SHLQ $32, acc3
1489 MULQ p256const1<>(SB)
1490 SHRQ $32, hlp
1491 ADDQ acc3, acc0
1492 ADCQ hlp, acc1
1493 ADCQ mul0, acc2
1494 ADCQ $0, mul1
1495 MOVQ mul1, acc3
1496 MOVQ $0, BP
1497 // Add bits [511:256] of the result
1498 ADCQ acc0, acc4
1499 ADCQ acc1, acc5
1500 ADCQ acc2, acc6
1501 ADCQ acc3, acc7
1502 ADCQ $0, hlp
1503 // Copy result
1504 MOVQ acc4, acc0
1505 MOVQ acc5, acc1
1506 MOVQ acc6, acc2
1507 MOVQ acc7, acc3
1508 // Subtract p256
1509 SUBQ $-1, acc4
1510 SBBQ p256const0<>(SB) ,acc5
1511 SBBQ $0, acc6
1512 SBBQ p256const1<>(SB), acc7
1513 SBBQ $0, hlp
1514 // If the result of the subtraction is negative, restore the previous result
1515 CMOVQCS acc0, acc4
1516 CMOVQCS acc1, acc5
1517 CMOVQCS acc2, acc6
1518 CMOVQCS acc3, acc7
1519
1520 RET
1521 /* ---------------------------------------*/
1522 TEXT p256SqrInternal(SB),NOSPLIT,$8
1523
1524 MOVQ acc4, mul0
1525 MULQ acc5
1526 MOVQ mul0, acc1
1527 MOVQ mul1, acc2
1528
1529 MOVQ acc4, mul0
1530 MULQ acc6
1531 ADDQ mul0, acc2
1532 ADCQ $0, mul1
1533 MOVQ mul1, acc3
1534
1535 MOVQ acc4, mul0
1536 MULQ acc7
1537 ADDQ mul0, acc3
1538 ADCQ $0, mul1
1539 MOVQ mul1, t0
1540
1541 MOVQ acc5, mul0
1542 MULQ acc6
1543 ADDQ mul0, acc3
1544 ADCQ $0, mul1
1545 MOVQ mul1, hlp
1546
1547 MOVQ acc5, mul0
1548 MULQ acc7
1549 ADDQ hlp, t0
1550 ADCQ $0, mul1
1551 ADDQ mul0, t0
1552 ADCQ $0, mul1
1553 MOVQ mul1, t1
1554
1555 MOVQ acc6, mul0
1556 MULQ acc7
1557 ADDQ mul0, t1
1558 ADCQ $0, mul1
1559 MOVQ mul1, t2
1560 XORQ t3, t3
1561 // *2
1562 ADDQ acc1, acc1
1563 ADCQ acc2, acc2
1564 ADCQ acc3, acc3
1565 ADCQ t0, t0
1566 ADCQ t1, t1
1567 ADCQ t2, t2
1568 ADCQ $0, t3
1569 // Missing products
1570 MOVQ acc4, mul0
1571 MULQ mul0
1572 MOVQ mul0, acc0
1573 MOVQ DX, acc4
1574
1575 MOVQ acc5, mul0
1576 MULQ mul0
1577 ADDQ acc4, acc1
1578 ADCQ mul0, acc2
1579 ADCQ $0, DX
1580 MOVQ DX, acc4
1581
1582 MOVQ acc6, mul0
1583 MULQ mul0
1584 ADDQ acc4, acc3
1585 ADCQ mul0, t0
1586 ADCQ $0, DX
1587 MOVQ DX, acc4
1588
1589 MOVQ acc7, mul0
1590 MULQ mul0
1591 ADDQ acc4, t1
1592 ADCQ mul0, t2
1593 ADCQ DX, t3
1594 // First reduction step
1595 MOVQ acc0, mul0
1596 MOVQ acc0, hlp
1597 SHLQ $32, acc0
1598 MULQ p256const1<>(SB)
1599 SHRQ $32, hlp
1600 ADDQ acc0, acc1
1601 ADCQ hlp, acc2
1602 ADCQ mul0, acc3
1603 ADCQ $0, mul1
1604 MOVQ mul1, acc0
1605 // Second reduction step
1606 MOVQ acc1, mul0
1607 MOVQ acc1, hlp
1608 SHLQ $32, acc1
1609 MULQ p256const1<>(SB)
1610 SHRQ $32, hlp
1611 ADDQ acc1, acc2
1612 ADCQ hlp, acc3
1613 ADCQ mul0, acc0
1614 ADCQ $0, mul1
1615 MOVQ mul1, acc1
1616 // Third reduction step
1617 MOVQ acc2, mul0
1618 MOVQ acc2, hlp
1619 SHLQ $32, acc2
1620 MULQ p256const1<>(SB)
1621 SHRQ $32, hlp
1622 ADDQ acc2, acc3
1623 ADCQ hlp, acc0
1624 ADCQ mul0, acc1
1625 ADCQ $0, mul1
1626 MOVQ mul1, acc2
1627 // Last reduction step
1628 MOVQ acc3, mul0
1629 MOVQ acc3, hlp
1630 SHLQ $32, acc3
1631 MULQ p256const1<>(SB)
1632 SHRQ $32, hlp
1633 ADDQ acc3, acc0
1634 ADCQ hlp, acc1
1635 ADCQ mul0, acc2
1636 ADCQ $0, mul1
1637 MOVQ mul1, acc3
1638 MOVQ $0, BP
1639 // Add bits [511:256] of the result
1640 ADCQ acc0, t0
1641 ADCQ acc1, t1
1642 ADCQ acc2, t2
1643 ADCQ acc3, t3
1644 ADCQ $0, hlp
1645 // Copy result
1646 MOVQ t0, acc4
1647 MOVQ t1, acc5
1648 MOVQ t2, acc6
1649 MOVQ t3, acc7
1650 // Subtract p256
1651 SUBQ $-1, acc4
1652 SBBQ p256const0<>(SB) ,acc5
1653 SBBQ $0, acc6
1654 SBBQ p256const1<>(SB), acc7
1655 SBBQ $0, hlp
1656 // If the result of the subtraction is negative, restore the previous result
1657 CMOVQCS t0, acc4
1658 CMOVQCS t1, acc5
1659 CMOVQCS t2, acc6
1660 CMOVQCS t3, acc7
1661
1662 RET
1663 /* ---------------------------------------*/
1664 #define p256MulBy2Inline\
1665 XORQ mul0, mul0;\
1666 ADDQ acc4, acc4;\
1667 ADCQ acc5, acc5;\
1668 ADCQ acc6, acc6;\
1669 ADCQ acc7, acc7;\
1670 ADCQ $0, mul0;\
1671 MOVQ acc4, t0;\
1672 MOVQ acc5, t1;\
1673 MOVQ acc6, t2;\
1674 MOVQ acc7, t3;\
1675 SUBQ $-1, t0;\
1676 SBBQ p256const0<>(SB), t1;\
1677 SBBQ $0, t2;\
1678 SBBQ p256const1<>(SB), t3;\
1679 SBBQ $0, mul0;\
1680 CMOVQCS acc4, t0;\
1681 CMOVQCS acc5, t1;\
1682 CMOVQCS acc6, t2;\
1683 CMOVQCS acc7, t3;
1684 /* ---------------------------------------*/
1685 #define p256AddInline \
1686 XORQ mul0, mul0;\
1687 ADDQ t0, acc4;\
1688 ADCQ t1, acc5;\
1689 ADCQ t2, acc6;\
1690 ADCQ t3, acc7;\
1691 ADCQ $0, mul0;\
1692 MOVQ acc4, t0;\
1693 MOVQ acc5, t1;\
1694 MOVQ acc6, t2;\
1695 MOVQ acc7, t3;\
1696 SUBQ $-1, t0;\
1697 SBBQ p256const0<>(SB), t1;\
1698 SBBQ $0, t2;\
1699 SBBQ p256const1<>(SB), t3;\
1700 SBBQ $0, mul0;\
1701 CMOVQCS acc4, t0;\
1702 CMOVQCS acc5, t1;\
1703 CMOVQCS acc6, t2;\
1704 CMOVQCS acc7, t3;
1705 /* ---------------------------------------*/
1706 #define LDacc(src) MOVQ src(8*0), acc4; MOVQ src(8*1), acc5; MOVQ src(8*2), acc6; MOVQ src(8*3), acc7
1707 #define LDt(src) MOVQ src(8*0), t0; MOVQ src(8*1), t1; MOVQ src(8*2), t2; MOVQ src(8*3), t3
1708 #define ST(dst) MOVQ acc4, dst(8*0); MOVQ acc5, dst(8*1); MOVQ acc6, dst(8*2); MOVQ acc7, dst(8*3)
1709 #define STt(dst) MOVQ t0, dst(8*0); MOVQ t1, dst(8*1); MOVQ t2, dst(8*2); MOVQ t3, dst(8*3)
1710 #define acc2t MOVQ acc4, t0; MOVQ acc5, t1; MOVQ acc6, t2; MOVQ acc7, t3
1711 #define t2acc MOVQ t0, acc4; MOVQ t1, acc5; MOVQ t2, acc6; MOVQ t3, acc7
1712 /* ---------------------------------------*/
1713 #define x1in(off) (32*0 + off)(SP)
1714 #define y1in(off) (32*1 + off)(SP)
1715 #define z1in(off) (32*2 + off)(SP)
1716 #define x2in(off) (32*3 + off)(SP)
1717 #define y2in(off) (32*4 + off)(SP)
1718 #define xout(off) (32*5 + off)(SP)
1719 #define yout(off) (32*6 + off)(SP)
1720 #define zout(off) (32*7 + off)(SP)
1721 #define s2(off) (32*8 + off)(SP)
1722 #define z1sqr(off) (32*9 + off)(SP)
1723 #define h(off) (32*10 + off)(SP)
1724 #define r(off) (32*11 + off)(SP)
1725 #define hsqr(off) (32*12 + off)(SP)
1726 #define rsqr(off) (32*13 + off)(SP)
1727 #define hcub(off) (32*14 + off)(SP)
1728 #define rptr (32*15)(SP)
1729 #define sel_save (32*15 + 8)(SP)
1730 #define zero_save (32*15 + 8 + 4)(SP)
1731
1732 // func p256PointAddAffineAsm(res, in1, in2 []uint64, sign, sel, zero int)
1733 TEXT ·p256PointAddAffineAsm(SB),0,$512-96
1734 // Move input to stack in order to free registers
1735 MOVQ res+0(FP), AX
1736 MOVQ in1+24(FP), BX
1737 MOVQ in2+48(FP), CX
1738 MOVQ sign+72(FP), DX
1739 MOVQ sel+80(FP), t1
1740 MOVQ zero+88(FP), t2
1741
1742 MOVOU (16*0)(BX), X0
1743 MOVOU (16*1)(BX), X1
1744 MOVOU (16*2)(BX), X2
1745 MOVOU (16*3)(BX), X3
1746 MOVOU (16*4)(BX), X4
1747 MOVOU (16*5)(BX), X5
1748
1749 MOVOU X0, x1in(16*0)
1750 MOVOU X1, x1in(16*1)
1751 MOVOU X2, y1in(16*0)
1752 MOVOU X3, y1in(16*1)
1753 MOVOU X4, z1in(16*0)
1754 MOVOU X5, z1in(16*1)
1755
1756 MOVOU (16*0)(CX), X0
1757 MOVOU (16*1)(CX), X1
1758
1759 MOVOU X0, x2in(16*0)
1760 MOVOU X1, x2in(16*1)
1761 // Store pointer to result
1762 MOVQ mul0, rptr
1763 MOVL t1, sel_save
1764 MOVL t2, zero_save
1765 // Negate y2in based on sign
1766 MOVQ (16*2 + 8*0)(CX), acc4
1767 MOVQ (16*2 + 8*1)(CX), acc5
1768 MOVQ (16*2 + 8*2)(CX), acc6
1769 MOVQ (16*2 + 8*3)(CX), acc7
1770 MOVQ $-1, acc0
1771 MOVQ p256const0<>(SB), acc1
1772 MOVQ $0, acc2
1773 MOVQ p256const1<>(SB), acc3
1774 XORQ mul0, mul0
1775 // Speculatively subtract
1776 SUBQ acc4, acc0
1777 SBBQ acc5, acc1
1778 SBBQ acc6, acc2
1779 SBBQ acc7, acc3
1780 SBBQ $0, mul0
1781 MOVQ acc0, t0
1782 MOVQ acc1, t1
1783 MOVQ acc2, t2
1784 MOVQ acc3, t3
1785 // Add in case the operand was > p256
1786 ADDQ $-1, acc0
1787 ADCQ p256const0<>(SB), acc1
1788 ADCQ $0, acc2
1789 ADCQ p256const1<>(SB), acc3
1790 ADCQ $0, mul0
1791 CMOVQNE t0, acc0
1792 CMOVQNE t1, acc1
1793 CMOVQNE t2, acc2
1794 CMOVQNE t3, acc3
1795 // If condition is 0, keep original value
1796 TESTQ DX, DX
1797 CMOVQEQ acc4, acc0
1798 CMOVQEQ acc5, acc1
1799 CMOVQEQ acc6, acc2
1800 CMOVQEQ acc7, acc3
1801 // Store result
1802 MOVQ acc0, y2in(8*0)
1803 MOVQ acc1, y2in(8*1)
1804 MOVQ acc2, y2in(8*2)
1805 MOVQ acc3, y2in(8*3)
1806 // Begin point add
1807 LDacc (z1in)
1808 CALL p256SqrInternal(SB) // z1ˆ2
1809 ST (z1sqr)
1810
1811 LDt (x2in)
1812 CALL p256MulInternal(SB) // x2 * z1ˆ2
1813
1814 LDt (x1in)
1815 CALL p256SubInternal(SB) // h = u2 - u1
1816 ST (h)
1817
1818 LDt (z1in)
1819 CALL p256MulInternal(SB) // z3 = h * z1
1820 ST (zout)
1821
1822 LDacc (z1sqr)
1823 CALL p256MulInternal(SB) // z1ˆ3
1824
1825 LDt (y2in)
1826 CALL p256MulInternal(SB) // s2 = y2 * z1ˆ3
1827 ST (s2)
1828
1829 LDt (y1in)
1830 CALL p256SubInternal(SB) // r = s2 - s1
1831 ST (r)
1832
1833 CALL p256SqrInternal(SB) // rsqr = rˆ2
1834 ST (rsqr)
1835
1836 LDacc (h)
1837 CALL p256SqrInternal(SB) // hsqr = hˆ2
1838 ST (hsqr)
1839
1840 LDt (h)
1841 CALL p256MulInternal(SB) // hcub = hˆ3
1842 ST (hcub)
1843
1844 LDt (y1in)
1845 CALL p256MulInternal(SB) // y1 * hˆ3
1846 ST (s2)
1847
1848 LDacc (x1in)
1849 LDt (hsqr)
1850 CALL p256MulInternal(SB) // u1 * hˆ2
1851 ST (h)
1852
1853 p256MulBy2Inline // u1 * hˆ2 * 2, inline
1854 LDacc (rsqr)
1855 CALL p256SubInternal(SB) // rˆ2 - u1 * hˆ2 * 2
1856
1857 LDt (hcub)
1858 CALL p256SubInternal(SB)
1859 ST (xout)
1860
1861 MOVQ acc4, t0
1862 MOVQ acc5, t1
1863 MOVQ acc6, t2
1864 MOVQ acc7, t3
1865 LDacc (h)
1866 CALL p256SubInternal(SB)
1867
1868 LDt (r)
1869 CALL p256MulInternal(SB)
1870
1871 LDt (s2)
1872 CALL p256SubInternal(SB)
1873 ST (yout)
1874 // Load stored values from stack
1875 MOVQ rptr, AX
1876 MOVL sel_save, BX
1877 MOVL zero_save, CX
1878 // The result is not valid if (sel == 0), conditional choose
1879 MOVOU xout(16*0), X0
1880 MOVOU xout(16*1), X1
1881 MOVOU yout(16*0), X2
1882 MOVOU yout(16*1), X3
1883 MOVOU zout(16*0), X4
1884 MOVOU zout(16*1), X5
1885
1886 MOVL BX, X6
1887 MOVL CX, X7
1888
1889 PXOR X8, X8
1890 PCMPEQL X9, X9
1891
1892 PSHUFD $0, X6, X6
1893 PSHUFD $0, X7, X7
1894
1895 PCMPEQL X8, X6
1896 PCMPEQL X8, X7
1897
1898 MOVOU X6, X15
1899 PANDN X9, X15
1900
1901 MOVOU x1in(16*0), X9
1902 MOVOU x1in(16*1), X10
1903 MOVOU y1in(16*0), X11
1904 MOVOU y1in(16*1), X12
1905 MOVOU z1in(16*0), X13
1906 MOVOU z1in(16*1), X14
1907
1908 PAND X15, X0
1909 PAND X15, X1
1910 PAND X15, X2
1911 PAND X15, X3
1912 PAND X15, X4
1913 PAND X15, X5
1914
1915 PAND X6, X9
1916 PAND X6, X10
1917 PAND X6, X11
1918 PAND X6, X12
1919 PAND X6, X13
1920 PAND X6, X14
1921
1922 PXOR X9, X0
1923 PXOR X10, X1
1924 PXOR X11, X2
1925 PXOR X12, X3
1926 PXOR X13, X4
1927 PXOR X14, X5
1928 // Similarly if zero == 0
1929 PCMPEQL X9, X9
1930 MOVOU X7, X15
1931 PANDN X9, X15
1932
1933 MOVOU x2in(16*0), X9
1934 MOVOU x2in(16*1), X10
1935 MOVOU y2in(16*0), X11
1936 MOVOU y2in(16*1), X12
1937 MOVOU p256one<>+0x00(SB), X13
1938 MOVOU p256one<>+0x10(SB), X14
1939
1940 PAND X15, X0
1941 PAND X15, X1
1942 PAND X15, X2
1943 PAND X15, X3
1944 PAND X15, X4
1945 PAND X15, X5
1946
1947 PAND X7, X9
1948 PAND X7, X10
1949 PAND X7, X11
1950 PAND X7, X12
1951 PAND X7, X13
1952 PAND X7, X14
1953
1954 PXOR X9, X0
1955 PXOR X10, X1
1956 PXOR X11, X2
1957 PXOR X12, X3
1958 PXOR X13, X4
1959 PXOR X14, X5
1960 // Finally output the result
1961 MOVOU X0, (16*0)(AX)
1962 MOVOU X1, (16*1)(AX)
1963 MOVOU X2, (16*2)(AX)
1964 MOVOU X3, (16*3)(AX)
1965 MOVOU X4, (16*4)(AX)
1966 MOVOU X5, (16*5)(AX)
1967 MOVQ $0, rptr
1968
1969 RET
1970 #undef x1in
1971 #undef y1in
1972 #undef z1in
1973 #undef x2in
1974 #undef y2in
1975 #undef xout
1976 #undef yout
1977 #undef zout
1978 #undef s2
1979 #undef z1sqr
1980 #undef h
1981 #undef r
1982 #undef hsqr
1983 #undef rsqr
1984 #undef hcub
1985 #undef rptr
1986 #undef sel_save
1987 #undef zero_save
1988
1989 // p256IsZero returns 1 in AX if [acc4..acc7] represents zero and zero
1990 // otherwise. It writes to [acc4..acc7], t0 and t1.
1991 TEXT p256IsZero(SB),NOSPLIT,$0
1992 // AX contains a flag that is set if the input is zero.
1993 XORQ AX, AX
1994 MOVQ $1, t1
1995
1996 // Check whether [acc4..acc7] are all zero.
1997 MOVQ acc4, t0
1998 ORQ acc5, t0
1999 ORQ acc6, t0
2000 ORQ acc7, t0
2001
2002 // Set the zero flag if so. (CMOV of a constant to a register doesn't
2003 // appear to be supported in Go. Thus t1 = 1.)
2004 CMOVQEQ t1, AX
2005
2006 // XOR [acc4..acc7] with P and compare with zero again.
2007 XORQ $-1, acc4
2008 XORQ p256const0<>(SB), acc5
2009 XORQ p256const1<>(SB), acc7
2010 ORQ acc5, acc4
2011 ORQ acc6, acc4
2012 ORQ acc7, acc4
2013
2014 // Set the zero flag if so.
2015 CMOVQEQ t1, AX
2016 RET
2017
2018 /* ---------------------------------------*/
2019 #define x1in(off) (32*0 + off)(SP)
2020 #define y1in(off) (32*1 + off)(SP)
2021 #define z1in(off) (32*2 + off)(SP)
2022 #define x2in(off) (32*3 + off)(SP)
2023 #define y2in(off) (32*4 + off)(SP)
2024 #define z2in(off) (32*5 + off)(SP)
2025
2026 #define xout(off) (32*6 + off)(SP)
2027 #define yout(off) (32*7 + off)(SP)
2028 #define zout(off) (32*8 + off)(SP)
2029
2030 #define u1(off) (32*9 + off)(SP)
2031 #define u2(off) (32*10 + off)(SP)
2032 #define s1(off) (32*11 + off)(SP)
2033 #define s2(off) (32*12 + off)(SP)
2034 #define z1sqr(off) (32*13 + off)(SP)
2035 #define z2sqr(off) (32*14 + off)(SP)
2036 #define h(off) (32*15 + off)(SP)
2037 #define r(off) (32*16 + off)(SP)
2038 #define hsqr(off) (32*17 + off)(SP)
2039 #define rsqr(off) (32*18 + off)(SP)
2040 #define hcub(off) (32*19 + off)(SP)
2041 #define rptr (32*20)(SP)
2042 #define points_eq (32*20+8)(SP)
2043
2044 //func p256PointAddAsm(res, in1, in2 []uint64) int
2045 TEXT ·p256PointAddAsm(SB),0,$680-80
2046 // See https://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl
2047 // Move input to stack in order to free registers
2048 MOVQ res+0(FP), AX
2049 MOVQ in1+24(FP), BX
2050 MOVQ in2+48(FP), CX
2051
2052 MOVOU (16*0)(BX), X0
2053 MOVOU (16*1)(BX), X1
2054 MOVOU (16*2)(BX), X2
2055 MOVOU (16*3)(BX), X3
2056 MOVOU (16*4)(BX), X4
2057 MOVOU (16*5)(BX), X5
2058
2059 MOVOU X0, x1in(16*0)
2060 MOVOU X1, x1in(16*1)
2061 MOVOU X2, y1in(16*0)
2062 MOVOU X3, y1in(16*1)
2063 MOVOU X4, z1in(16*0)
2064 MOVOU X5, z1in(16*1)
2065
2066 MOVOU (16*0)(CX), X0
2067 MOVOU (16*1)(CX), X1
2068 MOVOU (16*2)(CX), X2
2069 MOVOU (16*3)(CX), X3
2070 MOVOU (16*4)(CX), X4
2071 MOVOU (16*5)(CX), X5
2072
2073 MOVOU X0, x2in(16*0)
2074 MOVOU X1, x2in(16*1)
2075 MOVOU X2, y2in(16*0)
2076 MOVOU X3, y2in(16*1)
2077 MOVOU X4, z2in(16*0)
2078 MOVOU X5, z2in(16*1)
2079 // Store pointer to result
2080 MOVQ AX, rptr
2081 // Begin point add
2082 LDacc (z2in)
2083 CALL p256SqrInternal(SB) // z2ˆ2
2084 ST (z2sqr)
2085 LDt (z2in)
2086 CALL p256MulInternal(SB) // z2ˆ3
2087 LDt (y1in)
2088 CALL p256MulInternal(SB) // s1 = z2ˆ3*y1
2089 ST (s1)
2090
2091 LDacc (z1in)
2092 CALL p256SqrInternal(SB) // z1ˆ2
2093 ST (z1sqr)
2094 LDt (z1in)
2095 CALL p256MulInternal(SB) // z1ˆ3
2096 LDt (y2in)
2097 CALL p256MulInternal(SB) // s2 = z1ˆ3*y2
2098 ST (s2)
2099
2100 LDt (s1)
2101 CALL p256SubInternal(SB) // r = s2 - s1
2102 ST (r)
2103 CALL p256IsZero(SB)
2104 MOVQ AX, points_eq
2105
2106 LDacc (z2sqr)
2107 LDt (x1in)
2108 CALL p256MulInternal(SB) // u1 = x1 * z2ˆ2
2109 ST (u1)
2110 LDacc (z1sqr)
2111 LDt (x2in)
2112 CALL p256MulInternal(SB) // u2 = x2 * z1ˆ2
2113 ST (u2)
2114
2115 LDt (u1)
2116 CALL p256SubInternal(SB) // h = u2 - u1
2117 ST (h)
2118 CALL p256IsZero(SB)
2119 ANDQ points_eq, AX
2120 MOVQ AX, points_eq
2121
2122 LDacc (r)
2123 CALL p256SqrInternal(SB) // rsqr = rˆ2
2124 ST (rsqr)
2125
2126 LDacc (h)
2127 CALL p256SqrInternal(SB) // hsqr = hˆ2
2128 ST (hsqr)
2129
2130 LDt (h)
2131 CALL p256MulInternal(SB) // hcub = hˆ3
2132 ST (hcub)
2133
2134 LDt (s1)
2135 CALL p256MulInternal(SB)
2136 ST (s2)
2137
2138 LDacc (z1in)
2139 LDt (z2in)
2140 CALL p256MulInternal(SB) // z1 * z2
2141 LDt (h)
2142 CALL p256MulInternal(SB) // z1 * z2 * h
2143 ST (zout)
2144
2145 LDacc (hsqr)
2146 LDt (u1)
2147 CALL p256MulInternal(SB) // hˆ2 * u1
2148 ST (u2)
2149
2150 p256MulBy2Inline // u1 * hˆ2 * 2, inline
2151 LDacc (rsqr)
2152 CALL p256SubInternal(SB) // rˆ2 - u1 * hˆ2 * 2
2153
2154 LDt (hcub)
2155 CALL p256SubInternal(SB)
2156 ST (xout)
2157
2158 MOVQ acc4, t0
2159 MOVQ acc5, t1
2160 MOVQ acc6, t2
2161 MOVQ acc7, t3
2162 LDacc (u2)
2163 CALL p256SubInternal(SB)
2164
2165 LDt (r)
2166 CALL p256MulInternal(SB)
2167
2168 LDt (s2)
2169 CALL p256SubInternal(SB)
2170 ST (yout)
2171
2172 MOVOU xout(16*0), X0
2173 MOVOU xout(16*1), X1
2174 MOVOU yout(16*0), X2
2175 MOVOU yout(16*1), X3
2176 MOVOU zout(16*0), X4
2177 MOVOU zout(16*1), X5
2178 // Finally output the result
2179 MOVQ rptr, AX
2180 MOVQ $0, rptr
2181 MOVOU X0, (16*0)(AX)
2182 MOVOU X1, (16*1)(AX)
2183 MOVOU X2, (16*2)(AX)
2184 MOVOU X3, (16*3)(AX)
2185 MOVOU X4, (16*4)(AX)
2186 MOVOU X5, (16*5)(AX)
2187
2188 MOVQ points_eq, AX
2189 MOVQ AX, ret+72(FP)
2190
2191 RET
2192 #undef x1in
2193 #undef y1in
2194 #undef z1in
2195 #undef x2in
2196 #undef y2in
2197 #undef z2in
2198 #undef xout
2199 #undef yout
2200 #undef zout
2201 #undef s1
2202 #undef s2
2203 #undef u1
2204 #undef u2
2205 #undef z1sqr
2206 #undef z2sqr
2207 #undef h
2208 #undef r
2209 #undef hsqr
2210 #undef rsqr
2211 #undef hcub
2212 #undef rptr
2213 /* ---------------------------------------*/
2214 #define x(off) (32*0 + off)(SP)
2215 #define y(off) (32*1 + off)(SP)
2216 #define z(off) (32*2 + off)(SP)
2217
2218 #define s(off) (32*3 + off)(SP)
2219 #define m(off) (32*4 + off)(SP)
2220 #define zsqr(off) (32*5 + off)(SP)
2221 #define tmp(off) (32*6 + off)(SP)
2222 #define rptr (32*7)(SP)
2223
2224 //func p256PointDoubleAsm(res, in []uint64)
2225 TEXT ·p256PointDoubleAsm(SB),NOSPLIT,$256-48
2226 // Move input to stack in order to free registers
2227 MOVQ res+0(FP), AX
2228 MOVQ in+24(FP), BX
2229
2230 MOVOU (16*0)(BX), X0
2231 MOVOU (16*1)(BX), X1
2232 MOVOU (16*2)(BX), X2
2233 MOVOU (16*3)(BX), X3
2234 MOVOU (16*4)(BX), X4
2235 MOVOU (16*5)(BX), X5
2236
2237 MOVOU X0, x(16*0)
2238 MOVOU X1, x(16*1)
2239 MOVOU X2, y(16*0)
2240 MOVOU X3, y(16*1)
2241 MOVOU X4, z(16*0)
2242 MOVOU X5, z(16*1)
2243 // Store pointer to result
2244 MOVQ AX, rptr
2245 // Begin point double
2246 LDacc (z)
2247 CALL p256SqrInternal(SB)
2248 ST (zsqr)
2249
2250 LDt (x)
2251 p256AddInline
2252 STt (m)
2253
2254 LDacc (z)
2255 LDt (y)
2256 CALL p256MulInternal(SB)
2257 p256MulBy2Inline
2258 MOVQ rptr, AX
2259 // Store z
2260 MOVQ t0, (16*4 + 8*0)(AX)
2261 MOVQ t1, (16*4 + 8*1)(AX)
2262 MOVQ t2, (16*4 + 8*2)(AX)
2263 MOVQ t3, (16*4 + 8*3)(AX)
2264
2265 LDacc (x)
2266 LDt (zsqr)
2267 CALL p256SubInternal(SB)
2268 LDt (m)
2269 CALL p256MulInternal(SB)
2270 ST (m)
2271 // Multiply by 3
2272 p256MulBy2Inline
2273 LDacc (m)
2274 p256AddInline
2275 STt (m)
2276 ////////////////////////
2277 LDacc (y)
2278 p256MulBy2Inline
2279 t2acc
2280 CALL p256SqrInternal(SB)
2281 ST (s)
2282 CALL p256SqrInternal(SB)
2283 // Divide by 2
2284 XORQ mul0, mul0
2285 MOVQ acc4, t0
2286 MOVQ acc5, t1
2287 MOVQ acc6, t2
2288 MOVQ acc7, t3
2289
2290 ADDQ $-1, acc4
2291 ADCQ p256const0<>(SB), acc5
2292 ADCQ $0, acc6
2293 ADCQ p256const1<>(SB), acc7
2294 ADCQ $0, mul0
2295 TESTQ $1, t0
2296
2297 CMOVQEQ t0, acc4
2298 CMOVQEQ t1, acc5
2299 CMOVQEQ t2, acc6
2300 CMOVQEQ t3, acc7
2301 ANDQ t0, mul0
2302
2303 SHRQ $1, acc5, acc4
2304 SHRQ $1, acc6, acc5
2305 SHRQ $1, acc7, acc6
2306 SHRQ $1, mul0, acc7
2307 ST (y)
2308 /////////////////////////
2309 LDacc (x)
2310 LDt (s)
2311 CALL p256MulInternal(SB)
2312 ST (s)
2313 p256MulBy2Inline
2314 STt (tmp)
2315
2316 LDacc (m)
2317 CALL p256SqrInternal(SB)
2318 LDt (tmp)
2319 CALL p256SubInternal(SB)
2320
2321 MOVQ rptr, AX
2322 // Store x
2323 MOVQ acc4, (16*0 + 8*0)(AX)
2324 MOVQ acc5, (16*0 + 8*1)(AX)
2325 MOVQ acc6, (16*0 + 8*2)(AX)
2326 MOVQ acc7, (16*0 + 8*3)(AX)
2327
2328 acc2t
2329 LDacc (s)
2330 CALL p256SubInternal(SB)
2331
2332 LDt (m)
2333 CALL p256MulInternal(SB)
2334
2335 LDt (y)
2336 CALL p256SubInternal(SB)
2337 MOVQ rptr, AX
2338 // Store y
2339 MOVQ acc4, (16*2 + 8*0)(AX)
2340 MOVQ acc5, (16*2 + 8*1)(AX)
2341 MOVQ acc6, (16*2 + 8*2)(AX)
2342 MOVQ acc7, (16*2 + 8*3)(AX)
2343 ///////////////////////
2344 MOVQ $0, rptr
2345
2346 RET
2347 /* ---------------------------------------*/
2348
View as plain text