1 // Copyright 2016 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 #include "textflag.h"
6 #include "go_asm.h"
7
8
9 DATA p256ordK0<>+0x00(SB)/4, $0xee00bc4f
10 DATA p256ord<>+0x00(SB)/8, $0xffffffff00000000
11 DATA p256ord<>+0x08(SB)/8, $0xffffffffffffffff
12 DATA p256ord<>+0x10(SB)/8, $0xbce6faada7179e84
13 DATA p256ord<>+0x18(SB)/8, $0xf3b9cac2fc632551
14 DATA p256<>+0x00(SB)/8, $0xffffffff00000001 // P256
15 DATA p256<>+0x08(SB)/8, $0x0000000000000000 // P256
16 DATA p256<>+0x10(SB)/8, $0x00000000ffffffff // P256
17 DATA p256<>+0x18(SB)/8, $0xffffffffffffffff // P256
18 DATA p256<>+0x20(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0
19 DATA p256<>+0x28(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0
20 DATA p256<>+0x30(SB)/8, $0x0000000010111213 // SEL 0 d1 d0 0
21 DATA p256<>+0x38(SB)/8, $0x1415161700000000 // SEL 0 d1 d0 0
22 DATA p256<>+0x40(SB)/8, $0x18191a1b1c1d1e1f // SEL d1 d0 d1 d0
23 DATA p256<>+0x48(SB)/8, $0x18191a1b1c1d1e1f // SEL d1 d0 d1 d0
24 DATA p256mul<>+0x00(SB)/8, $0xffffffff00000001 // P256
25 DATA p256mul<>+0x08(SB)/8, $0x0000000000000000 // P256
26 DATA p256mul<>+0x10(SB)/8, $0x00000000ffffffff // P256
27 DATA p256mul<>+0x18(SB)/8, $0xffffffffffffffff // P256
28 DATA p256mul<>+0x20(SB)/8, $0x1c1d1e1f00000000 // SEL d0 0 0 d0
29 DATA p256mul<>+0x28(SB)/8, $0x000000001c1d1e1f // SEL d0 0 0 d0
30 DATA p256mul<>+0x30(SB)/8, $0x0001020304050607 // SEL d0 0 d1 d0
31 DATA p256mul<>+0x38(SB)/8, $0x1c1d1e1f0c0d0e0f // SEL d0 0 d1 d0
32 DATA p256mul<>+0x40(SB)/8, $0x040506071c1d1e1f // SEL 0 d1 d0 d1
33 DATA p256mul<>+0x48(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL 0 d1 d0 d1
34 DATA p256mul<>+0x50(SB)/8, $0x0405060704050607 // SEL 0 0 d1 d0
35 DATA p256mul<>+0x58(SB)/8, $0x1c1d1e1f0c0d0e0f // SEL 0 0 d1 d0
36 DATA p256mul<>+0x60(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0
37 DATA p256mul<>+0x68(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0
38 DATA p256mul<>+0x70(SB)/8, $0x141516170c0d0e0f // SEL 0 d1 d0 0
39 DATA p256mul<>+0x78(SB)/8, $0x1c1d1e1f14151617 // SEL 0 d1 d0 0
40 DATA p256mul<>+0x80(SB)/8, $0x00000000fffffffe // (1*2^256)%P256
41 DATA p256mul<>+0x88(SB)/8, $0xffffffffffffffff // (1*2^256)%P256
42 DATA p256mul<>+0x90(SB)/8, $0xffffffff00000000 // (1*2^256)%P256
43 DATA p256mul<>+0x98(SB)/8, $0x0000000000000001 // (1*2^256)%P256
44 GLOBL p256ordK0<>(SB), 8, $4
45 GLOBL p256ord<>(SB), 8, $32
46 GLOBL p256<>(SB), 8, $80
47 GLOBL p256mul<>(SB), 8, $160
48
49 DATA p256vmsl<>+0x0(SB)/8, $0x0012131415161718
50 DATA p256vmsl<>+0x8(SB)/8, $0x00191a1b1c1d1e1f
51 DATA p256vmsl<>+0x10(SB)/8, $0x0012131415161718
52 DATA p256vmsl<>+0x18(SB)/8, $0x000b0c0d0e0f1011
53 DATA p256vmsl<>+0x20(SB)/8, $0x00191a1b1c1d1e1f
54 DATA p256vmsl<>+0x28(SB)/8, $0x0012131415161718
55 DATA p256vmsl<>+0x30(SB)/8, $0x000b0c0d0e0f1011
56 DATA p256vmsl<>+0x38(SB)/8, $0x0012131415161718
57 DATA p256vmsl<>+0x40(SB)/8, $0x000405060708090a
58 DATA p256vmsl<>+0x48(SB)/8, $0x000b0c0d0e0f1011
59 DATA p256vmsl<>+0x50(SB)/8, $0x000b0c0d0e0f1011
60 DATA p256vmsl<>+0x58(SB)/8, $0x000405060708090a
61 DATA p256vmsl<>+0x60(SB)/8, $0x1010101000010203
62 DATA p256vmsl<>+0x68(SB)/8, $0x100405060708090a
63 DATA p256vmsl<>+0x70(SB)/8, $0x100405060708090a
64 DATA p256vmsl<>+0x78(SB)/8, $0x1010101000010203
65 GLOBL p256vmsl<>(SB), 8, $128
66
67 // ---------------------------------------
68 // iff cond == 1 val <- -val
69 // func p256NegCond(val *p256Point, cond int)
70 #define P1ptr R1
71 #define CPOOL R4
72
73 #define Y1L V0
74 #define Y1H V1
75 #define T1L V2
76 #define T1H V3
77
78 #define PL V30
79 #define PH V31
80
81 #define ZER V4
82 #define SEL1 V5
83 #define CAR1 V6
84 TEXT ·p256NegCond(SB), NOSPLIT, $0
85 MOVD val+0(FP), P1ptr
86
87 MOVD $p256mul<>+0x00(SB), CPOOL
88 VL 16(CPOOL), PL
89 VL 0(CPOOL), PH
90
91 VL 32(P1ptr), Y1H
92 VL 48(P1ptr), Y1L
93
94 VLREPG cond+8(FP), SEL1
95 VZERO ZER
96 VCEQG SEL1, ZER, SEL1
97
98 VSCBIQ Y1L, PL, CAR1
99 VSQ Y1L, PL, T1L
100 VSBIQ PH, Y1H, CAR1, T1H
101
102 VSEL Y1L, T1L, SEL1, Y1L
103 VSEL Y1H, T1H, SEL1, Y1H
104
105 VST Y1H, 32(P1ptr)
106 VST Y1L, 48(P1ptr)
107 RET
108
109 #undef P1ptr
110 #undef CPOOL
111 #undef Y1L
112 #undef Y1H
113 #undef T1L
114 #undef T1H
115 #undef PL
116 #undef PH
117 #undef ZER
118 #undef SEL1
119 #undef CAR1
120
121 // ---------------------------------------
122 // if cond == 0 res <- b; else res <- a
123 // func p256MovCond(res, a, b *p256Point, cond int)
124 #define P3ptr R1
125 #define P1ptr R2
126 #define P2ptr R3
127
128 #define X1L V0
129 #define X1H V1
130 #define Y1L V2
131 #define Y1H V3
132 #define Z1L V4
133 #define Z1H V5
134 #define X2L V6
135 #define X2H V7
136 #define Y2L V8
137 #define Y2H V9
138 #define Z2L V10
139 #define Z2H V11
140
141 #define ZER V18
142 #define SEL1 V19
143 TEXT ·p256MovCond(SB), NOSPLIT, $0
144 MOVD res+0(FP), P3ptr
145 MOVD a+8(FP), P1ptr
146 MOVD b+16(FP), P2ptr
147 VLREPG cond+24(FP), SEL1
148 VZERO ZER
149 VCEQG SEL1, ZER, SEL1
150
151 VL 0(P1ptr), X1H
152 VL 16(P1ptr), X1L
153 VL 32(P1ptr), Y1H
154 VL 48(P1ptr), Y1L
155 VL 64(P1ptr), Z1H
156 VL 80(P1ptr), Z1L
157
158 VL 0(P2ptr), X2H
159 VL 16(P2ptr), X2L
160 VL 32(P2ptr), Y2H
161 VL 48(P2ptr), Y2L
162 VL 64(P2ptr), Z2H
163 VL 80(P2ptr), Z2L
164
165 VSEL X2L, X1L, SEL1, X1L
166 VSEL X2H, X1H, SEL1, X1H
167 VSEL Y2L, Y1L, SEL1, Y1L
168 VSEL Y2H, Y1H, SEL1, Y1H
169 VSEL Z2L, Z1L, SEL1, Z1L
170 VSEL Z2H, Z1H, SEL1, Z1H
171
172 VST X1H, 0(P3ptr)
173 VST X1L, 16(P3ptr)
174 VST Y1H, 32(P3ptr)
175 VST Y1L, 48(P3ptr)
176 VST Z1H, 64(P3ptr)
177 VST Z1L, 80(P3ptr)
178
179 RET
180
181 #undef P3ptr
182 #undef P1ptr
183 #undef P2ptr
184 #undef X1L
185 #undef X1H
186 #undef Y1L
187 #undef Y1H
188 #undef Z1L
189 #undef Z1H
190 #undef X2L
191 #undef X2H
192 #undef Y2L
193 #undef Y2H
194 #undef Z2L
195 #undef Z2H
196 #undef ZER
197 #undef SEL1
198
199 // ---------------------------------------
200 // Constant time table access
201 // Indexed from 1 to 15, with -1 offset
202 // (index 0 is implicitly point at infinity)
203 // func p256Select(point *p256Point, table []p256Point, idx int)
204 #define P3ptr R1
205 #define P1ptr R2
206 #define COUNT R4
207
208 #define X1L V0
209 #define X1H V1
210 #define Y1L V2
211 #define Y1H V3
212 #define Z1L V4
213 #define Z1H V5
214 #define X2L V6
215 #define X2H V7
216 #define Y2L V8
217 #define Y2H V9
218 #define Z2L V10
219 #define Z2H V11
220
221 #define ONE V18
222 #define IDX V19
223 #define SEL1 V20
224 #define SEL2 V21
225 TEXT ·p256Select(SB), NOSPLIT, $0
226 MOVD point+0(FP), P3ptr
227 MOVD table+8(FP), P1ptr
228 VLREPB idx+(32+7)(FP), IDX
229 VREPIB $1, ONE
230 VREPIB $1, SEL2
231 MOVD $1, COUNT
232
233 VZERO X1H
234 VZERO X1L
235 VZERO Y1H
236 VZERO Y1L
237 VZERO Z1H
238 VZERO Z1L
239
240 loop_select:
241 VL 0(P1ptr), X2H
242 VL 16(P1ptr), X2L
243 VL 32(P1ptr), Y2H
244 VL 48(P1ptr), Y2L
245 VL 64(P1ptr), Z2H
246 VL 80(P1ptr), Z2L
247
248 VCEQG SEL2, IDX, SEL1
249
250 VSEL X2L, X1L, SEL1, X1L
251 VSEL X2H, X1H, SEL1, X1H
252 VSEL Y2L, Y1L, SEL1, Y1L
253 VSEL Y2H, Y1H, SEL1, Y1H
254 VSEL Z2L, Z1L, SEL1, Z1L
255 VSEL Z2H, Z1H, SEL1, Z1H
256
257 VAB SEL2, ONE, SEL2
258 ADDW $1, COUNT
259 ADD $96, P1ptr
260 CMPW COUNT, $17
261 BLT loop_select
262
263 VST X1H, 0(P3ptr)
264 VST X1L, 16(P3ptr)
265 VST Y1H, 32(P3ptr)
266 VST Y1L, 48(P3ptr)
267 VST Z1H, 64(P3ptr)
268 VST Z1L, 80(P3ptr)
269 RET
270
271 #undef P3ptr
272 #undef P1ptr
273 #undef COUNT
274 #undef X1L
275 #undef X1H
276 #undef Y1L
277 #undef Y1H
278 #undef Z1L
279 #undef Z1H
280 #undef X2L
281 #undef X2H
282 #undef Y2L
283 #undef Y2H
284 #undef Z2L
285 #undef Z2H
286 #undef ONE
287 #undef IDX
288 #undef SEL1
289 #undef SEL2
290
291 // ---------------------------------------
292 // Constant time table access
293 // Indexed from 1 to 15, with -1 offset
294 // (index 0 is implicitly point at infinity)
295 // func p256SelectBase(point *p256Point, table []p256Point, idx int)
296 #define P3ptr R1
297 #define P1ptr R2
298 #define COUNT R4
299
300 #define X1L V0
301 #define X1H V1
302 #define Y1L V2
303 #define Y1H V3
304 #define Z1L V4
305 #define Z1H V5
306 #define X2L V6
307 #define X2H V7
308 #define Y2L V8
309 #define Y2H V9
310 #define Z2L V10
311 #define Z2H V11
312
313 #define ONE V18
314 #define IDX V19
315 #define SEL1 V20
316 #define SEL2 V21
317 TEXT ·p256SelectBase(SB), NOSPLIT, $0
318 MOVD point+0(FP), P3ptr
319 MOVD table+8(FP), P1ptr
320 VLREPB idx+(32+7)(FP), IDX
321 VREPIB $1, ONE
322 VREPIB $1, SEL2
323 MOVD $1, COUNT
324
325 VZERO X1H
326 VZERO X1L
327 VZERO Y1H
328 VZERO Y1L
329 VZERO Z1H
330 VZERO Z1L
331
332 loop_select:
333 VL 0(P1ptr), X2H
334 VL 16(P1ptr), X2L
335 VL 32(P1ptr), Y2H
336 VL 48(P1ptr), Y2L
337 VL 64(P1ptr), Z2H
338 VL 80(P1ptr), Z2L
339
340 VCEQG SEL2, IDX, SEL1
341
342 VSEL X2L, X1L, SEL1, X1L
343 VSEL X2H, X1H, SEL1, X1H
344 VSEL Y2L, Y1L, SEL1, Y1L
345 VSEL Y2H, Y1H, SEL1, Y1H
346 VSEL Z2L, Z1L, SEL1, Z1L
347 VSEL Z2H, Z1H, SEL1, Z1H
348
349 VAB SEL2, ONE, SEL2
350 ADDW $1, COUNT
351 ADD $96, P1ptr
352 CMPW COUNT, $65
353 BLT loop_select
354
355 VST X1H, 0(P3ptr)
356 VST X1L, 16(P3ptr)
357 VST Y1H, 32(P3ptr)
358 VST Y1L, 48(P3ptr)
359 VST Z1H, 64(P3ptr)
360 VST Z1L, 80(P3ptr)
361 RET
362
363 #undef P3ptr
364 #undef P1ptr
365 #undef COUNT
366 #undef X1L
367 #undef X1H
368 #undef Y1L
369 #undef Y1H
370 #undef Z1L
371 #undef Z1H
372 #undef X2L
373 #undef X2H
374 #undef Y2L
375 #undef Y2H
376 #undef Z2L
377 #undef Z2H
378 #undef ONE
379 #undef IDX
380 #undef SEL1
381 #undef SEL2
382
383 // ---------------------------------------
384 // func p256FromMont(res, in []byte)
385 #define res_ptr R1
386 #define x_ptr R2
387 #define CPOOL R4
388
389 #define T0 V0
390 #define T1 V1
391 #define T2 V2
392 #define TT0 V3
393 #define TT1 V4
394
395 #define ZER V6
396 #define SEL1 V7
397 #define SEL2 V8
398 #define CAR1 V9
399 #define CAR2 V10
400 #define RED1 V11
401 #define RED2 V12
402 #define PL V13
403 #define PH V14
404
405 TEXT ·p256FromMont(SB), NOSPLIT, $0
406 MOVD res+0(FP), res_ptr
407 MOVD in+24(FP), x_ptr
408
409 VZERO T2
410 VZERO ZER
411 MOVD $p256<>+0x00(SB), CPOOL
412 VL 16(CPOOL), PL
413 VL 0(CPOOL), PH
414 VL 48(CPOOL), SEL2
415 VL 64(CPOOL), SEL1
416
417 VL (1*16)(x_ptr), T0
418 VL (0*16)(x_ptr), T1
419
420 // First round
421 VPERM T1, T0, SEL1, RED2 // d1 d0 d1 d0
422 VPERM ZER, RED2, SEL2, RED1 // 0 d1 d0 0
423 VSQ RED1, RED2, RED2 // Guaranteed not to underflow
424
425 VSLDB $8, T1, T0, T0
426 VSLDB $8, T2, T1, T1
427
428 VACCQ T0, RED1, CAR1
429 VAQ T0, RED1, T0
430 VACCCQ T1, RED2, CAR1, CAR2
431 VACQ T1, RED2, CAR1, T1
432 VAQ T2, CAR2, T2
433
434 // Second round
435 VPERM T1, T0, SEL1, RED2 // d1 d0 d1 d0
436 VPERM ZER, RED2, SEL2, RED1 // 0 d1 d0 0
437 VSQ RED1, RED2, RED2 // Guaranteed not to underflow
438
439 VSLDB $8, T1, T0, T0
440 VSLDB $8, T2, T1, T1
441
442 VACCQ T0, RED1, CAR1
443 VAQ T0, RED1, T0
444 VACCCQ T1, RED2, CAR1, CAR2
445 VACQ T1, RED2, CAR1, T1
446 VAQ T2, CAR2, T2
447
448 // Third round
449 VPERM T1, T0, SEL1, RED2 // d1 d0 d1 d0
450 VPERM ZER, RED2, SEL2, RED1 // 0 d1 d0 0
451 VSQ RED1, RED2, RED2 // Guaranteed not to underflow
452
453 VSLDB $8, T1, T0, T0
454 VSLDB $8, T2, T1, T1
455
456 VACCQ T0, RED1, CAR1
457 VAQ T0, RED1, T0
458 VACCCQ T1, RED2, CAR1, CAR2
459 VACQ T1, RED2, CAR1, T1
460 VAQ T2, CAR2, T2
461
462 // Last round
463 VPERM T1, T0, SEL1, RED2 // d1 d0 d1 d0
464 VPERM ZER, RED2, SEL2, RED1 // 0 d1 d0 0
465 VSQ RED1, RED2, RED2 // Guaranteed not to underflow
466
467 VSLDB $8, T1, T0, T0
468 VSLDB $8, T2, T1, T1
469
470 VACCQ T0, RED1, CAR1
471 VAQ T0, RED1, T0
472 VACCCQ T1, RED2, CAR1, CAR2
473 VACQ T1, RED2, CAR1, T1
474 VAQ T2, CAR2, T2
475
476 // ---------------------------------------------------
477
478 VSCBIQ PL, T0, CAR1
479 VSQ PL, T0, TT0
480 VSBCBIQ T1, PH, CAR1, CAR2
481 VSBIQ T1, PH, CAR1, TT1
482 VSBIQ T2, ZER, CAR2, T2
483
484 // what output to use, TT1||TT0 or T1||T0?
485 VSEL T0, TT0, T2, T0
486 VSEL T1, TT1, T2, T1
487
488 VST T0, (1*16)(res_ptr)
489 VST T1, (0*16)(res_ptr)
490 RET
491
492 #undef res_ptr
493 #undef x_ptr
494 #undef CPOOL
495 #undef T0
496 #undef T1
497 #undef T2
498 #undef TT0
499 #undef TT1
500 #undef ZER
501 #undef SEL1
502 #undef SEL2
503 #undef CAR1
504 #undef CAR2
505 #undef RED1
506 #undef RED2
507 #undef PL
508 #undef PH
509
510 // ---------------------------------------
511 // func p256OrdMul(res, in1, in2 []byte)
512 #define res_ptr R1
513 #define x_ptr R2
514 #define y_ptr R3
515 #define X0 V0
516 #define X1 V1
517 #define Y0 V2
518 #define Y1 V3
519 #define M0 V4
520 #define M1 V5
521 #define T0 V6
522 #define T1 V7
523 #define T2 V8
524 #define YDIG V9
525
526 #define ADD1 V16
527 #define ADD1H V17
528 #define ADD2 V18
529 #define ADD2H V19
530 #define RED1 V20
531 #define RED1H V21
532 #define RED2 V22
533 #define RED2H V23
534 #define CAR1 V24
535 #define CAR1M V25
536
537 #define MK0 V30
538 #define K0 V31
539 TEXT ·p256OrdMul(SB), NOSPLIT, $0
540 MOVD res+0(FP), res_ptr
541 MOVD in1+24(FP), x_ptr
542 MOVD in2+48(FP), y_ptr
543
544 VZERO T2
545 MOVD $p256ordK0<>+0x00(SB), R4
546
547 // VLEF $3, 0(R4), K0
548 WORD $0xE7F40000
549 BYTE $0x38
550 BYTE $0x03
551 MOVD $p256ord<>+0x00(SB), R4
552 VL 16(R4), M0
553 VL 0(R4), M1
554
555 VL (1*16)(x_ptr), X0
556 VL (0*16)(x_ptr), X1
557 VL (1*16)(y_ptr), Y0
558 VL (0*16)(y_ptr), Y1
559
560 // ---------------------------------------------------------------------------/
561 VREPF $3, Y0, YDIG
562 VMLF X0, YDIG, ADD1
563 VMLF ADD1, K0, MK0
564 VREPF $3, MK0, MK0
565
566 VMLF X1, YDIG, ADD2
567 VMLHF X0, YDIG, ADD1H
568 VMLHF X1, YDIG, ADD2H
569
570 VMALF M0, MK0, ADD1, RED1
571 VMALHF M0, MK0, ADD1, RED1H
572 VMALF M1, MK0, ADD2, RED2
573 VMALHF M1, MK0, ADD2, RED2H
574
575 VSLDB $12, RED2, RED1, RED1
576 VSLDB $12, T2, RED2, RED2
577
578 VACCQ RED1, ADD1H, CAR1
579 VAQ RED1, ADD1H, T0
580 VACCQ RED1H, T0, CAR1M
581 VAQ RED1H, T0, T0
582
583 // << ready for next MK0
584
585 VACQ RED2, ADD2H, CAR1, T1
586 VACCCQ RED2, ADD2H, CAR1, CAR1
587 VACCCQ RED2H, T1, CAR1M, T2
588 VACQ RED2H, T1, CAR1M, T1
589 VAQ CAR1, T2, T2
590
591 // ---------------------------------------------------
592 /* *
593 * ---+--------+--------+
594 * T2| T1 | T0 |
595 * ---+--------+--------+
596 * *(add)*
597 * +--------+--------+
598 * | X1 | X0 |
599 * +--------+--------+
600 * *(mul)*
601 * +--------+--------+
602 * | YDIG | YDIG |
603 * +--------+--------+
604 * *(add)*
605 * +--------+--------+
606 * | M1 | M0 |
607 * +--------+--------+
608 * *(mul)*
609 * +--------+--------+
610 * | MK0 | MK0 |
611 * +--------+--------+
612 *
613 * ---------------------
614 *
615 * +--------+--------+
616 * | ADD2 | ADD1 |
617 * +--------+--------+
618 * +--------+--------+
619 * | ADD2H | ADD1H |
620 * +--------+--------+
621 * +--------+--------+
622 * | RED2 | RED1 |
623 * +--------+--------+
624 * +--------+--------+
625 * | RED2H | RED1H |
626 * +--------+--------+
627 */
628 VREPF $2, Y0, YDIG
629 VMALF X0, YDIG, T0, ADD1
630 VMLF ADD1, K0, MK0
631 VREPF $3, MK0, MK0
632
633 VMALF X1, YDIG, T1, ADD2
634 VMALHF X0, YDIG, T0, ADD1H
635 VMALHF X1, YDIG, T1, ADD2H
636
637 VMALF M0, MK0, ADD1, RED1
638 VMALHF M0, MK0, ADD1, RED1H
639 VMALF M1, MK0, ADD2, RED2
640 VMALHF M1, MK0, ADD2, RED2H
641
642 VSLDB $12, RED2, RED1, RED1
643 VSLDB $12, T2, RED2, RED2
644
645 VACCQ RED1, ADD1H, CAR1
646 VAQ RED1, ADD1H, T0
647 VACCQ RED1H, T0, CAR1M
648 VAQ RED1H, T0, T0
649
650 // << ready for next MK0
651
652 VACQ RED2, ADD2H, CAR1, T1
653 VACCCQ RED2, ADD2H, CAR1, CAR1
654 VACCCQ RED2H, T1, CAR1M, T2
655 VACQ RED2H, T1, CAR1M, T1
656 VAQ CAR1, T2, T2
657
658 // ---------------------------------------------------
659 VREPF $1, Y0, YDIG
660 VMALF X0, YDIG, T0, ADD1
661 VMLF ADD1, K0, MK0
662 VREPF $3, MK0, MK0
663
664 VMALF X1, YDIG, T1, ADD2
665 VMALHF X0, YDIG, T0, ADD1H
666 VMALHF X1, YDIG, T1, ADD2H
667
668 VMALF M0, MK0, ADD1, RED1
669 VMALHF M0, MK0, ADD1, RED1H
670 VMALF M1, MK0, ADD2, RED2
671 VMALHF M1, MK0, ADD2, RED2H
672
673 VSLDB $12, RED2, RED1, RED1
674 VSLDB $12, T2, RED2, RED2
675
676 VACCQ RED1, ADD1H, CAR1
677 VAQ RED1, ADD1H, T0
678 VACCQ RED1H, T0, CAR1M
679 VAQ RED1H, T0, T0
680
681 // << ready for next MK0
682
683 VACQ RED2, ADD2H, CAR1, T1
684 VACCCQ RED2, ADD2H, CAR1, CAR1
685 VACCCQ RED2H, T1, CAR1M, T2
686 VACQ RED2H, T1, CAR1M, T1
687 VAQ CAR1, T2, T2
688
689 // ---------------------------------------------------
690 VREPF $0, Y0, YDIG
691 VMALF X0, YDIG, T0, ADD1
692 VMLF ADD1, K0, MK0
693 VREPF $3, MK0, MK0
694
695 VMALF X1, YDIG, T1, ADD2
696 VMALHF X0, YDIG, T0, ADD1H
697 VMALHF X1, YDIG, T1, ADD2H
698
699 VMALF M0, MK0, ADD1, RED1
700 VMALHF M0, MK0, ADD1, RED1H
701 VMALF M1, MK0, ADD2, RED2
702 VMALHF M1, MK0, ADD2, RED2H
703
704 VSLDB $12, RED2, RED1, RED1
705 VSLDB $12, T2, RED2, RED2
706
707 VACCQ RED1, ADD1H, CAR1
708 VAQ RED1, ADD1H, T0
709 VACCQ RED1H, T0, CAR1M
710 VAQ RED1H, T0, T0
711
712 // << ready for next MK0
713
714 VACQ RED2, ADD2H, CAR1, T1
715 VACCCQ RED2, ADD2H, CAR1, CAR1
716 VACCCQ RED2H, T1, CAR1M, T2
717 VACQ RED2H, T1, CAR1M, T1
718 VAQ CAR1, T2, T2
719
720 // ---------------------------------------------------
721 VREPF $3, Y1, YDIG
722 VMALF X0, YDIG, T0, ADD1
723 VMLF ADD1, K0, MK0
724 VREPF $3, MK0, MK0
725
726 VMALF X1, YDIG, T1, ADD2
727 VMALHF X0, YDIG, T0, ADD1H
728 VMALHF X1, YDIG, T1, ADD2H
729
730 VMALF M0, MK0, ADD1, RED1
731 VMALHF M0, MK0, ADD1, RED1H
732 VMALF M1, MK0, ADD2, RED2
733 VMALHF M1, MK0, ADD2, RED2H
734
735 VSLDB $12, RED2, RED1, RED1
736 VSLDB $12, T2, RED2, RED2
737
738 VACCQ RED1, ADD1H, CAR1
739 VAQ RED1, ADD1H, T0
740 VACCQ RED1H, T0, CAR1M
741 VAQ RED1H, T0, T0
742
743 // << ready for next MK0
744
745 VACQ RED2, ADD2H, CAR1, T1
746 VACCCQ RED2, ADD2H, CAR1, CAR1
747 VACCCQ RED2H, T1, CAR1M, T2
748 VACQ RED2H, T1, CAR1M, T1
749 VAQ CAR1, T2, T2
750
751 // ---------------------------------------------------
752 VREPF $2, Y1, YDIG
753 VMALF X0, YDIG, T0, ADD1
754 VMLF ADD1, K0, MK0
755 VREPF $3, MK0, MK0
756
757 VMALF X1, YDIG, T1, ADD2
758 VMALHF X0, YDIG, T0, ADD1H
759 VMALHF X1, YDIG, T1, ADD2H
760
761 VMALF M0, MK0, ADD1, RED1
762 VMALHF M0, MK0, ADD1, RED1H
763 VMALF M1, MK0, ADD2, RED2
764 VMALHF M1, MK0, ADD2, RED2H
765
766 VSLDB $12, RED2, RED1, RED1
767 VSLDB $12, T2, RED2, RED2
768
769 VACCQ RED1, ADD1H, CAR1
770 VAQ RED1, ADD1H, T0
771 VACCQ RED1H, T0, CAR1M
772 VAQ RED1H, T0, T0
773
774 // << ready for next MK0
775
776 VACQ RED2, ADD2H, CAR1, T1
777 VACCCQ RED2, ADD2H, CAR1, CAR1
778 VACCCQ RED2H, T1, CAR1M, T2
779 VACQ RED2H, T1, CAR1M, T1
780 VAQ CAR1, T2, T2
781
782 // ---------------------------------------------------
783 VREPF $1, Y1, YDIG
784 VMALF X0, YDIG, T0, ADD1
785 VMLF ADD1, K0, MK0
786 VREPF $3, MK0, MK0
787
788 VMALF X1, YDIG, T1, ADD2
789 VMALHF X0, YDIG, T0, ADD1H
790 VMALHF X1, YDIG, T1, ADD2H
791
792 VMALF M0, MK0, ADD1, RED1
793 VMALHF M0, MK0, ADD1, RED1H
794 VMALF M1, MK0, ADD2, RED2
795 VMALHF M1, MK0, ADD2, RED2H
796
797 VSLDB $12, RED2, RED1, RED1
798 VSLDB $12, T2, RED2, RED2
799
800 VACCQ RED1, ADD1H, CAR1
801 VAQ RED1, ADD1H, T0
802 VACCQ RED1H, T0, CAR1M
803 VAQ RED1H, T0, T0
804
805 // << ready for next MK0
806
807 VACQ RED2, ADD2H, CAR1, T1
808 VACCCQ RED2, ADD2H, CAR1, CAR1
809 VACCCQ RED2H, T1, CAR1M, T2
810 VACQ RED2H, T1, CAR1M, T1
811 VAQ CAR1, T2, T2
812
813 // ---------------------------------------------------
814 VREPF $0, Y1, YDIG
815 VMALF X0, YDIG, T0, ADD1
816 VMLF ADD1, K0, MK0
817 VREPF $3, MK0, MK0
818
819 VMALF X1, YDIG, T1, ADD2
820 VMALHF X0, YDIG, T0, ADD1H
821 VMALHF X1, YDIG, T1, ADD2H
822
823 VMALF M0, MK0, ADD1, RED1
824 VMALHF M0, MK0, ADD1, RED1H
825 VMALF M1, MK0, ADD2, RED2
826 VMALHF M1, MK0, ADD2, RED2H
827
828 VSLDB $12, RED2, RED1, RED1
829 VSLDB $12, T2, RED2, RED2
830
831 VACCQ RED1, ADD1H, CAR1
832 VAQ RED1, ADD1H, T0
833 VACCQ RED1H, T0, CAR1M
834 VAQ RED1H, T0, T0
835
836 // << ready for next MK0
837
838 VACQ RED2, ADD2H, CAR1, T1
839 VACCCQ RED2, ADD2H, CAR1, CAR1
840 VACCCQ RED2H, T1, CAR1M, T2
841 VACQ RED2H, T1, CAR1M, T1
842 VAQ CAR1, T2, T2
843
844 // ---------------------------------------------------
845
846 VZERO RED1
847 VSCBIQ M0, T0, CAR1
848 VSQ M0, T0, ADD1
849 VSBCBIQ T1, M1, CAR1, CAR1M
850 VSBIQ T1, M1, CAR1, ADD2
851 VSBIQ T2, RED1, CAR1M, T2
852
853 // what output to use, ADD2||ADD1 or T1||T0?
854 VSEL T0, ADD1, T2, T0
855 VSEL T1, ADD2, T2, T1
856
857 VST T0, (1*16)(res_ptr)
858 VST T1, (0*16)(res_ptr)
859 RET
860
861 #undef res_ptr
862 #undef x_ptr
863 #undef y_ptr
864 #undef X0
865 #undef X1
866 #undef Y0
867 #undef Y1
868 #undef M0
869 #undef M1
870 #undef T0
871 #undef T1
872 #undef T2
873 #undef YDIG
874
875 #undef ADD1
876 #undef ADD1H
877 #undef ADD2
878 #undef ADD2H
879 #undef RED1
880 #undef RED1H
881 #undef RED2
882 #undef RED2H
883 #undef CAR1
884 #undef CAR1M
885
886 #undef MK0
887 #undef K0
888
889 // ---------------------------------------
890 // p256MulInternalVX
891 // V0-V3,V30,V31 - Not Modified
892 // V4-V15 - Volatile
893
894 #define CPOOL R4
895
896 // Parameters
897 #define X0 V0 // Not modified
898 #define X1 V1 // Not modified
899 #define Y0 V2 // Not modified
900 #define Y1 V3 // Not modified
901 #define T0 V4
902 #define T1 V5
903 #define P0 V30 // Not modified
904 #define P1 V31 // Not modified
905
906 // Temporaries
907 #define YDIG V6 // Overloaded with CAR2, ZER
908 #define ADD1H V7 // Overloaded with ADD3H
909 #define ADD2H V8 // Overloaded with ADD4H
910 #define ADD3 V9 // Overloaded with SEL2,SEL5
911 #define ADD4 V10 // Overloaded with SEL3,SEL6
912 #define RED1 V11 // Overloaded with CAR2
913 #define RED2 V12
914 #define RED3 V13 // Overloaded with SEL1
915 #define T2 V14
916 // Overloaded temporaries
917 #define ADD1 V4 // Overloaded with T0
918 #define ADD2 V5 // Overloaded with T1
919 #define ADD3H V7 // Overloaded with ADD1H
920 #define ADD4H V8 // Overloaded with ADD2H
921 #define ZER V6 // Overloaded with YDIG, CAR2
922 #define CAR1 V6 // Overloaded with YDIG, ZER
923 #define CAR2 V11 // Overloaded with RED1
924 // Constant Selects
925 #define SEL1 V13 // Overloaded with RED3
926 #define SEL2 V9 // Overloaded with ADD3,SEL5
927 #define SEL3 V10 // Overloaded with ADD4,SEL6
928 #define SEL4 V6 // Overloaded with YDIG,CAR2,ZER
929 #define SEL5 V9 // Overloaded with ADD3,SEL2
930 #define SEL6 V10 // Overloaded with ADD4,SEL3
931
932 /* *
933 * To follow the flow of bits, for your own sanity a stiff drink, need you shall.
934 * Of a single round, a 'helpful' picture, here is. Meaning, column position has.
935 * With you, SIMD be...
936 *
937 * +--------+--------+
938 * +--------| RED2 | RED1 |
939 * | +--------+--------+
940 * | ---+--------+--------+
941 * | +---- T2| T1 | T0 |--+
942 * | | ---+--------+--------+ |
943 * | | |
944 * | | ======================= |
945 * | | |
946 * | | +--------+--------+<-+
947 * | +-------| ADD2 | ADD1 |--|-----+
948 * | | +--------+--------+ | |
949 * | | +--------+--------+<---+ |
950 * | | | ADD2H | ADD1H |--+ |
951 * | | +--------+--------+ | |
952 * | | +--------+--------+<-+ |
953 * | | | ADD4 | ADD3 |--|-+ |
954 * | | +--------+--------+ | | |
955 * | | +--------+--------+<---+ | |
956 * | | | ADD4H | ADD3H |------|-+ |(+vzero)
957 * | | +--------+--------+ | | V
958 * | | ------------------------ | | +--------+
959 * | | | | | RED3 | [d0 0 0 d0]
960 * | | | | +--------+
961 * | +---->+--------+--------+ | | |
962 * (T2[1w]||ADD2[4w]||ADD1[3w]) +--------| T1 | T0 | | | |
963 * | +--------+--------+ | | |
964 * +---->---+--------+--------+ | | |
965 * T2| T1 | T0 |----+ | |
966 * ---+--------+--------+ | | |
967 * ---+--------+--------+<---+ | |
968 * +--- T2| T1 | T0 |----------+
969 * | ---+--------+--------+ | |
970 * | +--------+--------+<-------------+
971 * | | RED2 | RED1 |-----+ | | [0 d1 d0 d1] [d0 0 d1 d0]
972 * | +--------+--------+ | | |
973 * | +--------+<----------------------+
974 * | | RED3 |--------------+ | [0 0 d1 d0]
975 * | +--------+ | |
976 * +--->+--------+--------+ | |
977 * | T1 | T0 |--------+
978 * +--------+--------+ | |
979 * --------------------------- | |
980 * | |
981 * +--------+--------+<----+ |
982 * | RED2 | RED1 | |
983 * +--------+--------+ |
984 * ---+--------+--------+<-------+
985 * T2| T1 | T0 | (H1P-H1P-H00RRAY!)
986 * ---+--------+--------+
987 *
988 * *Mi obra de arte de siglo XXI @vpaprots
989 *
990 *
991 * First group is special, doesn't get the two inputs:
992 * +--------+--------+<-+
993 * +-------| ADD2 | ADD1 |--|-----+
994 * | +--------+--------+ | |
995 * | +--------+--------+<---+ |
996 * | | ADD2H | ADD1H |--+ |
997 * | +--------+--------+ | |
998 * | +--------+--------+<-+ |
999 * | | ADD4 | ADD3 |--|-+ |
1000 * | +--------+--------+ | | |
1001 * | +--------+--------+<---+ | |
1002 * | | ADD4H | ADD3H |------|-+ |(+vzero)
1003 * | +--------+--------+ | | V
1004 * | ------------------------ | | +--------+
1005 * | | | | RED3 | [d0 0 0 d0]
1006 * | | | +--------+
1007 * +---->+--------+--------+ | | |
1008 * (T2[1w]||ADD2[4w]||ADD1[3w]) | T1 | T0 |----+ | |
1009 * +--------+--------+ | | |
1010 * ---+--------+--------+<---+ | |
1011 * +--- T2| T1 | T0 |----------+
1012 * | ---+--------+--------+ | |
1013 * | +--------+--------+<-------------+
1014 * | | RED2 | RED1 |-----+ | | [0 d1 d0 d1] [d0 0 d1 d0]
1015 * | +--------+--------+ | | |
1016 * | +--------+<----------------------+
1017 * | | RED3 |--------------+ | [0 0 d1 d0]
1018 * | +--------+ | |
1019 * +--->+--------+--------+ | |
1020 * | T1 | T0 |--------+
1021 * +--------+--------+ | |
1022 * --------------------------- | |
1023 * | |
1024 * +--------+--------+<----+ |
1025 * | RED2 | RED1 | |
1026 * +--------+--------+ |
1027 * ---+--------+--------+<-------+
1028 * T2| T1 | T0 | (H1P-H1P-H00RRAY!)
1029 * ---+--------+--------+
1030 *
1031 * Last 'group' needs to RED2||RED1 shifted less
1032 */
1033 TEXT ·p256MulInternalVX(SB), NOSPLIT, $0-0
1034 VL 32(CPOOL), SEL1
1035 VL 48(CPOOL), SEL2
1036 VL 64(CPOOL), SEL3
1037 VL 80(CPOOL), SEL4
1038
1039 // ---------------------------------------------------
1040
1041 VREPF $3, Y0, YDIG
1042 VMLHF X0, YDIG, ADD1H
1043 VMLHF X1, YDIG, ADD2H
1044 VMLF X0, YDIG, ADD1
1045 VMLF X1, YDIG, ADD2
1046
1047 VREPF $2, Y0, YDIG
1048 VMALF X0, YDIG, ADD1H, ADD3
1049 VMALF X1, YDIG, ADD2H, ADD4
1050 VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free
1051 VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free
1052
1053 VZERO ZER
1054 VL 32(CPOOL), SEL1
1055 VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0]
1056
1057 VSLDB $12, ADD2, ADD1, T0 // ADD1 Free
1058 VSLDB $12, ZER, ADD2, T1 // ADD2 Free
1059
1060 VACCQ T0, ADD3, CAR1
1061 VAQ T0, ADD3, T0 // ADD3 Free
1062 VACCCQ T1, ADD4, CAR1, T2
1063 VACQ T1, ADD4, CAR1, T1 // ADD4 Free
1064
1065 VL 48(CPOOL), SEL2
1066 VL 64(CPOOL), SEL3
1067 VL 80(CPOOL), SEL4
1068 VPERM RED3, T0, SEL2, RED1 // [d0 0 d1 d0]
1069 VPERM RED3, T0, SEL3, RED2 // [ 0 d1 d0 d1]
1070 VPERM RED3, T0, SEL4, RED3 // [ 0 0 d1 d0]
1071 VSQ RED3, RED2, RED2 // Guaranteed not to underflow
1072
1073 VSLDB $12, T1, T0, T0
1074 VSLDB $12, T2, T1, T1
1075
1076 VACCQ T0, ADD3H, CAR1
1077 VAQ T0, ADD3H, T0
1078 VACCCQ T1, ADD4H, CAR1, T2
1079 VACQ T1, ADD4H, CAR1, T1
1080
1081 // ---------------------------------------------------
1082
1083 VREPF $1, Y0, YDIG
1084 VMALHF X0, YDIG, T0, ADD1H
1085 VMALHF X1, YDIG, T1, ADD2H
1086 VMALF X0, YDIG, T0, ADD1 // T0 Free->ADD1
1087 VMALF X1, YDIG, T1, ADD2 // T1 Free->ADD2
1088
1089 VREPF $0, Y0, YDIG
1090 VMALF X0, YDIG, ADD1H, ADD3
1091 VMALF X1, YDIG, ADD2H, ADD4
1092 VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free->ADD3H
1093 VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free->ADD4H , YDIG Free->ZER
1094
1095 VZERO ZER
1096 VL 32(CPOOL), SEL1
1097 VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0]
1098
1099 VSLDB $12, ADD2, ADD1, T0 // ADD1 Free->T0
1100 VSLDB $12, T2, ADD2, T1 // ADD2 Free->T1, T2 Free
1101
1102 VACCQ T0, RED1, CAR1
1103 VAQ T0, RED1, T0
1104 VACCCQ T1, RED2, CAR1, T2
1105 VACQ T1, RED2, CAR1, T1
1106
1107 VACCQ T0, ADD3, CAR1
1108 VAQ T0, ADD3, T0
1109 VACCCQ T1, ADD4, CAR1, CAR2
1110 VACQ T1, ADD4, CAR1, T1
1111 VAQ T2, CAR2, T2
1112
1113 VL 48(CPOOL), SEL2
1114 VL 64(CPOOL), SEL3
1115 VL 80(CPOOL), SEL4
1116 VPERM RED3, T0, SEL2, RED1 // [d0 0 d1 d0]
1117 VPERM RED3, T0, SEL3, RED2 // [ 0 d1 d0 d1]
1118 VPERM RED3, T0, SEL4, RED3 // [ 0 0 d1 d0]
1119 VSQ RED3, RED2, RED2 // Guaranteed not to underflow
1120
1121 VSLDB $12, T1, T0, T0
1122 VSLDB $12, T2, T1, T1
1123
1124 VACCQ T0, ADD3H, CAR1
1125 VAQ T0, ADD3H, T0
1126 VACCCQ T1, ADD4H, CAR1, T2
1127 VACQ T1, ADD4H, CAR1, T1
1128
1129 // ---------------------------------------------------
1130
1131 VREPF $3, Y1, YDIG
1132 VMALHF X0, YDIG, T0, ADD1H
1133 VMALHF X1, YDIG, T1, ADD2H
1134 VMALF X0, YDIG, T0, ADD1
1135 VMALF X1, YDIG, T1, ADD2
1136
1137 VREPF $2, Y1, YDIG
1138 VMALF X0, YDIG, ADD1H, ADD3
1139 VMALF X1, YDIG, ADD2H, ADD4
1140 VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free
1141 VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free
1142
1143 VZERO ZER
1144 VL 32(CPOOL), SEL1
1145 VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0]
1146
1147 VSLDB $12, ADD2, ADD1, T0 // ADD1 Free
1148 VSLDB $12, T2, ADD2, T1 // ADD2 Free
1149
1150 VACCQ T0, RED1, CAR1
1151 VAQ T0, RED1, T0
1152 VACCCQ T1, RED2, CAR1, T2
1153 VACQ T1, RED2, CAR1, T1
1154
1155 VACCQ T0, ADD3, CAR1
1156 VAQ T0, ADD3, T0
1157 VACCCQ T1, ADD4, CAR1, CAR2
1158 VACQ T1, ADD4, CAR1, T1
1159 VAQ T2, CAR2, T2
1160
1161 VL 48(CPOOL), SEL2
1162 VL 64(CPOOL), SEL3
1163 VL 80(CPOOL), SEL4
1164 VPERM RED3, T0, SEL2, RED1 // [d0 0 d1 d0]
1165 VPERM RED3, T0, SEL3, RED2 // [ 0 d1 d0 d1]
1166 VPERM RED3, T0, SEL4, RED3 // [ 0 0 d1 d0]
1167 VSQ RED3, RED2, RED2 // Guaranteed not to underflow
1168
1169 VSLDB $12, T1, T0, T0
1170 VSLDB $12, T2, T1, T1
1171
1172 VACCQ T0, ADD3H, CAR1
1173 VAQ T0, ADD3H, T0
1174 VACCCQ T1, ADD4H, CAR1, T2
1175 VACQ T1, ADD4H, CAR1, T1
1176
1177 // ---------------------------------------------------
1178
1179 VREPF $1, Y1, YDIG
1180 VMALHF X0, YDIG, T0, ADD1H
1181 VMALHF X1, YDIG, T1, ADD2H
1182 VMALF X0, YDIG, T0, ADD1
1183 VMALF X1, YDIG, T1, ADD2
1184
1185 VREPF $0, Y1, YDIG
1186 VMALF X0, YDIG, ADD1H, ADD3
1187 VMALF X1, YDIG, ADD2H, ADD4
1188 VMALHF X0, YDIG, ADD1H, ADD3H
1189 VMALHF X1, YDIG, ADD2H, ADD4H
1190
1191 VZERO ZER
1192 VL 32(CPOOL), SEL1
1193 VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0]
1194
1195 VSLDB $12, ADD2, ADD1, T0
1196 VSLDB $12, T2, ADD2, T1
1197
1198 VACCQ T0, RED1, CAR1
1199 VAQ T0, RED1, T0
1200 VACCCQ T1, RED2, CAR1, T2
1201 VACQ T1, RED2, CAR1, T1
1202
1203 VACCQ T0, ADD3, CAR1
1204 VAQ T0, ADD3, T0
1205 VACCCQ T1, ADD4, CAR1, CAR2
1206 VACQ T1, ADD4, CAR1, T1
1207 VAQ T2, CAR2, T2
1208
1209 VL 96(CPOOL), SEL5
1210 VL 112(CPOOL), SEL6
1211 VPERM T0, RED3, SEL5, RED2 // [d1 d0 d1 d0]
1212 VPERM T0, RED3, SEL6, RED1 // [ 0 d1 d0 0]
1213 VSQ RED1, RED2, RED2 // Guaranteed not to underflow
1214
1215 VSLDB $12, T1, T0, T0
1216 VSLDB $12, T2, T1, T1
1217
1218 VACCQ T0, ADD3H, CAR1
1219 VAQ T0, ADD3H, T0
1220 VACCCQ T1, ADD4H, CAR1, T2
1221 VACQ T1, ADD4H, CAR1, T1
1222
1223 VACCQ T0, RED1, CAR1
1224 VAQ T0, RED1, T0
1225 VACCCQ T1, RED2, CAR1, CAR2
1226 VACQ T1, RED2, CAR1, T1
1227 VAQ T2, CAR2, T2
1228
1229 // ---------------------------------------------------
1230
1231 VZERO RED3
1232 VSCBIQ P0, T0, CAR1
1233 VSQ P0, T0, ADD1H
1234 VSBCBIQ T1, P1, CAR1, CAR2
1235 VSBIQ T1, P1, CAR1, ADD2H
1236 VSBIQ T2, RED3, CAR2, T2
1237
1238 // what output to use, ADD2H||ADD1H or T1||T0?
1239 VSEL T0, ADD1H, T2, T0
1240 VSEL T1, ADD2H, T2, T1
1241 RET
1242
1243 #undef CPOOL
1244
1245 #undef X0
1246 #undef X1
1247 #undef Y0
1248 #undef Y1
1249 #undef T0
1250 #undef T1
1251 #undef P0
1252 #undef P1
1253
1254 #undef SEL1
1255 #undef SEL2
1256 #undef SEL3
1257 #undef SEL4
1258 #undef SEL5
1259 #undef SEL6
1260
1261 #undef YDIG
1262 #undef ADD1H
1263 #undef ADD2H
1264 #undef ADD3
1265 #undef ADD4
1266 #undef RED1
1267 #undef RED2
1268 #undef RED3
1269 #undef T2
1270 #undef ADD1
1271 #undef ADD2
1272 #undef ADD3H
1273 #undef ADD4H
1274 #undef ZER
1275 #undef CAR1
1276 #undef CAR2
1277
1278 // ---------------------------------------
1279 // p256MulInternalVMSL
1280 // V0-V3,V30,V31 - Not Modified
1281 // V4-V14 - Volatile
1282
1283 #define CPOOL R4
1284 #define SCRATCH R9
1285
1286 // Parameters
1287 #define X0 V0 // Not modified
1288 #define X1 V1 // Not modified
1289 #define Y0 V2 // Not modified
1290 #define Y1 V3 // Not modified
1291 #define T0 V4
1292 #define T1 V5
1293 #define T2 V6
1294 #define P0 V30 // Not modified
1295 #define P1 V31 // Not modified
1296
1297 // input: d0
1298 // output: h0, h1
1299 // temp: TEMP, ZERO, BORROW
1300 #define OBSERVATION3(d0, h0, h1, TEMP, ZERO, BORROW) \
1301 VZERO ZERO \
1302 VSLDB $4, d0, ZERO, h0 \
1303 VLR h0, BORROW \
1304 VSLDB $12, ZERO, h0, TEMP \
1305 VSQ TEMP, h0, h0 \
1306 VSLDB $12, d0, BORROW, h1 \
1307 VSLDB $8, ZERO, BORROW, TEMP \
1308 VAQ TEMP, h0, h0 \
1309
1310 #define OBSERVATION3A(d2, h0, h1, TEMP, ZERO) \
1311 VZERO ZERO \
1312 VSLDB $8, d2, ZERO, TEMP \
1313 VSLDB $8, d2, TEMP, h0 \
1314 VSLDB $12, ZERO, TEMP, h1 \
1315 VSQ h1, h0, h0 \
1316
1317 TEXT ·p256MulInternalVMSL(SB), NOFRAME|NOSPLIT, $0-0
1318 VSTM V16, V19, (SCRATCH)
1319
1320 MOVD $p256vmsl<>+0x00(SB), CPOOL
1321
1322 // Divide input1 into 5 limbs
1323 VGBM $0x007f, V14
1324 VZERO V12
1325 VSLDB $2, X1, X0, V13
1326 VSLDB $2, Y1, Y0, V8
1327 VSLDB $4, V12, X1, V11 // V11(X1): 4 bytes limb
1328 VSLDB $4, V12, Y1, V6 // V6: 4 bytes limb
1329
1330 VN V14, X0, V5 // V5: first 7 bytes limb
1331 VN V14, Y0, V10 // V10: first 7 bytes limb
1332 VN V14, V13, V13 // v13: third 7 bytes limb
1333 VN V14, V8, V8 // V8: third 7 bytes limb
1334
1335 VMSLG V10, V5, V12, V10 // v10: l10 x l5 (column 1)
1336 VMSLG V8, V5, V12, V8 // v8: l8 x l5
1337 VMSLG V6, V13, V12, V13 // v13: l6 x l3
1338 VMSLG V6, V11, V12, V11 // v11: l6 x l1 (column 9)
1339 VMSLG V6, V5, V12, V6 // v6: l6 x l5
1340
1341 MOVD $p256vmsl<>+0x00(SB), CPOOL
1342 VGBM $0x7f7f, V14
1343
1344 VL 0(CPOOL), V4
1345 VL 16(CPOOL), V7
1346 VL 32(CPOOL), V9
1347 VL 48(CPOOL), V5
1348 VLM 64(CPOOL), V16, V19
1349
1350 VPERM V12, X0, V4, V4 // v4: limb4 | limb5
1351 VPERM Y1, Y0, V7, V7
1352 VPERM V12, Y0, V9, V9 // v9: limb10 | limb9
1353 VPERM X1, X0, V5, V5
1354 VPERM X1, X0, V16, V16
1355 VPERM Y1, Y0, V17, V17
1356 VPERM X1, V12, V18, V18 // v18: limb1 | limb2
1357 VPERM Y1, V12, V19, V19 // v19: limb7 | limb6
1358 VN V14, V7, V7 // v7: limb9 | limb8
1359 VN V14, V5, V5 // v5: limb3 | limb4
1360 VN V14, V16, V16 // v16: limb2 | limb3
1361 VN V14, V17, V17 // v17: limb8 | limb7
1362
1363 VMSLG V9, V4, V12, V14 // v14: l10 x l4 + l9 x l5 (column 2)
1364 VMSLG V9, V5, V8, V8 // v8: l10 x l9 + l3 x l4 + l8 x l5 (column 3)
1365 VMSLG V9, V16, V12, V16 // v16: l10 x l9 + l2 x l3
1366 VMSLG V9, V18, V12, V9 // v9: l10 x l1 + l9 x l2
1367 VMSLG V7, V18, V12, V7 // v7: l9 x l1 + l8 x l2
1368 VMSLG V17, V4, V16, V16 // v16: l8 x l4 + l7 x l5 + l10 x l9 + l2 x l3 (column 4)
1369 VMSLG V17, V5, V9, V9 // v9: l10 x l1 + l9 x l2 + l8 x l3 + l7 x l4
1370 VMSLG V17, V18, V12, V17 // v18: l8 x l1 + l7 x l2
1371 VMSLG V19, V5, V7, V7 // v7: l9 x l1 + l8 x l2 + l7 x l3 + l6 x l4 (column 6)
1372 VMSLG V19, V18, V12, V19 // v19: l7 x l1 + l6 x l2 (column 8)
1373 VAQ V9, V6, V9 // v9: l10 x l1 + l9 x l2 + l8 x l3 + l7 x l4 + l6 x l5 (column 5)
1374 VAQ V17, V13, V13 // v13: l8 x l1 + l7 x l2 + l6 x l3 (column 7)
1375
1376 VSLDB $9, V12, V10, V4
1377 VSLDB $9, V12, V7, V5
1378 VAQ V4, V14, V14
1379 VAQ V5, V13, V13
1380
1381 VSLDB $9, V12, V14, V4
1382 VSLDB $9, V12, V13, V5
1383 VAQ V4, V8, V8
1384 VAQ V5, V19, V19
1385
1386 VSLDB $9, V12, V8, V4
1387 VSLDB $9, V12, V19, V5
1388 VAQ V4, V16, V16
1389 VAQ V5, V11, V11
1390
1391 VSLDB $9, V12, V16, V4
1392 VAQ V4, V9, V17
1393
1394 VGBM $0x007f, V4
1395 VGBM $0x00ff, V5
1396
1397 VN V10, V4, V10
1398 VN V14, V4, V14
1399 VN V8, V4, V8
1400 VN V16, V4, V16
1401 VN V17, V4, V9
1402 VN V7, V4, V7
1403 VN V13, V4, V13
1404 VN V19, V4, V19
1405 VN V11, V5, V11
1406
1407 VSLDB $7, V14, V14, V14
1408 VSLDB $14, V8, V12, V4
1409 VSLDB $14, V12, V8, V8
1410 VSLDB $5, V16, V16, V16
1411 VSLDB $12, V9, V12, V5
1412
1413 VO V14, V10, V10
1414 VO V8, V16, V16
1415 VO V4, V10, V10 // first rightmost 128bits of the multiplication result
1416 VO V5, V16, V16 // second rightmost 128bits of the multiplication result
1417
1418 // adjust v7, v13, v19, v11
1419 VSLDB $7, V13, V13, V13
1420 VSLDB $14, V19, V12, V4
1421 VSLDB $14, V12, V19, V19
1422 VSLDB $5, V11, V12, V5
1423 VO V13, V7, V7
1424 VO V4, V7, V7
1425 VO V19, V5, V11
1426
1427 VSLDB $9, V12, V17, V14
1428 VSLDB $12, V12, V9, V9
1429 VACCQ V7, V14, V13
1430 VAQ V7, V14, V7
1431 VAQ V11, V13, V11
1432
1433 // First reduction, 96 bits
1434 VSLDB $4, V16, V10, T0
1435 VSLDB $4, V12, V16, T1
1436 VSLDB $3, V11, V7, V11 // fourth rightmost 128bits of the multiplication result
1437 VSLDB $3, V7, V12, V7
1438 OBSERVATION3(V10, V8, T2, V17, V18, V19)// results V8 | T2
1439 VO V7, V9, V7 // third rightmost 128bits of the multiplication result
1440 VACCQ T0, T2, V9
1441 VAQ T0, T2, T2
1442 VACQ T1, V8, V9, V8
1443
1444 // Second reduction 96 bits
1445 VSLDB $4, V8, T2, T0
1446 VSLDB $4, V12, V8, T1
1447 OBSERVATION3(T2, V9, V8, V17, V18, V19)// results V9 | V8
1448 VACCQ T0, V8, T2
1449 VAQ T0, V8, V8
1450 VACQ T1, V9, T2, V9
1451
1452 // Third reduction 64 bits
1453 VSLDB $8, V9, V8, T0
1454 VSLDB $8, V12, V9, T1
1455 OBSERVATION3A(V8, V14, V13, V17, V18)// results V14 | V13
1456 VACCQ T0, V13, V12
1457 VAQ T0, V13, V13
1458 VACQ T1, V14, V12, V14
1459 VACCQ V13, V7, V12
1460 VAQ V13, V7, T0
1461 VACCCQ V14, V11, V12, T2
1462 VACQ V14, V11, V12, T1 // results T2 | T1 | T0
1463
1464 // ---------------------------------------------------
1465 MOVD $p256mul<>+0x00(SB), CPOOL
1466
1467 VZERO V12
1468 VSCBIQ P0, T0, V8
1469 VSQ P0, T0, V7
1470 VSBCBIQ T1, P1, V8, V10
1471 VSBIQ T1, P1, V8, V9
1472 VSBIQ T2, V12, V10, T2
1473
1474 // what output to use, V9||V7 or T1||T0?
1475 VSEL T0, V7, T2, T0
1476 VSEL T1, V9, T2, T1
1477
1478 VLM (SCRATCH), V16, V19
1479
1480 RET
1481
1482 // ---------------------------------------
1483 // p256SqrInternalVMSL
1484 // V0-V1,V30,V31 - Not Modified
1485 // V4-V14 - Volatile
1486
1487 TEXT ·p256SqrInternalVMSL(SB), NOFRAME|NOSPLIT, $0-0
1488 VSTM V16, V18, (SCRATCH)
1489
1490 MOVD $p256vmsl<>+0x00(SB), CPOOL
1491 // Divide input into limbs
1492 VGBM $0x007f, V14
1493 VZERO V12
1494 VSLDB $2, X1, X0, V13
1495 VSLDB $4, V12, X1, V11 // V11(X1): 4 bytes limb
1496
1497 VN V14, X0, V10 // V10: first 7 bytes limb
1498 VN V14, V13, V13 // v13: third 7 bytes limb
1499
1500 VMSLG V10, V10, V12, V10 // v10: l10 x l5 (column 1)
1501 VMSLG V13, V13, V12, V13 // v13: l8 x l3
1502 VMSLG V11, V11, V12, V11 // v11: l6 x l1 (column 9)
1503
1504 MOVD $p256vmsl<>+0x00(SB), CPOOL
1505 VGBM $0x7f7f, V14
1506
1507 VL 0(CPOOL), V4
1508 VL 16(CPOOL), V7
1509 VL 32(CPOOL), V9
1510 VL 48(CPOOL), V5
1511 VLM 64(CPOOL), V16, V18
1512 VL 112(CPOOL), V8
1513
1514 VPERM V12, X0, V4, V4 // v4: limb4 | limb5
1515 VPERM X1, X0, V7, V7
1516 VPERM V12, X0, V9, V9 // v9: limb10 | limb9
1517 VPERM X1, X0, V5, V5
1518 VPERM X1, X0, V16, V16
1519 VPERM X1, X0, V17, V17
1520 VPERM X1, V12, V18, V18 // v18: limb1 | limb2
1521 VPERM X1, V12, V8, V8 // v8: limb7 | limb6
1522 VN V14, V7, V7 // v7: limb9 | limb8
1523 VN V14, V5, V5 // v5: limb3 | limb4
1524 VN V14, V16, V16 // v16: limb2 | limb3
1525 VN V14, V17, V17 // v17: limb8 | limb7
1526
1527 VMSLEOG V9, V18, V13, V6 // v6: l10 x l1 + l9 x l2 + l8 x l3 + l7 x l4 + l6 x l5 (column 5)
1528 VMSLG V9, V4, V12, V14 // v14: l10 x l4 + l9 x l5 (column 2)
1529 VMSLEOG V9, V16, V12, V16 // v16: l10 x l2 + l9 x l3 + l8 x l4 + l7 x l5 (column 4)
1530 VMSLEOG V7, V18, V12, V7 // v7: l9 x l1 + l8 x l2 (column 6)
1531 VMSLEG V17, V18, V12, V13 // v13: l8 x l1 + l7 x l2 + l6 x l3 (column 7)
1532 VMSLG V8, V18, V12, V8 // v8: l7 x l1 + l6 x l2 (column 8)
1533 VMSLEG V9, V5, V12, V18 // v18: l10 x l3 + l9 x l4 + l8 x l5 (column 3)
1534
1535 VSLDB $9, V12, V10, V4
1536 VSLDB $9, V12, V7, V5
1537 VAQ V4, V14, V14
1538 VAQ V5, V13, V13
1539
1540 VSLDB $9, V12, V14, V4
1541 VSLDB $9, V12, V13, V5
1542 VAQ V4, V18, V18
1543 VAQ V5, V8, V8
1544
1545 VSLDB $9, V12, V18, V4
1546 VSLDB $9, V12, V8, V5
1547 VAQ V4, V16, V16
1548 VAQ V5, V11, V11
1549
1550 VSLDB $9, V12, V16, V4
1551 VAQ V4, V6, V17
1552
1553 VGBM $0x007f, V4
1554 VGBM $0x00ff, V5
1555
1556 VN V10, V4, V10
1557 VN V14, V4, V14
1558 VN V18, V4, V18
1559 VN V16, V4, V16
1560 VN V17, V4, V9
1561 VN V7, V4, V7
1562 VN V13, V4, V13
1563 VN V8, V4, V8
1564 VN V11, V5, V11
1565
1566 VSLDB $7, V14, V14, V14
1567 VSLDB $14, V18, V12, V4
1568 VSLDB $14, V12, V18, V18
1569 VSLDB $5, V16, V16, V16
1570 VSLDB $12, V9, V12, V5
1571
1572 VO V14, V10, V10
1573 VO V18, V16, V16
1574 VO V4, V10, V10 // first rightmost 128bits of the multiplication result
1575 VO V5, V16, V16 // second rightmost 128bits of the multiplication result
1576
1577 // adjust v7, v13, v8, v11
1578 VSLDB $7, V13, V13, V13
1579 VSLDB $14, V8, V12, V4
1580 VSLDB $14, V12, V8, V8
1581 VSLDB $5, V11, V12, V5
1582 VO V13, V7, V7
1583 VO V4, V7, V7
1584 VO V8, V5, V11
1585
1586 VSLDB $9, V12, V17, V14
1587 VSLDB $12, V12, V9, V9
1588 VACCQ V7, V14, V13
1589 VAQ V7, V14, V7
1590 VAQ V11, V13, V11
1591
1592 // First reduction, 96 bits
1593 VSLDB $4, V16, V10, T0
1594 VSLDB $4, V12, V16, T1
1595 VSLDB $3, V11, V7, V11 // fourth rightmost 128bits of the multiplication result
1596 VSLDB $3, V7, V12, V7
1597 OBSERVATION3(V10, V8, T2, V16, V17, V18)// results V8 | T2
1598 VO V7, V9, V7 // third rightmost 128bits of the multiplication result
1599 VACCQ T0, T2, V9
1600 VAQ T0, T2, T2
1601 VACQ T1, V8, V9, V8
1602
1603 // Second reduction 96 bits
1604 VSLDB $4, V8, T2, T0
1605 VSLDB $4, V12, V8, T1
1606 OBSERVATION3(T2, V9, V8, V16, V17, V18)// results V9 | V8
1607 VACCQ T0, V8, T2
1608 VAQ T0, V8, V8
1609 VACQ T1, V9, T2, V9
1610
1611 // Third reduction 64 bits
1612 VSLDB $8, V9, V8, T0
1613 VSLDB $8, V12, V9, T1
1614 OBSERVATION3A(V8, V14, V13, V17, V18)// results V14 | V13
1615 VACCQ T0, V13, V12
1616 VAQ T0, V13, V13
1617 VACQ T1, V14, V12, V14
1618 VACCQ V13, V7, V12
1619 VAQ V13, V7, T0
1620 VACCCQ V14, V11, V12, T2
1621 VACQ V14, V11, V12, T1 // results T2 | T1 | T0
1622
1623 // ---------------------------------------------------
1624 MOVD $p256mul<>+0x00(SB), CPOOL
1625
1626 VZERO V12
1627 VSCBIQ P0, T0, V8
1628 VSQ P0, T0, V7
1629 VSBCBIQ T1, P1, V8, V10
1630 VSBIQ T1, P1, V8, V9
1631 VSBIQ T2, V12, V10, T2
1632
1633 // what output to use, V9||V7 or T1||T0?
1634 VSEL T0, V7, T2, T0
1635 VSEL T1, V9, T2, T1
1636
1637 VLM (SCRATCH), V16, V18
1638 RET
1639
1640
1641
1642 #undef CPOOL
1643 #undef SCRATCH
1644 #undef X0
1645 #undef X1
1646 #undef Y0
1647 #undef Y1
1648 #undef T0
1649 #undef T1
1650 #undef T2
1651 #undef P0
1652 #undef P1
1653
1654 #define SCRATCH R9
1655
1656 TEXT p256MulInternal<>(SB),NOSPLIT,$64-0
1657 MOVD $scratch-64(SP), SCRATCH
1658 MOVD ·p256MulInternalFacility+0x00(SB),R7
1659 CALL (R7)
1660 RET
1661
1662 TEXT ·p256MulInternalTrampolineSetup(SB),NOSPLIT|NOFRAME, $0
1663 MOVBZ internal∕cpu·S390X+const_offsetS390xHasVE1(SB), R0
1664 MOVD $·p256MulInternalFacility+0x00(SB), R7
1665 MOVD $·p256MulInternalVX(SB), R8
1666 CMPBEQ R0, $0, novmsl // VE1 facility = 1, VMSL supported
1667 MOVD $·p256MulInternalVMSL(SB), R8
1668 novmsl:
1669 MOVD R8, 0(R7)
1670 BR (R8)
1671
1672 GLOBL ·p256MulInternalFacility+0x00(SB), NOPTR, $8
1673 DATA ·p256MulInternalFacility+0x00(SB)/8, $·p256MulInternalTrampolineSetup(SB)
1674
1675 // Parameters
1676 #define X0 V0
1677 #define X1 V1
1678 #define Y0 V2
1679 #define Y1 V3
1680
1681 TEXT ·p256SqrInternalVX(SB), NOFRAME|NOSPLIT, $0
1682 VLR X0, Y0
1683 VLR X1, Y1
1684 BR ·p256MulInternalVX(SB)
1685
1686 #undef X0
1687 #undef X1
1688 #undef Y0
1689 #undef Y1
1690
1691
1692 TEXT p256SqrInternal<>(SB),NOSPLIT,$48-0
1693 MOVD $scratch-48(SP), SCRATCH
1694 MOVD ·p256SqrInternalFacility+0x00(SB),R7
1695 CALL (R7)
1696 RET
1697
1698 TEXT ·p256SqrInternalTrampolineSetup(SB),NOSPLIT|NOFRAME, $0
1699 MOVBZ internal∕cpu·S390X+const_offsetS390xHasVE1(SB), R0
1700 MOVD $·p256SqrInternalFacility+0x00(SB), R7
1701 MOVD $·p256SqrInternalVX(SB), R8
1702 CMPBEQ R0, $0, novmsl // VE1 facility = 1, VMSL supported
1703 MOVD $·p256SqrInternalVMSL(SB), R8
1704 novmsl:
1705 MOVD R8, 0(R7)
1706 BR (R8)
1707
1708
1709 GLOBL ·p256SqrInternalFacility+0x00(SB), NOPTR, $8
1710 DATA ·p256SqrInternalFacility+0x00(SB)/8, $·p256SqrInternalTrampolineSetup(SB)
1711
1712 #undef SCRATCH
1713
1714
1715 #define p256SubInternal(T1, T0, X1, X0, Y1, Y0) \
1716 VZERO ZER \
1717 VSCBIQ Y0, X0, CAR1 \
1718 VSQ Y0, X0, T0 \
1719 VSBCBIQ X1, Y1, CAR1, SEL1 \
1720 VSBIQ X1, Y1, CAR1, T1 \
1721 VSQ SEL1, ZER, SEL1 \
1722 \
1723 VACCQ T0, PL, CAR1 \
1724 VAQ T0, PL, TT0 \
1725 VACQ T1, PH, CAR1, TT1 \
1726 \
1727 VSEL T0, TT0, SEL1, T0 \
1728 VSEL T1, TT1, SEL1, T1 \
1729
1730 #define p256AddInternal(T1, T0, X1, X0, Y1, Y0) \
1731 VACCQ X0, Y0, CAR1 \
1732 VAQ X0, Y0, T0 \
1733 VACCCQ X1, Y1, CAR1, T2 \
1734 VACQ X1, Y1, CAR1, T1 \
1735 \
1736 VZERO ZER \
1737 VSCBIQ PL, T0, CAR1 \
1738 VSQ PL, T0, TT0 \
1739 VSBCBIQ T1, PH, CAR1, CAR2 \
1740 VSBIQ T1, PH, CAR1, TT1 \
1741 VSBIQ T2, ZER, CAR2, SEL1 \
1742 \
1743 VSEL T0, TT0, SEL1, T0 \
1744 VSEL T1, TT1, SEL1, T1
1745
1746 #define p256HalfInternal(T1, T0, X1, X0) \
1747 VZERO ZER \
1748 VSBIQ ZER, ZER, X0, SEL1 \
1749 \
1750 VACCQ X0, PL, CAR1 \
1751 VAQ X0, PL, T0 \
1752 VACCCQ X1, PH, CAR1, T2 \
1753 VACQ X1, PH, CAR1, T1 \
1754 \
1755 VSEL X0, T0, SEL1, T0 \
1756 VSEL X1, T1, SEL1, T1 \
1757 VSEL ZER, T2, SEL1, T2 \
1758 \
1759 VSLDB $15, T2, ZER, TT1 \
1760 VSLDB $15, T1, ZER, TT0 \
1761 VREPIB $1, SEL1 \
1762 VSRL SEL1, T0, T0 \
1763 VSRL SEL1, T1, T1 \
1764 VREPIB $7, SEL1 \
1765 VSL SEL1, TT0, TT0 \
1766 VSL SEL1, TT1, TT1 \
1767 VO T0, TT0, T0 \
1768 VO T1, TT1, T1
1769
1770 // ---------------------------------------
1771 // func p256MulAsm(res, in1, in2 []byte)
1772 #define res_ptr R1
1773 #define x_ptr R2
1774 #define y_ptr R3
1775 #define CPOOL R4
1776
1777 // Parameters
1778 #define X0 V0
1779 #define X1 V1
1780 #define Y0 V2
1781 #define Y1 V3
1782 #define T0 V4
1783 #define T1 V5
1784
1785 // Constants
1786 #define P0 V30
1787 #define P1 V31
1788 TEXT ·p256MulAsm(SB), NOSPLIT, $0
1789 MOVD res+0(FP), res_ptr
1790 MOVD in1+24(FP), x_ptr
1791 MOVD in2+48(FP), y_ptr
1792
1793 VL (1*16)(x_ptr), X0
1794 VL (0*16)(x_ptr), X1
1795 VL (1*16)(y_ptr), Y0
1796 VL (0*16)(y_ptr), Y1
1797
1798 MOVD $p256mul<>+0x00(SB), CPOOL
1799 VL 16(CPOOL), P0
1800 VL 0(CPOOL), P1
1801
1802 CALL p256MulInternal<>(SB)
1803
1804 VST T0, (1*16)(res_ptr)
1805 VST T1, (0*16)(res_ptr)
1806 RET
1807
1808 #undef res_ptr
1809 #undef x_ptr
1810 #undef y_ptr
1811 #undef CPOOL
1812
1813 #undef X0
1814 #undef X1
1815 #undef Y0
1816 #undef Y1
1817 #undef T0
1818 #undef T1
1819 #undef P0
1820 #undef P1
1821
1822 // ---------------------------------------
1823 // func p256SqrAsm(res, in1 []byte)
1824 #define res_ptr R1
1825 #define x_ptr R2
1826 #define y_ptr R3
1827 #define CPOOL R4
1828
1829 // Parameters
1830 #define X0 V0
1831 #define X1 V1
1832 #define T0 V4
1833 #define T1 V5
1834
1835 // Constants
1836 #define P0 V30
1837 #define P1 V31
1838 TEXT ·p256SqrAsm(SB), NOSPLIT, $0
1839 MOVD res+0(FP), res_ptr
1840 MOVD in1+24(FP), x_ptr
1841
1842 VL (1*16)(x_ptr), X0
1843 VL (0*16)(x_ptr), X1
1844
1845 MOVD $p256mul<>+0x00(SB), CPOOL
1846 VL 16(CPOOL), P0
1847 VL 0(CPOOL), P1
1848
1849 CALL p256SqrInternal<>(SB)
1850
1851 VST T0, (1*16)(res_ptr)
1852 VST T1, (0*16)(res_ptr)
1853 RET
1854
1855 #undef res_ptr
1856 #undef x_ptr
1857 #undef y_ptr
1858 #undef CPOOL
1859
1860 #undef X0
1861 #undef X1
1862 #undef T0
1863 #undef T1
1864 #undef P0
1865 #undef P1
1866
1867
1868 // Point add with P2 being affine point
1869 // If sign == 1 -> P2 = -P2
1870 // If sel == 0 -> P3 = P1
1871 // if zero == 0 -> P3 = P2
1872 // p256PointAddAffineAsm(P3, P1, P2 *p256Point, sign, sel, zero int)
1873 #define P3ptr R1
1874 #define P1ptr R2
1875 #define P2ptr R3
1876 #define CPOOL R4
1877
1878 // Temporaries in REGs
1879 #define Y2L V15
1880 #define Y2H V16
1881 #define T1L V17
1882 #define T1H V18
1883 #define T2L V19
1884 #define T2H V20
1885 #define T3L V21
1886 #define T3H V22
1887 #define T4L V23
1888 #define T4H V24
1889
1890 // Temps for Sub and Add
1891 #define TT0 V11
1892 #define TT1 V12
1893 #define T2 V13
1894
1895 // p256MulAsm Parameters
1896 #define X0 V0
1897 #define X1 V1
1898 #define Y0 V2
1899 #define Y1 V3
1900 #define T0 V4
1901 #define T1 V5
1902
1903 #define PL V30
1904 #define PH V31
1905
1906 // Names for zero/sel selects
1907 #define X1L V0
1908 #define X1H V1
1909 #define Y1L V2 // p256MulAsmParmY
1910 #define Y1H V3 // p256MulAsmParmY
1911 #define Z1L V4
1912 #define Z1H V5
1913 #define X2L V0
1914 #define X2H V1
1915 #define Z2L V4
1916 #define Z2H V5
1917 #define X3L V17 // T1L
1918 #define X3H V18 // T1H
1919 #define Y3L V21 // T3L
1920 #define Y3H V22 // T3H
1921 #define Z3L V28
1922 #define Z3H V29
1923
1924 #define ZER V6
1925 #define SEL1 V7
1926 #define CAR1 V8
1927 #define CAR2 V9
1928 /* *
1929 * Three operand formula:
1930 * Source: 2004 Hankerson–Menezes–Vanstone, page 91.
1931 * T1 = Z1²
1932 * T2 = T1*Z1
1933 * T1 = T1*X2
1934 * T2 = T2*Y2
1935 * T1 = T1-X1
1936 * T2 = T2-Y1
1937 * Z3 = Z1*T1
1938 * T3 = T1²
1939 * T4 = T3*T1
1940 * T3 = T3*X1
1941 * T1 = 2*T3
1942 * X3 = T2²
1943 * X3 = X3-T1
1944 * X3 = X3-T4
1945 * T3 = T3-X3
1946 * T3 = T3*T2
1947 * T4 = T4*Y1
1948 * Y3 = T3-T4
1949
1950 * Three operand formulas, but with MulInternal X,Y used to store temps
1951 X=Z1; Y=Z1; MUL;T- // T1 = Z1² T1
1952 X=T ; Y- ; MUL;T2=T // T2 = T1*Z1 T1 T2
1953 X- ; Y=X2; MUL;T1=T // T1 = T1*X2 T1 T2
1954 X=T2; Y=Y2; MUL;T- // T2 = T2*Y2 T1 T2
1955 SUB(T2<T-Y1) // T2 = T2-Y1 T1 T2
1956 SUB(Y<T1-X1) // T1 = T1-X1 T1 T2
1957 X=Z1; Y- ; MUL;Z3:=T// Z3 = Z1*T1 T2
1958 X=Y; Y- ; MUL;X=T // T3 = T1*T1 T2
1959 X- ; Y- ; MUL;T4=T // T4 = T3*T1 T2 T4
1960 X- ; Y=X1; MUL;T3=T // T3 = T3*X1 T2 T3 T4
1961 ADD(T1<T+T) // T1 = T3+T3 T1 T2 T3 T4
1962 X=T2; Y=T2; MUL;T- // X3 = T2*T2 T1 T2 T3 T4
1963 SUB(T<T-T1) // X3 = X3-T1 T1 T2 T3 T4
1964 SUB(T<T-T4) X3:=T // X3 = X3-T4 T2 T3 T4
1965 SUB(X<T3-T) // T3 = T3-X3 T2 T3 T4
1966 X- ; Y- ; MUL;T3=T // T3 = T3*T2 T2 T3 T4
1967 X=T4; Y=Y1; MUL;T- // T4 = T4*Y1 T3 T4
1968 SUB(T<T3-T) Y3:=T // Y3 = T3-T4 T3 T4
1969
1970 */
1971 TEXT ·p256PointAddAffineAsm(SB), NOSPLIT, $0
1972 MOVD P3+0(FP), P3ptr
1973 MOVD P1+8(FP), P1ptr
1974 MOVD P2+16(FP), P2ptr
1975
1976 MOVD $p256mul<>+0x00(SB), CPOOL
1977 VL 16(CPOOL), PL
1978 VL 0(CPOOL), PH
1979
1980 // if (sign == 1) {
1981 // Y2 = fromBig(new(big.Int).Mod(new(big.Int).Sub(p256.P, new(big.Int).SetBytes(Y2)), p256.P)) // Y2 = P-Y2
1982 // }
1983
1984 VL 32(P2ptr), Y2H
1985 VL 48(P2ptr), Y2L
1986
1987 VLREPG sign+24(FP), SEL1
1988 VZERO ZER
1989 VCEQG SEL1, ZER, SEL1
1990
1991 VSCBIQ Y2L, PL, CAR1
1992 VSQ Y2L, PL, T1L
1993 VSBIQ PH, Y2H, CAR1, T1H
1994
1995 VSEL Y2L, T1L, SEL1, Y2L
1996 VSEL Y2H, T1H, SEL1, Y2H
1997
1998 /* *
1999 * Three operand formula:
2000 * Source: 2004 Hankerson–Menezes–Vanstone, page 91.
2001 */
2002 // X=Z1; Y=Z1; MUL; T- // T1 = Z1² T1
2003 VL 64(P1ptr), X1 // Z1H
2004 VL 80(P1ptr), X0 // Z1L
2005 VLR X0, Y0
2006 VLR X1, Y1
2007 CALL p256SqrInternal<>(SB)
2008
2009 // X=T ; Y- ; MUL; T2=T // T2 = T1*Z1 T1 T2
2010 VLR T0, X0
2011 VLR T1, X1
2012 CALL p256MulInternal<>(SB)
2013 VLR T0, T2L
2014 VLR T1, T2H
2015
2016 // X- ; Y=X2; MUL; T1=T // T1 = T1*X2 T1 T2
2017 VL 0(P2ptr), Y1 // X2H
2018 VL 16(P2ptr), Y0 // X2L
2019 CALL p256MulInternal<>(SB)
2020 VLR T0, T1L
2021 VLR T1, T1H
2022
2023 // X=T2; Y=Y2; MUL; T- // T2 = T2*Y2 T1 T2
2024 VLR T2L, X0
2025 VLR T2H, X1
2026 VLR Y2L, Y0
2027 VLR Y2H, Y1
2028 CALL p256MulInternal<>(SB)
2029
2030 // SUB(T2<T-Y1) // T2 = T2-Y1 T1 T2
2031 VL 32(P1ptr), Y1H
2032 VL 48(P1ptr), Y1L
2033 p256SubInternal(T2H,T2L,T1,T0,Y1H,Y1L)
2034
2035 // SUB(Y<T1-X1) // T1 = T1-X1 T1 T2
2036 VL 0(P1ptr), X1H
2037 VL 16(P1ptr), X1L
2038 p256SubInternal(Y1,Y0,T1H,T1L,X1H,X1L)
2039
2040 // X=Z1; Y- ; MUL; Z3:=T// Z3 = Z1*T1 T2
2041 VL 64(P1ptr), X1 // Z1H
2042 VL 80(P1ptr), X0 // Z1L
2043 CALL p256MulInternal<>(SB)
2044
2045 // VST T1, 64(P3ptr)
2046 // VST T0, 80(P3ptr)
2047 VLR T0, Z3L
2048 VLR T1, Z3H
2049
2050 // X=Y; Y- ; MUL; X=T // T3 = T1*T1 T2
2051 VLR Y0, X0
2052 VLR Y1, X1
2053 CALL p256SqrInternal<>(SB)
2054 VLR T0, X0
2055 VLR T1, X1
2056
2057 // X- ; Y- ; MUL; T4=T // T4 = T3*T1 T2 T4
2058 CALL p256MulInternal<>(SB)
2059 VLR T0, T4L
2060 VLR T1, T4H
2061
2062 // X- ; Y=X1; MUL; T3=T // T3 = T3*X1 T2 T3 T4
2063 VL 0(P1ptr), Y1 // X1H
2064 VL 16(P1ptr), Y0 // X1L
2065 CALL p256MulInternal<>(SB)
2066 VLR T0, T3L
2067 VLR T1, T3H
2068
2069 // ADD(T1<T+T) // T1 = T3+T3 T1 T2 T3 T4
2070 p256AddInternal(T1H,T1L, T1,T0,T1,T0)
2071
2072 // X=T2; Y=T2; MUL; T- // X3 = T2*T2 T1 T2 T3 T4
2073 VLR T2L, X0
2074 VLR T2H, X1
2075 VLR T2L, Y0
2076 VLR T2H, Y1
2077 CALL p256SqrInternal<>(SB)
2078
2079 // SUB(T<T-T1) // X3 = X3-T1 T1 T2 T3 T4 (T1 = X3)
2080 p256SubInternal(T1,T0,T1,T0,T1H,T1L)
2081
2082 // SUB(T<T-T4) X3:=T // X3 = X3-T4 T2 T3 T4
2083 p256SubInternal(T1,T0,T1,T0,T4H,T4L)
2084 VLR T0, X3L
2085 VLR T1, X3H
2086
2087 // SUB(X<T3-T) // T3 = T3-X3 T2 T3 T4
2088 p256SubInternal(X1,X0,T3H,T3L,T1,T0)
2089
2090 // X- ; Y- ; MUL; T3=T // T3 = T3*T2 T2 T3 T4
2091 CALL p256MulInternal<>(SB)
2092 VLR T0, T3L
2093 VLR T1, T3H
2094
2095 // X=T4; Y=Y1; MUL; T- // T4 = T4*Y1 T3 T4
2096 VLR T4L, X0
2097 VLR T4H, X1
2098 VL 32(P1ptr), Y1 // Y1H
2099 VL 48(P1ptr), Y0 // Y1L
2100 CALL p256MulInternal<>(SB)
2101
2102 // SUB(T<T3-T) Y3:=T // Y3 = T3-T4 T3 T4 (T3 = Y3)
2103 p256SubInternal(Y3H,Y3L,T3H,T3L,T1,T0)
2104
2105 // if (sel == 0) {
2106 // copy(P3.x[:], X1)
2107 // copy(P3.y[:], Y1)
2108 // copy(P3.z[:], Z1)
2109 // }
2110
2111 VL 0(P1ptr), X1H
2112 VL 16(P1ptr), X1L
2113
2114 // Y1 already loaded, left over from addition
2115 VL 64(P1ptr), Z1H
2116 VL 80(P1ptr), Z1L
2117
2118 VLREPG sel+32(FP), SEL1
2119 VZERO ZER
2120 VCEQG SEL1, ZER, SEL1
2121
2122 VSEL X1L, X3L, SEL1, X3L
2123 VSEL X1H, X3H, SEL1, X3H
2124 VSEL Y1L, Y3L, SEL1, Y3L
2125 VSEL Y1H, Y3H, SEL1, Y3H
2126 VSEL Z1L, Z3L, SEL1, Z3L
2127 VSEL Z1H, Z3H, SEL1, Z3H
2128
2129 // if (zero == 0) {
2130 // copy(P3.x[:], X2)
2131 // copy(P3.y[:], Y2)
2132 // copy(P3.z[:], []byte{0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xfe, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
2133 // 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01}) //(p256.z*2^256)%p
2134 // }
2135 VL 0(P2ptr), X2H
2136 VL 16(P2ptr), X2L
2137
2138 // Y2 already loaded
2139 VL 128(CPOOL), Z2H
2140 VL 144(CPOOL), Z2L
2141
2142 VLREPG zero+40(FP), SEL1
2143 VZERO ZER
2144 VCEQG SEL1, ZER, SEL1
2145
2146 VSEL X2L, X3L, SEL1, X3L
2147 VSEL X2H, X3H, SEL1, X3H
2148 VSEL Y2L, Y3L, SEL1, Y3L
2149 VSEL Y2H, Y3H, SEL1, Y3H
2150 VSEL Z2L, Z3L, SEL1, Z3L
2151 VSEL Z2H, Z3H, SEL1, Z3H
2152
2153 // All done, store out the result!!!
2154 VST X3H, 0(P3ptr)
2155 VST X3L, 16(P3ptr)
2156 VST Y3H, 32(P3ptr)
2157 VST Y3L, 48(P3ptr)
2158 VST Z3H, 64(P3ptr)
2159 VST Z3L, 80(P3ptr)
2160
2161 RET
2162
2163 #undef P3ptr
2164 #undef P1ptr
2165 #undef P2ptr
2166 #undef CPOOL
2167
2168 #undef Y2L
2169 #undef Y2H
2170 #undef T1L
2171 #undef T1H
2172 #undef T2L
2173 #undef T2H
2174 #undef T3L
2175 #undef T3H
2176 #undef T4L
2177 #undef T4H
2178
2179 #undef TT0
2180 #undef TT1
2181 #undef T2
2182
2183 #undef X0
2184 #undef X1
2185 #undef Y0
2186 #undef Y1
2187 #undef T0
2188 #undef T1
2189
2190 #undef PL
2191 #undef PH
2192
2193 #undef X1L
2194 #undef X1H
2195 #undef Y1L
2196 #undef Y1H
2197 #undef Z1L
2198 #undef Z1H
2199 #undef X2L
2200 #undef X2H
2201 #undef Z2L
2202 #undef Z2H
2203 #undef X3L
2204 #undef X3H
2205 #undef Y3L
2206 #undef Y3H
2207 #undef Z3L
2208 #undef Z3H
2209
2210 #undef ZER
2211 #undef SEL1
2212 #undef CAR1
2213 #undef CAR2
2214
2215 // p256PointDoubleAsm(P3, P1 *p256Point)
2216 // https://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian.html#doubling-dbl-2007-bl
2217 // https://www.hyperelliptic.org/EFD/g1p/auto-shortw.html
2218 // https://www.hyperelliptic.org/EFD/g1p/auto-shortw-projective-3.html
2219 #define P3ptr R1
2220 #define P1ptr R2
2221 #define CPOOL R4
2222
2223 // Temporaries in REGs
2224 #define X3L V15
2225 #define X3H V16
2226 #define Y3L V17
2227 #define Y3H V18
2228 #define T1L V19
2229 #define T1H V20
2230 #define T2L V21
2231 #define T2H V22
2232 #define T3L V23
2233 #define T3H V24
2234
2235 #define X1L V6
2236 #define X1H V7
2237 #define Y1L V8
2238 #define Y1H V9
2239 #define Z1L V10
2240 #define Z1H V11
2241
2242 // Temps for Sub and Add
2243 #define TT0 V11
2244 #define TT1 V12
2245 #define T2 V13
2246
2247 // p256MulAsm Parameters
2248 #define X0 V0
2249 #define X1 V1
2250 #define Y0 V2
2251 #define Y1 V3
2252 #define T0 V4
2253 #define T1 V5
2254
2255 #define PL V30
2256 #define PH V31
2257
2258 #define Z3L V23
2259 #define Z3H V24
2260
2261 #define ZER V26
2262 #define SEL1 V27
2263 #define CAR1 V28
2264 #define CAR2 V29
2265 /*
2266 * https://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2004-hmv
2267 * Cost: 4M + 4S + 1*half + 5add + 2*2 + 1*3.
2268 * Source: 2004 Hankerson–Menezes–Vanstone, page 91.
2269 * A = 3(X₁-Z₁²)×(X₁+Z₁²)
2270 * B = 2Y₁
2271 * Z₃ = B×Z₁
2272 * C = B²
2273 * D = C×X₁
2274 * X₃ = A²-2D
2275 * Y₃ = (D-X₃)×A-C²/2
2276 *
2277 * Three-operand formula:
2278 * T1 = Z1²
2279 * T2 = X1-T1
2280 * T1 = X1+T1
2281 * T2 = T2*T1
2282 * T2 = 3*T2
2283 * Y3 = 2*Y1
2284 * Z3 = Y3*Z1
2285 * Y3 = Y3²
2286 * T3 = Y3*X1
2287 * Y3 = Y3²
2288 * Y3 = half*Y3
2289 * X3 = T2²
2290 * T1 = 2*T3
2291 * X3 = X3-T1
2292 * T1 = T3-X3
2293 * T1 = T1*T2
2294 * Y3 = T1-Y3
2295 */
2296
2297 TEXT ·p256PointDoubleAsm(SB), NOSPLIT, $0
2298 MOVD P3+0(FP), P3ptr
2299 MOVD P1+8(FP), P1ptr
2300
2301 MOVD $p256mul<>+0x00(SB), CPOOL
2302 VL 16(CPOOL), PL
2303 VL 0(CPOOL), PH
2304
2305 // X=Z1; Y=Z1; MUL; T- // T1 = Z1²
2306 VL 64(P1ptr), X1 // Z1H
2307 VL 80(P1ptr), X0 // Z1L
2308 VLR X0, Y0
2309 VLR X1, Y1
2310 CALL p256SqrInternal<>(SB)
2311
2312 // SUB(X<X1-T) // T2 = X1-T1
2313 VL 0(P1ptr), X1H
2314 VL 16(P1ptr), X1L
2315 p256SubInternal(X1,X0,X1H,X1L,T1,T0)
2316
2317 // ADD(Y<X1+T) // T1 = X1+T1
2318 p256AddInternal(Y1,Y0,X1H,X1L,T1,T0)
2319
2320 // X- ; Y- ; MUL; T- // T2 = T2*T1
2321 CALL p256MulInternal<>(SB)
2322
2323 // ADD(T2<T+T); ADD(T2<T2+T) // T2 = 3*T2
2324 p256AddInternal(T2H,T2L,T1,T0,T1,T0)
2325 p256AddInternal(T2H,T2L,T2H,T2L,T1,T0)
2326
2327 // ADD(X<Y1+Y1) // Y3 = 2*Y1
2328 VL 32(P1ptr), Y1H
2329 VL 48(P1ptr), Y1L
2330 p256AddInternal(X1,X0,Y1H,Y1L,Y1H,Y1L)
2331
2332 // X- ; Y=Z1; MUL; Z3:=T // Z3 = Y3*Z1
2333 VL 64(P1ptr), Y1 // Z1H
2334 VL 80(P1ptr), Y0 // Z1L
2335 CALL p256MulInternal<>(SB)
2336 VST T1, 64(P3ptr)
2337 VST T0, 80(P3ptr)
2338
2339 // X- ; Y=X ; MUL; T- // Y3 = Y3²
2340 VLR X0, Y0
2341 VLR X1, Y1
2342 CALL p256SqrInternal<>(SB)
2343
2344 // X=T ; Y=X1; MUL; T3=T // T3 = Y3*X1
2345 VLR T0, X0
2346 VLR T1, X1
2347 VL 0(P1ptr), Y1
2348 VL 16(P1ptr), Y0
2349 CALL p256MulInternal<>(SB)
2350 VLR T0, T3L
2351 VLR T1, T3H
2352
2353 // X- ; Y=X ; MUL; T- // Y3 = Y3²
2354 VLR X0, Y0
2355 VLR X1, Y1
2356 CALL p256SqrInternal<>(SB)
2357
2358 // HAL(Y3<T) // Y3 = half*Y3
2359 p256HalfInternal(Y3H,Y3L, T1,T0)
2360
2361 // X=T2; Y=T2; MUL; T- // X3 = T2²
2362 VLR T2L, X0
2363 VLR T2H, X1
2364 VLR T2L, Y0
2365 VLR T2H, Y1
2366 CALL p256SqrInternal<>(SB)
2367
2368 // ADD(T1<T3+T3) // T1 = 2*T3
2369 p256AddInternal(T1H,T1L,T3H,T3L,T3H,T3L)
2370
2371 // SUB(X3<T-T1) X3:=X3 // X3 = X3-T1
2372 p256SubInternal(X3H,X3L,T1,T0,T1H,T1L)
2373 VST X3H, 0(P3ptr)
2374 VST X3L, 16(P3ptr)
2375
2376 // SUB(X<T3-X3) // T1 = T3-X3
2377 p256SubInternal(X1,X0,T3H,T3L,X3H,X3L)
2378
2379 // X- ; Y- ; MUL; T- // T1 = T1*T2
2380 CALL p256MulInternal<>(SB)
2381
2382 // SUB(Y3<T-Y3) // Y3 = T1-Y3
2383 p256SubInternal(Y3H,Y3L,T1,T0,Y3H,Y3L)
2384
2385 VST Y3H, 32(P3ptr)
2386 VST Y3L, 48(P3ptr)
2387 RET
2388
2389 #undef P3ptr
2390 #undef P1ptr
2391 #undef CPOOL
2392 #undef X3L
2393 #undef X3H
2394 #undef Y3L
2395 #undef Y3H
2396 #undef T1L
2397 #undef T1H
2398 #undef T2L
2399 #undef T2H
2400 #undef T3L
2401 #undef T3H
2402 #undef X1L
2403 #undef X1H
2404 #undef Y1L
2405 #undef Y1H
2406 #undef Z1L
2407 #undef Z1H
2408 #undef TT0
2409 #undef TT1
2410 #undef T2
2411 #undef X0
2412 #undef X1
2413 #undef Y0
2414 #undef Y1
2415 #undef T0
2416 #undef T1
2417 #undef PL
2418 #undef PH
2419 #undef Z3L
2420 #undef Z3H
2421 #undef ZER
2422 #undef SEL1
2423 #undef CAR1
2424 #undef CAR2
2425
2426 // p256PointAddAsm(P3, P1, P2 *p256Point)
2427 #define P3ptr R1
2428 #define P1ptr R2
2429 #define P2ptr R3
2430 #define CPOOL R4
2431 #define ISZERO R5
2432 #define TRUE R6
2433
2434 // Temporaries in REGs
2435 #define T1L V16
2436 #define T1H V17
2437 #define T2L V18
2438 #define T2H V19
2439 #define U1L V20
2440 #define U1H V21
2441 #define S1L V22
2442 #define S1H V23
2443 #define HL V24
2444 #define HH V25
2445 #define RL V26
2446 #define RH V27
2447
2448 // Temps for Sub and Add
2449 #define ZER V6
2450 #define SEL1 V7
2451 #define CAR1 V8
2452 #define CAR2 V9
2453 #define TT0 V11
2454 #define TT1 V12
2455 #define T2 V13
2456
2457 // p256MulAsm Parameters
2458 #define X0 V0
2459 #define X1 V1
2460 #define Y0 V2
2461 #define Y1 V3
2462 #define T0 V4
2463 #define T1 V5
2464
2465 #define PL V30
2466 #define PH V31
2467 /*
2468 * https://delta.cs.cinvestav.mx/~francisco/arith/julio.pdf "Software Implementation of the NIST Elliptic Curves Over Prime Fields"
2469 *
2470 * A = X₁×Z₂²
2471 * B = Y₁×Z₂³
2472 * C = X₂×Z₁²-A
2473 * D = Y₂×Z₁³-B
2474 * X₃ = D² - 2A×C² - C³
2475 * Y₃ = D×(A×C² - X₃) - B×C³
2476 * Z₃ = Z₁×Z₂×C
2477 *
2478 * Three-operand formula (adopted): https://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-1998-cmo-2
2479 * Temp storage: T1,T2,U1,H,Z3=X3=Y3,S1,R
2480 *
2481 * T1 = Z1*Z1
2482 * T2 = Z2*Z2
2483 * U1 = X1*T2
2484 * H = X2*T1
2485 * H = H-U1
2486 * Z3 = Z1*Z2
2487 * Z3 = Z3*H << store-out Z3 result reg.. could override Z1, if slices have same backing array
2488 *
2489 * S1 = Z2*T2
2490 * S1 = Y1*S1
2491 * R = Z1*T1
2492 * R = Y2*R
2493 * R = R-S1
2494 *
2495 * T1 = H*H
2496 * T2 = H*T1
2497 * U1 = U1*T1
2498 *
2499 * X3 = R*R
2500 * X3 = X3-T2
2501 * T1 = 2*U1
2502 * X3 = X3-T1 << store-out X3 result reg
2503 *
2504 * T2 = S1*T2
2505 * Y3 = U1-X3
2506 * Y3 = R*Y3
2507 * Y3 = Y3-T2 << store-out Y3 result reg
2508
2509 // X=Z1; Y=Z1; MUL; T- // T1 = Z1*Z1
2510 // X- ; Y=T ; MUL; R=T // R = Z1*T1
2511 // X=X2; Y- ; MUL; H=T // H = X2*T1
2512 // X=Z2; Y=Z2; MUL; T- // T2 = Z2*Z2
2513 // X- ; Y=T ; MUL; S1=T // S1 = Z2*T2
2514 // X=X1; Y- ; MUL; U1=T // U1 = X1*T2
2515 // SUB(H<H-T) // H = H-U1
2516 // X=Z1; Y=Z2; MUL; T- // Z3 = Z1*Z2
2517 // X=T ; Y=H ; MUL; Z3:=T// Z3 = Z3*H << store-out Z3 result reg.. could override Z1, if slices have same backing array
2518 // X=Y1; Y=S1; MUL; S1=T // S1 = Y1*S1
2519 // X=Y2; Y=R ; MUL; T- // R = Y2*R
2520 // SUB(R<T-S1) // R = R-S1
2521 // X=H ; Y=H ; MUL; T- // T1 = H*H
2522 // X- ; Y=T ; MUL; T2=T // T2 = H*T1
2523 // X=U1; Y- ; MUL; U1=T // U1 = U1*T1
2524 // X=R ; Y=R ; MUL; T- // X3 = R*R
2525 // SUB(T<T-T2) // X3 = X3-T2
2526 // ADD(X<U1+U1) // T1 = 2*U1
2527 // SUB(T<T-X) X3:=T // X3 = X3-T1 << store-out X3 result reg
2528 // SUB(Y<U1-T) // Y3 = U1-X3
2529 // X=R ; Y- ; MUL; U1=T // Y3 = R*Y3
2530 // X=S1; Y=T2; MUL; T- // T2 = S1*T2
2531 // SUB(T<U1-T); Y3:=T // Y3 = Y3-T2 << store-out Y3 result reg
2532 */
2533 TEXT ·p256PointAddAsm(SB), NOSPLIT, $0
2534 MOVD P3+0(FP), P3ptr
2535 MOVD P1+8(FP), P1ptr
2536 MOVD P2+16(FP), P2ptr
2537
2538 MOVD $p256mul<>+0x00(SB), CPOOL
2539 VL 16(CPOOL), PL
2540 VL 0(CPOOL), PH
2541
2542 // X=Z1; Y=Z1; MUL; T- // T1 = Z1*Z1
2543 VL 64(P1ptr), X1 // Z1H
2544 VL 80(P1ptr), X0 // Z1L
2545 VLR X0, Y0
2546 VLR X1, Y1
2547 CALL p256SqrInternal<>(SB)
2548
2549 // X- ; Y=T ; MUL; R=T // R = Z1*T1
2550 VLR T0, Y0
2551 VLR T1, Y1
2552 CALL p256MulInternal<>(SB)
2553 VLR T0, RL
2554 VLR T1, RH
2555
2556 // X=X2; Y- ; MUL; H=T // H = X2*T1
2557 VL 0(P2ptr), X1 // X2H
2558 VL 16(P2ptr), X0 // X2L
2559 CALL p256MulInternal<>(SB)
2560 VLR T0, HL
2561 VLR T1, HH
2562
2563 // X=Z2; Y=Z2; MUL; T- // T2 = Z2*Z2
2564 VL 64(P2ptr), X1 // Z2H
2565 VL 80(P2ptr), X0 // Z2L
2566 VLR X0, Y0
2567 VLR X1, Y1
2568 CALL p256SqrInternal<>(SB)
2569
2570 // X- ; Y=T ; MUL; S1=T // S1 = Z2*T2
2571 VLR T0, Y0
2572 VLR T1, Y1
2573 CALL p256MulInternal<>(SB)
2574 VLR T0, S1L
2575 VLR T1, S1H
2576
2577 // X=X1; Y- ; MUL; U1=T // U1 = X1*T2
2578 VL 0(P1ptr), X1 // X1H
2579 VL 16(P1ptr), X0 // X1L
2580 CALL p256MulInternal<>(SB)
2581 VLR T0, U1L
2582 VLR T1, U1H
2583
2584 // SUB(H<H-T) // H = H-U1
2585 p256SubInternal(HH,HL,HH,HL,T1,T0)
2586
2587 // if H == 0 or H^P == 0 then ret=1 else ret=0
2588 // clobbers T1H and T1L
2589 MOVD $0, ISZERO
2590 MOVD $1, TRUE
2591 VZERO ZER
2592 VO HL, HH, T1H
2593 VCEQGS ZER, T1H, T1H
2594 MOVDEQ TRUE, ISZERO
2595 VX HL, PL, T1L
2596 VX HH, PH, T1H
2597 VO T1L, T1H, T1H
2598 VCEQGS ZER, T1H, T1H
2599 MOVDEQ TRUE, ISZERO
2600 MOVD ISZERO, ret+24(FP)
2601
2602 // X=Z1; Y=Z2; MUL; T- // Z3 = Z1*Z2
2603 VL 64(P1ptr), X1 // Z1H
2604 VL 80(P1ptr), X0 // Z1L
2605 VL 64(P2ptr), Y1 // Z2H
2606 VL 80(P2ptr), Y0 // Z2L
2607 CALL p256MulInternal<>(SB)
2608
2609 // X=T ; Y=H ; MUL; Z3:=T// Z3 = Z3*H
2610 VLR T0, X0
2611 VLR T1, X1
2612 VLR HL, Y0
2613 VLR HH, Y1
2614 CALL p256MulInternal<>(SB)
2615 VST T1, 64(P3ptr)
2616 VST T0, 80(P3ptr)
2617
2618 // X=Y1; Y=S1; MUL; S1=T // S1 = Y1*S1
2619 VL 32(P1ptr), X1
2620 VL 48(P1ptr), X0
2621 VLR S1L, Y0
2622 VLR S1H, Y1
2623 CALL p256MulInternal<>(SB)
2624 VLR T0, S1L
2625 VLR T1, S1H
2626
2627 // X=Y2; Y=R ; MUL; T- // R = Y2*R
2628 VL 32(P2ptr), X1
2629 VL 48(P2ptr), X0
2630 VLR RL, Y0
2631 VLR RH, Y1
2632 CALL p256MulInternal<>(SB)
2633
2634 // SUB(R<T-S1) // R = T-S1
2635 p256SubInternal(RH,RL,T1,T0,S1H,S1L)
2636
2637 // if R == 0 or R^P == 0 then ret=ret else ret=0
2638 // clobbers T1H and T1L
2639 MOVD $0, ISZERO
2640 MOVD $1, TRUE
2641 VZERO ZER
2642 VO RL, RH, T1H
2643 VCEQGS ZER, T1H, T1H
2644 MOVDEQ TRUE, ISZERO
2645 VX RL, PL, T1L
2646 VX RH, PH, T1H
2647 VO T1L, T1H, T1H
2648 VCEQGS ZER, T1H, T1H
2649 MOVDEQ TRUE, ISZERO
2650 AND ret+24(FP), ISZERO
2651 MOVD ISZERO, ret+24(FP)
2652
2653 // X=H ; Y=H ; MUL; T- // T1 = H*H
2654 VLR HL, X0
2655 VLR HH, X1
2656 VLR HL, Y0
2657 VLR HH, Y1
2658 CALL p256SqrInternal<>(SB)
2659
2660 // X- ; Y=T ; MUL; T2=T // T2 = H*T1
2661 VLR T0, Y0
2662 VLR T1, Y1
2663 CALL p256MulInternal<>(SB)
2664 VLR T0, T2L
2665 VLR T1, T2H
2666
2667 // X=U1; Y- ; MUL; U1=T // U1 = U1*T1
2668 VLR U1L, X0
2669 VLR U1H, X1
2670 CALL p256MulInternal<>(SB)
2671 VLR T0, U1L
2672 VLR T1, U1H
2673
2674 // X=R ; Y=R ; MUL; T- // X3 = R*R
2675 VLR RL, X0
2676 VLR RH, X1
2677 VLR RL, Y0
2678 VLR RH, Y1
2679 CALL p256SqrInternal<>(SB)
2680
2681 // SUB(T<T-T2) // X3 = X3-T2
2682 p256SubInternal(T1,T0,T1,T0,T2H,T2L)
2683
2684 // ADD(X<U1+U1) // T1 = 2*U1
2685 p256AddInternal(X1,X0,U1H,U1L,U1H,U1L)
2686
2687 // SUB(T<T-X) X3:=T // X3 = X3-T1 << store-out X3 result reg
2688 p256SubInternal(T1,T0,T1,T0,X1,X0)
2689 VST T1, 0(P3ptr)
2690 VST T0, 16(P3ptr)
2691
2692 // SUB(Y<U1-T) // Y3 = U1-X3
2693 p256SubInternal(Y1,Y0,U1H,U1L,T1,T0)
2694
2695 // X=R ; Y- ; MUL; U1=T // Y3 = R*Y3
2696 VLR RL, X0
2697 VLR RH, X1
2698 CALL p256MulInternal<>(SB)
2699 VLR T0, U1L
2700 VLR T1, U1H
2701
2702 // X=S1; Y=T2; MUL; T- // T2 = S1*T2
2703 VLR S1L, X0
2704 VLR S1H, X1
2705 VLR T2L, Y0
2706 VLR T2H, Y1
2707 CALL p256MulInternal<>(SB)
2708
2709 // SUB(T<U1-T); Y3:=T // Y3 = Y3-T2 << store-out Y3 result reg
2710 p256SubInternal(T1,T0,U1H,U1L,T1,T0)
2711 VST T1, 32(P3ptr)
2712 VST T0, 48(P3ptr)
2713
2714 RET
2715
View as plain text