1 // Copyright 2019 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 #include "textflag.h"
6
7 // This is a port of the s390x asm implementation.
8 // to ppc64le.
9
10 // Some changes were needed due to differences in
11 // the Go opcodes and/or available instructions
12 // between s390x and ppc64le.
13
14 // 1. There were operand order differences in the
15 // VSUBUQM, VSUBCUQ, and VSEL instructions.
16
17 // 2. ppc64 does not have a multiply high and low
18 // like s390x, so those were implemented using
19 // macros to compute the equivalent values.
20
21 // 3. The LVX, STVX instructions on ppc64 require
22 // 16 byte alignment of the data. To avoid that
23 // requirement, data is loaded using LXVD2X and
24 // STXVD2X with VPERM to reorder bytes correctly.
25
26 // I have identified some areas where I believe
27 // changes would be needed to make this work for big
28 // endian; however additional changes beyond what I
29 // have noted are most likely needed to make it work.
30 // - The string used with VPERM to swap the byte order
31 // for loads and stores.
32 // - The EXTRACT_HI and EXTRACT_LO strings.
33 // - The constants that are loaded from CPOOL.
34 //
35
36 // Permute string used by VPERM to reorder bytes
37 // loaded or stored using LXVD2X or STXVD2X
38 // on little endian.
39 DATA byteswap<>+0(SB)/8, $0x08090a0b0c0d0e0f
40 DATA byteswap<>+8(SB)/8, $0x0001020304050607
41
42 // The following constants are defined in an order
43 // that is correct for use with LXVD2X/STXVD2X
44 // on little endian.
45 DATA p256<>+0x00(SB)/8, $0xffffffff00000001 // P256
46 DATA p256<>+0x08(SB)/8, $0x0000000000000000 // P256
47 DATA p256<>+0x10(SB)/8, $0x00000000ffffffff // P256
48 DATA p256<>+0x18(SB)/8, $0xffffffffffffffff // P256
49 DATA p256<>+0x20(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0
50 DATA p256<>+0x28(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0
51 DATA p256<>+0x30(SB)/8, $0x0000000010111213 // SEL 0 d1 d0 0
52 DATA p256<>+0x38(SB)/8, $0x1415161700000000 // SEL 0 d1 d0 0
53 DATA p256<>+0x40(SB)/8, $0x18191a1b1c1d1e1f // SEL d1 d0 d1 d0
54 DATA p256<>+0x48(SB)/8, $0x18191a1b1c1d1e1f // SEL d1 d0 d1 d0
55 DATA p256mul<>+0x00(SB)/8, $0x00000000ffffffff // P256 original
56 DATA p256mul<>+0x08(SB)/8, $0xffffffffffffffff // P256
57 DATA p256mul<>+0x10(SB)/8, $0xffffffff00000001 // P256 original
58 DATA p256mul<>+0x18(SB)/8, $0x0000000000000000 // P256
59 DATA p256mul<>+0x20(SB)/8, $0x1c1d1e1f00000000 // SEL d0 0 0 d0
60 DATA p256mul<>+0x28(SB)/8, $0x000000001c1d1e1f // SEL d0 0 0 d0
61 DATA p256mul<>+0x30(SB)/8, $0x0001020304050607 // SEL d0 0 d1 d0
62 DATA p256mul<>+0x38(SB)/8, $0x1c1d1e1f0c0d0e0f // SEL d0 0 d1 d0
63 DATA p256mul<>+0x40(SB)/8, $0x040506071c1d1e1f // SEL 0 d1 d0 d1
64 DATA p256mul<>+0x48(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL 0 d1 d0 d1
65 DATA p256mul<>+0x50(SB)/8, $0x0405060704050607 // SEL 0 0 d1 d0
66 DATA p256mul<>+0x58(SB)/8, $0x1c1d1e1f0c0d0e0f // SEL 0 0 d1 d0
67 DATA p256mul<>+0x60(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0
68 DATA p256mul<>+0x68(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0
69 DATA p256mul<>+0x70(SB)/8, $0x141516170c0d0e0f // SEL 0 d1 d0 0
70 DATA p256mul<>+0x78(SB)/8, $0x1c1d1e1f14151617 // SEL 0 d1 d0 0
71 DATA p256mul<>+0x80(SB)/8, $0xffffffff00000000 // (1*2^256)%P256
72 DATA p256mul<>+0x88(SB)/8, $0x0000000000000001 // (1*2^256)%P256
73 DATA p256mul<>+0x90(SB)/8, $0x00000000fffffffe // (1*2^256)%P256
74 DATA p256mul<>+0x98(SB)/8, $0xffffffffffffffff // (1*2^256)%P256
75
76 // The following are used with VPERM to extract the high and low
77 // values from the intermediate results of a vector multiply.
78 // They are used in the VMULTxxx macros. These have been tested
79 // only on little endian, I think they would have to be different
80 // for big endian.
81 DATA p256permhilo<>+0x00(SB)/8, $0x0405060714151617 // least significant
82 DATA p256permhilo<>+0x08(SB)/8, $0x0c0d0e0f1c1d1e1f
83 DATA p256permhilo<>+0x10(SB)/8, $0x0001020310111213 // most significant
84 DATA p256permhilo<>+0x18(SB)/8, $0x08090a0b18191A1B
85
86 // External declarations for constants
87 GLOBL p256ord<>(SB), 8, $32
88 GLOBL p256<>(SB), 8, $80
89 GLOBL p256mul<>(SB), 8, $160
90 GLOBL p256permhilo<>(SB), 8, $32
91 GLOBL byteswap<>+0(SB), RODATA, $16
92
93 // The following macros are used to implement the ppc64le
94 // equivalent function from the corresponding s390x
95 // instruction for vector multiply high, low, and add,
96 // since there aren't exact equivalent instructions.
97 // The corresponding s390x instructions appear in the
98 // comments.
99 // Implementation for big endian would have to be
100 // investigated, I think it would be different.
101 //
102 // Vector multiply low word
103 //
104 // VMLF x0, x1, out_low
105 #define VMULT_LOW(x1, x2, out_low) \
106 VMULUWM x1, x2, out_low
107
108 //
109 // Vector multiply high word
110 //
111 // VMLHF x0, x1, out_hi
112 #define VMULT_HI(x1, x2, out_hi) \
113 VMULEUW x1, x2, TMP1; \
114 VMULOUW x1, x2, TMP2; \
115 VPERM TMP1, TMP2, EXTRACT_HI, out_hi
116
117 //
118 // Vector multiply word
119 //
120 // VMLF x0, x1, out_low
121 // VMLHF x0, x1, out_hi
122 #define VMULT(x1, x2, out_low, out_hi) \
123 VMULEUW x1, x2, TMP1; \
124 VMULOUW x1, x2, TMP2; \
125 VPERM TMP1, TMP2, EXTRACT_LO, out_low; \
126 VPERM TMP1, TMP2, EXTRACT_HI, out_hi
127
128 //
129 // Vector multiply add word
130 //
131 // VMALF x0, x1, y, out_low
132 // VMALHF x0, x1, y, out_hi
133 #define VMULT_ADD(x1, x2, y, out_low, out_hi) \
134 VSPLTISW $1, TMP1; \
135 VMULEUW y, TMP1, TMP2; \
136 VMULOUW y, TMP1, TMP1; \
137 VMULEUW x1, x2, out_low; \
138 VMULOUW x1, x2, out_hi; \
139 VADDUDM TMP1, out_hi, TMP1; \
140 VADDUDM TMP2, out_low, TMP2; \
141 VPERM TMP2, TMP1, EXTRACT_LO, out_low; \
142 VPERM TMP2, TMP1, EXTRACT_HI, out_hi
143
144 //
145 // Vector multiply add high word
146 //
147 // VMALF x0, x1, y, out_low
148 // VMALHF x0, x1, y, out_hi
149 #define VMULT_ADD_HI(x1, x2, y, out_low, out_hi) \
150 VSPLTISW $1, TMP1; \
151 VMULOUW y, TMP1, TMP2; \
152 VMULEUW y, TMP1, TMP1; \
153 VMULEUW x1, x2, out_hi; \
154 VMULOUW x1, x2, out_low; \
155 VADDUDM TMP1, out_hi, TMP1; \
156 VADDUDM TMP2, out_low, TMP2; \
157 VPERM TMP2, TMP1, EXTRACT_HI, out_hi
158
159 //
160 // Vector multiply add low word
161 //
162 // VMALF s0, x1, y, out_low
163 #define VMULT_ADD_LOW(x1, x2, y, out_low) \
164 VMULUWM x1, x2, out_low; \
165 VADDUWM out_low, y, out_low
166
167 #define res_ptr R3
168 #define a_ptr R4
169
170 #undef res_ptr
171 #undef a_ptr
172
173 // func p256NegCond(val *p256Point, cond int)
174 #define P1ptr R3
175 #define CPOOL R7
176
177 #define Y1L V0
178 #define Y1L_ VS32
179 #define Y1H V1
180 #define Y1H_ VS33
181 #define T1L V2
182 #define T1L_ VS34
183 #define T1H V3
184 #define T1H_ VS35
185
186 #define SWAP V28
187 #define SWAP_ VS60
188
189 #define PL V30
190 #define PL_ VS62
191 #define PH V31
192 #define PH_ VS63
193
194 #define SEL1 V5
195 #define SEL1_ VS37
196 #define CAR1 V6
197 //
198 // iff cond == 1 val <- -val
199 //
200 TEXT ·p256NegCond(SB), NOSPLIT, $0-16
201 MOVD val+0(FP), P1ptr
202 MOVD $16, R16
203 MOVD $32, R17
204 MOVD $48, R18
205 MOVD $40, R19
206
207 MOVD cond+8(FP), R6
208 CMP $0, R6
209 BC 12, 2, LR // just return if cond == 0
210
211 MOVD $p256mul<>+0x00(SB), CPOOL
212
213 MOVD $byteswap<>+0x00(SB), R8
214 LXVD2X (R8)(R0), SWAP_
215
216 LXVD2X (P1ptr)(R17), Y1L_
217 LXVD2X (P1ptr)(R18), Y1H_
218
219 VPERM Y1H, Y1H, SWAP, Y1H
220 VPERM Y1L, Y1L, SWAP, Y1L
221
222 LXVD2X (CPOOL)(R0), PL_
223 LXVD2X (CPOOL)(R16), PH_
224
225 VSUBCUQ PL, Y1L, CAR1 // subtract part2 giving carry
226 VSUBUQM PL, Y1L, T1L // subtract part2 giving result
227 VSUBEUQM PH, Y1H, CAR1, T1H // subtract part1 using carry from part2
228
229 VPERM T1H, T1H, SWAP, T1H
230 VPERM T1L, T1L, SWAP, T1L
231
232 STXVD2X T1L_, (R17+P1ptr)
233 STXVD2X T1H_, (R18+P1ptr)
234 RET
235
236 #undef P1ptr
237 #undef CPOOL
238 #undef Y1L
239 #undef Y1L_
240 #undef Y1H
241 #undef Y1H_
242 #undef T1L
243 #undef T1L_
244 #undef T1H
245 #undef T1H_
246 #undef PL
247 #undef PL_
248 #undef PH
249 #undef PH_
250 #undef SEL1
251 #undef SEL1_
252 #undef CAR1
253
254 //
255 // if cond == 0 res <-b else res <-a
256 //
257 // func p256MovCond(res, a, b *p256Point, cond int)
258 #define P3ptr R3
259 #define P1ptr R4
260 #define P2ptr R5
261
262 #define FROMptr R7
263 #define X1L V0
264 #define X1H V1
265 #define Y1L V2
266 #define Y1H V3
267 #define Z1L V4
268 #define Z1H V5
269 #define X1L_ VS32
270 #define X1H_ VS33
271 #define Y1L_ VS34
272 #define Y1H_ VS35
273 #define Z1L_ VS36
274 #define Z1H_ VS37
275
276 // This function uses LXVD2X and STXVD2X to avoid the
277 // data alignment requirement for LVX, STVX. Since
278 // this code is just moving bytes and not doing arithmetic,
279 // order of the bytes doesn't matter.
280 //
281 TEXT ·p256MovCond(SB), NOSPLIT, $0-32
282 MOVD res+0(FP), P3ptr
283 MOVD a+8(FP), P1ptr
284 MOVD b+16(FP), P2ptr
285 MOVD cond+24(FP), R6
286 MOVD $16, R16
287 MOVD $32, R17
288 MOVD $48, R18
289 MOVD $56, R21
290 MOVD $64, R19
291 MOVD $80, R20
292
293 // Check the condition
294 CMP $0, R6
295
296 // If 0, use b as the source
297 BEQ FROMB
298
299 // Not 0, use a as the source
300 MOVD P1ptr, FROMptr
301 BR LOADVALS
302
303 FROMB:
304 MOVD P2ptr, FROMptr
305
306 LOADVALS:
307 // Load from a or b depending on the setting
308 // of FROMptr
309 LXVW4X (FROMptr+R0), X1H_
310 LXVW4X (FROMptr+R16), X1L_
311 LXVW4X (FROMptr+R17), Y1H_
312 LXVW4X (FROMptr+R18), Y1L_
313 LXVW4X (FROMptr+R19), Z1H_
314 LXVW4X (FROMptr+R20), Z1L_
315
316 STXVW4X X1H_, (P3ptr+R0)
317 STXVW4X X1L_, (P3ptr+R16)
318 STXVW4X Y1H_, (P3ptr+R17)
319 STXVW4X Y1L_, (P3ptr+R18)
320 STXVW4X Z1H_, (P3ptr+R19)
321 STXVW4X Z1L_, (P3ptr+R20)
322
323 RET
324
325 #undef P3ptr
326 #undef P1ptr
327 #undef P2ptr
328 #undef FROMptr
329 #undef X1L
330 #undef X1H
331 #undef Y1L
332 #undef Y1H
333 #undef Z1L
334 #undef Z1H
335 #undef X1L_
336 #undef X1H_
337 #undef Y1L_
338 #undef Y1H_
339 #undef Z1L_
340 #undef Z1H_
341 //
342 // Select the point from the table for idx
343 //
344 // func p256Select(point *p256Point, table []p256Point, idx int)
345 #define P3ptr R3
346 #define P1ptr R4
347 #define COUNT R5
348
349 #define X1L V0
350 #define X1H V1
351 #define Y1L V2
352 #define Y1H V3
353 #define Z1L V4
354 #define Z1H V5
355 #define X1L_ VS32
356 #define X1H_ VS33
357 #define Y1L_ VS34
358 #define Y1H_ VS35
359 #define Z1L_ VS36
360 #define Z1H_ VS37
361 #define X2L V6
362 #define X2H V7
363 #define Y2L V8
364 #define Y2H V9
365 #define Z2L V10
366 #define Z2H V11
367 #define X2L_ VS38
368 #define X2H_ VS39
369 #define Y2L_ VS40
370 #define Y2H_ VS41
371 #define Z2L_ VS42
372 #define Z2H_ VS43
373
374 #define ONE V18
375 #define IDX V19
376 #define SEL1 V20
377 #define SEL1_ VS52
378 #define SEL2 V21
379 //
380 TEXT ·p256Select(SB), NOSPLIT, $0-40
381 MOVD point+0(FP), P3ptr
382 MOVD table+8(FP), P1ptr
383 MOVD $16, R16
384 MOVD $32, R17
385 MOVD $48, R18
386 MOVD $64, R19
387 MOVD $80, R20
388
389 LXVDSX (R1)(R19), SEL1_ // VLREPG idx+32(FP), SEL1
390 VSPLTB $7, SEL1, IDX // splat byte
391 VSPLTISB $1, ONE // VREPIB $1, ONE
392 VSPLTISB $1, SEL2 // VREPIB $1, SEL2
393 MOVD $17, COUNT
394 MOVD COUNT, CTR // set up ctr
395
396 VSPLTISB $0, X1H // VZERO X1H
397 VSPLTISB $0, X1L // VZERO X1L
398 VSPLTISB $0, Y1H // VZERO Y1H
399 VSPLTISB $0, Y1L // VZERO Y1L
400 VSPLTISB $0, Z1H // VZERO Z1H
401 VSPLTISB $0, Z1L // VZERO Z1L
402
403 loop_select:
404
405 // LVXD2X is used here since data alignment doesn't
406 // matter.
407
408 LXVD2X (P1ptr+R0), X2H_
409 LXVD2X (P1ptr+R16), X2L_
410 LXVD2X (P1ptr+R17), Y2H_
411 LXVD2X (P1ptr+R18), Y2L_
412 LXVD2X (P1ptr+R19), Z2H_
413 LXVD2X (P1ptr+R20), Z2L_
414
415 VCMPEQUD SEL2, IDX, SEL1 // VCEQG SEL2, IDX, SEL1 OK
416
417 // This will result in SEL1 being all 0s or 1s, meaning
418 // the result is either X1L or X2L, no individual byte
419 // selection.
420
421 VSEL X1L, X2L, SEL1, X1L
422 VSEL X1H, X2H, SEL1, X1H
423 VSEL Y1L, Y2L, SEL1, Y1L
424 VSEL Y1H, Y2H, SEL1, Y1H
425 VSEL Z1L, Z2L, SEL1, Z1L
426 VSEL Z1H, Z2H, SEL1, Z1H
427
428 // Add 1 to all bytes in SEL2
429 VADDUBM SEL2, ONE, SEL2 // VAB SEL2, ONE, SEL2 OK
430 ADD $96, P1ptr
431 BC 16, 0, loop_select
432
433 // STXVD2X is used here so that alignment doesn't
434 // need to be verified. Since values were loaded
435 // using LXVD2X this is OK.
436 STXVD2X X1H_, (P3ptr+R0)
437 STXVD2X X1L_, (P3ptr+R16)
438 STXVD2X Y1H_, (P3ptr+R17)
439 STXVD2X Y1L_, (P3ptr+R18)
440 STXVD2X Z1H_, (P3ptr+R19)
441 STXVD2X Z1L_, (P3ptr+R20)
442 RET
443
444 #undef P3ptr
445 #undef P1ptr
446 #undef COUNT
447 #undef X1L
448 #undef X1H
449 #undef Y1L
450 #undef Y1H
451 #undef Z1L
452 #undef Z1H
453 #undef X2L
454 #undef X2H
455 #undef Y2L
456 #undef Y2H
457 #undef Z2L
458 #undef Z2H
459 #undef X2L_
460 #undef X2H_
461 #undef Y2L_
462 #undef Y2H_
463 #undef Z2L_
464 #undef Z2H_
465 #undef ONE
466 #undef IDX
467 #undef SEL1
468 #undef SEL1_
469 #undef SEL2
470
471 // func p256SelectBase(point, table []uint64, idx int)
472 #define P3ptr R3
473 #define P1ptr R4
474 #define COUNT R5
475
476 #define X1L V0
477 #define X1H V1
478 #define Y1L V2
479 #define Y1H V3
480 #define Z1L V4
481 #define Z1H V5
482 #define X2L V6
483 #define X2H V7
484 #define Y2L V8
485 #define Y2H V9
486 #define Z2L V10
487 #define Z2H V11
488 #define X2L_ VS38
489 #define X2H_ VS39
490 #define Y2L_ VS40
491 #define Y2H_ VS41
492 #define Z2L_ VS42
493 #define Z2H_ VS43
494
495 #define ONE V18
496 #define IDX V19
497 #define SEL1 V20
498 #define SEL1_ VS52
499 #define SEL2 V21
500 TEXT ·p256SelectBase(SB), NOSPLIT, $0-40
501 MOVD point+0(FP), P3ptr
502 MOVD table+8(FP), P1ptr
503 MOVD $16, R16
504 MOVD $32, R17
505 MOVD $48, R18
506 MOVD $64, R19
507 MOVD $80, R20
508 MOVD $56, R21
509
510 LXVDSX (R1)(R19), SEL1_
511 VSPLTB $7, SEL1, IDX // splat byte
512
513 VSPLTISB $1, ONE // Vector with byte 1s
514 VSPLTISB $1, SEL2 // Vector with byte 1s
515 MOVD $65, COUNT
516 MOVD COUNT, CTR // loop count
517
518 VSPLTISB $0, X1H // VZERO X1H
519 VSPLTISB $0, X1L // VZERO X1L
520 VSPLTISB $0, Y1H // VZERO Y1H
521 VSPLTISB $0, Y1L // VZERO Y1L
522 VSPLTISB $0, Z1H // VZERO Z1H
523 VSPLTISB $0, Z1L // VZERO Z1L
524
525 loop_select:
526 LXVD2X (P1ptr+R0), X2H_
527 LXVD2X (P1ptr+R16), X2L_
528 LXVD2X (P1ptr+R17), Y2H_
529 LXVD2X (P1ptr+R18), Y2L_
530 LXVD2X (P1ptr+R19), Z2H_
531 LXVD2X (P1ptr+R20), Z2L_
532
533 VCMPEQUD SEL2, IDX, SEL1 // Compare against idx
534
535 VSEL X1L, X2L, SEL1, X1L // Select if idx matched
536 VSEL X1H, X2H, SEL1, X1H
537 VSEL Y1L, Y2L, SEL1, Y1L
538 VSEL Y1H, Y2H, SEL1, Y1H
539 VSEL Z1L, Z2L, SEL1, Z1L
540 VSEL Z1H, Z2H, SEL1, Z1H
541
542 VADDUBM SEL2, ONE, SEL2 // Increment SEL2 bytes by 1
543 ADD $96, P1ptr // Next chunk
544 BC 16, 0, loop_select
545
546 STXVD2X X1H_, (P3ptr+R0)
547 STXVD2X X1L_, (P3ptr+R16)
548 STXVD2X Y1H_, (P3ptr+R17)
549 STXVD2X Y1L_, (P3ptr+R18)
550 STXVD2X Z1H_, (P3ptr+R19)
551 STXVD2X Z1L_, (P3ptr+R20)
552 RET
553
554 #undef P3ptr
555 #undef P1ptr
556 #undef COUNT
557 #undef X1L
558 #undef X1H
559 #undef Y1L
560 #undef Y1H
561 #undef Z1L
562 #undef Z1H
563 #undef X2L
564 #undef X2H
565 #undef Y2L
566 #undef Y2H
567 #undef Z2L
568 #undef Z2H
569 #undef X1L_
570 #undef X1H_
571 #undef X2L_
572 #undef X2H_
573 #undef Y1L_
574 #undef Y1H_
575 #undef Y2L_
576 #undef Y2H_
577 #undef Z1L_
578 #undef Z1H_
579 #undef Z2L_
580 #undef Z2H_
581 #undef ONE
582 #undef IDX
583 #undef SEL1
584 #undef SEL1_
585 #undef SEL2
586 #undef SWAP
587 #undef SWAP_
588
589 // ---------------------------------------
590 // func p256FromMont(res, in []byte)
591 #define res_ptr R3
592 #define x_ptr R4
593 #define CPOOL R7
594
595 #define T0 V0
596 #define T0_ VS32
597 #define T1 V1
598 #define T1_ VS33
599 #define T2 V2
600 #define TT0 V3
601 #define TT1 V4
602 #define TT0_ VS35
603 #define TT1_ VS36
604
605 #define ZER V6
606 #define SEL1 V7
607 #define SEL1_ VS39
608 #define SEL2 V8
609 #define SEL2_ VS40
610 #define CAR1 V9
611 #define CAR2 V10
612 #define RED1 V11
613 #define RED2 V12
614 #define PL V13
615 #define PL_ VS45
616 #define PH V14
617 #define PH_ VS46
618 #define SWAP V28
619 #define SWAP_ VS57
620
621 TEXT ·p256FromMont(SB), NOSPLIT, $0-48
622 MOVD res+0(FP), res_ptr
623 MOVD in+24(FP), x_ptr
624
625 MOVD $16, R16
626 MOVD $32, R17
627 MOVD $48, R18
628 MOVD $64, R19
629 MOVD $p256<>+0x00(SB), CPOOL
630 MOVD $byteswap<>+0x00(SB), R15
631
632 VSPLTISB $0, T2 // VZERO T2
633 VSPLTISB $0, ZER // VZERO ZER
634
635 // Constants are defined so that the LXVD2X is correct
636 LXVD2X (CPOOL+R0), PH_
637 LXVD2X (CPOOL+R16), PL_
638
639 // VPERM byte selections
640 LXVD2X (CPOOL+R18), SEL2_
641 LXVD2X (CPOOL+R19), SEL1_
642
643 LXVD2X (R15)(R0), SWAP_
644
645 LXVD2X (R16)(x_ptr), T1_
646 LXVD2X (R0)(x_ptr), T0_
647
648 // Put in true little endian order
649 VPERM T0, T0, SWAP, T0
650 VPERM T1, T1, SWAP, T1
651
652 // First round
653 VPERM T1, T0, SEL1, RED2 // d1 d0 d1 d0
654 VPERM ZER, RED2, SEL2, RED1 // 0 d1 d0 0
655 VSUBUQM RED2, RED1, RED2 // VSQ RED1, RED2, RED2 // Guaranteed not to underflow
656
657 VSLDOI $8, T1, T0, T0 // VSLDB $8, T1, T0, T0
658 VSLDOI $8, T2, T1, T1 // VSLDB $8, T2, T1, T1
659
660 VADDCUQ T0, RED1, CAR1 // VACCQ T0, RED1, CAR1
661 VADDUQM T0, RED1, T0 // VAQ T0, RED1, T0
662 VADDECUQ T1, RED2, CAR1, CAR2 // VACCCQ T1, RED2, CAR1, CAR2
663 VADDEUQM T1, RED2, CAR1, T1 // VACQ T1, RED2, CAR1, T1
664 VADDUQM T2, CAR2, T2 // VAQ T2, CAR2, T2
665
666 // Second round
667 VPERM T1, T0, SEL1, RED2 // d1 d0 d1 d0
668 VPERM ZER, RED2, SEL2, RED1 // 0 d1 d0 0
669 VSUBUQM RED2, RED1, RED2 // VSQ RED1, RED2, RED2 // Guaranteed not to underflow
670
671 VSLDOI $8, T1, T0, T0 // VSLDB $8, T1, T0, T0
672 VSLDOI $8, T2, T1, T1 // VSLDB $8, T2, T1, T1
673
674 VADDCUQ T0, RED1, CAR1 // VACCQ T0, RED1, CAR1
675 VADDUQM T0, RED1, T0 // VAQ T0, RED1, T0
676 VADDECUQ T1, RED2, CAR1, CAR2 // VACCCQ T1, RED2, CAR1, CAR2
677 VADDEUQM T1, RED2, CAR1, T1 // VACQ T1, RED2, CAR1, T1
678 VADDUQM T2, CAR2, T2 // VAQ T2, CAR2, T2
679
680 // Third round
681 VPERM T1, T0, SEL1, RED2 // d1 d0 d1 d0
682 VPERM ZER, RED2, SEL2, RED1 // 0 d1 d0 0
683 VSUBUQM RED2, RED1, RED2 // VSQ RED1, RED2, RED2 // Guaranteed not to underflow
684
685 VSLDOI $8, T1, T0, T0 // VSLDB $8, T1, T0, T0
686 VSLDOI $8, T2, T1, T1 // VSLDB $8, T2, T1, T1
687
688 VADDCUQ T0, RED1, CAR1 // VACCQ T0, RED1, CAR1
689 VADDUQM T0, RED1, T0 // VAQ T0, RED1, T0
690 VADDECUQ T1, RED2, CAR1, CAR2 // VACCCQ T1, RED2, CAR1, CAR2
691 VADDEUQM T1, RED2, CAR1, T1 // VACQ T1, RED2, CAR1, T1
692 VADDUQM T2, CAR2, T2 // VAQ T2, CAR2, T2
693
694 // Last round
695 VPERM T1, T0, SEL1, RED2 // d1 d0 d1 d0
696 VPERM ZER, RED2, SEL2, RED1 // 0 d1 d0 0
697 VSUBUQM RED2, RED1, RED2 // VSQ RED1, RED2, RED2 // Guaranteed not to underflow
698
699 VSLDOI $8, T1, T0, T0 // VSLDB $8, T1, T0, T0
700 VSLDOI $8, T2, T1, T1 // VSLDB $8, T2, T1, T1
701
702 VADDCUQ T0, RED1, CAR1 // VACCQ T0, RED1, CAR1
703 VADDUQM T0, RED1, T0 // VAQ T0, RED1, T0
704 VADDECUQ T1, RED2, CAR1, CAR2 // VACCCQ T1, RED2, CAR1, CAR2
705 VADDEUQM T1, RED2, CAR1, T1 // VACQ T1, RED2, CAR1, T1
706 VADDUQM T2, CAR2, T2 // VAQ T2, CAR2, T2
707
708 // ---------------------------------------------------
709
710 VSUBCUQ T0, PL, CAR1 // VSCBIQ PL, T0, CAR1
711 VSUBUQM T0, PL, TT0 // VSQ PL, T0, TT0
712 VSUBECUQ T1, PH, CAR1, CAR2 // VSBCBIQ T1, PH, CAR1, CAR2
713 VSUBEUQM T1, PH, CAR1, TT1 // VSBIQ T1, PH, CAR1, TT1
714 VSUBEUQM T2, ZER, CAR2, T2 // VSBIQ T2, ZER, CAR2, T2
715
716 VSEL TT0, T0, T2, T0
717 VSEL TT1, T1, T2, T1
718
719 // Reorder the bytes so STXVD2X can be used.
720 // TT0, TT1 used for VPERM result in case
721 // the caller expects T0, T1 to be good.
722 VPERM T0, T0, SWAP, TT0
723 VPERM T1, T1, SWAP, TT1
724
725 STXVD2X TT0_, (R0)(res_ptr)
726 STXVD2X TT1_, (R16)(res_ptr)
727 RET
728
729 #undef res_ptr
730 #undef x_ptr
731 #undef CPOOL
732 #undef T0
733 #undef T0_
734 #undef T1
735 #undef T1_
736 #undef T2
737 #undef TT0
738 #undef TT1
739 #undef ZER
740 #undef SEL1
741 #undef SEL1_
742 #undef SEL2
743 #undef SEL2_
744 #undef CAR1
745 #undef CAR2
746 #undef RED1
747 #undef RED2
748 #undef PL
749 #undef PL_
750 #undef PH
751 #undef PH_
752 #undef SWAP
753 #undef SWAP_
754
755 // ---------------------------------------
756 // p256MulInternal
757 // V0-V3 V30,V31 - Not Modified
758 // V4-V15 V27-V29 - Volatile
759
760 #define CPOOL R7
761
762 // Parameters
763 #define X0 V0 // Not modified
764 #define X1 V1 // Not modified
765 #define Y0 V2 // Not modified
766 #define Y1 V3 // Not modified
767 #define T0 V4 // Result
768 #define T1 V5 // Result
769 #define P0 V30 // Not modified
770 #define P1 V31 // Not modified
771
772 // Temporaries: lots of reused vector regs
773 #define YDIG V6 // Overloaded with CAR2
774 #define ADD1H V7 // Overloaded with ADD3H
775 #define ADD2H V8 // Overloaded with ADD4H
776 #define ADD3 V9 // Overloaded with SEL2,SEL5
777 #define ADD4 V10 // Overloaded with SEL3,SEL6
778 #define RED1 V11 // Overloaded with CAR2
779 #define RED2 V12
780 #define RED3 V13 // Overloaded with SEL1
781 #define T2 V14
782 // Overloaded temporaries
783 #define ADD1 V4 // Overloaded with T0
784 #define ADD2 V5 // Overloaded with T1
785 #define ADD3H V7 // Overloaded with ADD1H
786 #define ADD4H V8 // Overloaded with ADD2H
787 #define ZER V28 // Overloaded with TMP1
788 #define CAR1 V6 // Overloaded with YDIG
789 #define CAR2 V11 // Overloaded with RED1
790 // Constant Selects
791 #define SEL1 V13 // Overloaded with RED3
792 #define SEL2 V9 // Overloaded with ADD3,SEL5
793 #define SEL3 V10 // Overloaded with ADD4,SEL6
794 #define SEL4 V6 // Overloaded with YDIG,CAR1
795 #define SEL5 V9 // Overloaded with ADD3,SEL2
796 #define SEL6 V10 // Overloaded with ADD4,SEL3
797 #define SEL1_ VS45
798 #define SEL2_ VS41
799 #define SEL3_ VS42
800 #define SEL4_ VS38
801 #define SEL5_ VS41
802 #define SEL6_ VS42
803
804 // TMP1, TMP2, EXTRACT_LO, EXTRACT_HI used in
805 // VMULT macros
806 #define TMP1 V13 // Overloaded with RED3
807 #define TMP2 V27
808 #define EVENODD R5
809 #define EXTRACT_LO V28
810 #define EXTRACT_LO_ VS60
811 #define EXTRACT_HI V29
812 #define EXTRACT_HI_ VS61
813
814 /* *
815 * To follow the flow of bits, for your own sanity a stiff drink, need you shall.
816 * Of a single round, a 'helpful' picture, here is. Meaning, column position has.
817 * With you, SIMD be...
818 *
819 * +--------+--------+
820 * +--------| RED2 | RED1 |
821 * | +--------+--------+
822 * | ---+--------+--------+
823 * | +---- T2| T1 | T0 |--+
824 * | | ---+--------+--------+ |
825 * | | |
826 * | | ======================= |
827 * | | |
828 * | | +--------+--------+<-+
829 * | +-------| ADD2 | ADD1 |--|-----+
830 * | | +--------+--------+ | |
831 * | | +--------+--------+<---+ |
832 * | | | ADD2H | ADD1H |--+ |
833 * | | +--------+--------+ | |
834 * | | +--------+--------+<-+ |
835 * | | | ADD4 | ADD3 |--|-+ |
836 * | | +--------+--------+ | | |
837 * | | +--------+--------+<---+ | |
838 * | | | ADD4H | ADD3H |------|-+ |(+vzero)
839 * | | +--------+--------+ | | V
840 * | | ------------------------ | | +--------+
841 * | | | | | RED3 | [d0 0 0 d0]
842 * | | | | +--------+
843 * | +---->+--------+--------+ | | |
844 * (T2[1w]||ADD2[4w]||ADD1[3w]) +--------| T1 | T0 | | | |
845 * | +--------+--------+ | | |
846 * +---->---+--------+--------+ | | |
847 * T2| T1 | T0 |----+ | |
848 * ---+--------+--------+ | | |
849 * ---+--------+--------+<---+ | |
850 * +--- T2| T1 | T0 |----------+
851 * | ---+--------+--------+ | |
852 * | +--------+--------+<-------------+
853 * | | RED2 | RED1 |-----+ | | [0 d1 d0 d1] [d0 0 d1 d0]
854 * | +--------+--------+ | | |
855 * | +--------+<----------------------+
856 * | | RED3 |--------------+ | [0 0 d1 d0]
857 * | +--------+ | |
858 * +--->+--------+--------+ | |
859 * | T1 | T0 |--------+
860 * +--------+--------+ | |
861 * --------------------------- | |
862 * | |
863 * +--------+--------+<----+ |
864 * | RED2 | RED1 | |
865 * +--------+--------+ |
866 * ---+--------+--------+<-------+
867 * T2| T1 | T0 | (H1P-H1P-H00RRAY!)
868 * ---+--------+--------+
869 *
870 * *Mi obra de arte de siglo XXI @vpaprots
871 *
872 *
873 * First group is special, doesn't get the two inputs:
874 * +--------+--------+<-+
875 * +-------| ADD2 | ADD1 |--|-----+
876 * | +--------+--------+ | |
877 * | +--------+--------+<---+ |
878 * | | ADD2H | ADD1H |--+ |
879 * | +--------+--------+ | |
880 * | +--------+--------+<-+ |
881 * | | ADD4 | ADD3 |--|-+ |
882 * | +--------+--------+ | | |
883 * | +--------+--------+<---+ | |
884 * | | ADD4H | ADD3H |------|-+ |(+vzero)
885 * | +--------+--------+ | | V
886 * | ------------------------ | | +--------+
887 * | | | | RED3 | [d0 0 0 d0]
888 * | | | +--------+
889 * +---->+--------+--------+ | | |
890 * (T2[1w]||ADD2[4w]||ADD1[3w]) | T1 | T0 |----+ | |
891 * +--------+--------+ | | |
892 * ---+--------+--------+<---+ | |
893 * +--- T2| T1 | T0 |----------+
894 * | ---+--------+--------+ | |
895 * | +--------+--------+<-------------+
896 * | | RED2 | RED1 |-----+ | | [0 d1 d0 d1] [d0 0 d1 d0]
897 * | +--------+--------+ | | |
898 * | +--------+<----------------------+
899 * | | RED3 |--------------+ | [0 0 d1 d0]
900 * | +--------+ | |
901 * +--->+--------+--------+ | |
902 * | T1 | T0 |--------+
903 * +--------+--------+ | |
904 * --------------------------- | |
905 * | |
906 * +--------+--------+<----+ |
907 * | RED2 | RED1 | |
908 * +--------+--------+ |
909 * ---+--------+--------+<-------+
910 * T2| T1 | T0 | (H1P-H1P-H00RRAY!)
911 * ---+--------+--------+
912 *
913 * Last 'group' needs to RED2||RED1 shifted less
914 */
915 TEXT p256MulInternal<>(SB), NOSPLIT, $0-16
916 // CPOOL loaded from caller
917 MOVD $16, R16
918 MOVD $32, R17
919 MOVD $48, R18
920 MOVD $64, R19
921 MOVD $80, R20
922 MOVD $96, R21
923 MOVD $112, R22
924
925 MOVD $p256permhilo<>+0x00(SB), EVENODD
926
927 // These values are used by the VMULTxxx macros to
928 // extract the high and low portions of the intermediate
929 // result.
930 LXVD2X (R0)(EVENODD), EXTRACT_LO_
931 LXVD2X (R16)(EVENODD), EXTRACT_HI_
932
933 // ---------------------------------------------------
934
935 VSPLTW $3, Y0, YDIG // VREPF Y0 is input
936
937 // VMLHF X0, YDIG, ADD1H
938 // VMLHF X1, YDIG, ADD2H
939 // VMLF X0, YDIG, ADD1
940 // VMLF X1, YDIG, ADD2
941 //
942 VMULT(X0, YDIG, ADD1, ADD1H)
943 VMULT(X1, YDIG, ADD2, ADD2H)
944
945 VSPLTW $2, Y0, YDIG // VREPF
946
947 // VMALF X0, YDIG, ADD1H, ADD3
948 // VMALF X1, YDIG, ADD2H, ADD4
949 // VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free
950 // VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free
951 VMULT_ADD(X0, YDIG, ADD1H, ADD3, ADD3H)
952 VMULT_ADD(X1, YDIG, ADD2H, ADD4, ADD4H)
953
954 LXVD2X (R17)(CPOOL), SEL1_
955 VSPLTISB $0, ZER // VZERO ZER
956 VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0]
957
958 VSLDOI $12, ADD2, ADD1, T0 // ADD1 Free // VSLDB
959 VSLDOI $12, ZER, ADD2, T1 // ADD2 Free // VSLDB
960
961 VADDCUQ T0, ADD3, CAR1 // VACCQ
962 VADDUQM T0, ADD3, T0 // ADD3 Free // VAQ
963 VADDECUQ T1, ADD4, CAR1, T2 // VACCCQ
964 VADDEUQM T1, ADD4, CAR1, T1 // ADD4 Free // VACQ
965
966 LXVD2X (R18)(CPOOL), SEL2_
967 LXVD2X (R19)(CPOOL), SEL3_
968 LXVD2X (R20)(CPOOL), SEL4_
969 VPERM RED3, T0, SEL2, RED1 // [d0 0 d1 d0]
970 VPERM RED3, T0, SEL3, RED2 // [ 0 d1 d0 d1]
971 VPERM RED3, T0, SEL4, RED3 // [ 0 0 d1 d0]
972 VSUBUQM RED2, RED3, RED2 // Guaranteed not to underflow -->? // VSQ
973
974 VSLDOI $12, T1, T0, T0 // VSLDB
975 VSLDOI $12, T2, T1, T1 // VSLDB
976
977 VADDCUQ T0, ADD3H, CAR1 // VACCQ
978 VADDUQM T0, ADD3H, T0 // VAQ
979 VADDECUQ T1, ADD4H, CAR1, T2 // VACCCQ
980 VADDEUQM T1, ADD4H, CAR1, T1 // VACQ
981
982 // ---------------------------------------------------
983
984 VSPLTW $1, Y0, YDIG // VREPF
985 LXVD2X (R0)(EVENODD), EXTRACT_LO_
986 LXVD2X (R16)(EVENODD), EXTRACT_HI_
987
988 // VMALHF X0, YDIG, T0, ADD1H
989 // VMALHF X1, YDIG, T1, ADD2H
990 // VMALF X0, YDIG, T0, ADD1 // T0 Free->ADD1
991 // VMALF X1, YDIG, T1, ADD2 // T1 Free->ADD2
992 VMULT_ADD(X0, YDIG, T0, ADD1, ADD1H)
993 VMULT_ADD(X1, YDIG, T1, ADD2, ADD2H)
994
995 VSPLTW $0, Y0, YDIG // VREPF
996
997 // VMALF X0, YDIG, ADD1H, ADD3
998 // VMALF X1, YDIG, ADD2H, ADD4
999 // VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free->ADD3H
1000 // VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free->ADD4H , YDIG Free->ZER
1001 VMULT_ADD(X0, YDIG, ADD1H, ADD3, ADD3H)
1002 VMULT_ADD(X1, YDIG, ADD2H, ADD4, ADD4H)
1003
1004 VSPLTISB $0, ZER // VZERO ZER
1005 LXVD2X (R17)(CPOOL), SEL1_
1006 VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0]
1007
1008 VSLDOI $12, ADD2, ADD1, T0 // ADD1 Free->T0 // VSLDB
1009 VSLDOI $12, T2, ADD2, T1 // ADD2 Free->T1, T2 Free // VSLDB
1010
1011 VADDCUQ T0, RED1, CAR1 // VACCQ
1012 VADDUQM T0, RED1, T0 // VAQ
1013 VADDECUQ T1, RED2, CAR1, T2 // VACCCQ
1014 VADDEUQM T1, RED2, CAR1, T1 // VACQ
1015
1016 VADDCUQ T0, ADD3, CAR1 // VACCQ
1017 VADDUQM T0, ADD3, T0 // VAQ
1018 VADDECUQ T1, ADD4, CAR1, CAR2 // VACCCQ
1019 VADDEUQM T1, ADD4, CAR1, T1 // VACQ
1020 VADDUQM T2, CAR2, T2 // VAQ
1021
1022 LXVD2X (R18)(CPOOL), SEL2_
1023 LXVD2X (R19)(CPOOL), SEL3_
1024 LXVD2X (R20)(CPOOL), SEL4_
1025 VPERM RED3, T0, SEL2, RED1 // [d0 0 d1 d0]
1026 VPERM RED3, T0, SEL3, RED2 // [ 0 d1 d0 d1]
1027 VPERM RED3, T0, SEL4, RED3 // [ 0 0 d1 d0]
1028 VSUBUQM RED2, RED3, RED2 // Guaranteed not to underflow // VSQ
1029
1030 VSLDOI $12, T1, T0, T0 // VSLDB
1031 VSLDOI $12, T2, T1, T1 // VSLDB
1032
1033 VADDCUQ T0, ADD3H, CAR1 // VACCQ
1034 VADDUQM T0, ADD3H, T0 // VAQ
1035 VADDECUQ T1, ADD4H, CAR1, T2 // VACCCQ
1036 VADDEUQM T1, ADD4H, CAR1, T1 // VACQ
1037
1038 // ---------------------------------------------------
1039
1040 VSPLTW $3, Y1, YDIG // VREPF
1041 LXVD2X (R0)(EVENODD), EXTRACT_LO_
1042 LXVD2X (R16)(EVENODD), EXTRACT_HI_
1043
1044 // VMALHF X0, YDIG, T0, ADD1H
1045 // VMALHF X1, YDIG, T1, ADD2H
1046 // VMALF X0, YDIG, T0, ADD1
1047 // VMALF X1, YDIG, T1, ADD2
1048 VMULT_ADD(X0, YDIG, T0, ADD1, ADD1H)
1049 VMULT_ADD(X1, YDIG, T1, ADD2, ADD2H)
1050
1051 VSPLTW $2, Y1, YDIG // VREPF
1052
1053 // VMALF X0, YDIG, ADD1H, ADD3
1054 // VMALF X1, YDIG, ADD2H, ADD4
1055 // VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free
1056 // VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free
1057 VMULT_ADD(X0, YDIG, ADD1H, ADD3, ADD3H)
1058 VMULT_ADD(X1, YDIG, ADD2H, ADD4, ADD4H)
1059
1060 LXVD2X (R17)(CPOOL), SEL1_
1061 VSPLTISB $0, ZER // VZERO ZER
1062 LXVD2X (R17)(CPOOL), SEL1_
1063 VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0]
1064
1065 VSLDOI $12, ADD2, ADD1, T0 // ADD1 Free // VSLDB
1066 VSLDOI $12, T2, ADD2, T1 // ADD2 Free // VSLDB
1067
1068 VADDCUQ T0, RED1, CAR1 // VACCQ
1069 VADDUQM T0, RED1, T0 // VAQ
1070 VADDECUQ T1, RED2, CAR1, T2 // VACCCQ
1071 VADDEUQM T1, RED2, CAR1, T1 // VACQ
1072
1073 VADDCUQ T0, ADD3, CAR1 // VACCQ
1074 VADDUQM T0, ADD3, T0 // VAQ
1075 VADDECUQ T1, ADD4, CAR1, CAR2 // VACCCQ
1076 VADDEUQM T1, ADD4, CAR1, T1 // VACQ
1077 VADDUQM T2, CAR2, T2 // VAQ
1078
1079 LXVD2X (R18)(CPOOL), SEL2_
1080 LXVD2X (R19)(CPOOL), SEL3_
1081 LXVD2X (R20)(CPOOL), SEL4_
1082 VPERM RED3, T0, SEL2, RED1 // [d0 0 d1 d0]
1083 VPERM RED3, T0, SEL3, RED2 // [ 0 d1 d0 d1]
1084 VPERM RED3, T0, SEL4, RED3 // [ 0 0 d1 d0]
1085 VSUBUQM RED2, RED3, RED2 // Guaranteed not to underflow // VSQ
1086
1087 VSLDOI $12, T1, T0, T0 // VSLDB
1088 VSLDOI $12, T2, T1, T1 // VSLDB
1089
1090 VADDCUQ T0, ADD3H, CAR1 // VACCQ
1091 VADDUQM T0, ADD3H, T0 // VAQ
1092 VADDECUQ T1, ADD4H, CAR1, T2 // VACCCQ
1093 VADDEUQM T1, ADD4H, CAR1, T1 // VACQ
1094
1095 // ---------------------------------------------------
1096
1097 VSPLTW $1, Y1, YDIG // VREPF
1098 LXVD2X (R0)(EVENODD), EXTRACT_LO_
1099 LXVD2X (R16)(EVENODD), EXTRACT_HI_
1100
1101 // VMALHF X0, YDIG, T0, ADD1H
1102 // VMALHF X1, YDIG, T1, ADD2H
1103 // VMALF X0, YDIG, T0, ADD1
1104 // VMALF X1, YDIG, T1, ADD2
1105 VMULT_ADD(X0, YDIG, T0, ADD1, ADD1H)
1106 VMULT_ADD(X1, YDIG, T1, ADD2, ADD2H)
1107
1108 VSPLTW $0, Y1, YDIG // VREPF
1109
1110 // VMALF X0, YDIG, ADD1H, ADD3
1111 // VMALF X1, YDIG, ADD2H, ADD4
1112 // VMALHF X0, YDIG, ADD1H, ADD3H
1113 // VMALHF X1, YDIG, ADD2H, ADD4H
1114 VMULT_ADD(X0, YDIG, ADD1H, ADD3, ADD3H)
1115 VMULT_ADD(X1, YDIG, ADD2H, ADD4, ADD4H)
1116
1117 VSPLTISB $0, ZER // VZERO ZER
1118 LXVD2X (R17)(CPOOL), SEL1_
1119 VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0]
1120
1121 VSLDOI $12, ADD2, ADD1, T0 // VSLDB
1122 VSLDOI $12, T2, ADD2, T1 // VSLDB
1123
1124 VADDCUQ T0, RED1, CAR1 // VACCQ
1125 VADDUQM T0, RED1, T0 // VAQ
1126 VADDECUQ T1, RED2, CAR1, T2 // VACCCQ
1127 VADDEUQM T1, RED2, CAR1, T1 // VACQ
1128
1129 VADDCUQ T0, ADD3, CAR1 // VACCQ
1130 VADDUQM T0, ADD3, T0 // VAQ
1131 VADDECUQ T1, ADD4, CAR1, CAR2 // VACCCQ
1132 VADDEUQM T1, ADD4, CAR1, T1 // VACQ
1133 VADDUQM T2, CAR2, T2 // VAQ
1134
1135 LXVD2X (R21)(CPOOL), SEL5_
1136 LXVD2X (R22)(CPOOL), SEL6_
1137 VPERM T0, RED3, SEL5, RED2 // [d1 d0 d1 d0]
1138 VPERM T0, RED3, SEL6, RED1 // [ 0 d1 d0 0]
1139 VSUBUQM RED2, RED1, RED2 // Guaranteed not to underflow // VSQ
1140
1141 VSLDOI $12, T1, T0, T0 // VSLDB
1142 VSLDOI $12, T2, T1, T1 // VSLDB
1143
1144 VADDCUQ T0, ADD3H, CAR1 // VACCQ
1145 VADDUQM T0, ADD3H, T0 // VAQ
1146 VADDECUQ T1, ADD4H, CAR1, T2 // VACCCQ
1147 VADDEUQM T1, ADD4H, CAR1, T1 // VACQ
1148
1149 VADDCUQ T0, RED1, CAR1 // VACCQ
1150 VADDUQM T0, RED1, T0 // VAQ
1151 VADDECUQ T1, RED2, CAR1, CAR2 // VACCCQ
1152 VADDEUQM T1, RED2, CAR1, T1 // VACQ
1153 VADDUQM T2, CAR2, T2 // VAQ
1154
1155 // ---------------------------------------------------
1156
1157 VSPLTISB $0, RED3 // VZERO RED3
1158 VSUBCUQ T0, P0, CAR1 // VSCBIQ
1159 VSUBUQM T0, P0, ADD1H // VSQ
1160 VSUBECUQ T1, P1, CAR1, CAR2 // VSBCBIQ
1161 VSUBEUQM T1, P1, CAR1, ADD2H // VSBIQ
1162 VSUBEUQM T2, RED3, CAR2, T2 // VSBIQ
1163
1164 // what output to use, ADD2H||ADD1H or T1||T0?
1165 VSEL ADD1H, T0, T2, T0
1166 VSEL ADD2H, T1, T2, T1
1167 RET
1168
1169 #undef CPOOL
1170
1171 #undef X0
1172 #undef X1
1173 #undef Y0
1174 #undef Y1
1175 #undef T0
1176 #undef T1
1177 #undef P0
1178 #undef P1
1179
1180 #undef SEL1
1181 #undef SEL2
1182 #undef SEL3
1183 #undef SEL4
1184 #undef SEL5
1185 #undef SEL6
1186 #undef SEL1_
1187 #undef SEL2_
1188 #undef SEL3_
1189 #undef SEL4_
1190 #undef SEL5_
1191 #undef SEL6_
1192
1193 #undef YDIG
1194 #undef ADD1H
1195 #undef ADD2H
1196 #undef ADD3
1197 #undef ADD4
1198 #undef RED1
1199 #undef RED2
1200 #undef RED3
1201 #undef T2
1202 #undef ADD1
1203 #undef ADD2
1204 #undef ADD3H
1205 #undef ADD4H
1206 #undef ZER
1207 #undef CAR1
1208 #undef CAR2
1209
1210 #undef TMP1
1211 #undef TMP2
1212 #undef EVENODD
1213 #undef EXTRACT_HI
1214 #undef EXTRACT_HI_
1215 #undef EXTRACT_LO
1216 #undef EXTRACT_LO_
1217
1218 #define p256SubInternal(T1, T0, X1, X0, Y1, Y0) \
1219 VSPLTISB $0, ZER \ // VZERO
1220 VSUBCUQ X0, Y0, CAR1 \
1221 VSUBUQM X0, Y0, T0 \
1222 VSUBECUQ X1, Y1, CAR1, SEL1 \
1223 VSUBEUQM X1, Y1, CAR1, T1 \
1224 VSUBUQM ZER, SEL1, SEL1 \ // VSQ
1225 \
1226 VADDCUQ T0, PL, CAR1 \ // VACCQ
1227 VADDUQM T0, PL, TT0 \ // VAQ
1228 VADDEUQM T1, PH, CAR1, TT1 \ // VACQ
1229 \
1230 VSEL TT0, T0, SEL1, T0 \
1231 VSEL TT1, T1, SEL1, T1 \
1232
1233 #define p256AddInternal(T1, T0, X1, X0, Y1, Y0) \
1234 VADDCUQ X0, Y0, CAR1 \
1235 VADDUQM X0, Y0, T0 \
1236 VADDECUQ X1, Y1, CAR1, T2 \ // VACCCQ
1237 VADDEUQM X1, Y1, CAR1, T1 \
1238 \
1239 VSPLTISB $0, ZER \
1240 VSUBCUQ T0, PL, CAR1 \ // VSCBIQ
1241 VSUBUQM T0, PL, TT0 \
1242 VSUBECUQ T1, PH, CAR1, CAR2 \ // VSBCBIQ
1243 VSUBEUQM T1, PH, CAR1, TT1 \ // VSBIQ
1244 VSUBEUQM T2, ZER, CAR2, SEL1 \
1245 \
1246 VSEL TT0, T0, SEL1, T0 \
1247 VSEL TT1, T1, SEL1, T1
1248
1249 #define p256HalfInternal(T1, T0, X1, X0) \
1250 VSPLTISB $0, ZER \
1251 VSUBEUQM ZER, ZER, X0, SEL1 \
1252 \
1253 VADDCUQ X0, PL, CAR1 \
1254 VADDUQM X0, PL, T0 \
1255 VADDECUQ X1, PH, CAR1, T2 \
1256 VADDEUQM X1, PH, CAR1, T1 \
1257 \
1258 VSEL T0, X0, SEL1, T0 \
1259 VSEL T1, X1, SEL1, T1 \
1260 VSEL T2, ZER, SEL1, T2 \
1261 \
1262 VSLDOI $15, T2, ZER, TT1 \
1263 VSLDOI $15, T1, ZER, TT0 \
1264 VSPLTISB $1, SEL1 \
1265 VSR T0, SEL1, T0 \ // VSRL
1266 VSR T1, SEL1, T1 \
1267 VSPLTISB $7, SEL1 \ // VREPIB
1268 VSL TT0, SEL1, TT0 \
1269 VSL TT1, SEL1, TT1 \
1270 VOR T0, TT0, T0 \
1271 VOR T1, TT1, T1
1272
1273 // ---------------------------------------
1274 // func p256MulAsm(res, in1, in2 []byte)
1275 #define res_ptr R3
1276 #define x_ptr R4
1277 #define y_ptr R5
1278 #define CPOOL R7
1279 #define TEMP R8
1280
1281 // Parameters
1282 #define X0 V0
1283 #define X1 V1
1284 #define Y0 V2
1285 #define Y1 V3
1286 #define T0 V4
1287 #define T1 V5
1288 #define X0_ VS32
1289 #define X1_ VS33
1290 #define Y0_ VS34
1291 #define Y1_ VS35
1292 #define T0_ VS36
1293 #define T1_ VS37
1294 #define SWAP V28
1295 #define SWAP_ VS60
1296
1297 // Constants
1298 #define P0 V30
1299 #define P1 V31
1300 #define P0_ VS62
1301 #define P1_ VS63
1302 //
1303 // Montgomery multiplication modulo P256
1304 //
1305 TEXT ·p256MulAsm(SB), NOSPLIT, $0-72
1306 MOVD res+0(FP), res_ptr
1307 MOVD in1+24(FP), x_ptr
1308 MOVD in2+48(FP), y_ptr
1309 MOVD $16, R16
1310 MOVD $32, R17
1311
1312 MOVD $p256mul<>+0x00(SB), CPOOL
1313 MOVD $byteswap<>+0x00(SB), R8
1314
1315 LXVD2X (R8)(R0), SWAP_
1316
1317 LXVD2X (R0)(x_ptr), X0_
1318 LXVD2X (R16)(x_ptr), X1_
1319
1320 VPERM X0, X0, SWAP, X0
1321 VPERM X1, X1, SWAP, X1
1322
1323 LXVD2X (R0)(y_ptr), Y0_
1324 LXVD2X (R16)(y_ptr), Y1_
1325
1326 VPERM Y0, Y0, SWAP, Y0
1327 VPERM Y1, Y1, SWAP, Y1
1328
1329 LXVD2X (R16)(CPOOL), P1_
1330 LXVD2X (R0)(CPOOL), P0_
1331
1332 CALL p256MulInternal<>(SB)
1333
1334 MOVD $p256mul<>+0x00(SB), CPOOL
1335 MOVD $byteswap<>+0x00(SB), R8
1336
1337 LXVD2X (R8)(R0), SWAP_
1338
1339 VPERM T0, T0, SWAP, T0
1340 VPERM T1, T1, SWAP, T1
1341 STXVD2X T0_, (R0)(res_ptr)
1342 STXVD2X T1_, (R16)(res_ptr)
1343 RET
1344
1345 #undef res_ptr
1346 #undef x_ptr
1347 #undef y_ptr
1348 #undef CPOOL
1349
1350 #undef X0
1351 #undef X1
1352 #undef Y0
1353 #undef Y1
1354 #undef T0
1355 #undef T1
1356 #undef P0
1357 #undef P1
1358 #undef X0_
1359 #undef X1_
1360 #undef Y0_
1361 #undef Y1_
1362 #undef T0_
1363 #undef T1_
1364 #undef P0_
1365 #undef P1_
1366
1367 // Point add with P2 being affine point
1368 // If sign == 1 -> P2 = -P2
1369 // If sel == 0 -> P3 = P1
1370 // if zero == 0 -> P3 = P2
1371 // p256PointAddAffineAsm(P3, P1, P2 *p256Point, sign, sel, zero int)
1372 #define P3ptr R3
1373 #define P1ptr R4
1374 #define P2ptr R5
1375 #define CPOOL R7
1376
1377 // Temporaries in REGs
1378 #define Y2L V15
1379 #define Y2H V16
1380 #define Y2L_ VS47
1381 #define Y2H_ VS48
1382 #define T1L V17
1383 #define T1H V18
1384 #define T2L V19
1385 #define T2H V20
1386 #define T3L V21
1387 #define T3H V22
1388 #define T4L V23
1389 #define T4H V24
1390
1391 // Temps for Sub and Add
1392 #define TT0 V11
1393 #define TT1 V12
1394 #define T2 V13
1395
1396 // p256MulAsm Parameters
1397 #define X0 V0
1398 #define X1 V1
1399 #define X0_ VS32
1400 #define X1_ VS33
1401 #define Y0 V2
1402 #define Y1 V3
1403 #define Y0_ VS34
1404 #define Y1_ VS35
1405 #define T0 V4
1406 #define T1 V5
1407
1408 #define PL V30
1409 #define PH V31
1410 #define PL_ VS62
1411 #define PH_ VS63
1412
1413 // Names for zero/sel selects
1414 #define X1L V0
1415 #define X1H V1
1416 #define X1L_ VS32
1417 #define X1H_ VS33
1418 #define Y1L V2 // p256MulAsmParmY
1419 #define Y1H V3 // p256MulAsmParmY
1420 #define Y1L_ VS34
1421 #define Y1H_ VS35
1422 #define Z1L V4
1423 #define Z1H V5
1424 #define Z1L_ VS36
1425 #define Z1H_ VS37
1426 #define X2L V0
1427 #define X2H V1
1428 #define X2L_ VS32
1429 #define X2H_ VS33
1430 #define Z2L V4
1431 #define Z2H V5
1432 #define Z2L_ VS36
1433 #define Z2H_ VS37
1434 #define X3L V17 // T1L
1435 #define X3H V18 // T1H
1436 #define Y3L V21 // T3L
1437 #define Y3H V22 // T3H
1438 #define Z3L V25
1439 #define Z3H V26
1440 #define X3L_ VS49
1441 #define X3H_ VS50
1442 #define Y3L_ VS53
1443 #define Y3H_ VS54
1444 #define Z3L_ VS57
1445 #define Z3H_ VS58
1446
1447 #define ZER V6
1448 #define SEL1 V7
1449 #define SEL1_ VS39
1450 #define CAR1 V8
1451 #define CAR2 V9
1452 /* *
1453 * Three operand formula:
1454 * Source: 2004 Hankerson–Menezes–Vanstone, page 91.
1455 * T1 = Z1²
1456 * T2 = T1*Z1
1457 * T1 = T1*X2
1458 * T2 = T2*Y2
1459 * T1 = T1-X1
1460 * T2 = T2-Y1
1461 * Z3 = Z1*T1
1462 * T3 = T1²
1463 * T4 = T3*T1
1464 * T3 = T3*X1
1465 * T1 = 2*T3
1466 * X3 = T2²
1467 * X3 = X3-T1
1468 * X3 = X3-T4
1469 * T3 = T3-X3
1470 * T3 = T3*T2
1471 * T4 = T4*Y1
1472 * Y3 = T3-T4
1473
1474 * Three operand formulas, but with MulInternal X,Y used to store temps
1475 X=Z1; Y=Z1; MUL;T- // T1 = Z1² T1
1476 X=T ; Y- ; MUL;T2=T // T2 = T1*Z1 T1 T2
1477 X- ; Y=X2; MUL;T1=T // T1 = T1*X2 T1 T2
1478 X=T2; Y=Y2; MUL;T- // T2 = T2*Y2 T1 T2
1479 SUB(T2<T-Y1) // T2 = T2-Y1 T1 T2
1480 SUB(Y<T1-X1) // T1 = T1-X1 T1 T2
1481 X=Z1; Y- ; MUL;Z3:=T// Z3 = Z1*T1 T2
1482 X=Y; Y- ; MUL;X=T // T3 = T1*T1 T2
1483 X- ; Y- ; MUL;T4=T // T4 = T3*T1 T2 T4
1484 X- ; Y=X1; MUL;T3=T // T3 = T3*X1 T2 T3 T4
1485 ADD(T1<T+T) // T1 = T3+T3 T1 T2 T3 T4
1486 X=T2; Y=T2; MUL;T- // X3 = T2*T2 T1 T2 T3 T4
1487 SUB(T<T-T1) // X3 = X3-T1 T1 T2 T3 T4
1488 SUB(T<T-T4) X3:=T // X3 = X3-T4 T2 T3 T4
1489 SUB(X<T3-T) // T3 = T3-X3 T2 T3 T4
1490 X- ; Y- ; MUL;T3=T // T3 = T3*T2 T2 T3 T4
1491 X=T4; Y=Y1; MUL;T- // T4 = T4*Y1 T3 T4
1492 SUB(T<T3-T) Y3:=T // Y3 = T3-T4 T3 T4
1493
1494 */
1495 //
1496 // V27 is clobbered by p256MulInternal so must be
1497 // saved in a temp.
1498 //
1499 TEXT ·p256PointAddAffineAsm(SB), NOSPLIT, $16-48
1500 MOVD res+0(FP), P3ptr
1501 MOVD in1+8(FP), P1ptr
1502 MOVD in2+16(FP), P2ptr
1503
1504 MOVD $p256mul<>+0x00(SB), CPOOL
1505
1506 MOVD $16, R16
1507 MOVD $32, R17
1508 MOVD $48, R18
1509 MOVD $64, R19
1510 MOVD $80, R20
1511 MOVD $96, R21
1512 MOVD $112, R22
1513 MOVD $128, R23
1514 MOVD $144, R24
1515 MOVD $160, R25
1516 MOVD $104, R26 // offset of sign+24(FP)
1517
1518 MOVD $byteswap<>+0+00(SB), R8
1519 LXVD2X (R16)(CPOOL), PH_
1520 LXVD2X (R0)(CPOOL), PL_
1521
1522 // if (sign == 1) {
1523 // Y2 = fromBig(new(big.Int).Mod(new(big.Int).Sub(p256.P, new(big.Int).SetBytes(Y2)), p256.P)) // Y2 = P-Y2
1524 // }
1525
1526 LXVD2X (R8)(R0), SWAP_
1527 LXVD2X (R17)(P2ptr), Y2L_
1528 LXVD2X (R18)(P2ptr), Y2H_
1529 VPERM Y2H, Y2H, SWAP, Y2H
1530 VPERM Y2L, Y2L, SWAP, Y2L
1531
1532 // Equivalent of VLREPG sign+24(FP), SEL1
1533 LXVDSX (R1)(R26), SEL1_
1534 VSPLTISB $0, ZER
1535 VCMPEQUD SEL1, ZER, SEL1
1536
1537 VSUBCUQ PL, Y2L, CAR1
1538 VSUBUQM PL, Y2L, T1L
1539 VSUBEUQM PH, Y2H, CAR1, T1H
1540
1541 VSEL T1L, Y2L, SEL1, Y2L
1542 VSEL T1H, Y2H, SEL1, Y2H
1543
1544 /* *
1545 * Three operand formula:
1546 * Source: 2004 Hankerson–Menezes–Vanstone, page 91.
1547 */
1548 // X=Z1; Y=Z1; MUL; T- // T1 = Z1² T1
1549 LXVD2X (R8)(R0), SWAP_
1550 LXVD2X (R19)(P1ptr), X0_ // Z1H
1551 LXVD2X (R20)(P1ptr), X1_ // Z1L
1552 VPERM X0, X0, SWAP, X0
1553 VPERM X1, X1, SWAP, X1
1554 VOR X0, X0, Y0
1555 VOR X1, X1, Y1
1556 CALL p256MulInternal<>(SB)
1557
1558 // X=T ; Y- ; MUL; T2=T // T2 = T1*Z1 T1 T2
1559 VOR T0, T0, X0
1560 VOR T1, T1, X1
1561 CALL p256MulInternal<>(SB)
1562 VOR T0, T0, T2L
1563 VOR T1, T1, T2H
1564
1565 // X- ; Y=X2; MUL; T1=T // T1 = T1*X2 T1 T2
1566 MOVD in2+16(FP), P2ptr
1567 LXVD2X (R8)(R0), SWAP_
1568 LXVD2X (R0)(P2ptr), Y0_ // X2H
1569 LXVD2X (R16)(P2ptr), Y1_ // X2L
1570 VPERM Y0, Y0, SWAP, Y0
1571 VPERM Y1, Y1, SWAP, Y1
1572 CALL p256MulInternal<>(SB)
1573 VOR T0, T0, T1L
1574 VOR T1, T1, T1H
1575
1576 // X=T2; Y=Y2; MUL; T- // T2 = T2*Y2 T1 T2
1577 VOR T2L, T2L, X0
1578 VOR T2H, T2H, X1
1579 VOR Y2L, Y2L, Y0
1580 VOR Y2H, Y2H, Y1
1581 CALL p256MulInternal<>(SB)
1582
1583 // SUB(T2<T-Y1) // T2 = T2-Y1 T1 T2
1584 MOVD in1+8(FP), P1ptr
1585 LXVD2X (R8)(R0), SWAP_
1586 LXVD2X (R17)(P1ptr), Y1L_
1587 LXVD2X (R18)(P1ptr), Y1H_
1588 VPERM Y1H, Y1H, SWAP, Y1H
1589 VPERM Y1L, Y1L, SWAP, Y1L
1590 p256SubInternal(T2H,T2L,T1,T0,Y1H,Y1L)
1591
1592 // SUB(Y<T1-X1) // T1 = T1-X1 T1 T2
1593 LXVD2X (R0)(P1ptr), X1L_
1594 LXVD2X (R16)(P1ptr), X1H_
1595 VPERM X1H, X1H, SWAP, X1H
1596 VPERM X1L, X1L, SWAP, X1L
1597 p256SubInternal(Y1,Y0,T1H,T1L,X1H,X1L)
1598
1599 // X=Z1; Y- ; MUL; Z3:=T// Z3 = Z1*T1 T2
1600 LXVD2X (R19)(P1ptr), X0_ // Z1H
1601 LXVD2X (R20)(P1ptr), X1_ // Z1L
1602 VPERM X0, X0, SWAP, X0
1603 VPERM X1, X1, SWAP, X1
1604 CALL p256MulInternal<>(SB)
1605
1606 VOR T0, T0, Z3L
1607 VOR T1, T1, Z3H
1608
1609 // X=Y; Y- ; MUL; X=T // T3 = T1*T1 T2
1610 VOR Y0, Y0, X0
1611 VOR Y1, Y1, X1
1612 CALL p256MulInternal<>(SB)
1613 VOR T0, T0, X0
1614 VOR T1, T1, X1
1615
1616 // X- ; Y- ; MUL; T4=T // T4 = T3*T1 T2 T4
1617 CALL p256MulInternal<>(SB)
1618 VOR T0, T0, T4L
1619 VOR T1, T1, T4H
1620
1621 // X- ; Y=X1; MUL; T3=T // T3 = T3*X1 T2 T3 T4
1622 MOVD in1+8(FP), P1ptr
1623 LXVD2X (R8)(R0), SWAP_
1624 LXVD2X (R0)(P1ptr), Y0_ // X1H
1625 LXVD2X (R16)(P1ptr), Y1_ // X1L
1626 VPERM Y1, Y1, SWAP, Y1
1627 VPERM Y0, Y0, SWAP, Y0
1628 CALL p256MulInternal<>(SB)
1629 VOR T0, T0, T3L
1630 VOR T1, T1, T3H
1631
1632 // ADD(T1<T+T) // T1 = T3+T3 T1 T2 T3 T4
1633 p256AddInternal(T1H,T1L, T1,T0,T1,T0)
1634
1635 // X=T2; Y=T2; MUL; T- // X3 = T2*T2 T1 T2 T3 T4
1636 VOR T2L, T2L, X0
1637 VOR T2H, T2H, X1
1638 VOR T2L, T2L, Y0
1639 VOR T2H, T2H, Y1
1640 CALL p256MulInternal<>(SB)
1641
1642 // SUB(T<T-T1) // X3 = X3-T1 T1 T2 T3 T4 (T1 = X3)
1643 p256SubInternal(T1,T0,T1,T0,T1H,T1L)
1644
1645 // SUB(T<T-T4) X3:=T // X3 = X3-T4 T2 T3 T4
1646 p256SubInternal(T1,T0,T1,T0,T4H,T4L)
1647 VOR T0, T0, X3L
1648 VOR T1, T1, X3H
1649
1650 // SUB(X<T3-T) // T3 = T3-X3 T2 T3 T4
1651 p256SubInternal(X1,X0,T3H,T3L,T1,T0)
1652
1653 // X- ; Y- ; MUL; T3=T // T3 = T3*T2 T2 T3 T4
1654 CALL p256MulInternal<>(SB)
1655 VOR T0, T0, T3L
1656 VOR T1, T1, T3H
1657
1658 // X=T4; Y=Y1; MUL; T- // T4 = T4*Y1 T3 T4
1659 VOR T4L, T4L, X0
1660 VOR T4H, T4H, X1
1661 MOVD in1+8(FP), P1ptr
1662 LXVD2X (R8)(R0), SWAP_
1663 LXVD2X (R17)(P1ptr), Y0_ // Y1H
1664 LXVD2X (R18)(P1ptr), Y1_ // Y1L
1665 VPERM Y0, Y0, SWAP, Y0
1666 VPERM Y1, Y1, SWAP, Y1
1667 CALL p256MulInternal<>(SB)
1668
1669 // SUB(T<T3-T) Y3:=T // Y3 = T3-T4 T3 T4 (T3 = Y3)
1670 p256SubInternal(Y3H,Y3L,T3H,T3L,T1,T0)
1671
1672 // if (sel == 0) {
1673 // copy(P3.x[:], X1)
1674 // copy(P3.y[:], Y1)
1675 // copy(P3.z[:], Z1)
1676 // }
1677
1678 LXVD2X (R8)(R0), SWAP_
1679 LXVD2X (R0)(P1ptr), X1L_
1680 LXVD2X (R16)(P1ptr), X1H_
1681 VPERM X1H, X1H, SWAP, X1H
1682 VPERM X1L, X1L, SWAP, X1L
1683
1684 // Y1 already loaded, left over from addition
1685 LXVD2X (R19)(P1ptr), Z1L_
1686 LXVD2X (R20)(P1ptr), Z1H_
1687 VPERM Z1H, Z1H, SWAP, Z1H
1688 VPERM Z1L, Z1L, SWAP, Z1L
1689
1690 MOVD $112, R26 // Get offset to sel+32
1691 LXVDSX (R1)(R26), SEL1_
1692 VSPLTISB $0, ZER
1693 VCMPEQUD SEL1, ZER, SEL1
1694
1695 VSEL X3L, X1L, SEL1, X3L
1696 VSEL X3H, X1H, SEL1, X3H
1697 VSEL Y3L, Y1L, SEL1, Y3L
1698 VSEL Y3H, Y1H, SEL1, Y3H
1699 VSEL Z3L, Z1L, SEL1, Z3L
1700 VSEL Z3H, Z1H, SEL1, Z3H
1701
1702 // if (zero == 0) {
1703 // copy(P3.x[:], X2)
1704 // copy(P3.y[:], Y2)
1705 // copy(P3.z[:], []byte{0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xfe, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1706 // 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01}) //(p256.z*2^256)%p
1707 // }
1708 MOVD in2+16(FP), P2ptr
1709 LXVD2X (R0)(P2ptr), X2L_
1710 LXVD2X (R16)(P2ptr), X2H_
1711 VPERM X2H, X2H, SWAP, X2H
1712 VPERM X2L, X2L, SWAP, X2L
1713
1714 // Y2 already loaded
1715 LXVD2X (R23)(CPOOL), Z2L_
1716 LXVD2X (R24)(CPOOL), Z2H_
1717
1718 MOVD $120, R26 // Get the value from zero+40(FP)
1719 LXVDSX (R1)(R26), SEL1_
1720 VSPLTISB $0, ZER
1721 VCMPEQUD SEL1, ZER, SEL1
1722
1723 VSEL X3L, X2L, SEL1, X3L
1724 VSEL X3H, X2H, SEL1, X3H
1725 VSEL Y3L, Y2L, SEL1, Y3L
1726 VSEL Y3H, Y2H, SEL1, Y3H
1727 VSEL Z3L, Z2L, SEL1, Z3L
1728 VSEL Z3H, Z2H, SEL1, Z3H
1729
1730 // Reorder the bytes so they can be stored using STXVD2X.
1731 MOVD res+0(FP), P3ptr
1732 VPERM X3H, X3H, SWAP, X3H
1733 VPERM X3L, X3L, SWAP, X3L
1734 VPERM Y3H, Y3H, SWAP, Y3H
1735 VPERM Y3L, Y3L, SWAP, Y3L
1736 VPERM Z3H, Z3H, SWAP, Z3H
1737 VPERM Z3L, Z3L, SWAP, Z3L
1738 STXVD2X X3L_, (R0)(P3ptr)
1739 STXVD2X X3H_, (R16)(P3ptr)
1740 STXVD2X Y3L_, (R17)(P3ptr)
1741 STXVD2X Y3H_, (R18)(P3ptr)
1742 STXVD2X Z3L_, (R19)(P3ptr)
1743 STXVD2X Z3H_, (R20)(P3ptr)
1744
1745 RET
1746
1747 #undef P3ptr
1748 #undef P1ptr
1749 #undef P2ptr
1750 #undef CPOOL
1751 #undef SWAP
1752 #undef SWAP_
1753
1754 #undef Y2L
1755 #undef Y2H
1756 #undef Y2L_
1757 #undef Y2H_
1758 #undef T1L
1759 #undef T1H
1760 #undef T2L
1761 #undef T2H
1762 #undef T3L
1763 #undef T3H
1764 #undef T4L
1765 #undef T4H
1766
1767 #undef TT0
1768 #undef TT1
1769 #undef TT0_
1770 #undef TT1_
1771 #undef T2
1772
1773 #undef X0
1774 #undef X1
1775 #undef X0_
1776 #undef X1_
1777 #undef Y0
1778 #undef Y1
1779 #undef Y0_
1780 #undef Y1_
1781 #undef T0
1782 #undef T1
1783
1784 #undef PL
1785 #undef PH
1786 #undef PL_
1787 #undef PH_
1788
1789 #undef X1L
1790 #undef X1H
1791 #undef X1L_
1792 #undef X1H_
1793 #undef Y1L
1794 #undef Y1H
1795 #undef Y1L_
1796 #undef Y1H_
1797 #undef Z1L
1798 #undef Z1H
1799 #undef Z1L_
1800 #undef Z1H_
1801 #undef X2L
1802 #undef X2H
1803 #undef X2L_
1804 #undef X2H_
1805 #undef Z2L
1806 #undef Z2H
1807 #undef Z2L_
1808 #undef Z2H_
1809 #undef X3L
1810 #undef X3H
1811 #undef X3L_
1812 #undef X3H_
1813 #undef Y3L
1814 #undef Y3H
1815 #undef Y3L_
1816 #undef Y3H_
1817 #undef Z3L
1818 #undef Z3H
1819 #undef Z3L_
1820 #undef Z3H_
1821
1822 #undef ZER
1823 #undef SEL1
1824 #undef SEL1_
1825 #undef CAR1
1826 #undef CAR2
1827
1828 // p256PointDoubleAsm(P3, P1 *p256Point)
1829 // http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian.html#doubling-dbl-2007-bl
1830 // http://www.hyperelliptic.org/EFD/g1p/auto-shortw.html
1831 // http://www.hyperelliptic.org/EFD/g1p/auto-shortw-projective-3.html
1832 #define P3ptr R3
1833 #define P1ptr R4
1834 #define CPOOL R7
1835
1836 // Temporaries in REGs
1837 #define X3L V15
1838 #define X3H V16
1839 #define X3L_ VS47
1840 #define X3H_ VS48
1841 #define Y3L V17
1842 #define Y3H V18
1843 #define Y3L_ VS49
1844 #define Y3H_ VS50
1845 #define T1L V19
1846 #define T1H V20
1847 #define T2L V21
1848 #define T2H V22
1849 #define T3L V23
1850 #define T3H V24
1851
1852 #define X1L V6
1853 #define X1H V7
1854 #define X1L_ VS38
1855 #define X1H_ VS39
1856 #define Y1L V8
1857 #define Y1H V9
1858 #define Y1L_ VS40
1859 #define Y1H_ VS41
1860 #define Z1L V10
1861 #define Z1H V11
1862
1863 // Temps for Sub and Add
1864 #define TT0 V11
1865 #define TT1 V12
1866 #define TT0_ VS43
1867 #define TT1_ VS44
1868 #define T2 V13
1869
1870 // p256MulAsm Parameters
1871 #define X0 V0
1872 #define X1 V1
1873 #define X0_ VS32
1874 #define X1_ VS33
1875 #define Y0 V2
1876 #define Y1 V3
1877 #define Y0_ VS34
1878 #define Y1_ VS35
1879 #define T0 V4
1880 #define T1 V5
1881 #define T0_ VS36
1882 #define T1_ VS37
1883
1884 #define PL V30
1885 #define PH V31
1886 #define PL_ VS62
1887 #define PH_ VS63
1888
1889 #define Z3L V23
1890 #define Z3H V24
1891
1892 #define SWAP V25
1893 #define SWAP_ VS57
1894 #define ZER V26
1895 #define SEL1 V27
1896 #define CAR1 V28
1897 #define CAR2 V29
1898 /*
1899 * http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2004-hmv
1900 * Cost: 4M + 4S + 1*half + 5add + 2*2 + 1*3.
1901 * Source: 2004 Hankerson–Menezes–Vanstone, page 91.
1902 * A = 3(X₁-Z₁²)×(X₁+Z₁²)
1903 * B = 2Y₁
1904 * Z₃ = B×Z₁
1905 * C = B²
1906 * D = C×X₁
1907 * X₃ = A²-2D
1908 * Y₃ = (D-X₃)×A-C²/2
1909 *
1910 * Three-operand formula:
1911 * T1 = Z1²
1912 * T2 = X1-T1
1913 * T1 = X1+T1
1914 * T2 = T2*T1
1915 * T2 = 3*T2
1916 * Y3 = 2*Y1
1917 * Z3 = Y3*Z1
1918 * Y3 = Y3²
1919 * T3 = Y3*X1
1920 * Y3 = Y3²
1921 * Y3 = half*Y3
1922 * X3 = T2²
1923 * T1 = 2*T3
1924 * X3 = X3-T1
1925 * T1 = T3-X3
1926 * T1 = T1*T2
1927 * Y3 = T1-Y3
1928 */
1929
1930 TEXT ·p256PointDoubleAsm(SB), NOSPLIT, $0-16
1931 MOVD res+0(FP), P3ptr
1932 MOVD in+8(FP), P1ptr
1933
1934 MOVD $p256mul<>+0x00(SB), CPOOL
1935 MOVD $byteswap<>+0x00(SB), R15
1936
1937 MOVD $16, R16
1938 MOVD $32, R17
1939 MOVD $48, R18
1940 MOVD $64, R19
1941 MOVD $80, R20
1942
1943 LXVD2X (R16)(CPOOL), PH_
1944 LXVD2X (R0)(CPOOL), PL_
1945
1946 LXVD2X (R15)(R0), SWAP_
1947
1948 // X=Z1; Y=Z1; MUL; T- // T1 = Z1²
1949 LXVD2X (R19)(P1ptr), X0_ // Z1H
1950 LXVD2X (R20)(P1ptr), X1_ // Z1L
1951
1952 VPERM X0, X0, SWAP, X0
1953 VPERM X1, X1, SWAP, X1
1954
1955 VOR X0, X0, Y0
1956 VOR X1, X1, Y1
1957 CALL p256MulInternal<>(SB)
1958
1959 // SUB(X<X1-T) // T2 = X1-T1
1960 LXVD2X (R0)(P1ptr), X1L_
1961 LXVD2X (R16)(P1ptr), X1H_
1962 VPERM X1L, X1L, SWAP, X1L
1963 VPERM X1H, X1H, SWAP, X1H
1964
1965 p256SubInternal(X1,X0,X1H,X1L,T1,T0)
1966
1967 // ADD(Y<X1+T) // T1 = X1+T1
1968 p256AddInternal(Y1,Y0,X1H,X1L,T1,T0)
1969
1970 // X- ; Y- ; MUL; T- // T2 = T2*T1
1971 CALL p256MulInternal<>(SB)
1972
1973 // ADD(T2<T+T); ADD(T2<T2+T) // T2 = 3*T2
1974 p256AddInternal(T2H,T2L,T1,T0,T1,T0)
1975 p256AddInternal(T2H,T2L,T2H,T2L,T1,T0)
1976
1977 // ADD(X<Y1+Y1) // Y3 = 2*Y1
1978 LXVD2X (R15)(R0), SWAP_
1979 LXVD2X (R17)(P1ptr), Y1L_
1980 LXVD2X (R18)(P1ptr), Y1H_
1981 VPERM Y1L, Y1L, SWAP, Y1L
1982 VPERM Y1H, Y1H, SWAP, Y1H
1983
1984 p256AddInternal(X1,X0,Y1H,Y1L,Y1H,Y1L)
1985
1986 // X- ; Y=Z1; MUL; Z3:=T // Z3 = Y3*Z1
1987 LXVD2X (R15)(R0), SWAP_
1988 LXVD2X (R19)(P1ptr), Y0_
1989 LXVD2X (R20)(P1ptr), Y1_
1990 VPERM Y0, Y0, SWAP, Y0
1991 VPERM Y1, Y1, SWAP, Y1
1992
1993 CALL p256MulInternal<>(SB)
1994
1995 LXVD2X (R15)(R0), SWAP_
1996
1997 // Leave T0, T1 as is.
1998 VPERM T0, T0, SWAP, TT0
1999 VPERM T1, T1, SWAP, TT1
2000 STXVD2X TT0_, (R19)(P3ptr)
2001 STXVD2X TT1_, (R20)(P3ptr)
2002
2003 // X- ; Y=X ; MUL; T- // Y3 = Y3²
2004 VOR X0, X0, Y0
2005 VOR X1, X1, Y1
2006 CALL p256MulInternal<>(SB)
2007
2008 // X=T ; Y=X1; MUL; T3=T // T3 = Y3*X1
2009 VOR T0, T0, X0
2010 VOR T1, T1, X1
2011 LXVD2X (R15)(R0), SWAP_
2012 LXVD2X (R0)(P1ptr), Y0_
2013 LXVD2X (R16)(P1ptr), Y1_
2014 VPERM Y0, Y0, SWAP, Y0
2015 VPERM Y1, Y1, SWAP, Y1
2016 CALL p256MulInternal<>(SB)
2017 VOR T0, T0, T3L
2018 VOR T1, T1, T3H
2019
2020 // X- ; Y=X ; MUL; T- // Y3 = Y3²
2021 VOR X0, X0, Y0
2022 VOR X1, X1, Y1
2023 CALL p256MulInternal<>(SB)
2024
2025 // HAL(Y3<T) // Y3 = half*Y3
2026 p256HalfInternal(Y3H,Y3L, T1,T0)
2027
2028 // X=T2; Y=T2; MUL; T- // X3 = T2²
2029 VOR T2L, T2L, X0
2030 VOR T2H, T2H, X1
2031 VOR T2L, T2L, Y0
2032 VOR T2H, T2H, Y1
2033 CALL p256MulInternal<>(SB)
2034
2035 // ADD(T1<T3+T3) // T1 = 2*T3
2036 p256AddInternal(T1H,T1L,T3H,T3L,T3H,T3L)
2037
2038 // SUB(X3<T-T1) X3:=X3 // X3 = X3-T1
2039 p256SubInternal(X3H,X3L,T1,T0,T1H,T1L)
2040
2041 LXVD2X (R15)(R0), SWAP_
2042 VPERM X3L, X3L, SWAP, TT0
2043 VPERM X3H, X3H, SWAP, TT1
2044 STXVD2X TT0_, (R0)(P3ptr)
2045 STXVD2X TT1_, (R16)(P3ptr)
2046
2047 // SUB(X<T3-X3) // T1 = T3-X3
2048 p256SubInternal(X1,X0,T3H,T3L,X3H,X3L)
2049
2050 // X- ; Y- ; MUL; T- // T1 = T1*T2
2051 CALL p256MulInternal<>(SB)
2052
2053 // SUB(Y3<T-Y3) // Y3 = T1-Y3
2054 p256SubInternal(Y3H,Y3L,T1,T0,Y3H,Y3L)
2055
2056 LXVD2X (R15)(R0), SWAP_
2057 VPERM Y3L, Y3L, SWAP, Y3L
2058 VPERM Y3H, Y3H, SWAP, Y3H
2059 STXVD2X Y3L_, (R17)(P3ptr)
2060 STXVD2X Y3H_, (R18)(P3ptr)
2061 RET
2062
2063 #undef P3ptr
2064 #undef P1ptr
2065 #undef CPOOL
2066 #undef X3L
2067 #undef X3H
2068 #undef X3L_
2069 #undef X3H_
2070 #undef Y3L
2071 #undef Y3H
2072 #undef Y3L_
2073 #undef Y3H_
2074 #undef T1L
2075 #undef T1H
2076 #undef T2L
2077 #undef T2H
2078 #undef T3L
2079 #undef T3H
2080 #undef X1L
2081 #undef X1H
2082 #undef X1L_
2083 #undef X1H_
2084 #undef Y1L
2085 #undef Y1H
2086 #undef Y1L_
2087 #undef Y1H_
2088 #undef Z1L
2089 #undef Z1H
2090 #undef TT0
2091 #undef TT1
2092 #undef TT0_
2093 #undef TT1_
2094 #undef T2
2095 #undef X0
2096 #undef X1
2097 #undef X0_
2098 #undef X1_
2099 #undef Y0
2100 #undef Y1
2101 #undef Y0_
2102 #undef Y1_
2103 #undef T0
2104 #undef T1
2105 #undef T0_
2106 #undef T1_
2107 #undef PL
2108 #undef PH
2109 #undef PL_
2110 #undef PH_
2111 #undef Z3L
2112 #undef Z3H
2113 #undef ZER
2114 #undef SEL1
2115 #undef CAR1
2116 #undef CAR2
2117 #undef SWAP
2118 #undef SWAP_
2119
2120 // p256PointAddAsm(P3, P1, P2 *p256Point)
2121 #define P3ptr R3
2122 #define P1ptr R4
2123 #define P2ptr R5
2124 #define CPOOL R7
2125 #define TRUE R14
2126 #define RES1 R9
2127 #define RES2 R10
2128
2129 // Temporaries in REGs
2130 #define T1L V16
2131 #define T1H V17
2132 #define T2L V18
2133 #define T2H V19
2134 #define U1L V20
2135 #define U1H V21
2136 #define S1L V22
2137 #define S1H V23
2138 #define HL V24
2139 #define HH V25
2140 #define RL V26
2141 #define RH V27
2142 #define RH_ VS59
2143
2144 // Temps for Sub and Add
2145 #define ZER V6
2146 #define SEL1 V7
2147 #define CAR1 V8
2148 #define CAR2 V9
2149 #define TT0 V11
2150 #define TT0_ VS43
2151 #define TT1 V12
2152 #define TT1_ VS44
2153 #define T2 V13
2154
2155 #define SWAP V28
2156 #define SWAP_ VS60
2157
2158 // p256MulAsm Parameters
2159 #define X0 V0
2160 #define X1 V1
2161 #define X0_ VS32
2162 #define X1_ VS33
2163 #define Y0 V2
2164 #define Y1 V3
2165 #define Y0_ VS34
2166 #define Y1_ VS35
2167 #define T0 V4
2168 #define T1 V5
2169 #define T0_ VS36
2170 #define T1_ VS37
2171
2172 #define PL V30
2173 #define PH V31
2174 #define PL_ VS62
2175 #define PH_ VS63
2176 /*
2177 * https://choucroutage.com/Papers/SideChannelAttacks/ctrsa-2011-brown.pdf "Software Implementation of the NIST Elliptic Curves Over Prime Fields"
2178 *
2179 * A = X₁×Z₂²
2180 * B = Y₁×Z₂³
2181 * C = X₂×Z₁²-A
2182 * D = Y₂×Z₁³-B
2183 * X₃ = D² - 2A×C² - C³
2184 * Y₃ = D×(A×C² - X₃) - B×C³
2185 * Z₃ = Z₁×Z₂×C
2186 *
2187 * Three-operand formula (adopted): http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-1998-cmo-2
2188 * Temp storage: T1,T2,U1,H,Z3=X3=Y3,S1,R
2189 *
2190 * T1 = Z1*Z1
2191 * T2 = Z2*Z2
2192 * U1 = X1*T2
2193 * H = X2*T1
2194 * H = H-U1
2195 * Z3 = Z1*Z2
2196 * Z3 = Z3*H << store-out Z3 result reg.. could override Z1, if slices have same backing array
2197 *
2198 * S1 = Z2*T2
2199 * S1 = Y1*S1
2200 * R = Z1*T1
2201 * R = Y2*R
2202 * R = R-S1
2203 *
2204 * T1 = H*H
2205 * T2 = H*T1
2206 * U1 = U1*T1
2207 *
2208 * X3 = R*R
2209 * X3 = X3-T2
2210 * T1 = 2*U1
2211 * X3 = X3-T1 << store-out X3 result reg
2212 *
2213 * T2 = S1*T2
2214 * Y3 = U1-X3
2215 * Y3 = R*Y3
2216 * Y3 = Y3-T2 << store-out Y3 result reg
2217
2218 // X=Z1; Y=Z1; MUL; T- // T1 = Z1*Z1
2219 // X- ; Y=T ; MUL; R=T // R = Z1*T1
2220 // X=X2; Y- ; MUL; H=T // H = X2*T1
2221 // X=Z2; Y=Z2; MUL; T- // T2 = Z2*Z2
2222 // X- ; Y=T ; MUL; S1=T // S1 = Z2*T2
2223 // X=X1; Y- ; MUL; U1=T // U1 = X1*T2
2224 // SUB(H<H-T) // H = H-U1
2225 // X=Z1; Y=Z2; MUL; T- // Z3 = Z1*Z2
2226 // X=T ; Y=H ; MUL; Z3:=T// Z3 = Z3*H << store-out Z3 result reg.. could override Z1, if slices have same backing array
2227 // X=Y1; Y=S1; MUL; S1=T // S1 = Y1*S1
2228 // X=Y2; Y=R ; MUL; T- // R = Y2*R
2229 // SUB(R<T-S1) // R = R-S1
2230 // X=H ; Y=H ; MUL; T- // T1 = H*H
2231 // X- ; Y=T ; MUL; T2=T // T2 = H*T1
2232 // X=U1; Y- ; MUL; U1=T // U1 = U1*T1
2233 // X=R ; Y=R ; MUL; T- // X3 = R*R
2234 // SUB(T<T-T2) // X3 = X3-T2
2235 // ADD(X<U1+U1) // T1 = 2*U1
2236 // SUB(T<T-X) X3:=T // X3 = X3-T1 << store-out X3 result reg
2237 // SUB(Y<U1-T) // Y3 = U1-X3
2238 // X=R ; Y- ; MUL; U1=T // Y3 = R*Y3
2239 // X=S1; Y=T2; MUL; T- // T2 = S1*T2
2240 // SUB(T<U1-T); Y3:=T // Y3 = Y3-T2 << store-out Y3 result reg
2241 */
2242 TEXT ·p256PointAddAsm(SB), NOSPLIT, $16-32
2243 MOVD res+0(FP), P3ptr
2244 MOVD in1+8(FP), P1ptr
2245 MOVD $p256mul<>+0x00(SB), CPOOL
2246 MOVD $16, R16
2247 MOVD $32, R17
2248 MOVD $48, R18
2249 MOVD $64, R19
2250 MOVD $80, R20
2251
2252 MOVD $byteswap<>+0x00(SB), R8
2253 LXVD2X (R16)(CPOOL), PH_
2254 LXVD2X (R0)(CPOOL), PL_
2255
2256 // X=Z1; Y=Z1; MUL; T- // T1 = Z1*Z1
2257 LXVD2X (R8)(R0), SWAP_
2258 LXVD2X (R19)(P1ptr), X0_ // Z1L
2259 LXVD2X (R20)(P1ptr), X1_ // Z1H
2260 VPERM X0, X0, SWAP, X0
2261 VPERM X1, X1, SWAP, X1
2262 VOR X0, X0, Y0
2263 VOR X1, X1, Y1
2264 CALL p256MulInternal<>(SB)
2265
2266 // X- ; Y=T ; MUL; R=T // R = Z1*T1
2267 VOR T0, T0, Y0
2268 VOR T1, T1, Y1
2269 CALL p256MulInternal<>(SB)
2270 VOR T0, T0, RL // SAVE: RL
2271 VOR T1, T1, RH // SAVE: RH
2272
2273 STXVD2X RH_, (R1)(R17) // V27 has to be saved
2274
2275 // X=X2; Y- ; MUL; H=T // H = X2*T1
2276 MOVD in2+16(FP), P2ptr
2277 LXVD2X (R8)(R0), SWAP_
2278 LXVD2X (R0)(P2ptr), X0_ // X2L
2279 LXVD2X (R16)(P2ptr), X1_ // X2H
2280 VPERM X0, X0, SWAP, X0
2281 VPERM X1, X1, SWAP, X1
2282 CALL p256MulInternal<>(SB)
2283 VOR T0, T0, HL // SAVE: HL
2284 VOR T1, T1, HH // SAVE: HH
2285
2286 // X=Z2; Y=Z2; MUL; T- // T2 = Z2*Z2
2287 MOVD in2+16(FP), P2ptr
2288 LXVD2X (R8)(R0), SWAP_
2289 LXVD2X (R19)(P2ptr), X0_ // Z2L
2290 LXVD2X (R20)(P2ptr), X1_ // Z2H
2291 VPERM X0, X0, SWAP, X0
2292 VPERM X1, X1, SWAP, X1
2293 VOR X0, X0, Y0
2294 VOR X1, X1, Y1
2295 CALL p256MulInternal<>(SB)
2296
2297 // X- ; Y=T ; MUL; S1=T // S1 = Z2*T2
2298 VOR T0, T0, Y0
2299 VOR T1, T1, Y1
2300 CALL p256MulInternal<>(SB)
2301 VOR T0, T0, S1L // SAVE: S1L
2302 VOR T1, T1, S1H // SAVE: S1H
2303
2304 // X=X1; Y- ; MUL; U1=T // U1 = X1*T2
2305 MOVD in1+8(FP), P1ptr
2306 LXVD2X (R8)(R0), SWAP_
2307 LXVD2X (R0)(P1ptr), X0_ // X1L
2308 LXVD2X (R16)(P1ptr), X1_ // X1H
2309 VPERM X0, X0, SWAP, X0
2310 VPERM X1, X1, SWAP, X1
2311 CALL p256MulInternal<>(SB)
2312 VOR T0, T0, U1L // SAVE: U1L
2313 VOR T1, T1, U1H // SAVE: U1H
2314
2315 // SUB(H<H-T) // H = H-U1
2316 p256SubInternal(HH,HL,HH,HL,T1,T0)
2317
2318 // if H == 0 or H^P == 0 then ret=1 else ret=0
2319 // clobbers T1H and T1L
2320 MOVD $1, TRUE
2321 VSPLTISB $0, ZER
2322 VOR HL, HH, T1H
2323 VCMPEQUDCC ZER, T1H, T1H
2324
2325 // 26 = CR6 NE
2326 ISEL $26, R0, TRUE, RES1
2327 VXOR HL, PL, T1L // SAVE: T1L
2328 VXOR HH, PH, T1H // SAVE: T1H
2329 VOR T1L, T1H, T1H
2330 VCMPEQUDCC ZER, T1H, T1H
2331
2332 // 26 = CR6 NE
2333 ISEL $26, R0, TRUE, RES2
2334 OR RES2, RES1, RES1
2335 MOVD RES1, ret+24(FP)
2336
2337 // X=Z1; Y=Z2; MUL; T- // Z3 = Z1*Z2
2338 MOVD $byteswap<>+0x00(SB), R8
2339 MOVD in1+8(FP), P1ptr
2340 MOVD in2+16(FP), P2ptr
2341 LXVD2X (R8)(R0), SWAP_
2342 LXVD2X (R19)(P1ptr), X0_ // Z1L
2343 LXVD2X (R20)(P1ptr), X1_ // Z1H
2344 VPERM X0, X0, SWAP, X0
2345 VPERM X1, X1, SWAP, X1
2346 LXVD2X (R19)(P2ptr), Y0_ // Z2L
2347 LXVD2X (R20)(P2ptr), Y1_ // Z2H
2348 VPERM Y0, Y0, SWAP, Y0
2349 VPERM Y1, Y1, SWAP, Y1
2350 CALL p256MulInternal<>(SB)
2351
2352 // X=T ; Y=H ; MUL; Z3:=T// Z3 = Z3*H
2353 VOR T0, T0, X0
2354 VOR T1, T1, X1
2355 VOR HL, HL, Y0
2356 VOR HH, HH, Y1
2357 CALL p256MulInternal<>(SB)
2358 MOVD res+0(FP), P3ptr
2359 LXVD2X (R8)(R0), SWAP_
2360 VPERM T1, T1, SWAP, TT1
2361 VPERM T0, T0, SWAP, TT0
2362 STXVD2X TT0_, (R19)(P3ptr)
2363 STXVD2X TT1_, (R20)(P3ptr)
2364
2365 // X=Y1; Y=S1; MUL; S1=T // S1 = Y1*S1
2366 MOVD in1+8(FP), P1ptr
2367 LXVD2X (R17)(P1ptr), X0_
2368 LXVD2X (R18)(P1ptr), X1_
2369 VPERM X0, X0, SWAP, X0
2370 VPERM X1, X1, SWAP, X1
2371 VOR S1L, S1L, Y0
2372 VOR S1H, S1H, Y1
2373 CALL p256MulInternal<>(SB)
2374 VOR T0, T0, S1L
2375 VOR T1, T1, S1H
2376
2377 // X=Y2; Y=R ; MUL; T- // R = Y2*R
2378 MOVD in2+16(FP), P2ptr
2379 LXVD2X (R8)(R0), SWAP_
2380 LXVD2X (R17)(P2ptr), X0_
2381 LXVD2X (R18)(P2ptr), X1_
2382 VPERM X0, X0, SWAP, X0
2383 VPERM X1, X1, SWAP, X1
2384 VOR RL, RL, Y0
2385
2386 // VOR RH, RH, Y1 RH was saved above in D2X format
2387 LXVD2X (R1)(R17), Y1_
2388 CALL p256MulInternal<>(SB)
2389
2390 // SUB(R<T-S1) // R = T-S1
2391 p256SubInternal(RH,RL,T1,T0,S1H,S1L)
2392
2393 STXVD2X RH_, (R1)(R17) // Save RH
2394
2395 // if R == 0 or R^P == 0 then ret=ret else ret=0
2396 // clobbers T1H and T1L
2397 // Redo this using ISEL??
2398 MOVD $1, TRUE
2399 VSPLTISB $0, ZER
2400 VOR RL, RH, T1H
2401 VCMPEQUDCC ZER, T1H, T1H
2402
2403 // 24 = CR6 NE
2404 ISEL $26, R0, TRUE, RES1
2405 VXOR RL, PL, T1L
2406 VXOR RH, PH, T1H // SAVE: T1L
2407 VOR T1L, T1H, T1H
2408 VCMPEQUDCC ZER, T1H, T1H
2409
2410 // 26 = CR6 NE
2411 ISEL $26, R0, TRUE, RES2
2412 OR RES2, RES1, RES1
2413 MOVD ret+24(FP), RES2
2414 AND RES2, RES1, RES1
2415 MOVD RES1, ret+24(FP)
2416
2417 // X=H ; Y=H ; MUL; T- // T1 = H*H
2418 VOR HL, HL, X0
2419 VOR HH, HH, X1
2420 VOR HL, HL, Y0
2421 VOR HH, HH, Y1
2422 CALL p256MulInternal<>(SB)
2423
2424 // X- ; Y=T ; MUL; T2=T // T2 = H*T1
2425 VOR T0, T0, Y0
2426 VOR T1, T1, Y1
2427 CALL p256MulInternal<>(SB)
2428 VOR T0, T0, T2L
2429 VOR T1, T1, T2H
2430
2431 // X=U1; Y- ; MUL; U1=T // U1 = U1*T1
2432 VOR U1L, U1L, X0
2433 VOR U1H, U1H, X1
2434 CALL p256MulInternal<>(SB)
2435 VOR T0, T0, U1L
2436 VOR T1, T1, U1H
2437
2438 // X=R ; Y=R ; MUL; T- // X3 = R*R
2439 VOR RL, RL, X0
2440
2441 // VOR RH, RH, X1
2442 VOR RL, RL, Y0
2443
2444 // RH was saved above using STXVD2X
2445 LXVD2X (R1)(R17), X1_
2446 VOR X1, X1, Y1
2447
2448 // VOR RH, RH, Y1
2449 CALL p256MulInternal<>(SB)
2450
2451 // SUB(T<T-T2) // X3 = X3-T2
2452 p256SubInternal(T1,T0,T1,T0,T2H,T2L)
2453
2454 // ADD(X<U1+U1) // T1 = 2*U1
2455 p256AddInternal(X1,X0,U1H,U1L,U1H,U1L)
2456
2457 // SUB(T<T-X) X3:=T // X3 = X3-T1 << store-out X3 result reg
2458 p256SubInternal(T1,T0,T1,T0,X1,X0)
2459 MOVD res+0(FP), P3ptr
2460 LXVD2X (R8)(R0), SWAP_
2461 VPERM T1, T1, SWAP, TT1
2462 VPERM T0, T0, SWAP, TT0
2463 STXVD2X TT0_, (R0)(P3ptr)
2464 STXVD2X TT1_, (R16)(P3ptr)
2465
2466 // SUB(Y<U1-T) // Y3 = U1-X3
2467 p256SubInternal(Y1,Y0,U1H,U1L,T1,T0)
2468
2469 // X=R ; Y- ; MUL; U1=T // Y3 = R*Y3
2470 VOR RL, RL, X0
2471
2472 // VOR RH, RH, X1
2473 LXVD2X (R1)(R17), X1_
2474 CALL p256MulInternal<>(SB)
2475 VOR T0, T0, U1L
2476 VOR T1, T1, U1H
2477
2478 // X=S1; Y=T2; MUL; T- // T2 = S1*T2
2479 VOR S1L, S1L, X0
2480 VOR S1H, S1H, X1
2481 VOR T2L, T2L, Y0
2482 VOR T2H, T2H, Y1
2483 CALL p256MulInternal<>(SB)
2484
2485 // SUB(T<U1-T); Y3:=T // Y3 = Y3-T2 << store-out Y3 result reg
2486 p256SubInternal(T1,T0,U1H,U1L,T1,T0)
2487 MOVD res+0(FP), P3ptr
2488 LXVD2X (R8)(R0), SWAP_
2489 VPERM T1, T1, SWAP, TT1
2490 VPERM T0, T0, SWAP, TT0
2491 STXVD2X TT0_, (R17)(P3ptr)
2492 STXVD2X TT1_, (R18)(P3ptr)
2493
2494 RET
2495
View as plain text