Text file
src/crypto/aes/gcm_ppc64le.s
1 // Copyright 2019 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 // Based on CRYPTOGAMS code with the following comment:
6 // # ====================================================================
7 // # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
8 // # project. The module is, however, dual licensed under OpenSSL and
9 // # CRYPTOGAMS licenses depending on where you obtain it. For further
10 // # details see http://www.openssl.org/~appro/cryptogams/.
11 // # ====================================================================
12
13 // This implementation is based on the ppc64 asm generated by the
14 // script https://github.com/dot-asm/cryptogams/blob/master/ppc/ghashp8-ppc.pl
15 // from commit d47afb3c.
16
17 // Changes were made due to differences in the ABI and some register usage.
18 // Some arguments were changed due to the way the Go code passes them.
19
20 #include "textflag.h"
21
22 #define XIP R3
23 #define HTBL R4
24 #define INP R5
25 #define LEN R6
26
27 #define XL V0
28 #define XM V1
29 #define XH V2
30 #define IN V3
31 #define ZERO V4
32 #define T0 V5
33 #define T1 V6
34 #define T2 V7
35 #define XC2 V8
36 #define H V9
37 #define HH V10
38 #define HL V11
39 #define LEMASK V12
40 #define XL1 V13
41 #define XM1 V14
42 #define XH1 V15
43 #define IN1 V16
44 #define H2 V17
45 #define H2H V18
46 #define H2L V19
47 #define XL3 V20
48 #define XM2 V21
49 #define IN2 V22
50 #define H3L V23
51 #define H3 V24
52 #define H3H V25
53 #define XH3 V26
54 #define XM3 V27
55 #define IN3 V28
56 #define H4L V29
57 #define H4 V30
58 #define H4H V31
59
60 #define IN0 IN
61 #define H21L HL
62 #define H21H HH
63 #define LOPERM H2L
64 #define HIPERM H2H
65
66 #define VXL VS32
67 #define VIN VS35
68 #define VXC2 VS40
69 #define VH VS41
70 #define VHH VS42
71 #define VHL VS43
72 #define VIN1 VS48
73 #define VH2 VS49
74 #define VH2H VS50
75 #define VH2L VS51
76
77 #define VIN2 VS54
78 #define VH3L VS55
79 #define VH3 VS56
80 #define VH3H VS57
81 #define VIN3 VS60
82 #define VH4L VS61
83 #define VH4 VS62
84 #define VH4H VS63
85
86 #define VIN0 VIN
87
88 // func gcmInit(productTable *[256]byte, h []byte)
89 TEXT ·gcmInit(SB), NOSPLIT, $0-32
90 MOVD productTable+0(FP), XIP
91 MOVD h+8(FP), HTBL
92
93 MOVD $0x10, R8
94 MOVD $0x20, R9
95 MOVD $0x30, R10
96 LXVD2X (HTBL)(R0), VH // Load H
97
98 VSPLTISB $-16, XC2 // 0xf0
99 VSPLTISB $1, T0 // one
100 VADDUBM XC2, XC2, XC2 // 0xe0
101 VXOR ZERO, ZERO, ZERO
102 VOR XC2, T0, XC2 // 0xe1
103 VSLDOI $15, XC2, ZERO, XC2 // 0xe1...
104 VSLDOI $1, ZERO, T0, T1 // ...1
105 VADDUBM XC2, XC2, XC2 // 0xc2...
106 VSPLTISB $7, T2
107 VOR XC2, T1, XC2 // 0xc2....01
108 VSPLTB $0, H, T1 // most significant byte
109 VSL H, T0, H // H<<=1
110 VSRAB T1, T2, T1 // broadcast carry bit
111 VAND T1, XC2, T1
112 VXOR H, T1, IN // twisted H
113
114 VSLDOI $8, IN, IN, H // twist even more ...
115 VSLDOI $8, ZERO, XC2, XC2 // 0xc2.0
116 VSLDOI $8, ZERO, H, HL // ... and split
117 VSLDOI $8, H, ZERO, HH
118
119 STXVD2X VXC2, (XIP+R0) // save pre-computed table
120 STXVD2X VHL, (XIP+R8)
121 MOVD $0x40, R8
122 STXVD2X VH, (XIP+R9)
123 MOVD $0x50, R9
124 STXVD2X VHH, (XIP+R10)
125 MOVD $0x60, R10
126
127 VPMSUMD IN, HL, XL // H.lo·H.lo
128 VPMSUMD IN, H, XM // H.hi·H.lo+H.lo·H.hi
129 VPMSUMD IN, HH, XH // H.hi·H.hi
130
131 VPMSUMD XL, XC2, T2 // 1st reduction phase
132
133 VSLDOI $8, XM, ZERO, T0
134 VSLDOI $8, ZERO, XM, T1
135 VXOR XL, T0, XL
136 VXOR XH, T1, XH
137
138 VSLDOI $8, XL, XL, XL
139 VXOR XL, T2, XL
140
141 VSLDOI $8, XL, XL, T1 // 2nd reduction phase
142 VPMSUMD XL, XC2, XL
143 VXOR T1, XH, T1
144 VXOR XL, T1, IN1
145
146 VSLDOI $8, IN1, IN1, H2
147 VSLDOI $8, ZERO, H2, H2L
148 VSLDOI $8, H2, ZERO, H2H
149
150 STXVD2X VH2L, (XIP+R8) // save H^2
151 MOVD $0x70, R8
152 STXVD2X VH2, (XIP+R9)
153 MOVD $0x80, R9
154 STXVD2X VH2H, (XIP+R10)
155 MOVD $0x90, R10
156
157 VPMSUMD IN, H2L, XL // H.lo·H^2.lo
158 VPMSUMD IN1, H2L, XL1 // H^2.lo·H^2.lo
159 VPMSUMD IN, H2, XM // H.hi·H^2.lo+H.lo·H^2.hi
160 VPMSUMD IN1, H2, XM1 // H^2.hi·H^2.lo+H^2.lo·H^2.hi
161 VPMSUMD IN, H2H, XH // H.hi·H^2.hi
162 VPMSUMD IN1, H2H, XH1 // H^2.hi·H^2.hi
163
164 VPMSUMD XL, XC2, T2 // 1st reduction phase
165 VPMSUMD XL1, XC2, HH // 1st reduction phase
166
167 VSLDOI $8, XM, ZERO, T0
168 VSLDOI $8, ZERO, XM, T1
169 VSLDOI $8, XM1, ZERO, HL
170 VSLDOI $8, ZERO, XM1, H
171 VXOR XL, T0, XL
172 VXOR XH, T1, XH
173 VXOR XL1, HL, XL1
174 VXOR XH1, H, XH1
175
176 VSLDOI $8, XL, XL, XL
177 VSLDOI $8, XL1, XL1, XL1
178 VXOR XL, T2, XL
179 VXOR XL1, HH, XL1
180
181 VSLDOI $8, XL, XL, T1 // 2nd reduction phase
182 VSLDOI $8, XL1, XL1, H // 2nd reduction phase
183 VPMSUMD XL, XC2, XL
184 VPMSUMD XL1, XC2, XL1
185 VXOR T1, XH, T1
186 VXOR H, XH1, H
187 VXOR XL, T1, XL
188 VXOR XL1, H, XL1
189
190 VSLDOI $8, XL, XL, H
191 VSLDOI $8, XL1, XL1, H2
192 VSLDOI $8, ZERO, H, HL
193 VSLDOI $8, H, ZERO, HH
194 VSLDOI $8, ZERO, H2, H2L
195 VSLDOI $8, H2, ZERO, H2H
196
197 STXVD2X VHL, (XIP+R8) // save H^3
198 MOVD $0xa0, R8
199 STXVD2X VH, (XIP+R9)
200 MOVD $0xb0, R9
201 STXVD2X VHH, (XIP+R10)
202 MOVD $0xc0, R10
203 STXVD2X VH2L, (XIP+R8) // save H^4
204 STXVD2X VH2, (XIP+R9)
205 STXVD2X VH2H, (XIP+R10)
206
207 RET
208
209 // func gcmHash(output []byte, productTable *[256]byte, inp []byte, len int)
210 TEXT ·gcmHash(SB), NOSPLIT, $0-64
211 MOVD output+0(FP), XIP
212 MOVD productTable+24(FP), HTBL
213 MOVD inp+32(FP), INP
214 MOVD len+56(FP), LEN
215
216 MOVD $0x10, R8
217 MOVD $0x20, R9
218 MOVD $0x30, R10
219 LXVD2X (XIP)(R0), VXL // load Xi
220
221 LXVD2X (HTBL)(R8), VHL // load pre-computed table
222 MOVD $0x40, R8
223 LVSL (R0)(R0), LEMASK
224 LXVD2X (HTBL)(R9), VH
225 MOVD $0x50, R9
226 VSPLTISB $0x07, T0
227 LXVD2X (HTBL)(R10), VHH
228 MOVD $0x60, R10
229 VXOR LEMASK, T0, LEMASK
230 LXVD2X (HTBL)(R0), VXC2
231 VPERM XL, XL, LEMASK, XL
232 VXOR ZERO, ZERO, ZERO
233
234 CMPU LEN, $64
235 BGE gcm_ghash_p8_4x
236
237 LXVD2X (INP)(R0), VIN
238 ADD $16, INP, INP
239 SUBCCC $16, LEN, LEN
240 VPERM IN, IN, LEMASK, IN
241 VXOR IN, XL, IN
242 BEQ short
243
244 LXVD2X (HTBL)(R8), VH2L // load H^2
245 MOVD $16, R8
246 LXVD2X (HTBL)(R9), VH2
247 ADD LEN, INP, R9 // end of input
248 LXVD2X (HTBL)(R10), VH2H
249
250 loop_2x:
251 LXVD2X (INP)(R0), VIN1
252 VPERM IN1, IN1, LEMASK, IN1
253
254 SUBC $32, LEN, LEN
255 VPMSUMD IN, H2L, XL // H^2.lo·Xi.lo
256 VPMSUMD IN1, HL, XL1 // H.lo·Xi+1.lo
257 SUBE R11, R11, R11 // borrow?-1:0
258 VPMSUMD IN, H2, XM // H^2.hi·Xi.lo+H^2.lo·Xi.hi
259 VPMSUMD IN1, H, XM1 // H.hi·Xi+1.lo+H.lo·Xi+1.hi
260 AND LEN, R11, R11
261 VPMSUMD IN, H2H, XH // H^2.hi·Xi.hi
262 VPMSUMD IN1, HH, XH1 // H.hi·Xi+1.hi
263 ADD R11, INP, INP
264
265 VXOR XL, XL1, XL
266 VXOR XM, XM1, XM
267
268 VPMSUMD XL, XC2, T2 // 1st reduction phase
269
270 VSLDOI $8, XM, ZERO, T0
271 VSLDOI $8, ZERO, XM, T1
272 VXOR XH, XH1, XH
273 VXOR XL, T0, XL
274 VXOR XH, T1, XH
275
276 VSLDOI $8, XL, XL, XL
277 VXOR XL, T2, XL
278 LXVD2X (INP)(R8), VIN
279 ADD $32, INP, INP
280
281 VSLDOI $8, XL, XL, T1 // 2nd reduction phase
282 VPMSUMD XL, XC2, XL
283 VPERM IN, IN, LEMASK, IN
284 VXOR T1, XH, T1
285 VXOR IN, T1, IN
286 VXOR IN, XL, IN
287 CMP R9, INP
288 BGT loop_2x // done yet?
289
290 CMPWU LEN, $0
291 BNE even
292
293 short:
294 VPMSUMD IN, HL, XL // H.lo·Xi.lo
295 VPMSUMD IN, H, XM // H.hi·Xi.lo+H.lo·Xi.hi
296 VPMSUMD IN, HH, XH // H.hi·Xi.hi
297
298 VPMSUMD XL, XC2, T2 // 1st reduction phase
299
300 VSLDOI $8, XM, ZERO, T0
301 VSLDOI $8, ZERO, XM, T1
302 VXOR XL, T0, XL
303 VXOR XH, T1, XH
304
305 VSLDOI $8, XL, XL, XL
306 VXOR XL, T2, XL
307
308 VSLDOI $8, XL, XL, T1 // 2nd reduction phase
309 VPMSUMD XL, XC2, XL
310 VXOR T1, XH, T1
311
312 even:
313 VXOR XL, T1, XL
314 VPERM XL, XL, LEMASK, XL
315 STXVD2X VXL, (XIP+R0)
316
317 OR R12, R12, R12 // write out Xi
318 RET
319
320 gcm_ghash_p8_4x:
321 LVSL (R8)(R0), T0 // 0x0001..0e0f
322 MOVD $0x70, R8
323 LXVD2X (HTBL)(R9), VH2
324 MOVD $0x80, R9
325 VSPLTISB $8, T1 // 0x0808..0808
326 MOVD $0x90, R10
327 LXVD2X (HTBL)(R8), VH3L // load H^3
328 MOVD $0xa0, R8
329 LXVD2X (HTBL)(R9), VH3
330 MOVD $0xb0, R9
331 LXVD2X (HTBL)(R10), VH3H
332 MOVD $0xc0, R10
333 LXVD2X (HTBL)(R8), VH4L // load H^4
334 MOVD $0x10, R8
335 LXVD2X (HTBL)(R9), VH4
336 MOVD $0x20, R9
337 LXVD2X (HTBL)(R10), VH4H
338 MOVD $0x30, R10
339
340 VSLDOI $8, ZERO, T1, T2 // 0x0000..0808
341 VADDUBM T0, T2, HIPERM // 0x0001..1617
342 VADDUBM T1, HIPERM, LOPERM // 0x0809..1e1f
343
344 SRD $4, LEN, LEN // this allows to use sign bit as carry
345
346 LXVD2X (INP)(R0), VIN0 // load input
347 LXVD2X (INP)(R8), VIN1
348 SUBCCC $8, LEN, LEN
349 LXVD2X (INP)(R9), VIN2
350 LXVD2X (INP)(R10), VIN3
351 ADD $0x40, INP, INP
352 VPERM IN0, IN0, LEMASK, IN0
353 VPERM IN1, IN1, LEMASK, IN1
354 VPERM IN2, IN2, LEMASK, IN2
355 VPERM IN3, IN3, LEMASK, IN3
356
357 VXOR IN0, XL, XH
358
359 VPMSUMD IN1, H3L, XL1
360 VPMSUMD IN1, H3, XM1
361 VPMSUMD IN1, H3H, XH1
362
363 VPERM H2, H, HIPERM, H21L
364 VPERM IN2, IN3, LOPERM, T0
365 VPERM H2, H, LOPERM, H21H
366 VPERM IN2, IN3, HIPERM, T1
367 VPMSUMD IN2, H2, XM2 // H^2.lo·Xi+2.hi+H^2.hi·Xi+2.lo
368 VPMSUMD T0, H21L, XL3 // H^2.lo·Xi+2.lo+H.lo·Xi+3.lo
369 VPMSUMD IN3, H, XM3 // H.hi·Xi+3.lo +H.lo·Xi+3.hi
370 VPMSUMD T1, H21H, XH3 // H^2.hi·Xi+2.hi+H.hi·Xi+3.hi
371
372 VXOR XM2, XM1, XM2
373 VXOR XL3, XL1, XL3
374 VXOR XM3, XM2, XM3
375 VXOR XH3, XH1, XH3
376
377 BLT tail_4x
378
379 loop_4x:
380 LXVD2X (INP)(R0), VIN0
381 LXVD2X (INP)(R8), VIN1
382 SUBCCC $4, LEN, LEN
383 LXVD2X (INP)(R9), VIN2
384 LXVD2X (INP)(R10), VIN3
385 ADD $0x40, INP, INP
386 VPERM IN1, IN1, LEMASK, IN1
387 VPERM IN2, IN2, LEMASK, IN2
388 VPERM IN3, IN3, LEMASK, IN3
389 VPERM IN0, IN0, LEMASK, IN0
390
391 VPMSUMD XH, H4L, XL // H^4.lo·Xi.lo
392 VPMSUMD XH, H4, XM // H^4.hi·Xi.lo+H^4.lo·Xi.hi
393 VPMSUMD XH, H4H, XH // H^4.hi·Xi.hi
394 VPMSUMD IN1, H3L, XL1
395 VPMSUMD IN1, H3, XM1
396 VPMSUMD IN1, H3H, XH1
397
398 VXOR XL, XL3, XL
399 VXOR XM, XM3, XM
400 VXOR XH, XH3, XH
401 VPERM IN2, IN3, LOPERM, T0
402 VPERM IN2, IN3, HIPERM, T1
403
404 VPMSUMD XL, XC2, T2 // 1st reduction phase
405 VPMSUMD T0, H21L, XL3 // H.lo·Xi+3.lo +H^2.lo·Xi+2.lo
406 VPMSUMD T1, H21H, XH3 // H.hi·Xi+3.hi +H^2.hi·Xi+2.hi
407
408 VSLDOI $8, XM, ZERO, T0
409 VSLDOI $8, ZERO, XM, T1
410 VXOR XL, T0, XL
411 VXOR XH, T1, XH
412
413 VSLDOI $8, XL, XL, XL
414 VXOR XL, T2, XL
415
416 VSLDOI $8, XL, XL, T1 // 2nd reduction phase
417 VPMSUMD IN2, H2, XM2 // H^2.hi·Xi+2.lo+H^2.lo·Xi+2.hi
418 VPMSUMD IN3, H, XM3 // H.hi·Xi+3.lo +H.lo·Xi+3.hi
419 VPMSUMD XL, XC2, XL
420
421 VXOR XL3, XL1, XL3
422 VXOR XH3, XH1, XH3
423 VXOR XH, IN0, XH
424 VXOR XM2, XM1, XM2
425 VXOR XH, T1, XH
426 VXOR XM3, XM2, XM3
427 VXOR XH, XL, XH
428 BGE loop_4x
429
430 tail_4x:
431 VPMSUMD XH, H4L, XL // H^4.lo·Xi.lo
432 VPMSUMD XH, H4, XM // H^4.hi·Xi.lo+H^4.lo·Xi.hi
433 VPMSUMD XH, H4H, XH // H^4.hi·Xi.hi
434
435 VXOR XL, XL3, XL
436 VXOR XM, XM3, XM
437
438 VPMSUMD XL, XC2, T2 // 1st reduction phase
439
440 VSLDOI $8, XM, ZERO, T0
441 VSLDOI $8, ZERO, XM, T1
442 VXOR XH, XH3, XH
443 VXOR XL, T0, XL
444 VXOR XH, T1, XH
445
446 VSLDOI $8, XL, XL, XL
447 VXOR XL, T2, XL
448
449 VSLDOI $8, XL, XL, T1 // 2nd reduction phase
450 VPMSUMD XL, XC2, XL
451 VXOR T1, XH, T1
452 VXOR XL, T1, XL
453
454 ADDCCC $4, LEN, LEN
455 BEQ done_4x
456
457 LXVD2X (INP)(R0), VIN0
458 CMPU LEN, $2
459 MOVD $-4, LEN
460 BLT one
461 LXVD2X (INP)(R8), VIN1
462 BEQ two
463
464 three:
465 LXVD2X (INP)(R9), VIN2
466 VPERM IN0, IN0, LEMASK, IN0
467 VPERM IN1, IN1, LEMASK, IN1
468 VPERM IN2, IN2, LEMASK, IN2
469
470 VXOR IN0, XL, XH
471 VOR H3L, H3L, H4L
472 VOR H3, H3, H4
473 VOR H3H, H3H, H4H
474
475 VPERM IN1, IN2, LOPERM, T0
476 VPERM IN1, IN2, HIPERM, T1
477 VPMSUMD IN1, H2, XM2 // H^2.lo·Xi+1.hi+H^2.hi·Xi+1.lo
478 VPMSUMD IN2, H, XM3 // H.hi·Xi+2.lo +H.lo·Xi+2.hi
479 VPMSUMD T0, H21L, XL3 // H^2.lo·Xi+1.lo+H.lo·Xi+2.lo
480 VPMSUMD T1, H21H, XH3 // H^2.hi·Xi+1.hi+H.hi·Xi+2.hi
481
482 VXOR XM3, XM2, XM3
483 JMP tail_4x
484
485 two:
486 VPERM IN0, IN0, LEMASK, IN0
487 VPERM IN1, IN1, LEMASK, IN1
488
489 VXOR IN, XL, XH
490 VPERM ZERO, IN1, LOPERM, T0
491 VPERM ZERO, IN1, HIPERM, T1
492
493 VSLDOI $8, ZERO, H2, H4L
494 VOR H2, H2, H4
495 VSLDOI $8, H2, ZERO, H4H
496
497 VPMSUMD T0, H21L, XL3 // H.lo·Xi+1.lo
498 VPMSUMD IN1, H, XM3 // H.hi·Xi+1.lo+H.lo·Xi+2.hi
499 VPMSUMD T1, H21H, XH3 // H.hi·Xi+1.hi
500
501 JMP tail_4x
502
503 one:
504 VPERM IN0, IN0, LEMASK, IN0
505
506 VSLDOI $8, ZERO, H, H4L
507 VOR H, H, H4
508 VSLDOI $8, H, ZERO, H4H
509
510 VXOR IN0, XL, XH
511 VXOR XL3, XL3, XL3
512 VXOR XM3, XM3, XM3
513 VXOR XH3, XH3, XH3
514
515 JMP tail_4x
516
517 done_4x:
518 VPERM XL, XL, LEMASK, XL
519 STXVD2X VXL, (XIP+R0) // write out Xi
520 RET
521
522 // func gcmMul(output []byte, productTable *[256]byte)
523 TEXT ·gcmMul(SB), NOSPLIT, $0-32
524 MOVD output+0(FP), XIP
525 MOVD productTable+24(FP), HTBL
526
527 MOVD $0x10, R8
528 MOVD $0x20, R9
529 MOVD $0x30, R10
530 LXVD2X (XIP)(R0), VIN // load Xi
531
532 LXVD2X (HTBL)(R8), VHL // Load pre-computed table
533 LVSL (R0)(R0), LEMASK
534 LXVD2X (HTBL)(R9), VH
535 VSPLTISB $0x07, T0
536 LXVD2X (HTBL)(R10), VHH
537 VXOR LEMASK, T0, LEMASK
538 LXVD2X (HTBL)(R0), VXC2
539 VPERM IN, IN, LEMASK, IN
540 VXOR ZERO, ZERO, ZERO
541
542 VPMSUMD IN, HL, XL // H.lo·Xi.lo
543 VPMSUMD IN, H, XM // H.hi·Xi.lo+H.lo·Xi.hi
544 VPMSUMD IN, HH, XH // H.hi·Xi.hi
545
546 VPMSUMD XL, XC2, T2 // 1st reduction phase
547
548 VSLDOI $8, XM, ZERO, T0
549 VSLDOI $8, ZERO, XM, T1
550 VXOR XL, T0, XL
551 VXOR XH, T1, XH
552
553 VSLDOI $8, XL, XL, XL
554 VXOR XL, T2, XL
555
556 VSLDOI $8, XL, XL, T1 // 2nd reduction phase
557 VPMSUMD XL, XC2, XL
558 VXOR T1, XH, T1
559 VXOR XL, T1, XL
560
561 VPERM XL, XL, LEMASK, XL
562 STXVD2X VXL, (XIP+R0) // write out Xi
563 RET
564
View as plain text