1 // Copyright 2018 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 #include "textflag.h"
6
7 #define B0 V0
8 #define B1 V1
9 #define B2 V2
10 #define B3 V3
11 #define B4 V4
12 #define B5 V5
13 #define B6 V6
14 #define B7 V7
15
16 #define ACC0 V8
17 #define ACC1 V9
18 #define ACCM V10
19
20 #define T0 V11
21 #define T1 V12
22 #define T2 V13
23 #define T3 V14
24
25 #define POLY V15
26 #define ZERO V16
27 #define INC V17
28 #define CTR V18
29
30 #define K0 V19
31 #define K1 V20
32 #define K2 V21
33 #define K3 V22
34 #define K4 V23
35 #define K5 V24
36 #define K6 V25
37 #define K7 V26
38 #define K8 V27
39 #define K9 V28
40 #define K10 V29
41 #define K11 V30
42 #define KLAST V31
43
44 #define reduce() \
45 VEOR ACC0.B16, ACCM.B16, ACCM.B16 \
46 VEOR ACC1.B16, ACCM.B16, ACCM.B16 \
47 VEXT $8, ZERO.B16, ACCM.B16, T0.B16 \
48 VEXT $8, ACCM.B16, ZERO.B16, ACCM.B16 \
49 VEOR ACCM.B16, ACC0.B16, ACC0.B16 \
50 VEOR T0.B16, ACC1.B16, ACC1.B16 \
51 VPMULL POLY.D1, ACC0.D1, T0.Q1 \
52 VEXT $8, ACC0.B16, ACC0.B16, ACC0.B16 \
53 VEOR T0.B16, ACC0.B16, ACC0.B16 \
54 VPMULL POLY.D1, ACC0.D1, T0.Q1 \
55 VEOR T0.B16, ACC1.B16, ACC1.B16 \
56 VEXT $8, ACC1.B16, ACC1.B16, ACC1.B16 \
57 VEOR ACC1.B16, ACC0.B16, ACC0.B16 \
58
59 // func gcmAesFinish(productTable *[256]byte, tagMask, T *[16]byte, pLen, dLen uint64)
60 TEXT ·gcmAesFinish(SB),NOSPLIT,$0
61 #define pTbl R0
62 #define tMsk R1
63 #define tPtr R2
64 #define plen R3
65 #define dlen R4
66
67 MOVD $0xC2, R1
68 LSL $56, R1
69 MOVD $1, R0
70 VMOV R1, POLY.D[0]
71 VMOV R0, POLY.D[1]
72 VEOR ZERO.B16, ZERO.B16, ZERO.B16
73
74 MOVD productTable+0(FP), pTbl
75 MOVD tagMask+8(FP), tMsk
76 MOVD T+16(FP), tPtr
77 MOVD pLen+24(FP), plen
78 MOVD dLen+32(FP), dlen
79
80 VLD1 (tPtr), [ACC0.B16]
81 VLD1 (tMsk), [B1.B16]
82
83 LSL $3, plen
84 LSL $3, dlen
85
86 VMOV dlen, B0.D[0]
87 VMOV plen, B0.D[1]
88
89 ADD $14*16, pTbl
90 VLD1.P (pTbl), [T1.B16, T2.B16]
91
92 VEOR ACC0.B16, B0.B16, B0.B16
93
94 VEXT $8, B0.B16, B0.B16, T0.B16
95 VEOR B0.B16, T0.B16, T0.B16
96 VPMULL B0.D1, T1.D1, ACC1.Q1
97 VPMULL2 B0.D2, T1.D2, ACC0.Q1
98 VPMULL T0.D1, T2.D1, ACCM.Q1
99
100 reduce()
101
102 VREV64 ACC0.B16, ACC0.B16
103 VEOR B1.B16, ACC0.B16, ACC0.B16
104
105 VST1 [ACC0.B16], (tPtr)
106 RET
107 #undef pTbl
108 #undef tMsk
109 #undef tPtr
110 #undef plen
111 #undef dlen
112
113 // func gcmAesInit(productTable *[256]byte, ks []uint32)
114 TEXT ·gcmAesInit(SB),NOSPLIT,$0
115 #define pTbl R0
116 #define KS R1
117 #define NR R2
118 #define I R3
119 MOVD productTable+0(FP), pTbl
120 MOVD ks_base+8(FP), KS
121 MOVD ks_len+16(FP), NR
122
123 MOVD $0xC2, I
124 LSL $56, I
125 VMOV I, POLY.D[0]
126 MOVD $1, I
127 VMOV I, POLY.D[1]
128 VEOR ZERO.B16, ZERO.B16, ZERO.B16
129
130 // Encrypt block 0 with the AES key to generate the hash key H
131 VLD1.P 64(KS), [T0.B16, T1.B16, T2.B16, T3.B16]
132 VEOR B0.B16, B0.B16, B0.B16
133 AESE T0.B16, B0.B16
134 AESMC B0.B16, B0.B16
135 AESE T1.B16, B0.B16
136 AESMC B0.B16, B0.B16
137 AESE T2.B16, B0.B16
138 AESMC B0.B16, B0.B16
139 AESE T3.B16, B0.B16
140 AESMC B0.B16, B0.B16
141 VLD1.P 64(KS), [T0.B16, T1.B16, T2.B16, T3.B16]
142 AESE T0.B16, B0.B16
143 AESMC B0.B16, B0.B16
144 AESE T1.B16, B0.B16
145 AESMC B0.B16, B0.B16
146 AESE T2.B16, B0.B16
147 AESMC B0.B16, B0.B16
148 AESE T3.B16, B0.B16
149 AESMC B0.B16, B0.B16
150 TBZ $4, NR, initEncFinish
151 VLD1.P 32(KS), [T0.B16, T1.B16]
152 AESE T0.B16, B0.B16
153 AESMC B0.B16, B0.B16
154 AESE T1.B16, B0.B16
155 AESMC B0.B16, B0.B16
156 TBZ $3, NR, initEncFinish
157 VLD1.P 32(KS), [T0.B16, T1.B16]
158 AESE T0.B16, B0.B16
159 AESMC B0.B16, B0.B16
160 AESE T1.B16, B0.B16
161 AESMC B0.B16, B0.B16
162 initEncFinish:
163 VLD1 (KS), [T0.B16, T1.B16, T2.B16]
164 AESE T0.B16, B0.B16
165 AESMC B0.B16, B0.B16
166 AESE T1.B16, B0.B16
167 VEOR T2.B16, B0.B16, B0.B16
168
169 VREV64 B0.B16, B0.B16
170
171 // Multiply by 2 modulo P
172 VMOV B0.D[0], I
173 ASR $63, I
174 VMOV I, T1.D[0]
175 VMOV I, T1.D[1]
176 VAND POLY.B16, T1.B16, T1.B16
177 VUSHR $63, B0.D2, T2.D2
178 VEXT $8, ZERO.B16, T2.B16, T2.B16
179 VSHL $1, B0.D2, B0.D2
180 VEOR T1.B16, B0.B16, B0.B16
181 VEOR T2.B16, B0.B16, B0.B16 // Can avoid this when VSLI is available
182
183 // Karatsuba pre-computation
184 VEXT $8, B0.B16, B0.B16, B1.B16
185 VEOR B0.B16, B1.B16, B1.B16
186
187 ADD $14*16, pTbl
188 VST1 [B0.B16, B1.B16], (pTbl)
189 SUB $2*16, pTbl
190
191 VMOV B0.B16, B2.B16
192 VMOV B1.B16, B3.B16
193
194 MOVD $7, I
195
196 initLoop:
197 // Compute powers of H
198 SUBS $1, I
199
200 VPMULL B0.D1, B2.D1, T1.Q1
201 VPMULL2 B0.D2, B2.D2, T0.Q1
202 VPMULL B1.D1, B3.D1, T2.Q1
203 VEOR T0.B16, T2.B16, T2.B16
204 VEOR T1.B16, T2.B16, T2.B16
205 VEXT $8, ZERO.B16, T2.B16, T3.B16
206 VEXT $8, T2.B16, ZERO.B16, T2.B16
207 VEOR T2.B16, T0.B16, T0.B16
208 VEOR T3.B16, T1.B16, T1.B16
209 VPMULL POLY.D1, T0.D1, T2.Q1
210 VEXT $8, T0.B16, T0.B16, T0.B16
211 VEOR T2.B16, T0.B16, T0.B16
212 VPMULL POLY.D1, T0.D1, T2.Q1
213 VEXT $8, T0.B16, T0.B16, T0.B16
214 VEOR T2.B16, T0.B16, T0.B16
215 VEOR T1.B16, T0.B16, B2.B16
216 VMOV B2.B16, B3.B16
217 VEXT $8, B2.B16, B2.B16, B2.B16
218 VEOR B2.B16, B3.B16, B3.B16
219
220 VST1 [B2.B16, B3.B16], (pTbl)
221 SUB $2*16, pTbl
222
223 BNE initLoop
224 RET
225 #undef I
226 #undef NR
227 #undef KS
228 #undef pTbl
229
230 // func gcmAesData(productTable *[256]byte, data []byte, T *[16]byte)
231 TEXT ·gcmAesData(SB),NOSPLIT,$0
232 #define pTbl R0
233 #define aut R1
234 #define tPtr R2
235 #define autLen R3
236 #define H0 R4
237 #define pTblSave R5
238
239 #define mulRound(X) \
240 VLD1.P 32(pTbl), [T1.B16, T2.B16] \
241 VREV64 X.B16, X.B16 \
242 VEXT $8, X.B16, X.B16, T0.B16 \
243 VEOR X.B16, T0.B16, T0.B16 \
244 VPMULL X.D1, T1.D1, T3.Q1 \
245 VEOR T3.B16, ACC1.B16, ACC1.B16 \
246 VPMULL2 X.D2, T1.D2, T3.Q1 \
247 VEOR T3.B16, ACC0.B16, ACC0.B16 \
248 VPMULL T0.D1, T2.D1, T3.Q1 \
249 VEOR T3.B16, ACCM.B16, ACCM.B16
250
251 MOVD productTable+0(FP), pTbl
252 MOVD data_base+8(FP), aut
253 MOVD data_len+16(FP), autLen
254 MOVD T+32(FP), tPtr
255
256 VEOR ACC0.B16, ACC0.B16, ACC0.B16
257 CBZ autLen, dataBail
258
259 MOVD $0xC2, H0
260 LSL $56, H0
261 VMOV H0, POLY.D[0]
262 MOVD $1, H0
263 VMOV H0, POLY.D[1]
264 VEOR ZERO.B16, ZERO.B16, ZERO.B16
265 MOVD pTbl, pTblSave
266
267 CMP $13, autLen
268 BEQ dataTLS
269 CMP $128, autLen
270 BLT startSinglesLoop
271 B octetsLoop
272
273 dataTLS:
274 ADD $14*16, pTbl
275 VLD1.P (pTbl), [T1.B16, T2.B16]
276 VEOR B0.B16, B0.B16, B0.B16
277
278 MOVD (aut), H0
279 VMOV H0, B0.D[0]
280 MOVW 8(aut), H0
281 VMOV H0, B0.S[2]
282 MOVB 12(aut), H0
283 VMOV H0, B0.B[12]
284
285 MOVD $0, autLen
286 B dataMul
287
288 octetsLoop:
289 CMP $128, autLen
290 BLT startSinglesLoop
291 SUB $128, autLen
292
293 VLD1.P 32(aut), [B0.B16, B1.B16]
294
295 VLD1.P 32(pTbl), [T1.B16, T2.B16]
296 VREV64 B0.B16, B0.B16
297 VEOR ACC0.B16, B0.B16, B0.B16
298 VEXT $8, B0.B16, B0.B16, T0.B16
299 VEOR B0.B16, T0.B16, T0.B16
300 VPMULL B0.D1, T1.D1, ACC1.Q1
301 VPMULL2 B0.D2, T1.D2, ACC0.Q1
302 VPMULL T0.D1, T2.D1, ACCM.Q1
303
304 mulRound(B1)
305 VLD1.P 32(aut), [B2.B16, B3.B16]
306 mulRound(B2)
307 mulRound(B3)
308 VLD1.P 32(aut), [B4.B16, B5.B16]
309 mulRound(B4)
310 mulRound(B5)
311 VLD1.P 32(aut), [B6.B16, B7.B16]
312 mulRound(B6)
313 mulRound(B7)
314
315 MOVD pTblSave, pTbl
316 reduce()
317 B octetsLoop
318
319 startSinglesLoop:
320
321 ADD $14*16, pTbl
322 VLD1.P (pTbl), [T1.B16, T2.B16]
323
324 singlesLoop:
325
326 CMP $16, autLen
327 BLT dataEnd
328 SUB $16, autLen
329
330 VLD1.P 16(aut), [B0.B16]
331 dataMul:
332 VREV64 B0.B16, B0.B16
333 VEOR ACC0.B16, B0.B16, B0.B16
334
335 VEXT $8, B0.B16, B0.B16, T0.B16
336 VEOR B0.B16, T0.B16, T0.B16
337 VPMULL B0.D1, T1.D1, ACC1.Q1
338 VPMULL2 B0.D2, T1.D2, ACC0.Q1
339 VPMULL T0.D1, T2.D1, ACCM.Q1
340
341 reduce()
342
343 B singlesLoop
344
345 dataEnd:
346
347 CBZ autLen, dataBail
348 VEOR B0.B16, B0.B16, B0.B16
349 ADD autLen, aut
350
351 dataLoadLoop:
352 MOVB.W -1(aut), H0
353 VEXT $15, B0.B16, ZERO.B16, B0.B16
354 VMOV H0, B0.B[0]
355 SUBS $1, autLen
356 BNE dataLoadLoop
357 B dataMul
358
359 dataBail:
360 VST1 [ACC0.B16], (tPtr)
361 RET
362
363 #undef pTbl
364 #undef aut
365 #undef tPtr
366 #undef autLen
367 #undef H0
368 #undef pTblSave
369
370 // func gcmAesEnc(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, ks []uint32)
371 TEXT ·gcmAesEnc(SB),NOSPLIT,$0
372 #define pTbl R0
373 #define dstPtr R1
374 #define ctrPtr R2
375 #define srcPtr R3
376 #define ks R4
377 #define tPtr R5
378 #define srcPtrLen R6
379 #define aluCTR R7
380 #define aluTMP R8
381 #define aluK R9
382 #define NR R10
383 #define H0 R11
384 #define H1 R12
385 #define curK R13
386 #define pTblSave R14
387
388 #define aesrndx8(K) \
389 AESE K.B16, B0.B16 \
390 AESMC B0.B16, B0.B16 \
391 AESE K.B16, B1.B16 \
392 AESMC B1.B16, B1.B16 \
393 AESE K.B16, B2.B16 \
394 AESMC B2.B16, B2.B16 \
395 AESE K.B16, B3.B16 \
396 AESMC B3.B16, B3.B16 \
397 AESE K.B16, B4.B16 \
398 AESMC B4.B16, B4.B16 \
399 AESE K.B16, B5.B16 \
400 AESMC B5.B16, B5.B16 \
401 AESE K.B16, B6.B16 \
402 AESMC B6.B16, B6.B16 \
403 AESE K.B16, B7.B16 \
404 AESMC B7.B16, B7.B16
405
406 #define aesrndlastx8(K) \
407 AESE K.B16, B0.B16 \
408 AESE K.B16, B1.B16 \
409 AESE K.B16, B2.B16 \
410 AESE K.B16, B3.B16 \
411 AESE K.B16, B4.B16 \
412 AESE K.B16, B5.B16 \
413 AESE K.B16, B6.B16 \
414 AESE K.B16, B7.B16
415
416 MOVD productTable+0(FP), pTbl
417 MOVD dst+8(FP), dstPtr
418 MOVD src_base+32(FP), srcPtr
419 MOVD src_len+40(FP), srcPtrLen
420 MOVD ctr+56(FP), ctrPtr
421 MOVD T+64(FP), tPtr
422 MOVD ks_base+72(FP), ks
423 MOVD ks_len+80(FP), NR
424
425 MOVD $0xC2, H1
426 LSL $56, H1
427 MOVD $1, H0
428 VMOV H1, POLY.D[0]
429 VMOV H0, POLY.D[1]
430 VEOR ZERO.B16, ZERO.B16, ZERO.B16
431 // Compute NR from len(ks)
432 MOVD pTbl, pTblSave
433 // Current tag, after AAD
434 VLD1 (tPtr), [ACC0.B16]
435 VEOR ACC1.B16, ACC1.B16, ACC1.B16
436 VEOR ACCM.B16, ACCM.B16, ACCM.B16
437 // Prepare initial counter, and the increment vector
438 VLD1 (ctrPtr), [CTR.B16]
439 VEOR INC.B16, INC.B16, INC.B16
440 MOVD $1, H0
441 VMOV H0, INC.S[3]
442 VREV32 CTR.B16, CTR.B16
443 VADD CTR.S4, INC.S4, CTR.S4
444 // Skip to <8 blocks loop
445 CMP $128, srcPtrLen
446
447 MOVD ks, H0
448 // For AES-128 round keys are stored in: K0 .. K10, KLAST
449 VLD1.P 64(H0), [K0.B16, K1.B16, K2.B16, K3.B16]
450 VLD1.P 64(H0), [K4.B16, K5.B16, K6.B16, K7.B16]
451 VLD1.P 48(H0), [K8.B16, K9.B16, K10.B16]
452 VMOV K10.B16, KLAST.B16
453
454 BLT startSingles
455 // There are at least 8 blocks to encrypt
456 TBZ $4, NR, octetsLoop
457
458 // For AES-192 round keys occupy: K0 .. K7, K10, K11, K8, K9, KLAST
459 VMOV K8.B16, K10.B16
460 VMOV K9.B16, K11.B16
461 VMOV KLAST.B16, K8.B16
462 VLD1.P 16(H0), [K9.B16]
463 VLD1.P 16(H0), [KLAST.B16]
464 TBZ $3, NR, octetsLoop
465 // For AES-256 round keys occupy: K0 .. K7, K10, K11, mem, mem, K8, K9, KLAST
466 VMOV KLAST.B16, K8.B16
467 VLD1.P 16(H0), [K9.B16]
468 VLD1.P 16(H0), [KLAST.B16]
469 ADD $10*16, ks, H0
470 MOVD H0, curK
471
472 octetsLoop:
473 SUB $128, srcPtrLen
474
475 VMOV CTR.B16, B0.B16
476 VADD B0.S4, INC.S4, B1.S4
477 VREV32 B0.B16, B0.B16
478 VADD B1.S4, INC.S4, B2.S4
479 VREV32 B1.B16, B1.B16
480 VADD B2.S4, INC.S4, B3.S4
481 VREV32 B2.B16, B2.B16
482 VADD B3.S4, INC.S4, B4.S4
483 VREV32 B3.B16, B3.B16
484 VADD B4.S4, INC.S4, B5.S4
485 VREV32 B4.B16, B4.B16
486 VADD B5.S4, INC.S4, B6.S4
487 VREV32 B5.B16, B5.B16
488 VADD B6.S4, INC.S4, B7.S4
489 VREV32 B6.B16, B6.B16
490 VADD B7.S4, INC.S4, CTR.S4
491 VREV32 B7.B16, B7.B16
492
493 aesrndx8(K0)
494 aesrndx8(K1)
495 aesrndx8(K2)
496 aesrndx8(K3)
497 aesrndx8(K4)
498 aesrndx8(K5)
499 aesrndx8(K6)
500 aesrndx8(K7)
501 TBZ $4, NR, octetsFinish
502 aesrndx8(K10)
503 aesrndx8(K11)
504 TBZ $3, NR, octetsFinish
505 VLD1.P 32(curK), [T1.B16, T2.B16]
506 aesrndx8(T1)
507 aesrndx8(T2)
508 MOVD H0, curK
509 octetsFinish:
510 aesrndx8(K8)
511 aesrndlastx8(K9)
512
513 VEOR KLAST.B16, B0.B16, B0.B16
514 VEOR KLAST.B16, B1.B16, B1.B16
515 VEOR KLAST.B16, B2.B16, B2.B16
516 VEOR KLAST.B16, B3.B16, B3.B16
517 VEOR KLAST.B16, B4.B16, B4.B16
518 VEOR KLAST.B16, B5.B16, B5.B16
519 VEOR KLAST.B16, B6.B16, B6.B16
520 VEOR KLAST.B16, B7.B16, B7.B16
521
522 VLD1.P 32(srcPtr), [T1.B16, T2.B16]
523 VEOR B0.B16, T1.B16, B0.B16
524 VEOR B1.B16, T2.B16, B1.B16
525 VST1.P [B0.B16, B1.B16], 32(dstPtr)
526 VLD1.P 32(srcPtr), [T1.B16, T2.B16]
527 VEOR B2.B16, T1.B16, B2.B16
528 VEOR B3.B16, T2.B16, B3.B16
529 VST1.P [B2.B16, B3.B16], 32(dstPtr)
530 VLD1.P 32(srcPtr), [T1.B16, T2.B16]
531 VEOR B4.B16, T1.B16, B4.B16
532 VEOR B5.B16, T2.B16, B5.B16
533 VST1.P [B4.B16, B5.B16], 32(dstPtr)
534 VLD1.P 32(srcPtr), [T1.B16, T2.B16]
535 VEOR B6.B16, T1.B16, B6.B16
536 VEOR B7.B16, T2.B16, B7.B16
537 VST1.P [B6.B16, B7.B16], 32(dstPtr)
538
539 VLD1.P 32(pTbl), [T1.B16, T2.B16]
540 VREV64 B0.B16, B0.B16
541 VEOR ACC0.B16, B0.B16, B0.B16
542 VEXT $8, B0.B16, B0.B16, T0.B16
543 VEOR B0.B16, T0.B16, T0.B16
544 VPMULL B0.D1, T1.D1, ACC1.Q1
545 VPMULL2 B0.D2, T1.D2, ACC0.Q1
546 VPMULL T0.D1, T2.D1, ACCM.Q1
547
548 mulRound(B1)
549 mulRound(B2)
550 mulRound(B3)
551 mulRound(B4)
552 mulRound(B5)
553 mulRound(B6)
554 mulRound(B7)
555 MOVD pTblSave, pTbl
556 reduce()
557
558 CMP $128, srcPtrLen
559 BGE octetsLoop
560
561 startSingles:
562 CBZ srcPtrLen, done
563 ADD $14*16, pTbl
564 // Preload H and its Karatsuba precomp
565 VLD1.P (pTbl), [T1.B16, T2.B16]
566 // Preload AES round keys
567 ADD $128, ks
568 VLD1.P 48(ks), [K8.B16, K9.B16, K10.B16]
569 VMOV K10.B16, KLAST.B16
570 TBZ $4, NR, singlesLoop
571 VLD1.P 32(ks), [B1.B16, B2.B16]
572 VMOV B2.B16, KLAST.B16
573 TBZ $3, NR, singlesLoop
574 VLD1.P 32(ks), [B3.B16, B4.B16]
575 VMOV B4.B16, KLAST.B16
576
577 singlesLoop:
578 CMP $16, srcPtrLen
579 BLT tail
580 SUB $16, srcPtrLen
581
582 VLD1.P 16(srcPtr), [T0.B16]
583 VEOR KLAST.B16, T0.B16, T0.B16
584
585 VREV32 CTR.B16, B0.B16
586 VADD CTR.S4, INC.S4, CTR.S4
587
588 AESE K0.B16, B0.B16
589 AESMC B0.B16, B0.B16
590 AESE K1.B16, B0.B16
591 AESMC B0.B16, B0.B16
592 AESE K2.B16, B0.B16
593 AESMC B0.B16, B0.B16
594 AESE K3.B16, B0.B16
595 AESMC B0.B16, B0.B16
596 AESE K4.B16, B0.B16
597 AESMC B0.B16, B0.B16
598 AESE K5.B16, B0.B16
599 AESMC B0.B16, B0.B16
600 AESE K6.B16, B0.B16
601 AESMC B0.B16, B0.B16
602 AESE K7.B16, B0.B16
603 AESMC B0.B16, B0.B16
604 AESE K8.B16, B0.B16
605 AESMC B0.B16, B0.B16
606 AESE K9.B16, B0.B16
607 TBZ $4, NR, singlesLast
608 AESMC B0.B16, B0.B16
609 AESE K10.B16, B0.B16
610 AESMC B0.B16, B0.B16
611 AESE B1.B16, B0.B16
612 TBZ $3, NR, singlesLast
613 AESMC B0.B16, B0.B16
614 AESE B2.B16, B0.B16
615 AESMC B0.B16, B0.B16
616 AESE B3.B16, B0.B16
617 singlesLast:
618 VEOR T0.B16, B0.B16, B0.B16
619 encReduce:
620 VST1.P [B0.B16], 16(dstPtr)
621
622 VREV64 B0.B16, B0.B16
623 VEOR ACC0.B16, B0.B16, B0.B16
624
625 VEXT $8, B0.B16, B0.B16, T0.B16
626 VEOR B0.B16, T0.B16, T0.B16
627 VPMULL B0.D1, T1.D1, ACC1.Q1
628 VPMULL2 B0.D2, T1.D2, ACC0.Q1
629 VPMULL T0.D1, T2.D1, ACCM.Q1
630
631 reduce()
632
633 B singlesLoop
634 tail:
635 CBZ srcPtrLen, done
636
637 VEOR T0.B16, T0.B16, T0.B16
638 VEOR T3.B16, T3.B16, T3.B16
639 MOVD $0, H1
640 SUB $1, H1
641 ADD srcPtrLen, srcPtr
642
643 TBZ $3, srcPtrLen, ld4
644 MOVD.W -8(srcPtr), H0
645 VMOV H0, T0.D[0]
646 VMOV H1, T3.D[0]
647 ld4:
648 TBZ $2, srcPtrLen, ld2
649 MOVW.W -4(srcPtr), H0
650 VEXT $12, T0.B16, ZERO.B16, T0.B16
651 VEXT $12, T3.B16, ZERO.B16, T3.B16
652 VMOV H0, T0.S[0]
653 VMOV H1, T3.S[0]
654 ld2:
655 TBZ $1, srcPtrLen, ld1
656 MOVH.W -2(srcPtr), H0
657 VEXT $14, T0.B16, ZERO.B16, T0.B16
658 VEXT $14, T3.B16, ZERO.B16, T3.B16
659 VMOV H0, T0.H[0]
660 VMOV H1, T3.H[0]
661 ld1:
662 TBZ $0, srcPtrLen, ld0
663 MOVB.W -1(srcPtr), H0
664 VEXT $15, T0.B16, ZERO.B16, T0.B16
665 VEXT $15, T3.B16, ZERO.B16, T3.B16
666 VMOV H0, T0.B[0]
667 VMOV H1, T3.B[0]
668 ld0:
669
670 MOVD ZR, srcPtrLen
671 VEOR KLAST.B16, T0.B16, T0.B16
672 VREV32 CTR.B16, B0.B16
673
674 AESE K0.B16, B0.B16
675 AESMC B0.B16, B0.B16
676 AESE K1.B16, B0.B16
677 AESMC B0.B16, B0.B16
678 AESE K2.B16, B0.B16
679 AESMC B0.B16, B0.B16
680 AESE K3.B16, B0.B16
681 AESMC B0.B16, B0.B16
682 AESE K4.B16, B0.B16
683 AESMC B0.B16, B0.B16
684 AESE K5.B16, B0.B16
685 AESMC B0.B16, B0.B16
686 AESE K6.B16, B0.B16
687 AESMC B0.B16, B0.B16
688 AESE K7.B16, B0.B16
689 AESMC B0.B16, B0.B16
690 AESE K8.B16, B0.B16
691 AESMC B0.B16, B0.B16
692 AESE K9.B16, B0.B16
693 TBZ $4, NR, tailLast
694 AESMC B0.B16, B0.B16
695 AESE K10.B16, B0.B16
696 AESMC B0.B16, B0.B16
697 AESE B1.B16, B0.B16
698 TBZ $3, NR, tailLast
699 AESMC B0.B16, B0.B16
700 AESE B2.B16, B0.B16
701 AESMC B0.B16, B0.B16
702 AESE B3.B16, B0.B16
703
704 tailLast:
705 VEOR T0.B16, B0.B16, B0.B16
706 VAND T3.B16, B0.B16, B0.B16
707 B encReduce
708
709 done:
710 VST1 [ACC0.B16], (tPtr)
711 RET
712
713 // func gcmAesDec(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, ks []uint32)
714 TEXT ·gcmAesDec(SB),NOSPLIT,$0
715 MOVD productTable+0(FP), pTbl
716 MOVD dst+8(FP), dstPtr
717 MOVD src_base+32(FP), srcPtr
718 MOVD src_len+40(FP), srcPtrLen
719 MOVD ctr+56(FP), ctrPtr
720 MOVD T+64(FP), tPtr
721 MOVD ks_base+72(FP), ks
722 MOVD ks_len+80(FP), NR
723
724 MOVD $0xC2, H1
725 LSL $56, H1
726 MOVD $1, H0
727 VMOV H1, POLY.D[0]
728 VMOV H0, POLY.D[1]
729 VEOR ZERO.B16, ZERO.B16, ZERO.B16
730 // Compute NR from len(ks)
731 MOVD pTbl, pTblSave
732 // Current tag, after AAD
733 VLD1 (tPtr), [ACC0.B16]
734 VEOR ACC1.B16, ACC1.B16, ACC1.B16
735 VEOR ACCM.B16, ACCM.B16, ACCM.B16
736 // Prepare initial counter, and the increment vector
737 VLD1 (ctrPtr), [CTR.B16]
738 VEOR INC.B16, INC.B16, INC.B16
739 MOVD $1, H0
740 VMOV H0, INC.S[3]
741 VREV32 CTR.B16, CTR.B16
742 VADD CTR.S4, INC.S4, CTR.S4
743
744 MOVD ks, H0
745 // For AES-128 round keys are stored in: K0 .. K10, KLAST
746 VLD1.P 64(H0), [K0.B16, K1.B16, K2.B16, K3.B16]
747 VLD1.P 64(H0), [K4.B16, K5.B16, K6.B16, K7.B16]
748 VLD1.P 48(H0), [K8.B16, K9.B16, K10.B16]
749 VMOV K10.B16, KLAST.B16
750
751 // Skip to <8 blocks loop
752 CMP $128, srcPtrLen
753 BLT startSingles
754 // There are at least 8 blocks to encrypt
755 TBZ $4, NR, octetsLoop
756
757 // For AES-192 round keys occupy: K0 .. K7, K10, K11, K8, K9, KLAST
758 VMOV K8.B16, K10.B16
759 VMOV K9.B16, K11.B16
760 VMOV KLAST.B16, K8.B16
761 VLD1.P 16(H0), [K9.B16]
762 VLD1.P 16(H0), [KLAST.B16]
763 TBZ $3, NR, octetsLoop
764 // For AES-256 round keys occupy: K0 .. K7, K10, K11, mem, mem, K8, K9, KLAST
765 VMOV KLAST.B16, K8.B16
766 VLD1.P 16(H0), [K9.B16]
767 VLD1.P 16(H0), [KLAST.B16]
768 ADD $10*16, ks, H0
769 MOVD H0, curK
770
771 octetsLoop:
772 SUB $128, srcPtrLen
773
774 VMOV CTR.B16, B0.B16
775 VADD B0.S4, INC.S4, B1.S4
776 VREV32 B0.B16, B0.B16
777 VADD B1.S4, INC.S4, B2.S4
778 VREV32 B1.B16, B1.B16
779 VADD B2.S4, INC.S4, B3.S4
780 VREV32 B2.B16, B2.B16
781 VADD B3.S4, INC.S4, B4.S4
782 VREV32 B3.B16, B3.B16
783 VADD B4.S4, INC.S4, B5.S4
784 VREV32 B4.B16, B4.B16
785 VADD B5.S4, INC.S4, B6.S4
786 VREV32 B5.B16, B5.B16
787 VADD B6.S4, INC.S4, B7.S4
788 VREV32 B6.B16, B6.B16
789 VADD B7.S4, INC.S4, CTR.S4
790 VREV32 B7.B16, B7.B16
791
792 aesrndx8(K0)
793 aesrndx8(K1)
794 aesrndx8(K2)
795 aesrndx8(K3)
796 aesrndx8(K4)
797 aesrndx8(K5)
798 aesrndx8(K6)
799 aesrndx8(K7)
800 TBZ $4, NR, octetsFinish
801 aesrndx8(K10)
802 aesrndx8(K11)
803 TBZ $3, NR, octetsFinish
804 VLD1.P 32(curK), [T1.B16, T2.B16]
805 aesrndx8(T1)
806 aesrndx8(T2)
807 MOVD H0, curK
808 octetsFinish:
809 aesrndx8(K8)
810 aesrndlastx8(K9)
811
812 VEOR KLAST.B16, B0.B16, T1.B16
813 VEOR KLAST.B16, B1.B16, T2.B16
814 VEOR KLAST.B16, B2.B16, B2.B16
815 VEOR KLAST.B16, B3.B16, B3.B16
816 VEOR KLAST.B16, B4.B16, B4.B16
817 VEOR KLAST.B16, B5.B16, B5.B16
818 VEOR KLAST.B16, B6.B16, B6.B16
819 VEOR KLAST.B16, B7.B16, B7.B16
820
821 VLD1.P 32(srcPtr), [B0.B16, B1.B16]
822 VEOR B0.B16, T1.B16, T1.B16
823 VEOR B1.B16, T2.B16, T2.B16
824 VST1.P [T1.B16, T2.B16], 32(dstPtr)
825
826 VLD1.P 32(pTbl), [T1.B16, T2.B16]
827 VREV64 B0.B16, B0.B16
828 VEOR ACC0.B16, B0.B16, B0.B16
829 VEXT $8, B0.B16, B0.B16, T0.B16
830 VEOR B0.B16, T0.B16, T0.B16
831 VPMULL B0.D1, T1.D1, ACC1.Q1
832 VPMULL2 B0.D2, T1.D2, ACC0.Q1
833 VPMULL T0.D1, T2.D1, ACCM.Q1
834 mulRound(B1)
835
836 VLD1.P 32(srcPtr), [B0.B16, B1.B16]
837 VEOR B2.B16, B0.B16, T1.B16
838 VEOR B3.B16, B1.B16, T2.B16
839 VST1.P [T1.B16, T2.B16], 32(dstPtr)
840 mulRound(B0)
841 mulRound(B1)
842
843 VLD1.P 32(srcPtr), [B0.B16, B1.B16]
844 VEOR B4.B16, B0.B16, T1.B16
845 VEOR B5.B16, B1.B16, T2.B16
846 VST1.P [T1.B16, T2.B16], 32(dstPtr)
847 mulRound(B0)
848 mulRound(B1)
849
850 VLD1.P 32(srcPtr), [B0.B16, B1.B16]
851 VEOR B6.B16, B0.B16, T1.B16
852 VEOR B7.B16, B1.B16, T2.B16
853 VST1.P [T1.B16, T2.B16], 32(dstPtr)
854 mulRound(B0)
855 mulRound(B1)
856
857 MOVD pTblSave, pTbl
858 reduce()
859
860 CMP $128, srcPtrLen
861 BGE octetsLoop
862
863 startSingles:
864 CBZ srcPtrLen, done
865 ADD $14*16, pTbl
866 // Preload H and its Karatsuba precomp
867 VLD1.P (pTbl), [T1.B16, T2.B16]
868 // Preload AES round keys
869 ADD $128, ks
870 VLD1.P 48(ks), [K8.B16, K9.B16, K10.B16]
871 VMOV K10.B16, KLAST.B16
872 TBZ $4, NR, singlesLoop
873 VLD1.P 32(ks), [B1.B16, B2.B16]
874 VMOV B2.B16, KLAST.B16
875 TBZ $3, NR, singlesLoop
876 VLD1.P 32(ks), [B3.B16, B4.B16]
877 VMOV B4.B16, KLAST.B16
878
879 singlesLoop:
880 CMP $16, srcPtrLen
881 BLT tail
882 SUB $16, srcPtrLen
883
884 VLD1.P 16(srcPtr), [T0.B16]
885 VREV64 T0.B16, B5.B16
886 VEOR KLAST.B16, T0.B16, T0.B16
887
888 VREV32 CTR.B16, B0.B16
889 VADD CTR.S4, INC.S4, CTR.S4
890
891 AESE K0.B16, B0.B16
892 AESMC B0.B16, B0.B16
893 AESE K1.B16, B0.B16
894 AESMC B0.B16, B0.B16
895 AESE K2.B16, B0.B16
896 AESMC B0.B16, B0.B16
897 AESE K3.B16, B0.B16
898 AESMC B0.B16, B0.B16
899 AESE K4.B16, B0.B16
900 AESMC B0.B16, B0.B16
901 AESE K5.B16, B0.B16
902 AESMC B0.B16, B0.B16
903 AESE K6.B16, B0.B16
904 AESMC B0.B16, B0.B16
905 AESE K7.B16, B0.B16
906 AESMC B0.B16, B0.B16
907 AESE K8.B16, B0.B16
908 AESMC B0.B16, B0.B16
909 AESE K9.B16, B0.B16
910 TBZ $4, NR, singlesLast
911 AESMC B0.B16, B0.B16
912 AESE K10.B16, B0.B16
913 AESMC B0.B16, B0.B16
914 AESE B1.B16, B0.B16
915 TBZ $3, NR, singlesLast
916 AESMC B0.B16, B0.B16
917 AESE B2.B16, B0.B16
918 AESMC B0.B16, B0.B16
919 AESE B3.B16, B0.B16
920 singlesLast:
921 VEOR T0.B16, B0.B16, B0.B16
922
923 VST1.P [B0.B16], 16(dstPtr)
924
925 VEOR ACC0.B16, B5.B16, B5.B16
926 VEXT $8, B5.B16, B5.B16, T0.B16
927 VEOR B5.B16, T0.B16, T0.B16
928 VPMULL B5.D1, T1.D1, ACC1.Q1
929 VPMULL2 B5.D2, T1.D2, ACC0.Q1
930 VPMULL T0.D1, T2.D1, ACCM.Q1
931 reduce()
932
933 B singlesLoop
934 tail:
935 CBZ srcPtrLen, done
936
937 VREV32 CTR.B16, B0.B16
938 VADD CTR.S4, INC.S4, CTR.S4
939
940 AESE K0.B16, B0.B16
941 AESMC B0.B16, B0.B16
942 AESE K1.B16, B0.B16
943 AESMC B0.B16, B0.B16
944 AESE K2.B16, B0.B16
945 AESMC B0.B16, B0.B16
946 AESE K3.B16, B0.B16
947 AESMC B0.B16, B0.B16
948 AESE K4.B16, B0.B16
949 AESMC B0.B16, B0.B16
950 AESE K5.B16, B0.B16
951 AESMC B0.B16, B0.B16
952 AESE K6.B16, B0.B16
953 AESMC B0.B16, B0.B16
954 AESE K7.B16, B0.B16
955 AESMC B0.B16, B0.B16
956 AESE K8.B16, B0.B16
957 AESMC B0.B16, B0.B16
958 AESE K9.B16, B0.B16
959 TBZ $4, NR, tailLast
960 AESMC B0.B16, B0.B16
961 AESE K10.B16, B0.B16
962 AESMC B0.B16, B0.B16
963 AESE B1.B16, B0.B16
964 TBZ $3, NR, tailLast
965 AESMC B0.B16, B0.B16
966 AESE B2.B16, B0.B16
967 AESMC B0.B16, B0.B16
968 AESE B3.B16, B0.B16
969 tailLast:
970 VEOR KLAST.B16, B0.B16, B0.B16
971
972 // Assuming it is safe to load past dstPtr due to the presence of the tag
973 VLD1 (srcPtr), [B5.B16]
974
975 VEOR B5.B16, B0.B16, B0.B16
976
977 VEOR T3.B16, T3.B16, T3.B16
978 MOVD $0, H1
979 SUB $1, H1
980
981 TBZ $3, srcPtrLen, ld4
982 VMOV B0.D[0], H0
983 MOVD.P H0, 8(dstPtr)
984 VMOV H1, T3.D[0]
985 VEXT $8, ZERO.B16, B0.B16, B0.B16
986 ld4:
987 TBZ $2, srcPtrLen, ld2
988 VMOV B0.S[0], H0
989 MOVW.P H0, 4(dstPtr)
990 VEXT $12, T3.B16, ZERO.B16, T3.B16
991 VMOV H1, T3.S[0]
992 VEXT $4, ZERO.B16, B0.B16, B0.B16
993 ld2:
994 TBZ $1, srcPtrLen, ld1
995 VMOV B0.H[0], H0
996 MOVH.P H0, 2(dstPtr)
997 VEXT $14, T3.B16, ZERO.B16, T3.B16
998 VMOV H1, T3.H[0]
999 VEXT $2, ZERO.B16, B0.B16, B0.B16
1000 ld1:
1001 TBZ $0, srcPtrLen, ld0
1002 VMOV B0.B[0], H0
1003 MOVB.P H0, 1(dstPtr)
1004 VEXT $15, T3.B16, ZERO.B16, T3.B16
1005 VMOV H1, T3.B[0]
1006 ld0:
1007
1008 VAND T3.B16, B5.B16, B5.B16
1009 VREV64 B5.B16, B5.B16
1010
1011 VEOR ACC0.B16, B5.B16, B5.B16
1012 VEXT $8, B5.B16, B5.B16, T0.B16
1013 VEOR B5.B16, T0.B16, T0.B16
1014 VPMULL B5.D1, T1.D1, ACC1.Q1
1015 VPMULL2 B5.D2, T1.D2, ACC0.Q1
1016 VPMULL T0.D1, T2.D1, ACCM.Q1
1017 reduce()
1018 done:
1019 VST1 [ACC0.B16], (tPtr)
1020
1021 RET
1022
View as plain text