1 // Copyright 2015 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 // This is an optimized implementation of AES-GCM using AES-NI and CLMUL-NI
6 // The implementation uses some optimization as described in:
7 // [1] Gueron, S., Kounavis, M.E.: Intel® Carry-Less Multiplication
8 // Instruction and its Usage for Computing the GCM Mode rev. 2.02
9 // [2] Gueron, S., Krasnov, V.: Speeding up Counter Mode in Software and
10 // Hardware
11
12 #include "textflag.h"
13
14 #define B0 X0
15 #define B1 X1
16 #define B2 X2
17 #define B3 X3
18 #define B4 X4
19 #define B5 X5
20 #define B6 X6
21 #define B7 X7
22
23 #define ACC0 X8
24 #define ACC1 X9
25 #define ACCM X10
26
27 #define T0 X11
28 #define T1 X12
29 #define T2 X13
30 #define POLY X14
31 #define BSWAP X15
32
33 DATA bswapMask<>+0x00(SB)/8, $0x08090a0b0c0d0e0f
34 DATA bswapMask<>+0x08(SB)/8, $0x0001020304050607
35
36 DATA gcmPoly<>+0x00(SB)/8, $0x0000000000000001
37 DATA gcmPoly<>+0x08(SB)/8, $0xc200000000000000
38
39 DATA andMask<>+0x00(SB)/8, $0x00000000000000ff
40 DATA andMask<>+0x08(SB)/8, $0x0000000000000000
41 DATA andMask<>+0x10(SB)/8, $0x000000000000ffff
42 DATA andMask<>+0x18(SB)/8, $0x0000000000000000
43 DATA andMask<>+0x20(SB)/8, $0x0000000000ffffff
44 DATA andMask<>+0x28(SB)/8, $0x0000000000000000
45 DATA andMask<>+0x30(SB)/8, $0x00000000ffffffff
46 DATA andMask<>+0x38(SB)/8, $0x0000000000000000
47 DATA andMask<>+0x40(SB)/8, $0x000000ffffffffff
48 DATA andMask<>+0x48(SB)/8, $0x0000000000000000
49 DATA andMask<>+0x50(SB)/8, $0x0000ffffffffffff
50 DATA andMask<>+0x58(SB)/8, $0x0000000000000000
51 DATA andMask<>+0x60(SB)/8, $0x00ffffffffffffff
52 DATA andMask<>+0x68(SB)/8, $0x0000000000000000
53 DATA andMask<>+0x70(SB)/8, $0xffffffffffffffff
54 DATA andMask<>+0x78(SB)/8, $0x0000000000000000
55 DATA andMask<>+0x80(SB)/8, $0xffffffffffffffff
56 DATA andMask<>+0x88(SB)/8, $0x00000000000000ff
57 DATA andMask<>+0x90(SB)/8, $0xffffffffffffffff
58 DATA andMask<>+0x98(SB)/8, $0x000000000000ffff
59 DATA andMask<>+0xa0(SB)/8, $0xffffffffffffffff
60 DATA andMask<>+0xa8(SB)/8, $0x0000000000ffffff
61 DATA andMask<>+0xb0(SB)/8, $0xffffffffffffffff
62 DATA andMask<>+0xb8(SB)/8, $0x00000000ffffffff
63 DATA andMask<>+0xc0(SB)/8, $0xffffffffffffffff
64 DATA andMask<>+0xc8(SB)/8, $0x000000ffffffffff
65 DATA andMask<>+0xd0(SB)/8, $0xffffffffffffffff
66 DATA andMask<>+0xd8(SB)/8, $0x0000ffffffffffff
67 DATA andMask<>+0xe0(SB)/8, $0xffffffffffffffff
68 DATA andMask<>+0xe8(SB)/8, $0x00ffffffffffffff
69
70 GLOBL bswapMask<>(SB), (NOPTR+RODATA), $16
71 GLOBL gcmPoly<>(SB), (NOPTR+RODATA), $16
72 GLOBL andMask<>(SB), (NOPTR+RODATA), $240
73
74 // func gcmAesFinish(productTable *[256]byte, tagMask, T *[16]byte, pLen, dLen uint64)
75 TEXT ·gcmAesFinish(SB),NOSPLIT,$0
76 #define pTbl DI
77 #define tMsk SI
78 #define tPtr DX
79 #define plen AX
80 #define dlen CX
81
82 MOVQ productTable+0(FP), pTbl
83 MOVQ tagMask+8(FP), tMsk
84 MOVQ T+16(FP), tPtr
85 MOVQ pLen+24(FP), plen
86 MOVQ dLen+32(FP), dlen
87
88 MOVOU (tPtr), ACC0
89 MOVOU (tMsk), T2
90
91 MOVOU bswapMask<>(SB), BSWAP
92 MOVOU gcmPoly<>(SB), POLY
93
94 SHLQ $3, plen
95 SHLQ $3, dlen
96
97 MOVQ plen, B0
98 PINSRQ $1, dlen, B0
99
100 PXOR ACC0, B0
101
102 MOVOU (16*14)(pTbl), ACC0
103 MOVOU (16*15)(pTbl), ACCM
104 MOVOU ACC0, ACC1
105
106 PCLMULQDQ $0x00, B0, ACC0
107 PCLMULQDQ $0x11, B0, ACC1
108 PSHUFD $78, B0, T0
109 PXOR B0, T0
110 PCLMULQDQ $0x00, T0, ACCM
111
112 PXOR ACC0, ACCM
113 PXOR ACC1, ACCM
114 MOVOU ACCM, T0
115 PSRLDQ $8, ACCM
116 PSLLDQ $8, T0
117 PXOR ACCM, ACC1
118 PXOR T0, ACC0
119
120 MOVOU POLY, T0
121 PCLMULQDQ $0x01, ACC0, T0
122 PSHUFD $78, ACC0, ACC0
123 PXOR T0, ACC0
124
125 MOVOU POLY, T0
126 PCLMULQDQ $0x01, ACC0, T0
127 PSHUFD $78, ACC0, ACC0
128 PXOR T0, ACC0
129
130 PXOR ACC1, ACC0
131
132 PSHUFB BSWAP, ACC0
133 PXOR T2, ACC0
134 MOVOU ACC0, (tPtr)
135
136 RET
137 #undef pTbl
138 #undef tMsk
139 #undef tPtr
140 #undef plen
141 #undef dlen
142
143 // func gcmAesInit(productTable *[256]byte, ks []uint32)
144 TEXT ·gcmAesInit(SB),NOSPLIT,$0
145 #define dst DI
146 #define KS SI
147 #define NR DX
148
149 MOVQ productTable+0(FP), dst
150 MOVQ ks_base+8(FP), KS
151 MOVQ ks_len+16(FP), NR
152
153 SHRQ $2, NR
154 DECQ NR
155
156 MOVOU bswapMask<>(SB), BSWAP
157 MOVOU gcmPoly<>(SB), POLY
158
159 // Encrypt block 0, with the AES key to generate the hash key H
160 MOVOU (16*0)(KS), B0
161 MOVOU (16*1)(KS), T0
162 AESENC T0, B0
163 MOVOU (16*2)(KS), T0
164 AESENC T0, B0
165 MOVOU (16*3)(KS), T0
166 AESENC T0, B0
167 MOVOU (16*4)(KS), T0
168 AESENC T0, B0
169 MOVOU (16*5)(KS), T0
170 AESENC T0, B0
171 MOVOU (16*6)(KS), T0
172 AESENC T0, B0
173 MOVOU (16*7)(KS), T0
174 AESENC T0, B0
175 MOVOU (16*8)(KS), T0
176 AESENC T0, B0
177 MOVOU (16*9)(KS), T0
178 AESENC T0, B0
179 MOVOU (16*10)(KS), T0
180 CMPQ NR, $12
181 JB initEncLast
182 AESENC T0, B0
183 MOVOU (16*11)(KS), T0
184 AESENC T0, B0
185 MOVOU (16*12)(KS), T0
186 JE initEncLast
187 AESENC T0, B0
188 MOVOU (16*13)(KS), T0
189 AESENC T0, B0
190 MOVOU (16*14)(KS), T0
191 initEncLast:
192 AESENCLAST T0, B0
193
194 PSHUFB BSWAP, B0
195 // H * 2
196 PSHUFD $0xff, B0, T0
197 MOVOU B0, T1
198 PSRAL $31, T0
199 PAND POLY, T0
200 PSRLL $31, T1
201 PSLLDQ $4, T1
202 PSLLL $1, B0
203 PXOR T0, B0
204 PXOR T1, B0
205 // Karatsuba pre-computations
206 MOVOU B0, (16*14)(dst)
207 PSHUFD $78, B0, B1
208 PXOR B0, B1
209 MOVOU B1, (16*15)(dst)
210
211 MOVOU B0, B2
212 MOVOU B1, B3
213 // Now prepare powers of H and pre-computations for them
214 MOVQ $7, AX
215
216 initLoop:
217 MOVOU B2, T0
218 MOVOU B2, T1
219 MOVOU B3, T2
220 PCLMULQDQ $0x00, B0, T0
221 PCLMULQDQ $0x11, B0, T1
222 PCLMULQDQ $0x00, B1, T2
223
224 PXOR T0, T2
225 PXOR T1, T2
226 MOVOU T2, B4
227 PSLLDQ $8, B4
228 PSRLDQ $8, T2
229 PXOR B4, T0
230 PXOR T2, T1
231
232 MOVOU POLY, B2
233 PCLMULQDQ $0x01, T0, B2
234 PSHUFD $78, T0, T0
235 PXOR B2, T0
236 MOVOU POLY, B2
237 PCLMULQDQ $0x01, T0, B2
238 PSHUFD $78, T0, T0
239 PXOR T0, B2
240 PXOR T1, B2
241
242 MOVOU B2, (16*12)(dst)
243 PSHUFD $78, B2, B3
244 PXOR B2, B3
245 MOVOU B3, (16*13)(dst)
246
247 DECQ AX
248 LEAQ (-16*2)(dst), dst
249 JNE initLoop
250
251 RET
252 #undef NR
253 #undef KS
254 #undef dst
255
256 // func gcmAesData(productTable *[256]byte, data []byte, T *[16]byte)
257 TEXT ·gcmAesData(SB),NOSPLIT,$0
258 #define pTbl DI
259 #define aut SI
260 #define tPtr CX
261 #define autLen DX
262
263 #define reduceRound(a) MOVOU POLY, T0; PCLMULQDQ $0x01, a, T0; PSHUFD $78, a, a; PXOR T0, a
264 #define mulRoundAAD(X ,i) \
265 MOVOU (16*(i*2))(pTbl), T1;\
266 MOVOU T1, T2;\
267 PCLMULQDQ $0x00, X, T1;\
268 PXOR T1, ACC0;\
269 PCLMULQDQ $0x11, X, T2;\
270 PXOR T2, ACC1;\
271 PSHUFD $78, X, T1;\
272 PXOR T1, X;\
273 MOVOU (16*(i*2+1))(pTbl), T1;\
274 PCLMULQDQ $0x00, X, T1;\
275 PXOR T1, ACCM
276
277 MOVQ productTable+0(FP), pTbl
278 MOVQ data_base+8(FP), aut
279 MOVQ data_len+16(FP), autLen
280 MOVQ T+32(FP), tPtr
281
282 PXOR ACC0, ACC0
283 MOVOU bswapMask<>(SB), BSWAP
284 MOVOU gcmPoly<>(SB), POLY
285
286 TESTQ autLen, autLen
287 JEQ dataBail
288
289 CMPQ autLen, $13 // optimize the TLS case
290 JE dataTLS
291 CMPQ autLen, $128
292 JB startSinglesLoop
293 JMP dataOctaLoop
294
295 dataTLS:
296 MOVOU (16*14)(pTbl), T1
297 MOVOU (16*15)(pTbl), T2
298 PXOR B0, B0
299 MOVQ (aut), B0
300 PINSRD $2, 8(aut), B0
301 PINSRB $12, 12(aut), B0
302 XORQ autLen, autLen
303 JMP dataMul
304
305 dataOctaLoop:
306 CMPQ autLen, $128
307 JB startSinglesLoop
308 SUBQ $128, autLen
309
310 MOVOU (16*0)(aut), X0
311 MOVOU (16*1)(aut), X1
312 MOVOU (16*2)(aut), X2
313 MOVOU (16*3)(aut), X3
314 MOVOU (16*4)(aut), X4
315 MOVOU (16*5)(aut), X5
316 MOVOU (16*6)(aut), X6
317 MOVOU (16*7)(aut), X7
318 LEAQ (16*8)(aut), aut
319 PSHUFB BSWAP, X0
320 PSHUFB BSWAP, X1
321 PSHUFB BSWAP, X2
322 PSHUFB BSWAP, X3
323 PSHUFB BSWAP, X4
324 PSHUFB BSWAP, X5
325 PSHUFB BSWAP, X6
326 PSHUFB BSWAP, X7
327 PXOR ACC0, X0
328
329 MOVOU (16*0)(pTbl), ACC0
330 MOVOU (16*1)(pTbl), ACCM
331 MOVOU ACC0, ACC1
332 PSHUFD $78, X0, T1
333 PXOR X0, T1
334 PCLMULQDQ $0x00, X0, ACC0
335 PCLMULQDQ $0x11, X0, ACC1
336 PCLMULQDQ $0x00, T1, ACCM
337
338 mulRoundAAD(X1, 1)
339 mulRoundAAD(X2, 2)
340 mulRoundAAD(X3, 3)
341 mulRoundAAD(X4, 4)
342 mulRoundAAD(X5, 5)
343 mulRoundAAD(X6, 6)
344 mulRoundAAD(X7, 7)
345
346 PXOR ACC0, ACCM
347 PXOR ACC1, ACCM
348 MOVOU ACCM, T0
349 PSRLDQ $8, ACCM
350 PSLLDQ $8, T0
351 PXOR ACCM, ACC1
352 PXOR T0, ACC0
353 reduceRound(ACC0)
354 reduceRound(ACC0)
355 PXOR ACC1, ACC0
356 JMP dataOctaLoop
357
358 startSinglesLoop:
359 MOVOU (16*14)(pTbl), T1
360 MOVOU (16*15)(pTbl), T2
361
362 dataSinglesLoop:
363
364 CMPQ autLen, $16
365 JB dataEnd
366 SUBQ $16, autLen
367
368 MOVOU (aut), B0
369 dataMul:
370 PSHUFB BSWAP, B0
371 PXOR ACC0, B0
372
373 MOVOU T1, ACC0
374 MOVOU T2, ACCM
375 MOVOU T1, ACC1
376
377 PSHUFD $78, B0, T0
378 PXOR B0, T0
379 PCLMULQDQ $0x00, B0, ACC0
380 PCLMULQDQ $0x11, B0, ACC1
381 PCLMULQDQ $0x00, T0, ACCM
382
383 PXOR ACC0, ACCM
384 PXOR ACC1, ACCM
385 MOVOU ACCM, T0
386 PSRLDQ $8, ACCM
387 PSLLDQ $8, T0
388 PXOR ACCM, ACC1
389 PXOR T0, ACC0
390
391 MOVOU POLY, T0
392 PCLMULQDQ $0x01, ACC0, T0
393 PSHUFD $78, ACC0, ACC0
394 PXOR T0, ACC0
395
396 MOVOU POLY, T0
397 PCLMULQDQ $0x01, ACC0, T0
398 PSHUFD $78, ACC0, ACC0
399 PXOR T0, ACC0
400 PXOR ACC1, ACC0
401
402 LEAQ 16(aut), aut
403
404 JMP dataSinglesLoop
405
406 dataEnd:
407
408 TESTQ autLen, autLen
409 JEQ dataBail
410
411 PXOR B0, B0
412 LEAQ -1(aut)(autLen*1), aut
413
414 dataLoadLoop:
415
416 PSLLDQ $1, B0
417 PINSRB $0, (aut), B0
418
419 LEAQ -1(aut), aut
420 DECQ autLen
421 JNE dataLoadLoop
422
423 JMP dataMul
424
425 dataBail:
426 MOVOU ACC0, (tPtr)
427 RET
428 #undef pTbl
429 #undef aut
430 #undef tPtr
431 #undef autLen
432
433 // func gcmAesEnc(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, ks []uint32)
434 TEXT ·gcmAesEnc(SB),0,$256-96
435 #define pTbl DI
436 #define ctx DX
437 #define ctrPtr CX
438 #define ptx SI
439 #define ks AX
440 #define tPtr R8
441 #define ptxLen R9
442 #define aluCTR R10
443 #define aluTMP R11
444 #define aluK R12
445 #define NR R13
446
447 #define increment(i) ADDL $1, aluCTR; MOVL aluCTR, aluTMP; XORL aluK, aluTMP; BSWAPL aluTMP; MOVL aluTMP, (3*4 + 8*16 + i*16)(SP)
448 #define aesRnd(k) AESENC k, B0; AESENC k, B1; AESENC k, B2; AESENC k, B3; AESENC k, B4; AESENC k, B5; AESENC k, B6; AESENC k, B7
449 #define aesRound(i) MOVOU (16*i)(ks), T0;AESENC T0, B0; AESENC T0, B1; AESENC T0, B2; AESENC T0, B3; AESENC T0, B4; AESENC T0, B5; AESENC T0, B6; AESENC T0, B7
450 #define aesRndLast(k) AESENCLAST k, B0; AESENCLAST k, B1; AESENCLAST k, B2; AESENCLAST k, B3; AESENCLAST k, B4; AESENCLAST k, B5; AESENCLAST k, B6; AESENCLAST k, B7
451 #define combinedRound(i) \
452 MOVOU (16*i)(ks), T0;\
453 AESENC T0, B0;\
454 AESENC T0, B1;\
455 AESENC T0, B2;\
456 AESENC T0, B3;\
457 MOVOU (16*(i*2))(pTbl), T1;\
458 MOVOU T1, T2;\
459 AESENC T0, B4;\
460 AESENC T0, B5;\
461 AESENC T0, B6;\
462 AESENC T0, B7;\
463 MOVOU (16*i)(SP), T0;\
464 PCLMULQDQ $0x00, T0, T1;\
465 PXOR T1, ACC0;\
466 PSHUFD $78, T0, T1;\
467 PCLMULQDQ $0x11, T0, T2;\
468 PXOR T1, T0;\
469 PXOR T2, ACC1;\
470 MOVOU (16*(i*2+1))(pTbl), T2;\
471 PCLMULQDQ $0x00, T2, T0;\
472 PXOR T0, ACCM
473 #define mulRound(i) \
474 MOVOU (16*i)(SP), T0;\
475 MOVOU (16*(i*2))(pTbl), T1;\
476 MOVOU T1, T2;\
477 PCLMULQDQ $0x00, T0, T1;\
478 PXOR T1, ACC0;\
479 PCLMULQDQ $0x11, T0, T2;\
480 PXOR T2, ACC1;\
481 PSHUFD $78, T0, T1;\
482 PXOR T1, T0;\
483 MOVOU (16*(i*2+1))(pTbl), T1;\
484 PCLMULQDQ $0x00, T0, T1;\
485 PXOR T1, ACCM
486
487 MOVQ productTable+0(FP), pTbl
488 MOVQ dst+8(FP), ctx
489 MOVQ src_base+32(FP), ptx
490 MOVQ src_len+40(FP), ptxLen
491 MOVQ ctr+56(FP), ctrPtr
492 MOVQ T+64(FP), tPtr
493 MOVQ ks_base+72(FP), ks
494 MOVQ ks_len+80(FP), NR
495
496 SHRQ $2, NR
497 DECQ NR
498
499 MOVOU bswapMask<>(SB), BSWAP
500 MOVOU gcmPoly<>(SB), POLY
501
502 MOVOU (tPtr), ACC0
503 PXOR ACC1, ACC1
504 PXOR ACCM, ACCM
505 MOVOU (ctrPtr), B0
506 MOVL (3*4)(ctrPtr), aluCTR
507 MOVOU (ks), T0
508 MOVL (3*4)(ks), aluK
509 BSWAPL aluCTR
510 BSWAPL aluK
511
512 PXOR B0, T0
513 MOVOU T0, (8*16 + 0*16)(SP)
514 increment(0)
515
516 CMPQ ptxLen, $128
517 JB gcmAesEncSingles
518 SUBQ $128, ptxLen
519
520 // We have at least 8 blocks to encrypt, prepare the rest of the counters
521 MOVOU T0, (8*16 + 1*16)(SP)
522 increment(1)
523 MOVOU T0, (8*16 + 2*16)(SP)
524 increment(2)
525 MOVOU T0, (8*16 + 3*16)(SP)
526 increment(3)
527 MOVOU T0, (8*16 + 4*16)(SP)
528 increment(4)
529 MOVOU T0, (8*16 + 5*16)(SP)
530 increment(5)
531 MOVOU T0, (8*16 + 6*16)(SP)
532 increment(6)
533 MOVOU T0, (8*16 + 7*16)(SP)
534 increment(7)
535
536 MOVOU (8*16 + 0*16)(SP), B0
537 MOVOU (8*16 + 1*16)(SP), B1
538 MOVOU (8*16 + 2*16)(SP), B2
539 MOVOU (8*16 + 3*16)(SP), B3
540 MOVOU (8*16 + 4*16)(SP), B4
541 MOVOU (8*16 + 5*16)(SP), B5
542 MOVOU (8*16 + 6*16)(SP), B6
543 MOVOU (8*16 + 7*16)(SP), B7
544
545 aesRound(1)
546 increment(0)
547 aesRound(2)
548 increment(1)
549 aesRound(3)
550 increment(2)
551 aesRound(4)
552 increment(3)
553 aesRound(5)
554 increment(4)
555 aesRound(6)
556 increment(5)
557 aesRound(7)
558 increment(6)
559 aesRound(8)
560 increment(7)
561 aesRound(9)
562 MOVOU (16*10)(ks), T0
563 CMPQ NR, $12
564 JB encLast1
565 aesRnd(T0)
566 aesRound(11)
567 MOVOU (16*12)(ks), T0
568 JE encLast1
569 aesRnd(T0)
570 aesRound(13)
571 MOVOU (16*14)(ks), T0
572 encLast1:
573 aesRndLast(T0)
574
575 MOVOU (16*0)(ptx), T0
576 PXOR T0, B0
577 MOVOU (16*1)(ptx), T0
578 PXOR T0, B1
579 MOVOU (16*2)(ptx), T0
580 PXOR T0, B2
581 MOVOU (16*3)(ptx), T0
582 PXOR T0, B3
583 MOVOU (16*4)(ptx), T0
584 PXOR T0, B4
585 MOVOU (16*5)(ptx), T0
586 PXOR T0, B5
587 MOVOU (16*6)(ptx), T0
588 PXOR T0, B6
589 MOVOU (16*7)(ptx), T0
590 PXOR T0, B7
591
592 MOVOU B0, (16*0)(ctx)
593 PSHUFB BSWAP, B0
594 PXOR ACC0, B0
595 MOVOU B1, (16*1)(ctx)
596 PSHUFB BSWAP, B1
597 MOVOU B2, (16*2)(ctx)
598 PSHUFB BSWAP, B2
599 MOVOU B3, (16*3)(ctx)
600 PSHUFB BSWAP, B3
601 MOVOU B4, (16*4)(ctx)
602 PSHUFB BSWAP, B4
603 MOVOU B5, (16*5)(ctx)
604 PSHUFB BSWAP, B5
605 MOVOU B6, (16*6)(ctx)
606 PSHUFB BSWAP, B6
607 MOVOU B7, (16*7)(ctx)
608 PSHUFB BSWAP, B7
609
610 MOVOU B0, (16*0)(SP)
611 MOVOU B1, (16*1)(SP)
612 MOVOU B2, (16*2)(SP)
613 MOVOU B3, (16*3)(SP)
614 MOVOU B4, (16*4)(SP)
615 MOVOU B5, (16*5)(SP)
616 MOVOU B6, (16*6)(SP)
617 MOVOU B7, (16*7)(SP)
618
619 LEAQ 128(ptx), ptx
620 LEAQ 128(ctx), ctx
621
622 gcmAesEncOctetsLoop:
623
624 CMPQ ptxLen, $128
625 JB gcmAesEncOctetsEnd
626 SUBQ $128, ptxLen
627
628 MOVOU (8*16 + 0*16)(SP), B0
629 MOVOU (8*16 + 1*16)(SP), B1
630 MOVOU (8*16 + 2*16)(SP), B2
631 MOVOU (8*16 + 3*16)(SP), B3
632 MOVOU (8*16 + 4*16)(SP), B4
633 MOVOU (8*16 + 5*16)(SP), B5
634 MOVOU (8*16 + 6*16)(SP), B6
635 MOVOU (8*16 + 7*16)(SP), B7
636
637 MOVOU (16*0)(SP), T0
638 PSHUFD $78, T0, T1
639 PXOR T0, T1
640
641 MOVOU (16*0)(pTbl), ACC0
642 MOVOU (16*1)(pTbl), ACCM
643 MOVOU ACC0, ACC1
644
645 PCLMULQDQ $0x00, T1, ACCM
646 PCLMULQDQ $0x00, T0, ACC0
647 PCLMULQDQ $0x11, T0, ACC1
648
649 combinedRound(1)
650 increment(0)
651 combinedRound(2)
652 increment(1)
653 combinedRound(3)
654 increment(2)
655 combinedRound(4)
656 increment(3)
657 combinedRound(5)
658 increment(4)
659 combinedRound(6)
660 increment(5)
661 combinedRound(7)
662 increment(6)
663
664 aesRound(8)
665 increment(7)
666
667 PXOR ACC0, ACCM
668 PXOR ACC1, ACCM
669 MOVOU ACCM, T0
670 PSRLDQ $8, ACCM
671 PSLLDQ $8, T0
672 PXOR ACCM, ACC1
673 PXOR T0, ACC0
674
675 reduceRound(ACC0)
676 aesRound(9)
677
678 reduceRound(ACC0)
679 PXOR ACC1, ACC0
680
681 MOVOU (16*10)(ks), T0
682 CMPQ NR, $12
683 JB encLast2
684 aesRnd(T0)
685 aesRound(11)
686 MOVOU (16*12)(ks), T0
687 JE encLast2
688 aesRnd(T0)
689 aesRound(13)
690 MOVOU (16*14)(ks), T0
691 encLast2:
692 aesRndLast(T0)
693
694 MOVOU (16*0)(ptx), T0
695 PXOR T0, B0
696 MOVOU (16*1)(ptx), T0
697 PXOR T0, B1
698 MOVOU (16*2)(ptx), T0
699 PXOR T0, B2
700 MOVOU (16*3)(ptx), T0
701 PXOR T0, B3
702 MOVOU (16*4)(ptx), T0
703 PXOR T0, B4
704 MOVOU (16*5)(ptx), T0
705 PXOR T0, B5
706 MOVOU (16*6)(ptx), T0
707 PXOR T0, B6
708 MOVOU (16*7)(ptx), T0
709 PXOR T0, B7
710
711 MOVOU B0, (16*0)(ctx)
712 PSHUFB BSWAP, B0
713 PXOR ACC0, B0
714 MOVOU B1, (16*1)(ctx)
715 PSHUFB BSWAP, B1
716 MOVOU B2, (16*2)(ctx)
717 PSHUFB BSWAP, B2
718 MOVOU B3, (16*3)(ctx)
719 PSHUFB BSWAP, B3
720 MOVOU B4, (16*4)(ctx)
721 PSHUFB BSWAP, B4
722 MOVOU B5, (16*5)(ctx)
723 PSHUFB BSWAP, B5
724 MOVOU B6, (16*6)(ctx)
725 PSHUFB BSWAP, B6
726 MOVOU B7, (16*7)(ctx)
727 PSHUFB BSWAP, B7
728
729 MOVOU B0, (16*0)(SP)
730 MOVOU B1, (16*1)(SP)
731 MOVOU B2, (16*2)(SP)
732 MOVOU B3, (16*3)(SP)
733 MOVOU B4, (16*4)(SP)
734 MOVOU B5, (16*5)(SP)
735 MOVOU B6, (16*6)(SP)
736 MOVOU B7, (16*7)(SP)
737
738 LEAQ 128(ptx), ptx
739 LEAQ 128(ctx), ctx
740
741 JMP gcmAesEncOctetsLoop
742
743 gcmAesEncOctetsEnd:
744
745 MOVOU (16*0)(SP), T0
746 MOVOU (16*0)(pTbl), ACC0
747 MOVOU (16*1)(pTbl), ACCM
748 MOVOU ACC0, ACC1
749 PSHUFD $78, T0, T1
750 PXOR T0, T1
751 PCLMULQDQ $0x00, T0, ACC0
752 PCLMULQDQ $0x11, T0, ACC1
753 PCLMULQDQ $0x00, T1, ACCM
754
755 mulRound(1)
756 mulRound(2)
757 mulRound(3)
758 mulRound(4)
759 mulRound(5)
760 mulRound(6)
761 mulRound(7)
762
763 PXOR ACC0, ACCM
764 PXOR ACC1, ACCM
765 MOVOU ACCM, T0
766 PSRLDQ $8, ACCM
767 PSLLDQ $8, T0
768 PXOR ACCM, ACC1
769 PXOR T0, ACC0
770
771 reduceRound(ACC0)
772 reduceRound(ACC0)
773 PXOR ACC1, ACC0
774
775 TESTQ ptxLen, ptxLen
776 JE gcmAesEncDone
777
778 SUBQ $7, aluCTR
779
780 gcmAesEncSingles:
781
782 MOVOU (16*1)(ks), B1
783 MOVOU (16*2)(ks), B2
784 MOVOU (16*3)(ks), B3
785 MOVOU (16*4)(ks), B4
786 MOVOU (16*5)(ks), B5
787 MOVOU (16*6)(ks), B6
788 MOVOU (16*7)(ks), B7
789
790 MOVOU (16*14)(pTbl), T2
791
792 gcmAesEncSinglesLoop:
793
794 CMPQ ptxLen, $16
795 JB gcmAesEncTail
796 SUBQ $16, ptxLen
797
798 MOVOU (8*16 + 0*16)(SP), B0
799 increment(0)
800
801 AESENC B1, B0
802 AESENC B2, B0
803 AESENC B3, B0
804 AESENC B4, B0
805 AESENC B5, B0
806 AESENC B6, B0
807 AESENC B7, B0
808 MOVOU (16*8)(ks), T0
809 AESENC T0, B0
810 MOVOU (16*9)(ks), T0
811 AESENC T0, B0
812 MOVOU (16*10)(ks), T0
813 CMPQ NR, $12
814 JB encLast3
815 AESENC T0, B0
816 MOVOU (16*11)(ks), T0
817 AESENC T0, B0
818 MOVOU (16*12)(ks), T0
819 JE encLast3
820 AESENC T0, B0
821 MOVOU (16*13)(ks), T0
822 AESENC T0, B0
823 MOVOU (16*14)(ks), T0
824 encLast3:
825 AESENCLAST T0, B0
826
827 MOVOU (ptx), T0
828 PXOR T0, B0
829 MOVOU B0, (ctx)
830
831 PSHUFB BSWAP, B0
832 PXOR ACC0, B0
833
834 MOVOU T2, ACC0
835 MOVOU T2, ACC1
836 MOVOU (16*15)(pTbl), ACCM
837
838 PSHUFD $78, B0, T0
839 PXOR B0, T0
840 PCLMULQDQ $0x00, B0, ACC0
841 PCLMULQDQ $0x11, B0, ACC1
842 PCLMULQDQ $0x00, T0, ACCM
843
844 PXOR ACC0, ACCM
845 PXOR ACC1, ACCM
846 MOVOU ACCM, T0
847 PSRLDQ $8, ACCM
848 PSLLDQ $8, T0
849 PXOR ACCM, ACC1
850 PXOR T0, ACC0
851
852 reduceRound(ACC0)
853 reduceRound(ACC0)
854 PXOR ACC1, ACC0
855
856 LEAQ (16*1)(ptx), ptx
857 LEAQ (16*1)(ctx), ctx
858
859 JMP gcmAesEncSinglesLoop
860
861 gcmAesEncTail:
862 TESTQ ptxLen, ptxLen
863 JE gcmAesEncDone
864
865 MOVOU (8*16 + 0*16)(SP), B0
866 AESENC B1, B0
867 AESENC B2, B0
868 AESENC B3, B0
869 AESENC B4, B0
870 AESENC B5, B0
871 AESENC B6, B0
872 AESENC B7, B0
873 MOVOU (16*8)(ks), T0
874 AESENC T0, B0
875 MOVOU (16*9)(ks), T0
876 AESENC T0, B0
877 MOVOU (16*10)(ks), T0
878 CMPQ NR, $12
879 JB encLast4
880 AESENC T0, B0
881 MOVOU (16*11)(ks), T0
882 AESENC T0, B0
883 MOVOU (16*12)(ks), T0
884 JE encLast4
885 AESENC T0, B0
886 MOVOU (16*13)(ks), T0
887 AESENC T0, B0
888 MOVOU (16*14)(ks), T0
889 encLast4:
890 AESENCLAST T0, B0
891 MOVOU B0, T0
892
893 LEAQ -1(ptx)(ptxLen*1), ptx
894
895 MOVQ ptxLen, aluTMP
896 SHLQ $4, aluTMP
897
898 LEAQ andMask<>(SB), aluCTR
899 MOVOU -16(aluCTR)(aluTMP*1), T1
900
901 PXOR B0, B0
902 ptxLoadLoop:
903 PSLLDQ $1, B0
904 PINSRB $0, (ptx), B0
905 LEAQ -1(ptx), ptx
906 DECQ ptxLen
907 JNE ptxLoadLoop
908
909 PXOR T0, B0
910 PAND T1, B0
911 MOVOU B0, (ctx) // I assume there is always space, due to TAG in the end of the CT
912
913 PSHUFB BSWAP, B0
914 PXOR ACC0, B0
915
916 MOVOU T2, ACC0
917 MOVOU T2, ACC1
918 MOVOU (16*15)(pTbl), ACCM
919
920 PSHUFD $78, B0, T0
921 PXOR B0, T0
922 PCLMULQDQ $0x00, B0, ACC0
923 PCLMULQDQ $0x11, B0, ACC1
924 PCLMULQDQ $0x00, T0, ACCM
925
926 PXOR ACC0, ACCM
927 PXOR ACC1, ACCM
928 MOVOU ACCM, T0
929 PSRLDQ $8, ACCM
930 PSLLDQ $8, T0
931 PXOR ACCM, ACC1
932 PXOR T0, ACC0
933
934 reduceRound(ACC0)
935 reduceRound(ACC0)
936 PXOR ACC1, ACC0
937
938 gcmAesEncDone:
939 MOVOU ACC0, (tPtr)
940 RET
941 #undef increment
942
943 // func gcmAesDec(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, ks []uint32)
944 TEXT ·gcmAesDec(SB),0,$128-96
945 #define increment(i) ADDL $1, aluCTR; MOVL aluCTR, aluTMP; XORL aluK, aluTMP; BSWAPL aluTMP; MOVL aluTMP, (3*4 + i*16)(SP)
946 #define combinedDecRound(i) \
947 MOVOU (16*i)(ks), T0;\
948 AESENC T0, B0;\
949 AESENC T0, B1;\
950 AESENC T0, B2;\
951 AESENC T0, B3;\
952 MOVOU (16*(i*2))(pTbl), T1;\
953 MOVOU T1, T2;\
954 AESENC T0, B4;\
955 AESENC T0, B5;\
956 AESENC T0, B6;\
957 AESENC T0, B7;\
958 MOVOU (16*i)(ctx), T0;\
959 PSHUFB BSWAP, T0;\
960 PCLMULQDQ $0x00, T0, T1;\
961 PXOR T1, ACC0;\
962 PSHUFD $78, T0, T1;\
963 PCLMULQDQ $0x11, T0, T2;\
964 PXOR T1, T0;\
965 PXOR T2, ACC1;\
966 MOVOU (16*(i*2+1))(pTbl), T2;\
967 PCLMULQDQ $0x00, T2, T0;\
968 PXOR T0, ACCM
969
970 MOVQ productTable+0(FP), pTbl
971 MOVQ dst+8(FP), ptx
972 MOVQ src_base+32(FP), ctx
973 MOVQ src_len+40(FP), ptxLen
974 MOVQ ctr+56(FP), ctrPtr
975 MOVQ T+64(FP), tPtr
976 MOVQ ks_base+72(FP), ks
977 MOVQ ks_len+80(FP), NR
978
979 SHRQ $2, NR
980 DECQ NR
981
982 MOVOU bswapMask<>(SB), BSWAP
983 MOVOU gcmPoly<>(SB), POLY
984
985 MOVOU (tPtr), ACC0
986 PXOR ACC1, ACC1
987 PXOR ACCM, ACCM
988 MOVOU (ctrPtr), B0
989 MOVL (3*4)(ctrPtr), aluCTR
990 MOVOU (ks), T0
991 MOVL (3*4)(ks), aluK
992 BSWAPL aluCTR
993 BSWAPL aluK
994
995 PXOR B0, T0
996 MOVOU T0, (0*16)(SP)
997 increment(0)
998
999 CMPQ ptxLen, $128
1000 JB gcmAesDecSingles
1001
1002 MOVOU T0, (1*16)(SP)
1003 increment(1)
1004 MOVOU T0, (2*16)(SP)
1005 increment(2)
1006 MOVOU T0, (3*16)(SP)
1007 increment(3)
1008 MOVOU T0, (4*16)(SP)
1009 increment(4)
1010 MOVOU T0, (5*16)(SP)
1011 increment(5)
1012 MOVOU T0, (6*16)(SP)
1013 increment(6)
1014 MOVOU T0, (7*16)(SP)
1015 increment(7)
1016
1017 gcmAesDecOctetsLoop:
1018
1019 CMPQ ptxLen, $128
1020 JB gcmAesDecEndOctets
1021 SUBQ $128, ptxLen
1022
1023 MOVOU (0*16)(SP), B0
1024 MOVOU (1*16)(SP), B1
1025 MOVOU (2*16)(SP), B2
1026 MOVOU (3*16)(SP), B3
1027 MOVOU (4*16)(SP), B4
1028 MOVOU (5*16)(SP), B5
1029 MOVOU (6*16)(SP), B6
1030 MOVOU (7*16)(SP), B7
1031
1032 MOVOU (16*0)(ctx), T0
1033 PSHUFB BSWAP, T0
1034 PXOR ACC0, T0
1035 PSHUFD $78, T0, T1
1036 PXOR T0, T1
1037
1038 MOVOU (16*0)(pTbl), ACC0
1039 MOVOU (16*1)(pTbl), ACCM
1040 MOVOU ACC0, ACC1
1041
1042 PCLMULQDQ $0x00, T1, ACCM
1043 PCLMULQDQ $0x00, T0, ACC0
1044 PCLMULQDQ $0x11, T0, ACC1
1045
1046 combinedDecRound(1)
1047 increment(0)
1048 combinedDecRound(2)
1049 increment(1)
1050 combinedDecRound(3)
1051 increment(2)
1052 combinedDecRound(4)
1053 increment(3)
1054 combinedDecRound(5)
1055 increment(4)
1056 combinedDecRound(6)
1057 increment(5)
1058 combinedDecRound(7)
1059 increment(6)
1060
1061 aesRound(8)
1062 increment(7)
1063
1064 PXOR ACC0, ACCM
1065 PXOR ACC1, ACCM
1066 MOVOU ACCM, T0
1067 PSRLDQ $8, ACCM
1068 PSLLDQ $8, T0
1069 PXOR ACCM, ACC1
1070 PXOR T0, ACC0
1071
1072 reduceRound(ACC0)
1073 aesRound(9)
1074
1075 reduceRound(ACC0)
1076 PXOR ACC1, ACC0
1077
1078 MOVOU (16*10)(ks), T0
1079 CMPQ NR, $12
1080 JB decLast1
1081 aesRnd(T0)
1082 aesRound(11)
1083 MOVOU (16*12)(ks), T0
1084 JE decLast1
1085 aesRnd(T0)
1086 aesRound(13)
1087 MOVOU (16*14)(ks), T0
1088 decLast1:
1089 aesRndLast(T0)
1090
1091 MOVOU (16*0)(ctx), T0
1092 PXOR T0, B0
1093 MOVOU (16*1)(ctx), T0
1094 PXOR T0, B1
1095 MOVOU (16*2)(ctx), T0
1096 PXOR T0, B2
1097 MOVOU (16*3)(ctx), T0
1098 PXOR T0, B3
1099 MOVOU (16*4)(ctx), T0
1100 PXOR T0, B4
1101 MOVOU (16*5)(ctx), T0
1102 PXOR T0, B5
1103 MOVOU (16*6)(ctx), T0
1104 PXOR T0, B6
1105 MOVOU (16*7)(ctx), T0
1106 PXOR T0, B7
1107
1108 MOVOU B0, (16*0)(ptx)
1109 MOVOU B1, (16*1)(ptx)
1110 MOVOU B2, (16*2)(ptx)
1111 MOVOU B3, (16*3)(ptx)
1112 MOVOU B4, (16*4)(ptx)
1113 MOVOU B5, (16*5)(ptx)
1114 MOVOU B6, (16*6)(ptx)
1115 MOVOU B7, (16*7)(ptx)
1116
1117 LEAQ 128(ptx), ptx
1118 LEAQ 128(ctx), ctx
1119
1120 JMP gcmAesDecOctetsLoop
1121
1122 gcmAesDecEndOctets:
1123
1124 SUBQ $7, aluCTR
1125
1126 gcmAesDecSingles:
1127
1128 MOVOU (16*1)(ks), B1
1129 MOVOU (16*2)(ks), B2
1130 MOVOU (16*3)(ks), B3
1131 MOVOU (16*4)(ks), B4
1132 MOVOU (16*5)(ks), B5
1133 MOVOU (16*6)(ks), B6
1134 MOVOU (16*7)(ks), B7
1135
1136 MOVOU (16*14)(pTbl), T2
1137
1138 gcmAesDecSinglesLoop:
1139
1140 CMPQ ptxLen, $16
1141 JB gcmAesDecTail
1142 SUBQ $16, ptxLen
1143
1144 MOVOU (ctx), B0
1145 MOVOU B0, T1
1146 PSHUFB BSWAP, B0
1147 PXOR ACC0, B0
1148
1149 MOVOU T2, ACC0
1150 MOVOU T2, ACC1
1151 MOVOU (16*15)(pTbl), ACCM
1152
1153 PCLMULQDQ $0x00, B0, ACC0
1154 PCLMULQDQ $0x11, B0, ACC1
1155 PSHUFD $78, B0, T0
1156 PXOR B0, T0
1157 PCLMULQDQ $0x00, T0, ACCM
1158
1159 PXOR ACC0, ACCM
1160 PXOR ACC1, ACCM
1161 MOVOU ACCM, T0
1162 PSRLDQ $8, ACCM
1163 PSLLDQ $8, T0
1164 PXOR ACCM, ACC1
1165 PXOR T0, ACC0
1166
1167 reduceRound(ACC0)
1168 reduceRound(ACC0)
1169 PXOR ACC1, ACC0
1170
1171 MOVOU (0*16)(SP), B0
1172 increment(0)
1173 AESENC B1, B0
1174 AESENC B2, B0
1175 AESENC B3, B0
1176 AESENC B4, B0
1177 AESENC B5, B0
1178 AESENC B6, B0
1179 AESENC B7, B0
1180 MOVOU (16*8)(ks), T0
1181 AESENC T0, B0
1182 MOVOU (16*9)(ks), T0
1183 AESENC T0, B0
1184 MOVOU (16*10)(ks), T0
1185 CMPQ NR, $12
1186 JB decLast2
1187 AESENC T0, B0
1188 MOVOU (16*11)(ks), T0
1189 AESENC T0, B0
1190 MOVOU (16*12)(ks), T0
1191 JE decLast2
1192 AESENC T0, B0
1193 MOVOU (16*13)(ks), T0
1194 AESENC T0, B0
1195 MOVOU (16*14)(ks), T0
1196 decLast2:
1197 AESENCLAST T0, B0
1198
1199 PXOR T1, B0
1200 MOVOU B0, (ptx)
1201
1202 LEAQ (16*1)(ptx), ptx
1203 LEAQ (16*1)(ctx), ctx
1204
1205 JMP gcmAesDecSinglesLoop
1206
1207 gcmAesDecTail:
1208
1209 TESTQ ptxLen, ptxLen
1210 JE gcmAesDecDone
1211
1212 MOVQ ptxLen, aluTMP
1213 SHLQ $4, aluTMP
1214 LEAQ andMask<>(SB), aluCTR
1215 MOVOU -16(aluCTR)(aluTMP*1), T1
1216
1217 MOVOU (ctx), B0 // I assume there is TAG attached to the ctx, and there is no read overflow
1218 PAND T1, B0
1219
1220 MOVOU B0, T1
1221 PSHUFB BSWAP, B0
1222 PXOR ACC0, B0
1223
1224 MOVOU (16*14)(pTbl), ACC0
1225 MOVOU (16*15)(pTbl), ACCM
1226 MOVOU ACC0, ACC1
1227
1228 PCLMULQDQ $0x00, B0, ACC0
1229 PCLMULQDQ $0x11, B0, ACC1
1230 PSHUFD $78, B0, T0
1231 PXOR B0, T0
1232 PCLMULQDQ $0x00, T0, ACCM
1233
1234 PXOR ACC0, ACCM
1235 PXOR ACC1, ACCM
1236 MOVOU ACCM, T0
1237 PSRLDQ $8, ACCM
1238 PSLLDQ $8, T0
1239 PXOR ACCM, ACC1
1240 PXOR T0, ACC0
1241
1242 reduceRound(ACC0)
1243 reduceRound(ACC0)
1244 PXOR ACC1, ACC0
1245
1246 MOVOU (0*16)(SP), B0
1247 increment(0)
1248 AESENC B1, B0
1249 AESENC B2, B0
1250 AESENC B3, B0
1251 AESENC B4, B0
1252 AESENC B5, B0
1253 AESENC B6, B0
1254 AESENC B7, B0
1255 MOVOU (16*8)(ks), T0
1256 AESENC T0, B0
1257 MOVOU (16*9)(ks), T0
1258 AESENC T0, B0
1259 MOVOU (16*10)(ks), T0
1260 CMPQ NR, $12
1261 JB decLast3
1262 AESENC T0, B0
1263 MOVOU (16*11)(ks), T0
1264 AESENC T0, B0
1265 MOVOU (16*12)(ks), T0
1266 JE decLast3
1267 AESENC T0, B0
1268 MOVOU (16*13)(ks), T0
1269 AESENC T0, B0
1270 MOVOU (16*14)(ks), T0
1271 decLast3:
1272 AESENCLAST T0, B0
1273 PXOR T1, B0
1274
1275 ptxStoreLoop:
1276 PEXTRB $0, B0, (ptx)
1277 PSRLDQ $1, B0
1278 LEAQ 1(ptx), ptx
1279 DECQ ptxLen
1280
1281 JNE ptxStoreLoop
1282
1283 gcmAesDecDone:
1284
1285 MOVOU ACC0, (tPtr)
1286 RET
1287
View as plain text