1 // Copyright 2016 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 // This file was originally from https://golang.org/cl/24717 by Vlad Krasnov of CloudFlare.
6
7 //go:build gc && !purego
8 // +build gc,!purego
9
10 #include "textflag.h"
11 // General register allocation
12 #define oup DI
13 #define inp SI
14 #define inl BX
15 #define adp CX // free to reuse, after we hash the additional data
16 #define keyp R8 // free to reuse, when we copy the key to stack
17 #define itr2 R9 // general iterator
18 #define itr1 CX // general iterator
19 #define acc0 R10
20 #define acc1 R11
21 #define acc2 R12
22 #define t0 R13
23 #define t1 R14
24 #define t2 R15
25 #define t3 R8
26 // Register and stack allocation for the SSE code
27 #define rStore (0*16)(BP)
28 #define sStore (1*16)(BP)
29 #define state1Store (2*16)(BP)
30 #define state2Store (3*16)(BP)
31 #define tmpStore (4*16)(BP)
32 #define ctr0Store (5*16)(BP)
33 #define ctr1Store (6*16)(BP)
34 #define ctr2Store (7*16)(BP)
35 #define ctr3Store (8*16)(BP)
36 #define A0 X0
37 #define A1 X1
38 #define A2 X2
39 #define B0 X3
40 #define B1 X4
41 #define B2 X5
42 #define C0 X6
43 #define C1 X7
44 #define C2 X8
45 #define D0 X9
46 #define D1 X10
47 #define D2 X11
48 #define T0 X12
49 #define T1 X13
50 #define T2 X14
51 #define T3 X15
52 #define A3 T0
53 #define B3 T1
54 #define C3 T2
55 #define D3 T3
56 // Register and stack allocation for the AVX2 code
57 #define rsStoreAVX2 (0*32)(BP)
58 #define state1StoreAVX2 (1*32)(BP)
59 #define state2StoreAVX2 (2*32)(BP)
60 #define ctr0StoreAVX2 (3*32)(BP)
61 #define ctr1StoreAVX2 (4*32)(BP)
62 #define ctr2StoreAVX2 (5*32)(BP)
63 #define ctr3StoreAVX2 (6*32)(BP)
64 #define tmpStoreAVX2 (7*32)(BP) // 256 bytes on stack
65 #define AA0 Y0
66 #define AA1 Y5
67 #define AA2 Y6
68 #define AA3 Y7
69 #define BB0 Y14
70 #define BB1 Y9
71 #define BB2 Y10
72 #define BB3 Y11
73 #define CC0 Y12
74 #define CC1 Y13
75 #define CC2 Y8
76 #define CC3 Y15
77 #define DD0 Y4
78 #define DD1 Y1
79 #define DD2 Y2
80 #define DD3 Y3
81 #define TT0 DD3
82 #define TT1 AA3
83 #define TT2 BB3
84 #define TT3 CC3
85 // ChaCha20 constants
86 DATA ·chacha20Constants<>+0x00(SB)/4, $0x61707865
87 DATA ·chacha20Constants<>+0x04(SB)/4, $0x3320646e
88 DATA ·chacha20Constants<>+0x08(SB)/4, $0x79622d32
89 DATA ·chacha20Constants<>+0x0c(SB)/4, $0x6b206574
90 DATA ·chacha20Constants<>+0x10(SB)/4, $0x61707865
91 DATA ·chacha20Constants<>+0x14(SB)/4, $0x3320646e
92 DATA ·chacha20Constants<>+0x18(SB)/4, $0x79622d32
93 DATA ·chacha20Constants<>+0x1c(SB)/4, $0x6b206574
94 // <<< 16 with PSHUFB
95 DATA ·rol16<>+0x00(SB)/8, $0x0504070601000302
96 DATA ·rol16<>+0x08(SB)/8, $0x0D0C0F0E09080B0A
97 DATA ·rol16<>+0x10(SB)/8, $0x0504070601000302
98 DATA ·rol16<>+0x18(SB)/8, $0x0D0C0F0E09080B0A
99 // <<< 8 with PSHUFB
100 DATA ·rol8<>+0x00(SB)/8, $0x0605040702010003
101 DATA ·rol8<>+0x08(SB)/8, $0x0E0D0C0F0A09080B
102 DATA ·rol8<>+0x10(SB)/8, $0x0605040702010003
103 DATA ·rol8<>+0x18(SB)/8, $0x0E0D0C0F0A09080B
104
105 DATA ·avx2InitMask<>+0x00(SB)/8, $0x0
106 DATA ·avx2InitMask<>+0x08(SB)/8, $0x0
107 DATA ·avx2InitMask<>+0x10(SB)/8, $0x1
108 DATA ·avx2InitMask<>+0x18(SB)/8, $0x0
109
110 DATA ·avx2IncMask<>+0x00(SB)/8, $0x2
111 DATA ·avx2IncMask<>+0x08(SB)/8, $0x0
112 DATA ·avx2IncMask<>+0x10(SB)/8, $0x2
113 DATA ·avx2IncMask<>+0x18(SB)/8, $0x0
114 // Poly1305 key clamp
115 DATA ·polyClampMask<>+0x00(SB)/8, $0x0FFFFFFC0FFFFFFF
116 DATA ·polyClampMask<>+0x08(SB)/8, $0x0FFFFFFC0FFFFFFC
117 DATA ·polyClampMask<>+0x10(SB)/8, $0xFFFFFFFFFFFFFFFF
118 DATA ·polyClampMask<>+0x18(SB)/8, $0xFFFFFFFFFFFFFFFF
119
120 DATA ·sseIncMask<>+0x00(SB)/8, $0x1
121 DATA ·sseIncMask<>+0x08(SB)/8, $0x0
122 // To load/store the last < 16 bytes in a buffer
123 DATA ·andMask<>+0x00(SB)/8, $0x00000000000000ff
124 DATA ·andMask<>+0x08(SB)/8, $0x0000000000000000
125 DATA ·andMask<>+0x10(SB)/8, $0x000000000000ffff
126 DATA ·andMask<>+0x18(SB)/8, $0x0000000000000000
127 DATA ·andMask<>+0x20(SB)/8, $0x0000000000ffffff
128 DATA ·andMask<>+0x28(SB)/8, $0x0000000000000000
129 DATA ·andMask<>+0x30(SB)/8, $0x00000000ffffffff
130 DATA ·andMask<>+0x38(SB)/8, $0x0000000000000000
131 DATA ·andMask<>+0x40(SB)/8, $0x000000ffffffffff
132 DATA ·andMask<>+0x48(SB)/8, $0x0000000000000000
133 DATA ·andMask<>+0x50(SB)/8, $0x0000ffffffffffff
134 DATA ·andMask<>+0x58(SB)/8, $0x0000000000000000
135 DATA ·andMask<>+0x60(SB)/8, $0x00ffffffffffffff
136 DATA ·andMask<>+0x68(SB)/8, $0x0000000000000000
137 DATA ·andMask<>+0x70(SB)/8, $0xffffffffffffffff
138 DATA ·andMask<>+0x78(SB)/8, $0x0000000000000000
139 DATA ·andMask<>+0x80(SB)/8, $0xffffffffffffffff
140 DATA ·andMask<>+0x88(SB)/8, $0x00000000000000ff
141 DATA ·andMask<>+0x90(SB)/8, $0xffffffffffffffff
142 DATA ·andMask<>+0x98(SB)/8, $0x000000000000ffff
143 DATA ·andMask<>+0xa0(SB)/8, $0xffffffffffffffff
144 DATA ·andMask<>+0xa8(SB)/8, $0x0000000000ffffff
145 DATA ·andMask<>+0xb0(SB)/8, $0xffffffffffffffff
146 DATA ·andMask<>+0xb8(SB)/8, $0x00000000ffffffff
147 DATA ·andMask<>+0xc0(SB)/8, $0xffffffffffffffff
148 DATA ·andMask<>+0xc8(SB)/8, $0x000000ffffffffff
149 DATA ·andMask<>+0xd0(SB)/8, $0xffffffffffffffff
150 DATA ·andMask<>+0xd8(SB)/8, $0x0000ffffffffffff
151 DATA ·andMask<>+0xe0(SB)/8, $0xffffffffffffffff
152 DATA ·andMask<>+0xe8(SB)/8, $0x00ffffffffffffff
153
154 GLOBL ·chacha20Constants<>(SB), (NOPTR+RODATA), $32
155 GLOBL ·rol16<>(SB), (NOPTR+RODATA), $32
156 GLOBL ·rol8<>(SB), (NOPTR+RODATA), $32
157 GLOBL ·sseIncMask<>(SB), (NOPTR+RODATA), $16
158 GLOBL ·avx2IncMask<>(SB), (NOPTR+RODATA), $32
159 GLOBL ·avx2InitMask<>(SB), (NOPTR+RODATA), $32
160 GLOBL ·polyClampMask<>(SB), (NOPTR+RODATA), $32
161 GLOBL ·andMask<>(SB), (NOPTR+RODATA), $240
162 // No PALIGNR in Go ASM yet (but VPALIGNR is present).
163 #define shiftB0Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x04 // PALIGNR $4, X3, X3
164 #define shiftB1Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xe4; BYTE $0x04 // PALIGNR $4, X4, X4
165 #define shiftB2Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x04 // PALIGNR $4, X5, X5
166 #define shiftB3Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x04 // PALIGNR $4, X13, X13
167 #define shiftC0Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xf6; BYTE $0x08 // PALIGNR $8, X6, X6
168 #define shiftC1Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xff; BYTE $0x08 // PALIGNR $8, X7, X7
169 #define shiftC2Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xc0; BYTE $0x08 // PALIGNR $8, X8, X8
170 #define shiftC3Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xf6; BYTE $0x08 // PALIGNR $8, X14, X14
171 #define shiftD0Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xc9; BYTE $0x0c // PALIGNR $12, X9, X9
172 #define shiftD1Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xd2; BYTE $0x0c // PALIGNR $12, X10, X10
173 #define shiftD2Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x0c // PALIGNR $12, X11, X11
174 #define shiftD3Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xff; BYTE $0x0c // PALIGNR $12, X15, X15
175 #define shiftB0Right BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x0c // PALIGNR $12, X3, X3
176 #define shiftB1Right BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xe4; BYTE $0x0c // PALIGNR $12, X4, X4
177 #define shiftB2Right BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x0c // PALIGNR $12, X5, X5
178 #define shiftB3Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x0c // PALIGNR $12, X13, X13
179 #define shiftC0Right shiftC0Left
180 #define shiftC1Right shiftC1Left
181 #define shiftC2Right shiftC2Left
182 #define shiftC3Right shiftC3Left
183 #define shiftD0Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xc9; BYTE $0x04 // PALIGNR $4, X9, X9
184 #define shiftD1Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xd2; BYTE $0x04 // PALIGNR $4, X10, X10
185 #define shiftD2Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x04 // PALIGNR $4, X11, X11
186 #define shiftD3Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xff; BYTE $0x04 // PALIGNR $4, X15, X15
187 // Some macros
188 #define chachaQR(A, B, C, D, T) \
189 PADDD B, A; PXOR A, D; PSHUFB ·rol16<>(SB), D \
190 PADDD D, C; PXOR C, B; MOVO B, T; PSLLL $12, T; PSRLL $20, B; PXOR T, B \
191 PADDD B, A; PXOR A, D; PSHUFB ·rol8<>(SB), D \
192 PADDD D, C; PXOR C, B; MOVO B, T; PSLLL $7, T; PSRLL $25, B; PXOR T, B
193
194 #define chachaQR_AVX2(A, B, C, D, T) \
195 VPADDD B, A, A; VPXOR A, D, D; VPSHUFB ·rol16<>(SB), D, D \
196 VPADDD D, C, C; VPXOR C, B, B; VPSLLD $12, B, T; VPSRLD $20, B, B; VPXOR T, B, B \
197 VPADDD B, A, A; VPXOR A, D, D; VPSHUFB ·rol8<>(SB), D, D \
198 VPADDD D, C, C; VPXOR C, B, B; VPSLLD $7, B, T; VPSRLD $25, B, B; VPXOR T, B, B
199
200 #define polyAdd(S) ADDQ S, acc0; ADCQ 8+S, acc1; ADCQ $1, acc2
201 #define polyMulStage1 MOVQ (0*8)(BP), AX; MOVQ AX, t2; MULQ acc0; MOVQ AX, t0; MOVQ DX, t1; MOVQ (0*8)(BP), AX; MULQ acc1; IMULQ acc2, t2; ADDQ AX, t1; ADCQ DX, t2
202 #define polyMulStage2 MOVQ (1*8)(BP), AX; MOVQ AX, t3; MULQ acc0; ADDQ AX, t1; ADCQ $0, DX; MOVQ DX, acc0; MOVQ (1*8)(BP), AX; MULQ acc1; ADDQ AX, t2; ADCQ $0, DX
203 #define polyMulStage3 IMULQ acc2, t3; ADDQ acc0, t2; ADCQ DX, t3
204 #define polyMulReduceStage MOVQ t0, acc0; MOVQ t1, acc1; MOVQ t2, acc2; ANDQ $3, acc2; MOVQ t2, t0; ANDQ $-4, t0; MOVQ t3, t1; SHRQ $2, t3, t2; SHRQ $2, t3; ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $0, acc2; ADDQ t2, acc0; ADCQ t3, acc1; ADCQ $0, acc2
205
206 #define polyMulStage1_AVX2 MOVQ (0*8)(BP), DX; MOVQ DX, t2; MULXQ acc0, t0, t1; IMULQ acc2, t2; MULXQ acc1, AX, DX; ADDQ AX, t1; ADCQ DX, t2
207 #define polyMulStage2_AVX2 MOVQ (1*8)(BP), DX; MULXQ acc0, acc0, AX; ADDQ acc0, t1; MULXQ acc1, acc1, t3; ADCQ acc1, t2; ADCQ $0, t3
208 #define polyMulStage3_AVX2 IMULQ acc2, DX; ADDQ AX, t2; ADCQ DX, t3
209
210 #define polyMul polyMulStage1; polyMulStage2; polyMulStage3; polyMulReduceStage
211 #define polyMulAVX2 polyMulStage1_AVX2; polyMulStage2_AVX2; polyMulStage3_AVX2; polyMulReduceStage
212 // ----------------------------------------------------------------------------
213 TEXT polyHashADInternal<>(SB), NOSPLIT, $0
214 // adp points to beginning of additional data
215 // itr2 holds ad length
216 XORQ acc0, acc0
217 XORQ acc1, acc1
218 XORQ acc2, acc2
219 CMPQ itr2, $13
220 JNE hashADLoop
221
222 openFastTLSAD:
223 // Special treatment for the TLS case of 13 bytes
224 MOVQ (adp), acc0
225 MOVQ 5(adp), acc1
226 SHRQ $24, acc1
227 MOVQ $1, acc2
228 polyMul
229 RET
230
231 hashADLoop:
232 // Hash in 16 byte chunks
233 CMPQ itr2, $16
234 JB hashADTail
235 polyAdd(0(adp))
236 LEAQ (1*16)(adp), adp
237 SUBQ $16, itr2
238 polyMul
239 JMP hashADLoop
240
241 hashADTail:
242 CMPQ itr2, $0
243 JE hashADDone
244
245 // Hash last < 16 byte tail
246 XORQ t0, t0
247 XORQ t1, t1
248 XORQ t2, t2
249 ADDQ itr2, adp
250
251 hashADTailLoop:
252 SHLQ $8, t0, t1
253 SHLQ $8, t0
254 MOVB -1(adp), t2
255 XORQ t2, t0
256 DECQ adp
257 DECQ itr2
258 JNE hashADTailLoop
259
260 hashADTailFinish:
261 ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2
262 polyMul
263
264 // Finished AD
265 hashADDone:
266 RET
267
268 // ----------------------------------------------------------------------------
269 // func chacha20Poly1305Open(dst, key, src, ad []byte) bool
270 TEXT ·chacha20Poly1305Open(SB), 0, $288-97
271 // For aligned stack access
272 MOVQ SP, BP
273 ADDQ $32, BP
274 ANDQ $-32, BP
275 MOVQ dst+0(FP), oup
276 MOVQ key+24(FP), keyp
277 MOVQ src+48(FP), inp
278 MOVQ src_len+56(FP), inl
279 MOVQ ad+72(FP), adp
280
281 // Check for AVX2 support
282 CMPB ·useAVX2(SB), $1
283 JE chacha20Poly1305Open_AVX2
284
285 // Special optimization, for very short buffers
286 CMPQ inl, $128
287 JBE openSSE128 // About 16% faster
288
289 // For long buffers, prepare the poly key first
290 MOVOU ·chacha20Constants<>(SB), A0
291 MOVOU (1*16)(keyp), B0
292 MOVOU (2*16)(keyp), C0
293 MOVOU (3*16)(keyp), D0
294 MOVO D0, T1
295
296 // Store state on stack for future use
297 MOVO B0, state1Store
298 MOVO C0, state2Store
299 MOVO D0, ctr3Store
300 MOVQ $10, itr2
301
302 openSSEPreparePolyKey:
303 chachaQR(A0, B0, C0, D0, T0)
304 shiftB0Left; shiftC0Left; shiftD0Left
305 chachaQR(A0, B0, C0, D0, T0)
306 shiftB0Right; shiftC0Right; shiftD0Right
307 DECQ itr2
308 JNE openSSEPreparePolyKey
309
310 // A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded
311 PADDL ·chacha20Constants<>(SB), A0; PADDL state1Store, B0
312
313 // Clamp and store the key
314 PAND ·polyClampMask<>(SB), A0
315 MOVO A0, rStore; MOVO B0, sStore
316
317 // Hash AAD
318 MOVQ ad_len+80(FP), itr2
319 CALL polyHashADInternal<>(SB)
320
321 openSSEMainLoop:
322 CMPQ inl, $256
323 JB openSSEMainLoopDone
324
325 // Load state, increment counter blocks
326 MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0
327 MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
328 MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
329 MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3
330
331 // Store counters
332 MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store
333
334 // There are 10 ChaCha20 iterations of 2QR each, so for 6 iterations we hash 2 blocks, and for the remaining 4 only 1 block - for a total of 16
335 MOVQ $4, itr1
336 MOVQ inp, itr2
337
338 openSSEInternalLoop:
339 MOVO C3, tmpStore
340 chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
341 MOVO tmpStore, C3
342 MOVO C1, tmpStore
343 chachaQR(A3, B3, C3, D3, C1)
344 MOVO tmpStore, C1
345 polyAdd(0(itr2))
346 shiftB0Left; shiftB1Left; shiftB2Left; shiftB3Left
347 shiftC0Left; shiftC1Left; shiftC2Left; shiftC3Left
348 shiftD0Left; shiftD1Left; shiftD2Left; shiftD3Left
349 polyMulStage1
350 polyMulStage2
351 LEAQ (2*8)(itr2), itr2
352 MOVO C3, tmpStore
353 chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
354 MOVO tmpStore, C3
355 MOVO C1, tmpStore
356 polyMulStage3
357 chachaQR(A3, B3, C3, D3, C1)
358 MOVO tmpStore, C1
359 polyMulReduceStage
360 shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right
361 shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right
362 shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right
363 DECQ itr1
364 JGE openSSEInternalLoop
365
366 polyAdd(0(itr2))
367 polyMul
368 LEAQ (2*8)(itr2), itr2
369
370 CMPQ itr1, $-6
371 JG openSSEInternalLoop
372
373 // Add in the state
374 PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3
375 PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3
376 PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3
377 PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3
378
379 // Load - xor - store
380 MOVO D3, tmpStore
381 MOVOU (0*16)(inp), D3; PXOR D3, A0; MOVOU A0, (0*16)(oup)
382 MOVOU (1*16)(inp), D3; PXOR D3, B0; MOVOU B0, (1*16)(oup)
383 MOVOU (2*16)(inp), D3; PXOR D3, C0; MOVOU C0, (2*16)(oup)
384 MOVOU (3*16)(inp), D3; PXOR D3, D0; MOVOU D0, (3*16)(oup)
385 MOVOU (4*16)(inp), D0; PXOR D0, A1; MOVOU A1, (4*16)(oup)
386 MOVOU (5*16)(inp), D0; PXOR D0, B1; MOVOU B1, (5*16)(oup)
387 MOVOU (6*16)(inp), D0; PXOR D0, C1; MOVOU C1, (6*16)(oup)
388 MOVOU (7*16)(inp), D0; PXOR D0, D1; MOVOU D1, (7*16)(oup)
389 MOVOU (8*16)(inp), D0; PXOR D0, A2; MOVOU A2, (8*16)(oup)
390 MOVOU (9*16)(inp), D0; PXOR D0, B2; MOVOU B2, (9*16)(oup)
391 MOVOU (10*16)(inp), D0; PXOR D0, C2; MOVOU C2, (10*16)(oup)
392 MOVOU (11*16)(inp), D0; PXOR D0, D2; MOVOU D2, (11*16)(oup)
393 MOVOU (12*16)(inp), D0; PXOR D0, A3; MOVOU A3, (12*16)(oup)
394 MOVOU (13*16)(inp), D0; PXOR D0, B3; MOVOU B3, (13*16)(oup)
395 MOVOU (14*16)(inp), D0; PXOR D0, C3; MOVOU C3, (14*16)(oup)
396 MOVOU (15*16)(inp), D0; PXOR tmpStore, D0; MOVOU D0, (15*16)(oup)
397 LEAQ 256(inp), inp
398 LEAQ 256(oup), oup
399 SUBQ $256, inl
400 JMP openSSEMainLoop
401
402 openSSEMainLoopDone:
403 // Handle the various tail sizes efficiently
404 TESTQ inl, inl
405 JE openSSEFinalize
406 CMPQ inl, $64
407 JBE openSSETail64
408 CMPQ inl, $128
409 JBE openSSETail128
410 CMPQ inl, $192
411 JBE openSSETail192
412 JMP openSSETail256
413
414 openSSEFinalize:
415 // Hash in the PT, AAD lengths
416 ADDQ ad_len+80(FP), acc0; ADCQ src_len+56(FP), acc1; ADCQ $1, acc2
417 polyMul
418
419 // Final reduce
420 MOVQ acc0, t0
421 MOVQ acc1, t1
422 MOVQ acc2, t2
423 SUBQ $-5, acc0
424 SBBQ $-1, acc1
425 SBBQ $3, acc2
426 CMOVQCS t0, acc0
427 CMOVQCS t1, acc1
428 CMOVQCS t2, acc2
429
430 // Add in the "s" part of the key
431 ADDQ 0+sStore, acc0
432 ADCQ 8+sStore, acc1
433
434 // Finally, constant time compare to the tag at the end of the message
435 XORQ AX, AX
436 MOVQ $1, DX
437 XORQ (0*8)(inp), acc0
438 XORQ (1*8)(inp), acc1
439 ORQ acc1, acc0
440 CMOVQEQ DX, AX
441
442 // Return true iff tags are equal
443 MOVB AX, ret+96(FP)
444 RET
445
446 // ----------------------------------------------------------------------------
447 // Special optimization for buffers smaller than 129 bytes
448 openSSE128:
449 // For up to 128 bytes of ciphertext and 64 bytes for the poly key, we require to process three blocks
450 MOVOU ·chacha20Constants<>(SB), A0; MOVOU (1*16)(keyp), B0; MOVOU (2*16)(keyp), C0; MOVOU (3*16)(keyp), D0
451 MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
452 MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
453 MOVO B0, T1; MOVO C0, T2; MOVO D1, T3
454 MOVQ $10, itr2
455
456 openSSE128InnerCipherLoop:
457 chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
458 shiftB0Left; shiftB1Left; shiftB2Left
459 shiftC0Left; shiftC1Left; shiftC2Left
460 shiftD0Left; shiftD1Left; shiftD2Left
461 chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
462 shiftB0Right; shiftB1Right; shiftB2Right
463 shiftC0Right; shiftC1Right; shiftC2Right
464 shiftD0Right; shiftD1Right; shiftD2Right
465 DECQ itr2
466 JNE openSSE128InnerCipherLoop
467
468 // A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded
469 PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2
470 PADDL T1, B0; PADDL T1, B1; PADDL T1, B2
471 PADDL T2, C1; PADDL T2, C2
472 PADDL T3, D1; PADDL ·sseIncMask<>(SB), T3; PADDL T3, D2
473
474 // Clamp and store the key
475 PAND ·polyClampMask<>(SB), A0
476 MOVOU A0, rStore; MOVOU B0, sStore
477
478 // Hash
479 MOVQ ad_len+80(FP), itr2
480 CALL polyHashADInternal<>(SB)
481
482 openSSE128Open:
483 CMPQ inl, $16
484 JB openSSETail16
485 SUBQ $16, inl
486
487 // Load for hashing
488 polyAdd(0(inp))
489
490 // Load for decryption
491 MOVOU (inp), T0; PXOR T0, A1; MOVOU A1, (oup)
492 LEAQ (1*16)(inp), inp
493 LEAQ (1*16)(oup), oup
494 polyMul
495
496 // Shift the stream "left"
497 MOVO B1, A1
498 MOVO C1, B1
499 MOVO D1, C1
500 MOVO A2, D1
501 MOVO B2, A2
502 MOVO C2, B2
503 MOVO D2, C2
504 JMP openSSE128Open
505
506 openSSETail16:
507 TESTQ inl, inl
508 JE openSSEFinalize
509
510 // We can safely load the CT from the end, because it is padded with the MAC
511 MOVQ inl, itr2
512 SHLQ $4, itr2
513 LEAQ ·andMask<>(SB), t0
514 MOVOU (inp), T0
515 ADDQ inl, inp
516 PAND -16(t0)(itr2*1), T0
517 MOVO T0, 0+tmpStore
518 MOVQ T0, t0
519 MOVQ 8+tmpStore, t1
520 PXOR A1, T0
521
522 // We can only store one byte at a time, since plaintext can be shorter than 16 bytes
523 openSSETail16Store:
524 MOVQ T0, t3
525 MOVB t3, (oup)
526 PSRLDQ $1, T0
527 INCQ oup
528 DECQ inl
529 JNE openSSETail16Store
530 ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2
531 polyMul
532 JMP openSSEFinalize
533
534 // ----------------------------------------------------------------------------
535 // Special optimization for the last 64 bytes of ciphertext
536 openSSETail64:
537 // Need to decrypt up to 64 bytes - prepare single block
538 MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr0Store
539 XORQ itr2, itr2
540 MOVQ inl, itr1
541 CMPQ itr1, $16
542 JB openSSETail64LoopB
543
544 openSSETail64LoopA:
545 // Perform ChaCha rounds, while hashing the remaining input
546 polyAdd(0(inp)(itr2*1))
547 polyMul
548 SUBQ $16, itr1
549
550 openSSETail64LoopB:
551 ADDQ $16, itr2
552 chachaQR(A0, B0, C0, D0, T0)
553 shiftB0Left; shiftC0Left; shiftD0Left
554 chachaQR(A0, B0, C0, D0, T0)
555 shiftB0Right; shiftC0Right; shiftD0Right
556
557 CMPQ itr1, $16
558 JAE openSSETail64LoopA
559
560 CMPQ itr2, $160
561 JNE openSSETail64LoopB
562
563 PADDL ·chacha20Constants<>(SB), A0; PADDL state1Store, B0; PADDL state2Store, C0; PADDL ctr0Store, D0
564
565 openSSETail64DecLoop:
566 CMPQ inl, $16
567 JB openSSETail64DecLoopDone
568 SUBQ $16, inl
569 MOVOU (inp), T0
570 PXOR T0, A0
571 MOVOU A0, (oup)
572 LEAQ 16(inp), inp
573 LEAQ 16(oup), oup
574 MOVO B0, A0
575 MOVO C0, B0
576 MOVO D0, C0
577 JMP openSSETail64DecLoop
578
579 openSSETail64DecLoopDone:
580 MOVO A0, A1
581 JMP openSSETail16
582
583 // ----------------------------------------------------------------------------
584 // Special optimization for the last 128 bytes of ciphertext
585 openSSETail128:
586 // Need to decrypt up to 128 bytes - prepare two blocks
587 MOVO ·chacha20Constants<>(SB), A1; MOVO state1Store, B1; MOVO state2Store, C1; MOVO ctr3Store, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr0Store
588 MOVO A1, A0; MOVO B1, B0; MOVO C1, C0; MOVO D1, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr1Store
589 XORQ itr2, itr2
590 MOVQ inl, itr1
591 ANDQ $-16, itr1
592
593 openSSETail128LoopA:
594 // Perform ChaCha rounds, while hashing the remaining input
595 polyAdd(0(inp)(itr2*1))
596 polyMul
597
598 openSSETail128LoopB:
599 ADDQ $16, itr2
600 chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0)
601 shiftB0Left; shiftC0Left; shiftD0Left
602 shiftB1Left; shiftC1Left; shiftD1Left
603 chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0)
604 shiftB0Right; shiftC0Right; shiftD0Right
605 shiftB1Right; shiftC1Right; shiftD1Right
606
607 CMPQ itr2, itr1
608 JB openSSETail128LoopA
609
610 CMPQ itr2, $160
611 JNE openSSETail128LoopB
612
613 PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1
614 PADDL state1Store, B0; PADDL state1Store, B1
615 PADDL state2Store, C0; PADDL state2Store, C1
616 PADDL ctr1Store, D0; PADDL ctr0Store, D1
617
618 MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3
619 PXOR T0, A1; PXOR T1, B1; PXOR T2, C1; PXOR T3, D1
620 MOVOU A1, (0*16)(oup); MOVOU B1, (1*16)(oup); MOVOU C1, (2*16)(oup); MOVOU D1, (3*16)(oup)
621
622 SUBQ $64, inl
623 LEAQ 64(inp), inp
624 LEAQ 64(oup), oup
625 JMP openSSETail64DecLoop
626
627 // ----------------------------------------------------------------------------
628 // Special optimization for the last 192 bytes of ciphertext
629 openSSETail192:
630 // Need to decrypt up to 192 bytes - prepare three blocks
631 MOVO ·chacha20Constants<>(SB), A2; MOVO state1Store, B2; MOVO state2Store, C2; MOVO ctr3Store, D2; PADDL ·sseIncMask<>(SB), D2; MOVO D2, ctr0Store
632 MOVO A2, A1; MOVO B2, B1; MOVO C2, C1; MOVO D2, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr1Store
633 MOVO A1, A0; MOVO B1, B0; MOVO C1, C0; MOVO D1, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr2Store
634
635 MOVQ inl, itr1
636 MOVQ $160, itr2
637 CMPQ itr1, $160
638 CMOVQGT itr2, itr1
639 ANDQ $-16, itr1
640 XORQ itr2, itr2
641
642 openSSLTail192LoopA:
643 // Perform ChaCha rounds, while hashing the remaining input
644 polyAdd(0(inp)(itr2*1))
645 polyMul
646
647 openSSLTail192LoopB:
648 ADDQ $16, itr2
649 chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
650 shiftB0Left; shiftC0Left; shiftD0Left
651 shiftB1Left; shiftC1Left; shiftD1Left
652 shiftB2Left; shiftC2Left; shiftD2Left
653
654 chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
655 shiftB0Right; shiftC0Right; shiftD0Right
656 shiftB1Right; shiftC1Right; shiftD1Right
657 shiftB2Right; shiftC2Right; shiftD2Right
658
659 CMPQ itr2, itr1
660 JB openSSLTail192LoopA
661
662 CMPQ itr2, $160
663 JNE openSSLTail192LoopB
664
665 CMPQ inl, $176
666 JB openSSLTail192Store
667
668 polyAdd(160(inp))
669 polyMul
670
671 CMPQ inl, $192
672 JB openSSLTail192Store
673
674 polyAdd(176(inp))
675 polyMul
676
677 openSSLTail192Store:
678 PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2
679 PADDL state1Store, B0; PADDL state1Store, B1; PADDL state1Store, B2
680 PADDL state2Store, C0; PADDL state2Store, C1; PADDL state2Store, C2
681 PADDL ctr2Store, D0; PADDL ctr1Store, D1; PADDL ctr0Store, D2
682
683 MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3
684 PXOR T0, A2; PXOR T1, B2; PXOR T2, C2; PXOR T3, D2
685 MOVOU A2, (0*16)(oup); MOVOU B2, (1*16)(oup); MOVOU C2, (2*16)(oup); MOVOU D2, (3*16)(oup)
686
687 MOVOU (4*16)(inp), T0; MOVOU (5*16)(inp), T1; MOVOU (6*16)(inp), T2; MOVOU (7*16)(inp), T3
688 PXOR T0, A1; PXOR T1, B1; PXOR T2, C1; PXOR T3, D1
689 MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup)
690
691 SUBQ $128, inl
692 LEAQ 128(inp), inp
693 LEAQ 128(oup), oup
694 JMP openSSETail64DecLoop
695
696 // ----------------------------------------------------------------------------
697 // Special optimization for the last 256 bytes of ciphertext
698 openSSETail256:
699 // Need to decrypt up to 256 bytes - prepare four blocks
700 MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0
701 MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
702 MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
703 MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3
704
705 // Store counters
706 MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store
707 XORQ itr2, itr2
708
709 openSSETail256Loop:
710 // This loop inteleaves 8 ChaCha quarter rounds with 1 poly multiplication
711 polyAdd(0(inp)(itr2*1))
712 MOVO C3, tmpStore
713 chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
714 MOVO tmpStore, C3
715 MOVO C1, tmpStore
716 chachaQR(A3, B3, C3, D3, C1)
717 MOVO tmpStore, C1
718 shiftB0Left; shiftB1Left; shiftB2Left; shiftB3Left
719 shiftC0Left; shiftC1Left; shiftC2Left; shiftC3Left
720 shiftD0Left; shiftD1Left; shiftD2Left; shiftD3Left
721 polyMulStage1
722 polyMulStage2
723 MOVO C3, tmpStore
724 chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
725 MOVO tmpStore, C3
726 MOVO C1, tmpStore
727 chachaQR(A3, B3, C3, D3, C1)
728 MOVO tmpStore, C1
729 polyMulStage3
730 polyMulReduceStage
731 shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right
732 shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right
733 shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right
734 ADDQ $2*8, itr2
735 CMPQ itr2, $160
736 JB openSSETail256Loop
737 MOVQ inl, itr1
738 ANDQ $-16, itr1
739
740 openSSETail256HashLoop:
741 polyAdd(0(inp)(itr2*1))
742 polyMul
743 ADDQ $2*8, itr2
744 CMPQ itr2, itr1
745 JB openSSETail256HashLoop
746
747 // Add in the state
748 PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3
749 PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3
750 PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3
751 PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3
752 MOVO D3, tmpStore
753
754 // Load - xor - store
755 MOVOU (0*16)(inp), D3; PXOR D3, A0
756 MOVOU (1*16)(inp), D3; PXOR D3, B0
757 MOVOU (2*16)(inp), D3; PXOR D3, C0
758 MOVOU (3*16)(inp), D3; PXOR D3, D0
759 MOVOU A0, (0*16)(oup)
760 MOVOU B0, (1*16)(oup)
761 MOVOU C0, (2*16)(oup)
762 MOVOU D0, (3*16)(oup)
763 MOVOU (4*16)(inp), A0; MOVOU (5*16)(inp), B0; MOVOU (6*16)(inp), C0; MOVOU (7*16)(inp), D0
764 PXOR A0, A1; PXOR B0, B1; PXOR C0, C1; PXOR D0, D1
765 MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup)
766 MOVOU (8*16)(inp), A0; MOVOU (9*16)(inp), B0; MOVOU (10*16)(inp), C0; MOVOU (11*16)(inp), D0
767 PXOR A0, A2; PXOR B0, B2; PXOR C0, C2; PXOR D0, D2
768 MOVOU A2, (8*16)(oup); MOVOU B2, (9*16)(oup); MOVOU C2, (10*16)(oup); MOVOU D2, (11*16)(oup)
769 LEAQ 192(inp), inp
770 LEAQ 192(oup), oup
771 SUBQ $192, inl
772 MOVO A3, A0
773 MOVO B3, B0
774 MOVO C3, C0
775 MOVO tmpStore, D0
776
777 JMP openSSETail64DecLoop
778
779 // ----------------------------------------------------------------------------
780 // ------------------------- AVX2 Code ----------------------------------------
781 chacha20Poly1305Open_AVX2:
782 VZEROUPPER
783 VMOVDQU ·chacha20Constants<>(SB), AA0
784 BYTE $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x70; BYTE $0x10 // broadcasti128 16(r8), ymm14
785 BYTE $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x20 // broadcasti128 32(r8), ymm12
786 BYTE $0xc4; BYTE $0xc2; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x30 // broadcasti128 48(r8), ymm4
787 VPADDD ·avx2InitMask<>(SB), DD0, DD0
788
789 // Special optimization, for very short buffers
790 CMPQ inl, $192
791 JBE openAVX2192
792 CMPQ inl, $320
793 JBE openAVX2320
794
795 // For the general key prepare the key first - as a byproduct we have 64 bytes of cipher stream
796 VMOVDQA BB0, state1StoreAVX2
797 VMOVDQA CC0, state2StoreAVX2
798 VMOVDQA DD0, ctr3StoreAVX2
799 MOVQ $10, itr2
800
801 openAVX2PreparePolyKey:
802 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0)
803 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $12, DD0, DD0, DD0
804 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0)
805 VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $4, DD0, DD0, DD0
806 DECQ itr2
807 JNE openAVX2PreparePolyKey
808
809 VPADDD ·chacha20Constants<>(SB), AA0, AA0
810 VPADDD state1StoreAVX2, BB0, BB0
811 VPADDD state2StoreAVX2, CC0, CC0
812 VPADDD ctr3StoreAVX2, DD0, DD0
813
814 VPERM2I128 $0x02, AA0, BB0, TT0
815
816 // Clamp and store poly key
817 VPAND ·polyClampMask<>(SB), TT0, TT0
818 VMOVDQA TT0, rsStoreAVX2
819
820 // Stream for the first 64 bytes
821 VPERM2I128 $0x13, AA0, BB0, AA0
822 VPERM2I128 $0x13, CC0, DD0, BB0
823
824 // Hash AD + first 64 bytes
825 MOVQ ad_len+80(FP), itr2
826 CALL polyHashADInternal<>(SB)
827 XORQ itr1, itr1
828
829 openAVX2InitialHash64:
830 polyAdd(0(inp)(itr1*1))
831 polyMulAVX2
832 ADDQ $16, itr1
833 CMPQ itr1, $64
834 JNE openAVX2InitialHash64
835
836 // Decrypt the first 64 bytes
837 VPXOR (0*32)(inp), AA0, AA0
838 VPXOR (1*32)(inp), BB0, BB0
839 VMOVDQU AA0, (0*32)(oup)
840 VMOVDQU BB0, (1*32)(oup)
841 LEAQ (2*32)(inp), inp
842 LEAQ (2*32)(oup), oup
843 SUBQ $64, inl
844
845 openAVX2MainLoop:
846 CMPQ inl, $512
847 JB openAVX2MainLoopDone
848
849 // Load state, increment counter blocks, store the incremented counters
850 VMOVDQU ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
851 VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
852 VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
853 VMOVDQA ctr3StoreAVX2, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
854 VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
855 XORQ itr1, itr1
856
857 openAVX2InternalLoop:
858 // Lets just say this spaghetti loop interleaves 2 quarter rounds with 3 poly multiplications
859 // Effectively per 512 bytes of stream we hash 480 bytes of ciphertext
860 polyAdd(0*8(inp)(itr1*1))
861 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
862 polyMulStage1_AVX2
863 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
864 VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
865 polyMulStage2_AVX2
866 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
867 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
868 polyMulStage3_AVX2
869 VMOVDQA CC3, tmpStoreAVX2
870 VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
871 VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
872 VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
873 VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
874 VMOVDQA tmpStoreAVX2, CC3
875 polyMulReduceStage
876 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
877 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
878 VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
879 polyAdd(2*8(inp)(itr1*1))
880 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
881 polyMulStage1_AVX2
882 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
883 VMOVDQA CC3, tmpStoreAVX2
884 VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
885 VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
886 VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
887 VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
888 VMOVDQA tmpStoreAVX2, CC3
889 polyMulStage2_AVX2
890 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3
891 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
892 VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3
893 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
894 polyMulStage3_AVX2
895 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
896 VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
897 polyMulReduceStage
898 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
899 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
900 polyAdd(4*8(inp)(itr1*1))
901 LEAQ (6*8)(itr1), itr1
902 VMOVDQA CC3, tmpStoreAVX2
903 VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
904 VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
905 VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
906 VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
907 VMOVDQA tmpStoreAVX2, CC3
908 polyMulStage1_AVX2
909 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
910 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
911 polyMulStage2_AVX2
912 VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
913 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
914 polyMulStage3_AVX2
915 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
916 VMOVDQA CC3, tmpStoreAVX2
917 VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
918 VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
919 VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
920 VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
921 VMOVDQA tmpStoreAVX2, CC3
922 polyMulReduceStage
923 VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3
924 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
925 VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3
926 CMPQ itr1, $480
927 JNE openAVX2InternalLoop
928
929 VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
930 VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
931 VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
932 VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
933 VMOVDQA CC3, tmpStoreAVX2
934
935 // We only hashed 480 of the 512 bytes available - hash the remaining 32 here
936 polyAdd(480(inp))
937 polyMulAVX2
938 VPERM2I128 $0x02, AA0, BB0, CC3; VPERM2I128 $0x13, AA0, BB0, BB0; VPERM2I128 $0x02, CC0, DD0, AA0; VPERM2I128 $0x13, CC0, DD0, CC0
939 VPXOR (0*32)(inp), CC3, CC3; VPXOR (1*32)(inp), AA0, AA0; VPXOR (2*32)(inp), BB0, BB0; VPXOR (3*32)(inp), CC0, CC0
940 VMOVDQU CC3, (0*32)(oup); VMOVDQU AA0, (1*32)(oup); VMOVDQU BB0, (2*32)(oup); VMOVDQU CC0, (3*32)(oup)
941 VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
942 VPXOR (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0
943 VMOVDQU AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup)
944
945 // and here
946 polyAdd(496(inp))
947 polyMulAVX2
948 VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0
949 VPXOR (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0
950 VMOVDQU AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup)
951 VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0
952 VPXOR (12*32)(inp), AA0, AA0; VPXOR (13*32)(inp), BB0, BB0; VPXOR (14*32)(inp), CC0, CC0; VPXOR (15*32)(inp), DD0, DD0
953 VMOVDQU AA0, (12*32)(oup); VMOVDQU BB0, (13*32)(oup); VMOVDQU CC0, (14*32)(oup); VMOVDQU DD0, (15*32)(oup)
954 LEAQ (32*16)(inp), inp
955 LEAQ (32*16)(oup), oup
956 SUBQ $(32*16), inl
957 JMP openAVX2MainLoop
958
959 openAVX2MainLoopDone:
960 // Handle the various tail sizes efficiently
961 TESTQ inl, inl
962 JE openSSEFinalize
963 CMPQ inl, $128
964 JBE openAVX2Tail128
965 CMPQ inl, $256
966 JBE openAVX2Tail256
967 CMPQ inl, $384
968 JBE openAVX2Tail384
969 JMP openAVX2Tail512
970
971 // ----------------------------------------------------------------------------
972 // Special optimization for buffers smaller than 193 bytes
973 openAVX2192:
974 // For up to 192 bytes of ciphertext and 64 bytes for the poly key, we process four blocks
975 VMOVDQA AA0, AA1
976 VMOVDQA BB0, BB1
977 VMOVDQA CC0, CC1
978 VPADDD ·avx2IncMask<>(SB), DD0, DD1
979 VMOVDQA AA0, AA2
980 VMOVDQA BB0, BB2
981 VMOVDQA CC0, CC2
982 VMOVDQA DD0, DD2
983 VMOVDQA DD1, TT3
984 MOVQ $10, itr2
985
986 openAVX2192InnerCipherLoop:
987 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
988 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1
989 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
990 VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1
991 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
992 VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1
993 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
994 VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1
995 DECQ itr2
996 JNE openAVX2192InnerCipherLoop
997 VPADDD AA2, AA0, AA0; VPADDD AA2, AA1, AA1
998 VPADDD BB2, BB0, BB0; VPADDD BB2, BB1, BB1
999 VPADDD CC2, CC0, CC0; VPADDD CC2, CC1, CC1
1000 VPADDD DD2, DD0, DD0; VPADDD TT3, DD1, DD1
1001 VPERM2I128 $0x02, AA0, BB0, TT0
1002
1003 // Clamp and store poly key
1004 VPAND ·polyClampMask<>(SB), TT0, TT0
1005 VMOVDQA TT0, rsStoreAVX2
1006
1007 // Stream for up to 192 bytes
1008 VPERM2I128 $0x13, AA0, BB0, AA0
1009 VPERM2I128 $0x13, CC0, DD0, BB0
1010 VPERM2I128 $0x02, AA1, BB1, CC0
1011 VPERM2I128 $0x02, CC1, DD1, DD0
1012 VPERM2I128 $0x13, AA1, BB1, AA1
1013 VPERM2I128 $0x13, CC1, DD1, BB1
1014
1015 openAVX2ShortOpen:
1016 // Hash
1017 MOVQ ad_len+80(FP), itr2
1018 CALL polyHashADInternal<>(SB)
1019
1020 openAVX2ShortOpenLoop:
1021 CMPQ inl, $32
1022 JB openAVX2ShortTail32
1023 SUBQ $32, inl
1024
1025 // Load for hashing
1026 polyAdd(0*8(inp))
1027 polyMulAVX2
1028 polyAdd(2*8(inp))
1029 polyMulAVX2
1030
1031 // Load for decryption
1032 VPXOR (inp), AA0, AA0
1033 VMOVDQU AA0, (oup)
1034 LEAQ (1*32)(inp), inp
1035 LEAQ (1*32)(oup), oup
1036
1037 // Shift stream left
1038 VMOVDQA BB0, AA0
1039 VMOVDQA CC0, BB0
1040 VMOVDQA DD0, CC0
1041 VMOVDQA AA1, DD0
1042 VMOVDQA BB1, AA1
1043 VMOVDQA CC1, BB1
1044 VMOVDQA DD1, CC1
1045 VMOVDQA AA2, DD1
1046 VMOVDQA BB2, AA2
1047 JMP openAVX2ShortOpenLoop
1048
1049 openAVX2ShortTail32:
1050 CMPQ inl, $16
1051 VMOVDQA A0, A1
1052 JB openAVX2ShortDone
1053
1054 SUBQ $16, inl
1055
1056 // Load for hashing
1057 polyAdd(0*8(inp))
1058 polyMulAVX2
1059
1060 // Load for decryption
1061 VPXOR (inp), A0, T0
1062 VMOVDQU T0, (oup)
1063 LEAQ (1*16)(inp), inp
1064 LEAQ (1*16)(oup), oup
1065 VPERM2I128 $0x11, AA0, AA0, AA0
1066 VMOVDQA A0, A1
1067
1068 openAVX2ShortDone:
1069 VZEROUPPER
1070 JMP openSSETail16
1071
1072 // ----------------------------------------------------------------------------
1073 // Special optimization for buffers smaller than 321 bytes
1074 openAVX2320:
1075 // For up to 320 bytes of ciphertext and 64 bytes for the poly key, we process six blocks
1076 VMOVDQA AA0, AA1; VMOVDQA BB0, BB1; VMOVDQA CC0, CC1; VPADDD ·avx2IncMask<>(SB), DD0, DD1
1077 VMOVDQA AA0, AA2; VMOVDQA BB0, BB2; VMOVDQA CC0, CC2; VPADDD ·avx2IncMask<>(SB), DD1, DD2
1078 VMOVDQA BB0, TT1; VMOVDQA CC0, TT2; VMOVDQA DD0, TT3
1079 MOVQ $10, itr2
1080
1081 openAVX2320InnerCipherLoop:
1082 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
1083 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2
1084 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
1085 VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2
1086 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
1087 VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2
1088 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
1089 VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2
1090 DECQ itr2
1091 JNE openAVX2320InnerCipherLoop
1092
1093 VMOVDQA ·chacha20Constants<>(SB), TT0
1094 VPADDD TT0, AA0, AA0; VPADDD TT0, AA1, AA1; VPADDD TT0, AA2, AA2
1095 VPADDD TT1, BB0, BB0; VPADDD TT1, BB1, BB1; VPADDD TT1, BB2, BB2
1096 VPADDD TT2, CC0, CC0; VPADDD TT2, CC1, CC1; VPADDD TT2, CC2, CC2
1097 VMOVDQA ·avx2IncMask<>(SB), TT0
1098 VPADDD TT3, DD0, DD0; VPADDD TT0, TT3, TT3
1099 VPADDD TT3, DD1, DD1; VPADDD TT0, TT3, TT3
1100 VPADDD TT3, DD2, DD2
1101
1102 // Clamp and store poly key
1103 VPERM2I128 $0x02, AA0, BB0, TT0
1104 VPAND ·polyClampMask<>(SB), TT0, TT0
1105 VMOVDQA TT0, rsStoreAVX2
1106
1107 // Stream for up to 320 bytes
1108 VPERM2I128 $0x13, AA0, BB0, AA0
1109 VPERM2I128 $0x13, CC0, DD0, BB0
1110 VPERM2I128 $0x02, AA1, BB1, CC0
1111 VPERM2I128 $0x02, CC1, DD1, DD0
1112 VPERM2I128 $0x13, AA1, BB1, AA1
1113 VPERM2I128 $0x13, CC1, DD1, BB1
1114 VPERM2I128 $0x02, AA2, BB2, CC1
1115 VPERM2I128 $0x02, CC2, DD2, DD1
1116 VPERM2I128 $0x13, AA2, BB2, AA2
1117 VPERM2I128 $0x13, CC2, DD2, BB2
1118 JMP openAVX2ShortOpen
1119
1120 // ----------------------------------------------------------------------------
1121 // Special optimization for the last 128 bytes of ciphertext
1122 openAVX2Tail128:
1123 // Need to decrypt up to 128 bytes - prepare two blocks
1124 VMOVDQA ·chacha20Constants<>(SB), AA1
1125 VMOVDQA state1StoreAVX2, BB1
1126 VMOVDQA state2StoreAVX2, CC1
1127 VMOVDQA ctr3StoreAVX2, DD1
1128 VPADDD ·avx2IncMask<>(SB), DD1, DD1
1129 VMOVDQA DD1, DD0
1130
1131 XORQ itr2, itr2
1132 MOVQ inl, itr1
1133 ANDQ $-16, itr1
1134 TESTQ itr1, itr1
1135 JE openAVX2Tail128LoopB
1136
1137 openAVX2Tail128LoopA:
1138 // Perform ChaCha rounds, while hashing the remaining input
1139 polyAdd(0(inp)(itr2*1))
1140 polyMulAVX2
1141
1142 openAVX2Tail128LoopB:
1143 ADDQ $16, itr2
1144 chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
1145 VPALIGNR $4, BB1, BB1, BB1
1146 VPALIGNR $8, CC1, CC1, CC1
1147 VPALIGNR $12, DD1, DD1, DD1
1148 chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
1149 VPALIGNR $12, BB1, BB1, BB1
1150 VPALIGNR $8, CC1, CC1, CC1
1151 VPALIGNR $4, DD1, DD1, DD1
1152 CMPQ itr2, itr1
1153 JB openAVX2Tail128LoopA
1154 CMPQ itr2, $160
1155 JNE openAVX2Tail128LoopB
1156
1157 VPADDD ·chacha20Constants<>(SB), AA1, AA1
1158 VPADDD state1StoreAVX2, BB1, BB1
1159 VPADDD state2StoreAVX2, CC1, CC1
1160 VPADDD DD0, DD1, DD1
1161 VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
1162
1163 openAVX2TailLoop:
1164 CMPQ inl, $32
1165 JB openAVX2Tail
1166 SUBQ $32, inl
1167
1168 // Load for decryption
1169 VPXOR (inp), AA0, AA0
1170 VMOVDQU AA0, (oup)
1171 LEAQ (1*32)(inp), inp
1172 LEAQ (1*32)(oup), oup
1173 VMOVDQA BB0, AA0
1174 VMOVDQA CC0, BB0
1175 VMOVDQA DD0, CC0
1176 JMP openAVX2TailLoop
1177
1178 openAVX2Tail:
1179 CMPQ inl, $16
1180 VMOVDQA A0, A1
1181 JB openAVX2TailDone
1182 SUBQ $16, inl
1183
1184 // Load for decryption
1185 VPXOR (inp), A0, T0
1186 VMOVDQU T0, (oup)
1187 LEAQ (1*16)(inp), inp
1188 LEAQ (1*16)(oup), oup
1189 VPERM2I128 $0x11, AA0, AA0, AA0
1190 VMOVDQA A0, A1
1191
1192 openAVX2TailDone:
1193 VZEROUPPER
1194 JMP openSSETail16
1195
1196 // ----------------------------------------------------------------------------
1197 // Special optimization for the last 256 bytes of ciphertext
1198 openAVX2Tail256:
1199 // Need to decrypt up to 256 bytes - prepare four blocks
1200 VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1
1201 VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1
1202 VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1
1203 VMOVDQA ctr3StoreAVX2, DD0
1204 VPADDD ·avx2IncMask<>(SB), DD0, DD0
1205 VPADDD ·avx2IncMask<>(SB), DD0, DD1
1206 VMOVDQA DD0, TT1
1207 VMOVDQA DD1, TT2
1208
1209 // Compute the number of iterations that will hash data
1210 MOVQ inl, tmpStoreAVX2
1211 MOVQ inl, itr1
1212 SUBQ $128, itr1
1213 SHRQ $4, itr1
1214 MOVQ $10, itr2
1215 CMPQ itr1, $10
1216 CMOVQGT itr2, itr1
1217 MOVQ inp, inl
1218 XORQ itr2, itr2
1219
1220 openAVX2Tail256LoopA:
1221 polyAdd(0(inl))
1222 polyMulAVX2
1223 LEAQ 16(inl), inl
1224
1225 // Perform ChaCha rounds, while hashing the remaining input
1226 openAVX2Tail256LoopB:
1227 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
1228 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1
1229 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
1230 VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1
1231 INCQ itr2
1232 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
1233 VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1
1234 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
1235 VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1
1236 CMPQ itr2, itr1
1237 JB openAVX2Tail256LoopA
1238
1239 CMPQ itr2, $10
1240 JNE openAVX2Tail256LoopB
1241
1242 MOVQ inl, itr2
1243 SUBQ inp, inl
1244 MOVQ inl, itr1
1245 MOVQ tmpStoreAVX2, inl
1246
1247 // Hash the remainder of data (if any)
1248 openAVX2Tail256Hash:
1249 ADDQ $16, itr1
1250 CMPQ itr1, inl
1251 JGT openAVX2Tail256HashEnd
1252 polyAdd (0(itr2))
1253 polyMulAVX2
1254 LEAQ 16(itr2), itr2
1255 JMP openAVX2Tail256Hash
1256
1257 // Store 128 bytes safely, then go to store loop
1258 openAVX2Tail256HashEnd:
1259 VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1
1260 VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1
1261 VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1
1262 VPADDD TT1, DD0, DD0; VPADDD TT2, DD1, DD1
1263 VPERM2I128 $0x02, AA0, BB0, AA2; VPERM2I128 $0x02, CC0, DD0, BB2; VPERM2I128 $0x13, AA0, BB0, CC2; VPERM2I128 $0x13, CC0, DD0, DD2
1264 VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
1265
1266 VPXOR (0*32)(inp), AA2, AA2; VPXOR (1*32)(inp), BB2, BB2; VPXOR (2*32)(inp), CC2, CC2; VPXOR (3*32)(inp), DD2, DD2
1267 VMOVDQU AA2, (0*32)(oup); VMOVDQU BB2, (1*32)(oup); VMOVDQU CC2, (2*32)(oup); VMOVDQU DD2, (3*32)(oup)
1268 LEAQ (4*32)(inp), inp
1269 LEAQ (4*32)(oup), oup
1270 SUBQ $4*32, inl
1271
1272 JMP openAVX2TailLoop
1273
1274 // ----------------------------------------------------------------------------
1275 // Special optimization for the last 384 bytes of ciphertext
1276 openAVX2Tail384:
1277 // Need to decrypt up to 384 bytes - prepare six blocks
1278 VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2
1279 VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2
1280 VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2
1281 VMOVDQA ctr3StoreAVX2, DD0
1282 VPADDD ·avx2IncMask<>(SB), DD0, DD0
1283 VPADDD ·avx2IncMask<>(SB), DD0, DD1
1284 VPADDD ·avx2IncMask<>(SB), DD1, DD2
1285 VMOVDQA DD0, ctr0StoreAVX2
1286 VMOVDQA DD1, ctr1StoreAVX2
1287 VMOVDQA DD2, ctr2StoreAVX2
1288
1289 // Compute the number of iterations that will hash two blocks of data
1290 MOVQ inl, tmpStoreAVX2
1291 MOVQ inl, itr1
1292 SUBQ $256, itr1
1293 SHRQ $4, itr1
1294 ADDQ $6, itr1
1295 MOVQ $10, itr2
1296 CMPQ itr1, $10
1297 CMOVQGT itr2, itr1
1298 MOVQ inp, inl
1299 XORQ itr2, itr2
1300
1301 // Perform ChaCha rounds, while hashing the remaining input
1302 openAVX2Tail384LoopB:
1303 polyAdd(0(inl))
1304 polyMulAVX2
1305 LEAQ 16(inl), inl
1306
1307 openAVX2Tail384LoopA:
1308 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
1309 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2
1310 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
1311 VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2
1312 polyAdd(0(inl))
1313 polyMulAVX2
1314 LEAQ 16(inl), inl
1315 INCQ itr2
1316 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
1317 VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2
1318 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
1319 VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2
1320
1321 CMPQ itr2, itr1
1322 JB openAVX2Tail384LoopB
1323
1324 CMPQ itr2, $10
1325 JNE openAVX2Tail384LoopA
1326
1327 MOVQ inl, itr2
1328 SUBQ inp, inl
1329 MOVQ inl, itr1
1330 MOVQ tmpStoreAVX2, inl
1331
1332 openAVX2Tail384Hash:
1333 ADDQ $16, itr1
1334 CMPQ itr1, inl
1335 JGT openAVX2Tail384HashEnd
1336 polyAdd(0(itr2))
1337 polyMulAVX2
1338 LEAQ 16(itr2), itr2
1339 JMP openAVX2Tail384Hash
1340
1341 // Store 256 bytes safely, then go to store loop
1342 openAVX2Tail384HashEnd:
1343 VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2
1344 VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2
1345 VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2
1346 VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2
1347 VPERM2I128 $0x02, AA0, BB0, TT0; VPERM2I128 $0x02, CC0, DD0, TT1; VPERM2I128 $0x13, AA0, BB0, TT2; VPERM2I128 $0x13, CC0, DD0, TT3
1348 VPXOR (0*32)(inp), TT0, TT0; VPXOR (1*32)(inp), TT1, TT1; VPXOR (2*32)(inp), TT2, TT2; VPXOR (3*32)(inp), TT3, TT3
1349 VMOVDQU TT0, (0*32)(oup); VMOVDQU TT1, (1*32)(oup); VMOVDQU TT2, (2*32)(oup); VMOVDQU TT3, (3*32)(oup)
1350 VPERM2I128 $0x02, AA1, BB1, TT0; VPERM2I128 $0x02, CC1, DD1, TT1; VPERM2I128 $0x13, AA1, BB1, TT2; VPERM2I128 $0x13, CC1, DD1, TT3
1351 VPXOR (4*32)(inp), TT0, TT0; VPXOR (5*32)(inp), TT1, TT1; VPXOR (6*32)(inp), TT2, TT2; VPXOR (7*32)(inp), TT3, TT3
1352 VMOVDQU TT0, (4*32)(oup); VMOVDQU TT1, (5*32)(oup); VMOVDQU TT2, (6*32)(oup); VMOVDQU TT3, (7*32)(oup)
1353 VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0
1354 LEAQ (8*32)(inp), inp
1355 LEAQ (8*32)(oup), oup
1356 SUBQ $8*32, inl
1357 JMP openAVX2TailLoop
1358
1359 // ----------------------------------------------------------------------------
1360 // Special optimization for the last 512 bytes of ciphertext
1361 openAVX2Tail512:
1362 VMOVDQU ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
1363 VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
1364 VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
1365 VMOVDQA ctr3StoreAVX2, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
1366 VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
1367 XORQ itr1, itr1
1368 MOVQ inp, itr2
1369
1370 openAVX2Tail512LoopB:
1371 polyAdd(0(itr2))
1372 polyMulAVX2
1373 LEAQ (2*8)(itr2), itr2
1374
1375 openAVX2Tail512LoopA:
1376 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
1377 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
1378 VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
1379 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
1380 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
1381 VMOVDQA CC3, tmpStoreAVX2
1382 VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
1383 VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
1384 VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
1385 VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
1386 VMOVDQA tmpStoreAVX2, CC3
1387 polyAdd(0*8(itr2))
1388 polyMulAVX2
1389 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
1390 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
1391 VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
1392 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
1393 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
1394 VMOVDQA CC3, tmpStoreAVX2
1395 VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
1396 VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
1397 VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
1398 VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
1399 VMOVDQA tmpStoreAVX2, CC3
1400 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3
1401 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
1402 VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3
1403 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
1404 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
1405 VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
1406 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
1407 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
1408 polyAdd(2*8(itr2))
1409 polyMulAVX2
1410 LEAQ (4*8)(itr2), itr2
1411 VMOVDQA CC3, tmpStoreAVX2
1412 VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
1413 VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
1414 VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
1415 VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
1416 VMOVDQA tmpStoreAVX2, CC3
1417 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
1418 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
1419 VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
1420 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
1421 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
1422 VMOVDQA CC3, tmpStoreAVX2
1423 VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
1424 VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
1425 VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
1426 VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
1427 VMOVDQA tmpStoreAVX2, CC3
1428 VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3
1429 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
1430 VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3
1431 INCQ itr1
1432 CMPQ itr1, $4
1433 JLT openAVX2Tail512LoopB
1434
1435 CMPQ itr1, $10
1436 JNE openAVX2Tail512LoopA
1437
1438 MOVQ inl, itr1
1439 SUBQ $384, itr1
1440 ANDQ $-16, itr1
1441
1442 openAVX2Tail512HashLoop:
1443 TESTQ itr1, itr1
1444 JE openAVX2Tail512HashEnd
1445 polyAdd(0(itr2))
1446 polyMulAVX2
1447 LEAQ 16(itr2), itr2
1448 SUBQ $16, itr1
1449 JMP openAVX2Tail512HashLoop
1450
1451 openAVX2Tail512HashEnd:
1452 VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
1453 VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
1454 VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
1455 VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
1456 VMOVDQA CC3, tmpStoreAVX2
1457 VPERM2I128 $0x02, AA0, BB0, CC3; VPERM2I128 $0x13, AA0, BB0, BB0; VPERM2I128 $0x02, CC0, DD0, AA0; VPERM2I128 $0x13, CC0, DD0, CC0
1458 VPXOR (0*32)(inp), CC3, CC3; VPXOR (1*32)(inp), AA0, AA0; VPXOR (2*32)(inp), BB0, BB0; VPXOR (3*32)(inp), CC0, CC0
1459 VMOVDQU CC3, (0*32)(oup); VMOVDQU AA0, (1*32)(oup); VMOVDQU BB0, (2*32)(oup); VMOVDQU CC0, (3*32)(oup)
1460 VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
1461 VPXOR (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0
1462 VMOVDQU AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup)
1463 VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0
1464 VPXOR (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0
1465 VMOVDQU AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup)
1466 VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0
1467
1468 LEAQ (12*32)(inp), inp
1469 LEAQ (12*32)(oup), oup
1470 SUBQ $12*32, inl
1471
1472 JMP openAVX2TailLoop
1473
1474 // ----------------------------------------------------------------------------
1475 // ----------------------------------------------------------------------------
1476 // func chacha20Poly1305Seal(dst, key, src, ad []byte)
1477 TEXT ·chacha20Poly1305Seal(SB), 0, $288-96
1478 // For aligned stack access
1479 MOVQ SP, BP
1480 ADDQ $32, BP
1481 ANDQ $-32, BP
1482 MOVQ dst+0(FP), oup
1483 MOVQ key+24(FP), keyp
1484 MOVQ src+48(FP), inp
1485 MOVQ src_len+56(FP), inl
1486 MOVQ ad+72(FP), adp
1487
1488 CMPB ·useAVX2(SB), $1
1489 JE chacha20Poly1305Seal_AVX2
1490
1491 // Special optimization, for very short buffers
1492 CMPQ inl, $128
1493 JBE sealSSE128 // About 15% faster
1494
1495 // In the seal case - prepare the poly key + 3 blocks of stream in the first iteration
1496 MOVOU ·chacha20Constants<>(SB), A0
1497 MOVOU (1*16)(keyp), B0
1498 MOVOU (2*16)(keyp), C0
1499 MOVOU (3*16)(keyp), D0
1500
1501 // Store state on stack for future use
1502 MOVO B0, state1Store
1503 MOVO C0, state2Store
1504
1505 // Load state, increment counter blocks
1506 MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
1507 MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
1508 MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3
1509
1510 // Store counters
1511 MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store
1512 MOVQ $10, itr2
1513
1514 sealSSEIntroLoop:
1515 MOVO C3, tmpStore
1516 chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
1517 MOVO tmpStore, C3
1518 MOVO C1, tmpStore
1519 chachaQR(A3, B3, C3, D3, C1)
1520 MOVO tmpStore, C1
1521 shiftB0Left; shiftB1Left; shiftB2Left; shiftB3Left
1522 shiftC0Left; shiftC1Left; shiftC2Left; shiftC3Left
1523 shiftD0Left; shiftD1Left; shiftD2Left; shiftD3Left
1524
1525 MOVO C3, tmpStore
1526 chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
1527 MOVO tmpStore, C3
1528 MOVO C1, tmpStore
1529 chachaQR(A3, B3, C3, D3, C1)
1530 MOVO tmpStore, C1
1531 shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right
1532 shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right
1533 shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right
1534 DECQ itr2
1535 JNE sealSSEIntroLoop
1536
1537 // Add in the state
1538 PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3
1539 PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3
1540 PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3
1541 PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3
1542
1543 // Clamp and store the key
1544 PAND ·polyClampMask<>(SB), A0
1545 MOVO A0, rStore
1546 MOVO B0, sStore
1547
1548 // Hash AAD
1549 MOVQ ad_len+80(FP), itr2
1550 CALL polyHashADInternal<>(SB)
1551
1552 MOVOU (0*16)(inp), A0; MOVOU (1*16)(inp), B0; MOVOU (2*16)(inp), C0; MOVOU (3*16)(inp), D0
1553 PXOR A0, A1; PXOR B0, B1; PXOR C0, C1; PXOR D0, D1
1554 MOVOU A1, (0*16)(oup); MOVOU B1, (1*16)(oup); MOVOU C1, (2*16)(oup); MOVOU D1, (3*16)(oup)
1555 MOVOU (4*16)(inp), A0; MOVOU (5*16)(inp), B0; MOVOU (6*16)(inp), C0; MOVOU (7*16)(inp), D0
1556 PXOR A0, A2; PXOR B0, B2; PXOR C0, C2; PXOR D0, D2
1557 MOVOU A2, (4*16)(oup); MOVOU B2, (5*16)(oup); MOVOU C2, (6*16)(oup); MOVOU D2, (7*16)(oup)
1558
1559 MOVQ $128, itr1
1560 SUBQ $128, inl
1561 LEAQ 128(inp), inp
1562
1563 MOVO A3, A1; MOVO B3, B1; MOVO C3, C1; MOVO D3, D1
1564
1565 CMPQ inl, $64
1566 JBE sealSSE128SealHash
1567
1568 MOVOU (0*16)(inp), A0; MOVOU (1*16)(inp), B0; MOVOU (2*16)(inp), C0; MOVOU (3*16)(inp), D0
1569 PXOR A0, A3; PXOR B0, B3; PXOR C0, C3; PXOR D0, D3
1570 MOVOU A3, (8*16)(oup); MOVOU B3, (9*16)(oup); MOVOU C3, (10*16)(oup); MOVOU D3, (11*16)(oup)
1571
1572 ADDQ $64, itr1
1573 SUBQ $64, inl
1574 LEAQ 64(inp), inp
1575
1576 MOVQ $2, itr1
1577 MOVQ $8, itr2
1578
1579 CMPQ inl, $64
1580 JBE sealSSETail64
1581 CMPQ inl, $128
1582 JBE sealSSETail128
1583 CMPQ inl, $192
1584 JBE sealSSETail192
1585
1586 sealSSEMainLoop:
1587 // Load state, increment counter blocks
1588 MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0
1589 MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
1590 MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
1591 MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3
1592
1593 // Store counters
1594 MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store
1595
1596 sealSSEInnerLoop:
1597 MOVO C3, tmpStore
1598 chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
1599 MOVO tmpStore, C3
1600 MOVO C1, tmpStore
1601 chachaQR(A3, B3, C3, D3, C1)
1602 MOVO tmpStore, C1
1603 polyAdd(0(oup))
1604 shiftB0Left; shiftB1Left; shiftB2Left; shiftB3Left
1605 shiftC0Left; shiftC1Left; shiftC2Left; shiftC3Left
1606 shiftD0Left; shiftD1Left; shiftD2Left; shiftD3Left
1607 polyMulStage1
1608 polyMulStage2
1609 LEAQ (2*8)(oup), oup
1610 MOVO C3, tmpStore
1611 chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
1612 MOVO tmpStore, C3
1613 MOVO C1, tmpStore
1614 polyMulStage3
1615 chachaQR(A3, B3, C3, D3, C1)
1616 MOVO tmpStore, C1
1617 polyMulReduceStage
1618 shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right
1619 shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right
1620 shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right
1621 DECQ itr2
1622 JGE sealSSEInnerLoop
1623 polyAdd(0(oup))
1624 polyMul
1625 LEAQ (2*8)(oup), oup
1626 DECQ itr1
1627 JG sealSSEInnerLoop
1628
1629 // Add in the state
1630 PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3
1631 PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3
1632 PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3
1633 PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3
1634 MOVO D3, tmpStore
1635
1636 // Load - xor - store
1637 MOVOU (0*16)(inp), D3; PXOR D3, A0
1638 MOVOU (1*16)(inp), D3; PXOR D3, B0
1639 MOVOU (2*16)(inp), D3; PXOR D3, C0
1640 MOVOU (3*16)(inp), D3; PXOR D3, D0
1641 MOVOU A0, (0*16)(oup)
1642 MOVOU B0, (1*16)(oup)
1643 MOVOU C0, (2*16)(oup)
1644 MOVOU D0, (3*16)(oup)
1645 MOVO tmpStore, D3
1646
1647 MOVOU (4*16)(inp), A0; MOVOU (5*16)(inp), B0; MOVOU (6*16)(inp), C0; MOVOU (7*16)(inp), D0
1648 PXOR A0, A1; PXOR B0, B1; PXOR C0, C1; PXOR D0, D1
1649 MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup)
1650 MOVOU (8*16)(inp), A0; MOVOU (9*16)(inp), B0; MOVOU (10*16)(inp), C0; MOVOU (11*16)(inp), D0
1651 PXOR A0, A2; PXOR B0, B2; PXOR C0, C2; PXOR D0, D2
1652 MOVOU A2, (8*16)(oup); MOVOU B2, (9*16)(oup); MOVOU C2, (10*16)(oup); MOVOU D2, (11*16)(oup)
1653 ADDQ $192, inp
1654 MOVQ $192, itr1
1655 SUBQ $192, inl
1656 MOVO A3, A1
1657 MOVO B3, B1
1658 MOVO C3, C1
1659 MOVO D3, D1
1660 CMPQ inl, $64
1661 JBE sealSSE128SealHash
1662 MOVOU (0*16)(inp), A0; MOVOU (1*16)(inp), B0; MOVOU (2*16)(inp), C0; MOVOU (3*16)(inp), D0
1663 PXOR A0, A3; PXOR B0, B3; PXOR C0, C3; PXOR D0, D3
1664 MOVOU A3, (12*16)(oup); MOVOU B3, (13*16)(oup); MOVOU C3, (14*16)(oup); MOVOU D3, (15*16)(oup)
1665 LEAQ 64(inp), inp
1666 SUBQ $64, inl
1667 MOVQ $6, itr1
1668 MOVQ $4, itr2
1669 CMPQ inl, $192
1670 JG sealSSEMainLoop
1671
1672 MOVQ inl, itr1
1673 TESTQ inl, inl
1674 JE sealSSE128SealHash
1675 MOVQ $6, itr1
1676 CMPQ inl, $64
1677 JBE sealSSETail64
1678 CMPQ inl, $128
1679 JBE sealSSETail128
1680 JMP sealSSETail192
1681
1682 // ----------------------------------------------------------------------------
1683 // Special optimization for the last 64 bytes of plaintext
1684 sealSSETail64:
1685 // Need to encrypt up to 64 bytes - prepare single block, hash 192 or 256 bytes
1686 MOVO ·chacha20Constants<>(SB), A1
1687 MOVO state1Store, B1
1688 MOVO state2Store, C1
1689 MOVO ctr3Store, D1
1690 PADDL ·sseIncMask<>(SB), D1
1691 MOVO D1, ctr0Store
1692
1693 sealSSETail64LoopA:
1694 // Perform ChaCha rounds, while hashing the previously encrypted ciphertext
1695 polyAdd(0(oup))
1696 polyMul
1697 LEAQ 16(oup), oup
1698
1699 sealSSETail64LoopB:
1700 chachaQR(A1, B1, C1, D1, T1)
1701 shiftB1Left; shiftC1Left; shiftD1Left
1702 chachaQR(A1, B1, C1, D1, T1)
1703 shiftB1Right; shiftC1Right; shiftD1Right
1704 polyAdd(0(oup))
1705 polyMul
1706 LEAQ 16(oup), oup
1707
1708 DECQ itr1
1709 JG sealSSETail64LoopA
1710
1711 DECQ itr2
1712 JGE sealSSETail64LoopB
1713 PADDL ·chacha20Constants<>(SB), A1
1714 PADDL state1Store, B1
1715 PADDL state2Store, C1
1716 PADDL ctr0Store, D1
1717
1718 JMP sealSSE128Seal
1719
1720 // ----------------------------------------------------------------------------
1721 // Special optimization for the last 128 bytes of plaintext
1722 sealSSETail128:
1723 // Need to encrypt up to 128 bytes - prepare two blocks, hash 192 or 256 bytes
1724 MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr0Store
1725 MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr1Store
1726
1727 sealSSETail128LoopA:
1728 // Perform ChaCha rounds, while hashing the previously encrypted ciphertext
1729 polyAdd(0(oup))
1730 polyMul
1731 LEAQ 16(oup), oup
1732
1733 sealSSETail128LoopB:
1734 chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0)
1735 shiftB0Left; shiftC0Left; shiftD0Left
1736 shiftB1Left; shiftC1Left; shiftD1Left
1737 polyAdd(0(oup))
1738 polyMul
1739 LEAQ 16(oup), oup
1740 chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0)
1741 shiftB0Right; shiftC0Right; shiftD0Right
1742 shiftB1Right; shiftC1Right; shiftD1Right
1743
1744 DECQ itr1
1745 JG sealSSETail128LoopA
1746
1747 DECQ itr2
1748 JGE sealSSETail128LoopB
1749
1750 PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1
1751 PADDL state1Store, B0; PADDL state1Store, B1
1752 PADDL state2Store, C0; PADDL state2Store, C1
1753 PADDL ctr0Store, D0; PADDL ctr1Store, D1
1754
1755 MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3
1756 PXOR T0, A0; PXOR T1, B0; PXOR T2, C0; PXOR T3, D0
1757 MOVOU A0, (0*16)(oup); MOVOU B0, (1*16)(oup); MOVOU C0, (2*16)(oup); MOVOU D0, (3*16)(oup)
1758
1759 MOVQ $64, itr1
1760 LEAQ 64(inp), inp
1761 SUBQ $64, inl
1762
1763 JMP sealSSE128SealHash
1764
1765 // ----------------------------------------------------------------------------
1766 // Special optimization for the last 192 bytes of plaintext
1767 sealSSETail192:
1768 // Need to encrypt up to 192 bytes - prepare three blocks, hash 192 or 256 bytes
1769 MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr0Store
1770 MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr1Store
1771 MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2; MOVO D2, ctr2Store
1772
1773 sealSSETail192LoopA:
1774 // Perform ChaCha rounds, while hashing the previously encrypted ciphertext
1775 polyAdd(0(oup))
1776 polyMul
1777 LEAQ 16(oup), oup
1778
1779 sealSSETail192LoopB:
1780 chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
1781 shiftB0Left; shiftC0Left; shiftD0Left
1782 shiftB1Left; shiftC1Left; shiftD1Left
1783 shiftB2Left; shiftC2Left; shiftD2Left
1784
1785 polyAdd(0(oup))
1786 polyMul
1787 LEAQ 16(oup), oup
1788
1789 chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
1790 shiftB0Right; shiftC0Right; shiftD0Right
1791 shiftB1Right; shiftC1Right; shiftD1Right
1792 shiftB2Right; shiftC2Right; shiftD2Right
1793
1794 DECQ itr1
1795 JG sealSSETail192LoopA
1796
1797 DECQ itr2
1798 JGE sealSSETail192LoopB
1799
1800 PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2
1801 PADDL state1Store, B0; PADDL state1Store, B1; PADDL state1Store, B2
1802 PADDL state2Store, C0; PADDL state2Store, C1; PADDL state2Store, C2
1803 PADDL ctr0Store, D0; PADDL ctr1Store, D1; PADDL ctr2Store, D2
1804
1805 MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3
1806 PXOR T0, A0; PXOR T1, B0; PXOR T2, C0; PXOR T3, D0
1807 MOVOU A0, (0*16)(oup); MOVOU B0, (1*16)(oup); MOVOU C0, (2*16)(oup); MOVOU D0, (3*16)(oup)
1808 MOVOU (4*16)(inp), T0; MOVOU (5*16)(inp), T1; MOVOU (6*16)(inp), T2; MOVOU (7*16)(inp), T3
1809 PXOR T0, A1; PXOR T1, B1; PXOR T2, C1; PXOR T3, D1
1810 MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup)
1811
1812 MOVO A2, A1
1813 MOVO B2, B1
1814 MOVO C2, C1
1815 MOVO D2, D1
1816 MOVQ $128, itr1
1817 LEAQ 128(inp), inp
1818 SUBQ $128, inl
1819
1820 JMP sealSSE128SealHash
1821
1822 // ----------------------------------------------------------------------------
1823 // Special seal optimization for buffers smaller than 129 bytes
1824 sealSSE128:
1825 // For up to 128 bytes of ciphertext and 64 bytes for the poly key, we require to process three blocks
1826 MOVOU ·chacha20Constants<>(SB), A0; MOVOU (1*16)(keyp), B0; MOVOU (2*16)(keyp), C0; MOVOU (3*16)(keyp), D0
1827 MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
1828 MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
1829 MOVO B0, T1; MOVO C0, T2; MOVO D1, T3
1830 MOVQ $10, itr2
1831
1832 sealSSE128InnerCipherLoop:
1833 chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
1834 shiftB0Left; shiftB1Left; shiftB2Left
1835 shiftC0Left; shiftC1Left; shiftC2Left
1836 shiftD0Left; shiftD1Left; shiftD2Left
1837 chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
1838 shiftB0Right; shiftB1Right; shiftB2Right
1839 shiftC0Right; shiftC1Right; shiftC2Right
1840 shiftD0Right; shiftD1Right; shiftD2Right
1841 DECQ itr2
1842 JNE sealSSE128InnerCipherLoop
1843
1844 // A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded
1845 PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2
1846 PADDL T1, B0; PADDL T1, B1; PADDL T1, B2
1847 PADDL T2, C1; PADDL T2, C2
1848 PADDL T3, D1; PADDL ·sseIncMask<>(SB), T3; PADDL T3, D2
1849 PAND ·polyClampMask<>(SB), A0
1850 MOVOU A0, rStore
1851 MOVOU B0, sStore
1852
1853 // Hash
1854 MOVQ ad_len+80(FP), itr2
1855 CALL polyHashADInternal<>(SB)
1856 XORQ itr1, itr1
1857
1858 sealSSE128SealHash:
1859 // itr1 holds the number of bytes encrypted but not yet hashed
1860 CMPQ itr1, $16
1861 JB sealSSE128Seal
1862 polyAdd(0(oup))
1863 polyMul
1864
1865 SUBQ $16, itr1
1866 ADDQ $16, oup
1867
1868 JMP sealSSE128SealHash
1869
1870 sealSSE128Seal:
1871 CMPQ inl, $16
1872 JB sealSSETail
1873 SUBQ $16, inl
1874
1875 // Load for decryption
1876 MOVOU (inp), T0
1877 PXOR T0, A1
1878 MOVOU A1, (oup)
1879 LEAQ (1*16)(inp), inp
1880 LEAQ (1*16)(oup), oup
1881
1882 // Extract for hashing
1883 MOVQ A1, t0
1884 PSRLDQ $8, A1
1885 MOVQ A1, t1
1886 ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2
1887 polyMul
1888
1889 // Shift the stream "left"
1890 MOVO B1, A1
1891 MOVO C1, B1
1892 MOVO D1, C1
1893 MOVO A2, D1
1894 MOVO B2, A2
1895 MOVO C2, B2
1896 MOVO D2, C2
1897 JMP sealSSE128Seal
1898
1899 sealSSETail:
1900 TESTQ inl, inl
1901 JE sealSSEFinalize
1902
1903 // We can only load the PT one byte at a time to avoid read after end of buffer
1904 MOVQ inl, itr2
1905 SHLQ $4, itr2
1906 LEAQ ·andMask<>(SB), t0
1907 MOVQ inl, itr1
1908 LEAQ -1(inp)(inl*1), inp
1909 XORQ t2, t2
1910 XORQ t3, t3
1911 XORQ AX, AX
1912
1913 sealSSETailLoadLoop:
1914 SHLQ $8, t2, t3
1915 SHLQ $8, t2
1916 MOVB (inp), AX
1917 XORQ AX, t2
1918 LEAQ -1(inp), inp
1919 DECQ itr1
1920 JNE sealSSETailLoadLoop
1921 MOVQ t2, 0+tmpStore
1922 MOVQ t3, 8+tmpStore
1923 PXOR 0+tmpStore, A1
1924 MOVOU A1, (oup)
1925 MOVOU -16(t0)(itr2*1), T0
1926 PAND T0, A1
1927 MOVQ A1, t0
1928 PSRLDQ $8, A1
1929 MOVQ A1, t1
1930 ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2
1931 polyMul
1932
1933 ADDQ inl, oup
1934
1935 sealSSEFinalize:
1936 // Hash in the buffer lengths
1937 ADDQ ad_len+80(FP), acc0
1938 ADCQ src_len+56(FP), acc1
1939 ADCQ $1, acc2
1940 polyMul
1941
1942 // Final reduce
1943 MOVQ acc0, t0
1944 MOVQ acc1, t1
1945 MOVQ acc2, t2
1946 SUBQ $-5, acc0
1947 SBBQ $-1, acc1
1948 SBBQ $3, acc2
1949 CMOVQCS t0, acc0
1950 CMOVQCS t1, acc1
1951 CMOVQCS t2, acc2
1952
1953 // Add in the "s" part of the key
1954 ADDQ 0+sStore, acc0
1955 ADCQ 8+sStore, acc1
1956
1957 // Finally store the tag at the end of the message
1958 MOVQ acc0, (0*8)(oup)
1959 MOVQ acc1, (1*8)(oup)
1960 RET
1961
1962 // ----------------------------------------------------------------------------
1963 // ------------------------- AVX2 Code ----------------------------------------
1964 chacha20Poly1305Seal_AVX2:
1965 VZEROUPPER
1966 VMOVDQU ·chacha20Constants<>(SB), AA0
1967 BYTE $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x70; BYTE $0x10 // broadcasti128 16(r8), ymm14
1968 BYTE $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x20 // broadcasti128 32(r8), ymm12
1969 BYTE $0xc4; BYTE $0xc2; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x30 // broadcasti128 48(r8), ymm4
1970 VPADDD ·avx2InitMask<>(SB), DD0, DD0
1971
1972 // Special optimizations, for very short buffers
1973 CMPQ inl, $192
1974 JBE seal192AVX2 // 33% faster
1975 CMPQ inl, $320
1976 JBE seal320AVX2 // 17% faster
1977
1978 // For the general key prepare the key first - as a byproduct we have 64 bytes of cipher stream
1979 VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
1980 VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3; VMOVDQA BB0, state1StoreAVX2
1981 VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3; VMOVDQA CC0, state2StoreAVX2
1982 VPADDD ·avx2IncMask<>(SB), DD0, DD1; VMOVDQA DD0, ctr0StoreAVX2
1983 VPADDD ·avx2IncMask<>(SB), DD1, DD2; VMOVDQA DD1, ctr1StoreAVX2
1984 VPADDD ·avx2IncMask<>(SB), DD2, DD3; VMOVDQA DD2, ctr2StoreAVX2
1985 VMOVDQA DD3, ctr3StoreAVX2
1986 MOVQ $10, itr2
1987
1988 sealAVX2IntroLoop:
1989 VMOVDQA CC3, tmpStoreAVX2
1990 chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3)
1991 VMOVDQA tmpStoreAVX2, CC3
1992 VMOVDQA CC1, tmpStoreAVX2
1993 chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1)
1994 VMOVDQA tmpStoreAVX2, CC1
1995
1996 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $12, DD0, DD0, DD0
1997 VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $12, DD1, DD1, DD1
1998 VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $12, DD2, DD2, DD2
1999 VPALIGNR $4, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $12, DD3, DD3, DD3
2000
2001 VMOVDQA CC3, tmpStoreAVX2
2002 chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3)
2003 VMOVDQA tmpStoreAVX2, CC3
2004 VMOVDQA CC1, tmpStoreAVX2
2005 chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1)
2006 VMOVDQA tmpStoreAVX2, CC1
2007
2008 VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $4, DD0, DD0, DD0
2009 VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $4, DD1, DD1, DD1
2010 VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $4, DD2, DD2, DD2
2011 VPALIGNR $12, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $4, DD3, DD3, DD3
2012 DECQ itr2
2013 JNE sealAVX2IntroLoop
2014
2015 VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
2016 VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
2017 VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
2018 VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
2019
2020 VPERM2I128 $0x13, CC0, DD0, CC0 // Stream bytes 96 - 127
2021 VPERM2I128 $0x02, AA0, BB0, DD0 // The Poly1305 key
2022 VPERM2I128 $0x13, AA0, BB0, AA0 // Stream bytes 64 - 95
2023
2024 // Clamp and store poly key
2025 VPAND ·polyClampMask<>(SB), DD0, DD0
2026 VMOVDQA DD0, rsStoreAVX2
2027
2028 // Hash AD
2029 MOVQ ad_len+80(FP), itr2
2030 CALL polyHashADInternal<>(SB)
2031
2032 // Can store at least 320 bytes
2033 VPXOR (0*32)(inp), AA0, AA0
2034 VPXOR (1*32)(inp), CC0, CC0
2035 VMOVDQU AA0, (0*32)(oup)
2036 VMOVDQU CC0, (1*32)(oup)
2037
2038 VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
2039 VPXOR (2*32)(inp), AA0, AA0; VPXOR (3*32)(inp), BB0, BB0; VPXOR (4*32)(inp), CC0, CC0; VPXOR (5*32)(inp), DD0, DD0
2040 VMOVDQU AA0, (2*32)(oup); VMOVDQU BB0, (3*32)(oup); VMOVDQU CC0, (4*32)(oup); VMOVDQU DD0, (5*32)(oup)
2041 VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0
2042 VPXOR (6*32)(inp), AA0, AA0; VPXOR (7*32)(inp), BB0, BB0; VPXOR (8*32)(inp), CC0, CC0; VPXOR (9*32)(inp), DD0, DD0
2043 VMOVDQU AA0, (6*32)(oup); VMOVDQU BB0, (7*32)(oup); VMOVDQU CC0, (8*32)(oup); VMOVDQU DD0, (9*32)(oup)
2044
2045 MOVQ $320, itr1
2046 SUBQ $320, inl
2047 LEAQ 320(inp), inp
2048
2049 VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, CC3, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, CC3, DD3, DD0
2050 CMPQ inl, $128
2051 JBE sealAVX2SealHash
2052
2053 VPXOR (0*32)(inp), AA0, AA0; VPXOR (1*32)(inp), BB0, BB0; VPXOR (2*32)(inp), CC0, CC0; VPXOR (3*32)(inp), DD0, DD0
2054 VMOVDQU AA0, (10*32)(oup); VMOVDQU BB0, (11*32)(oup); VMOVDQU CC0, (12*32)(oup); VMOVDQU DD0, (13*32)(oup)
2055 SUBQ $128, inl
2056 LEAQ 128(inp), inp
2057
2058 MOVQ $8, itr1
2059 MOVQ $2, itr2
2060
2061 CMPQ inl, $128
2062 JBE sealAVX2Tail128
2063 CMPQ inl, $256
2064 JBE sealAVX2Tail256
2065 CMPQ inl, $384
2066 JBE sealAVX2Tail384
2067 CMPQ inl, $512
2068 JBE sealAVX2Tail512
2069
2070 // We have 448 bytes to hash, but main loop hashes 512 bytes at a time - perform some rounds, before the main loop
2071 VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
2072 VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
2073 VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
2074 VMOVDQA ctr3StoreAVX2, DD0
2075 VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
2076 VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
2077
2078 VMOVDQA CC3, tmpStoreAVX2
2079 chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3)
2080 VMOVDQA tmpStoreAVX2, CC3
2081 VMOVDQA CC1, tmpStoreAVX2
2082 chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1)
2083 VMOVDQA tmpStoreAVX2, CC1
2084
2085 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $12, DD0, DD0, DD0
2086 VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $12, DD1, DD1, DD1
2087 VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $12, DD2, DD2, DD2
2088 VPALIGNR $4, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $12, DD3, DD3, DD3
2089
2090 VMOVDQA CC3, tmpStoreAVX2
2091 chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3)
2092 VMOVDQA tmpStoreAVX2, CC3
2093 VMOVDQA CC1, tmpStoreAVX2
2094 chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1)
2095 VMOVDQA tmpStoreAVX2, CC1
2096
2097 VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $4, DD0, DD0, DD0
2098 VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $4, DD1, DD1, DD1
2099 VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $4, DD2, DD2, DD2
2100 VPALIGNR $12, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $4, DD3, DD3, DD3
2101 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
2102 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
2103 VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
2104 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
2105 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
2106 VMOVDQA CC3, tmpStoreAVX2
2107 VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
2108 VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
2109 VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
2110 VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
2111 VMOVDQA tmpStoreAVX2, CC3
2112
2113 SUBQ $16, oup // Adjust the pointer
2114 MOVQ $9, itr1
2115 JMP sealAVX2InternalLoopStart
2116
2117 sealAVX2MainLoop:
2118 // Load state, increment counter blocks, store the incremented counters
2119 VMOVDQU ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
2120 VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
2121 VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
2122 VMOVDQA ctr3StoreAVX2, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
2123 VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
2124 MOVQ $10, itr1
2125
2126 sealAVX2InternalLoop:
2127 polyAdd(0*8(oup))
2128 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
2129 polyMulStage1_AVX2
2130 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
2131 VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
2132 polyMulStage2_AVX2
2133 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
2134 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
2135 polyMulStage3_AVX2
2136 VMOVDQA CC3, tmpStoreAVX2
2137 VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
2138 VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
2139 VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
2140 VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
2141 VMOVDQA tmpStoreAVX2, CC3
2142 polyMulReduceStage
2143
2144 sealAVX2InternalLoopStart:
2145 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
2146 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
2147 VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
2148 polyAdd(2*8(oup))
2149 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
2150 polyMulStage1_AVX2
2151 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
2152 VMOVDQA CC3, tmpStoreAVX2
2153 VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
2154 VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
2155 VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
2156 VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
2157 VMOVDQA tmpStoreAVX2, CC3
2158 polyMulStage2_AVX2
2159 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3
2160 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
2161 VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3
2162 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
2163 polyMulStage3_AVX2
2164 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
2165 VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
2166 polyMulReduceStage
2167 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
2168 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
2169 polyAdd(4*8(oup))
2170 LEAQ (6*8)(oup), oup
2171 VMOVDQA CC3, tmpStoreAVX2
2172 VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
2173 VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
2174 VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
2175 VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
2176 VMOVDQA tmpStoreAVX2, CC3
2177 polyMulStage1_AVX2
2178 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
2179 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
2180 polyMulStage2_AVX2
2181 VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
2182 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
2183 polyMulStage3_AVX2
2184 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
2185 VMOVDQA CC3, tmpStoreAVX2
2186 VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
2187 VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
2188 VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
2189 VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
2190 VMOVDQA tmpStoreAVX2, CC3
2191 polyMulReduceStage
2192 VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3
2193 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
2194 VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3
2195 DECQ itr1
2196 JNE sealAVX2InternalLoop
2197
2198 VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
2199 VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
2200 VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
2201 VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
2202 VMOVDQA CC3, tmpStoreAVX2
2203
2204 // We only hashed 480 of the 512 bytes available - hash the remaining 32 here
2205 polyAdd(0*8(oup))
2206 polyMulAVX2
2207 LEAQ (4*8)(oup), oup
2208 VPERM2I128 $0x02, AA0, BB0, CC3; VPERM2I128 $0x13, AA0, BB0, BB0; VPERM2I128 $0x02, CC0, DD0, AA0; VPERM2I128 $0x13, CC0, DD0, CC0
2209 VPXOR (0*32)(inp), CC3, CC3; VPXOR (1*32)(inp), AA0, AA0; VPXOR (2*32)(inp), BB0, BB0; VPXOR (3*32)(inp), CC0, CC0
2210 VMOVDQU CC3, (0*32)(oup); VMOVDQU AA0, (1*32)(oup); VMOVDQU BB0, (2*32)(oup); VMOVDQU CC0, (3*32)(oup)
2211 VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
2212 VPXOR (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0
2213 VMOVDQU AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup)
2214
2215 // and here
2216 polyAdd(-2*8(oup))
2217 polyMulAVX2
2218 VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0
2219 VPXOR (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0
2220 VMOVDQU AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup)
2221 VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0
2222 VPXOR (12*32)(inp), AA0, AA0; VPXOR (13*32)(inp), BB0, BB0; VPXOR (14*32)(inp), CC0, CC0; VPXOR (15*32)(inp), DD0, DD0
2223 VMOVDQU AA0, (12*32)(oup); VMOVDQU BB0, (13*32)(oup); VMOVDQU CC0, (14*32)(oup); VMOVDQU DD0, (15*32)(oup)
2224 LEAQ (32*16)(inp), inp
2225 SUBQ $(32*16), inl
2226 CMPQ inl, $512
2227 JG sealAVX2MainLoop
2228
2229 // Tail can only hash 480 bytes
2230 polyAdd(0*8(oup))
2231 polyMulAVX2
2232 polyAdd(2*8(oup))
2233 polyMulAVX2
2234 LEAQ 32(oup), oup
2235
2236 MOVQ $10, itr1
2237 MOVQ $0, itr2
2238 CMPQ inl, $128
2239 JBE sealAVX2Tail128
2240 CMPQ inl, $256
2241 JBE sealAVX2Tail256
2242 CMPQ inl, $384
2243 JBE sealAVX2Tail384
2244 JMP sealAVX2Tail512
2245
2246 // ----------------------------------------------------------------------------
2247 // Special optimization for buffers smaller than 193 bytes
2248 seal192AVX2:
2249 // For up to 192 bytes of ciphertext and 64 bytes for the poly key, we process four blocks
2250 VMOVDQA AA0, AA1
2251 VMOVDQA BB0, BB1
2252 VMOVDQA CC0, CC1
2253 VPADDD ·avx2IncMask<>(SB), DD0, DD1
2254 VMOVDQA AA0, AA2
2255 VMOVDQA BB0, BB2
2256 VMOVDQA CC0, CC2
2257 VMOVDQA DD0, DD2
2258 VMOVDQA DD1, TT3
2259 MOVQ $10, itr2
2260
2261 sealAVX2192InnerCipherLoop:
2262 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
2263 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1
2264 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
2265 VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1
2266 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
2267 VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1
2268 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
2269 VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1
2270 DECQ itr2
2271 JNE sealAVX2192InnerCipherLoop
2272 VPADDD AA2, AA0, AA0; VPADDD AA2, AA1, AA1
2273 VPADDD BB2, BB0, BB0; VPADDD BB2, BB1, BB1
2274 VPADDD CC2, CC0, CC0; VPADDD CC2, CC1, CC1
2275 VPADDD DD2, DD0, DD0; VPADDD TT3, DD1, DD1
2276 VPERM2I128 $0x02, AA0, BB0, TT0
2277
2278 // Clamp and store poly key
2279 VPAND ·polyClampMask<>(SB), TT0, TT0
2280 VMOVDQA TT0, rsStoreAVX2
2281
2282 // Stream for up to 192 bytes
2283 VPERM2I128 $0x13, AA0, BB0, AA0
2284 VPERM2I128 $0x13, CC0, DD0, BB0
2285 VPERM2I128 $0x02, AA1, BB1, CC0
2286 VPERM2I128 $0x02, CC1, DD1, DD0
2287 VPERM2I128 $0x13, AA1, BB1, AA1
2288 VPERM2I128 $0x13, CC1, DD1, BB1
2289
2290 sealAVX2ShortSeal:
2291 // Hash aad
2292 MOVQ ad_len+80(FP), itr2
2293 CALL polyHashADInternal<>(SB)
2294 XORQ itr1, itr1
2295
2296 sealAVX2SealHash:
2297 // itr1 holds the number of bytes encrypted but not yet hashed
2298 CMPQ itr1, $16
2299 JB sealAVX2ShortSealLoop
2300 polyAdd(0(oup))
2301 polyMul
2302 SUBQ $16, itr1
2303 ADDQ $16, oup
2304 JMP sealAVX2SealHash
2305
2306 sealAVX2ShortSealLoop:
2307 CMPQ inl, $32
2308 JB sealAVX2ShortTail32
2309 SUBQ $32, inl
2310
2311 // Load for encryption
2312 VPXOR (inp), AA0, AA0
2313 VMOVDQU AA0, (oup)
2314 LEAQ (1*32)(inp), inp
2315
2316 // Now can hash
2317 polyAdd(0*8(oup))
2318 polyMulAVX2
2319 polyAdd(2*8(oup))
2320 polyMulAVX2
2321 LEAQ (1*32)(oup), oup
2322
2323 // Shift stream left
2324 VMOVDQA BB0, AA0
2325 VMOVDQA CC0, BB0
2326 VMOVDQA DD0, CC0
2327 VMOVDQA AA1, DD0
2328 VMOVDQA BB1, AA1
2329 VMOVDQA CC1, BB1
2330 VMOVDQA DD1, CC1
2331 VMOVDQA AA2, DD1
2332 VMOVDQA BB2, AA2
2333 JMP sealAVX2ShortSealLoop
2334
2335 sealAVX2ShortTail32:
2336 CMPQ inl, $16
2337 VMOVDQA A0, A1
2338 JB sealAVX2ShortDone
2339
2340 SUBQ $16, inl
2341
2342 // Load for encryption
2343 VPXOR (inp), A0, T0
2344 VMOVDQU T0, (oup)
2345 LEAQ (1*16)(inp), inp
2346
2347 // Hash
2348 polyAdd(0*8(oup))
2349 polyMulAVX2
2350 LEAQ (1*16)(oup), oup
2351 VPERM2I128 $0x11, AA0, AA0, AA0
2352 VMOVDQA A0, A1
2353
2354 sealAVX2ShortDone:
2355 VZEROUPPER
2356 JMP sealSSETail
2357
2358 // ----------------------------------------------------------------------------
2359 // Special optimization for buffers smaller than 321 bytes
2360 seal320AVX2:
2361 // For up to 320 bytes of ciphertext and 64 bytes for the poly key, we process six blocks
2362 VMOVDQA AA0, AA1; VMOVDQA BB0, BB1; VMOVDQA CC0, CC1; VPADDD ·avx2IncMask<>(SB), DD0, DD1
2363 VMOVDQA AA0, AA2; VMOVDQA BB0, BB2; VMOVDQA CC0, CC2; VPADDD ·avx2IncMask<>(SB), DD1, DD2
2364 VMOVDQA BB0, TT1; VMOVDQA CC0, TT2; VMOVDQA DD0, TT3
2365 MOVQ $10, itr2
2366
2367 sealAVX2320InnerCipherLoop:
2368 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
2369 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2
2370 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
2371 VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2
2372 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
2373 VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2
2374 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
2375 VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2
2376 DECQ itr2
2377 JNE sealAVX2320InnerCipherLoop
2378
2379 VMOVDQA ·chacha20Constants<>(SB), TT0
2380 VPADDD TT0, AA0, AA0; VPADDD TT0, AA1, AA1; VPADDD TT0, AA2, AA2
2381 VPADDD TT1, BB0, BB0; VPADDD TT1, BB1, BB1; VPADDD TT1, BB2, BB2
2382 VPADDD TT2, CC0, CC0; VPADDD TT2, CC1, CC1; VPADDD TT2, CC2, CC2
2383 VMOVDQA ·avx2IncMask<>(SB), TT0
2384 VPADDD TT3, DD0, DD0; VPADDD TT0, TT3, TT3
2385 VPADDD TT3, DD1, DD1; VPADDD TT0, TT3, TT3
2386 VPADDD TT3, DD2, DD2
2387
2388 // Clamp and store poly key
2389 VPERM2I128 $0x02, AA0, BB0, TT0
2390 VPAND ·polyClampMask<>(SB), TT0, TT0
2391 VMOVDQA TT0, rsStoreAVX2
2392
2393 // Stream for up to 320 bytes
2394 VPERM2I128 $0x13, AA0, BB0, AA0
2395 VPERM2I128 $0x13, CC0, DD0, BB0
2396 VPERM2I128 $0x02, AA1, BB1, CC0
2397 VPERM2I128 $0x02, CC1, DD1, DD0
2398 VPERM2I128 $0x13, AA1, BB1, AA1
2399 VPERM2I128 $0x13, CC1, DD1, BB1
2400 VPERM2I128 $0x02, AA2, BB2, CC1
2401 VPERM2I128 $0x02, CC2, DD2, DD1
2402 VPERM2I128 $0x13, AA2, BB2, AA2
2403 VPERM2I128 $0x13, CC2, DD2, BB2
2404 JMP sealAVX2ShortSeal
2405
2406 // ----------------------------------------------------------------------------
2407 // Special optimization for the last 128 bytes of ciphertext
2408 sealAVX2Tail128:
2409 // Need to decrypt up to 128 bytes - prepare two blocks
2410 // If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed
2411 // If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed
2412 VMOVDQA ·chacha20Constants<>(SB), AA0
2413 VMOVDQA state1StoreAVX2, BB0
2414 VMOVDQA state2StoreAVX2, CC0
2415 VMOVDQA ctr3StoreAVX2, DD0
2416 VPADDD ·avx2IncMask<>(SB), DD0, DD0
2417 VMOVDQA DD0, DD1
2418
2419 sealAVX2Tail128LoopA:
2420 polyAdd(0(oup))
2421 polyMul
2422 LEAQ 16(oup), oup
2423
2424 sealAVX2Tail128LoopB:
2425 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0)
2426 polyAdd(0(oup))
2427 polyMul
2428 VPALIGNR $4, BB0, BB0, BB0
2429 VPALIGNR $8, CC0, CC0, CC0
2430 VPALIGNR $12, DD0, DD0, DD0
2431 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0)
2432 polyAdd(16(oup))
2433 polyMul
2434 LEAQ 32(oup), oup
2435 VPALIGNR $12, BB0, BB0, BB0
2436 VPALIGNR $8, CC0, CC0, CC0
2437 VPALIGNR $4, DD0, DD0, DD0
2438 DECQ itr1
2439 JG sealAVX2Tail128LoopA
2440 DECQ itr2
2441 JGE sealAVX2Tail128LoopB
2442
2443 VPADDD ·chacha20Constants<>(SB), AA0, AA1
2444 VPADDD state1StoreAVX2, BB0, BB1
2445 VPADDD state2StoreAVX2, CC0, CC1
2446 VPADDD DD1, DD0, DD1
2447
2448 VPERM2I128 $0x02, AA1, BB1, AA0
2449 VPERM2I128 $0x02, CC1, DD1, BB0
2450 VPERM2I128 $0x13, AA1, BB1, CC0
2451 VPERM2I128 $0x13, CC1, DD1, DD0
2452 JMP sealAVX2ShortSealLoop
2453
2454 // ----------------------------------------------------------------------------
2455 // Special optimization for the last 256 bytes of ciphertext
2456 sealAVX2Tail256:
2457 // Need to decrypt up to 256 bytes - prepare two blocks
2458 // If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed
2459 // If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed
2460 VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA ·chacha20Constants<>(SB), AA1
2461 VMOVDQA state1StoreAVX2, BB0; VMOVDQA state1StoreAVX2, BB1
2462 VMOVDQA state2StoreAVX2, CC0; VMOVDQA state2StoreAVX2, CC1
2463 VMOVDQA ctr3StoreAVX2, DD0
2464 VPADDD ·avx2IncMask<>(SB), DD0, DD0
2465 VPADDD ·avx2IncMask<>(SB), DD0, DD1
2466 VMOVDQA DD0, TT1
2467 VMOVDQA DD1, TT2
2468
2469 sealAVX2Tail256LoopA:
2470 polyAdd(0(oup))
2471 polyMul
2472 LEAQ 16(oup), oup
2473
2474 sealAVX2Tail256LoopB:
2475 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
2476 polyAdd(0(oup))
2477 polyMul
2478 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1
2479 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
2480 VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1
2481 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
2482 polyAdd(16(oup))
2483 polyMul
2484 LEAQ 32(oup), oup
2485 VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1
2486 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
2487 VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1
2488 DECQ itr1
2489 JG sealAVX2Tail256LoopA
2490 DECQ itr2
2491 JGE sealAVX2Tail256LoopB
2492
2493 VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1
2494 VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1
2495 VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1
2496 VPADDD TT1, DD0, DD0; VPADDD TT2, DD1, DD1
2497 VPERM2I128 $0x02, AA0, BB0, TT0
2498 VPERM2I128 $0x02, CC0, DD0, TT1
2499 VPERM2I128 $0x13, AA0, BB0, TT2
2500 VPERM2I128 $0x13, CC0, DD0, TT3
2501 VPXOR (0*32)(inp), TT0, TT0; VPXOR (1*32)(inp), TT1, TT1; VPXOR (2*32)(inp), TT2, TT2; VPXOR (3*32)(inp), TT3, TT3
2502 VMOVDQU TT0, (0*32)(oup); VMOVDQU TT1, (1*32)(oup); VMOVDQU TT2, (2*32)(oup); VMOVDQU TT3, (3*32)(oup)
2503 MOVQ $128, itr1
2504 LEAQ 128(inp), inp
2505 SUBQ $128, inl
2506 VPERM2I128 $0x02, AA1, BB1, AA0
2507 VPERM2I128 $0x02, CC1, DD1, BB0
2508 VPERM2I128 $0x13, AA1, BB1, CC0
2509 VPERM2I128 $0x13, CC1, DD1, DD0
2510
2511 JMP sealAVX2SealHash
2512
2513 // ----------------------------------------------------------------------------
2514 // Special optimization for the last 384 bytes of ciphertext
2515 sealAVX2Tail384:
2516 // Need to decrypt up to 384 bytes - prepare two blocks
2517 // If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed
2518 // If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed
2519 VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2
2520 VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2
2521 VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2
2522 VMOVDQA ctr3StoreAVX2, DD0
2523 VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2
2524 VMOVDQA DD0, TT1; VMOVDQA DD1, TT2; VMOVDQA DD2, TT3
2525
2526 sealAVX2Tail384LoopA:
2527 polyAdd(0(oup))
2528 polyMul
2529 LEAQ 16(oup), oup
2530
2531 sealAVX2Tail384LoopB:
2532 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
2533 polyAdd(0(oup))
2534 polyMul
2535 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2
2536 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
2537 VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2
2538 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
2539 polyAdd(16(oup))
2540 polyMul
2541 LEAQ 32(oup), oup
2542 VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2
2543 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
2544 VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2
2545 DECQ itr1
2546 JG sealAVX2Tail384LoopA
2547 DECQ itr2
2548 JGE sealAVX2Tail384LoopB
2549
2550 VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2
2551 VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2
2552 VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2
2553 VPADDD TT1, DD0, DD0; VPADDD TT2, DD1, DD1; VPADDD TT3, DD2, DD2
2554 VPERM2I128 $0x02, AA0, BB0, TT0
2555 VPERM2I128 $0x02, CC0, DD0, TT1
2556 VPERM2I128 $0x13, AA0, BB0, TT2
2557 VPERM2I128 $0x13, CC0, DD0, TT3
2558 VPXOR (0*32)(inp), TT0, TT0; VPXOR (1*32)(inp), TT1, TT1; VPXOR (2*32)(inp), TT2, TT2; VPXOR (3*32)(inp), TT3, TT3
2559 VMOVDQU TT0, (0*32)(oup); VMOVDQU TT1, (1*32)(oup); VMOVDQU TT2, (2*32)(oup); VMOVDQU TT3, (3*32)(oup)
2560 VPERM2I128 $0x02, AA1, BB1, TT0
2561 VPERM2I128 $0x02, CC1, DD1, TT1
2562 VPERM2I128 $0x13, AA1, BB1, TT2
2563 VPERM2I128 $0x13, CC1, DD1, TT3
2564 VPXOR (4*32)(inp), TT0, TT0; VPXOR (5*32)(inp), TT1, TT1; VPXOR (6*32)(inp), TT2, TT2; VPXOR (7*32)(inp), TT3, TT3
2565 VMOVDQU TT0, (4*32)(oup); VMOVDQU TT1, (5*32)(oup); VMOVDQU TT2, (6*32)(oup); VMOVDQU TT3, (7*32)(oup)
2566 MOVQ $256, itr1
2567 LEAQ 256(inp), inp
2568 SUBQ $256, inl
2569 VPERM2I128 $0x02, AA2, BB2, AA0
2570 VPERM2I128 $0x02, CC2, DD2, BB0
2571 VPERM2I128 $0x13, AA2, BB2, CC0
2572 VPERM2I128 $0x13, CC2, DD2, DD0
2573
2574 JMP sealAVX2SealHash
2575
2576 // ----------------------------------------------------------------------------
2577 // Special optimization for the last 512 bytes of ciphertext
2578 sealAVX2Tail512:
2579 // Need to decrypt up to 512 bytes - prepare two blocks
2580 // If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed
2581 // If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed
2582 VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
2583 VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
2584 VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
2585 VMOVDQA ctr3StoreAVX2, DD0
2586 VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
2587 VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
2588
2589 sealAVX2Tail512LoopA:
2590 polyAdd(0(oup))
2591 polyMul
2592 LEAQ 16(oup), oup
2593
2594 sealAVX2Tail512LoopB:
2595 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
2596 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
2597 VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
2598 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
2599 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
2600 VMOVDQA CC3, tmpStoreAVX2
2601 VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
2602 VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
2603 VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
2604 VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
2605 VMOVDQA tmpStoreAVX2, CC3
2606 polyAdd(0*8(oup))
2607 polyMulAVX2
2608 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
2609 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
2610 VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
2611 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
2612 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
2613 VMOVDQA CC3, tmpStoreAVX2
2614 VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
2615 VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
2616 VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
2617 VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
2618 VMOVDQA tmpStoreAVX2, CC3
2619 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3
2620 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
2621 VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3
2622 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
2623 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
2624 VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
2625 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
2626 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
2627 polyAdd(2*8(oup))
2628 polyMulAVX2
2629 LEAQ (4*8)(oup), oup
2630 VMOVDQA CC3, tmpStoreAVX2
2631 VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
2632 VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
2633 VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
2634 VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
2635 VMOVDQA tmpStoreAVX2, CC3
2636 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
2637 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
2638 VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
2639 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
2640 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
2641 VMOVDQA CC3, tmpStoreAVX2
2642 VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
2643 VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
2644 VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
2645 VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
2646 VMOVDQA tmpStoreAVX2, CC3
2647 VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3
2648 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
2649 VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3
2650
2651 DECQ itr1
2652 JG sealAVX2Tail512LoopA
2653 DECQ itr2
2654 JGE sealAVX2Tail512LoopB
2655
2656 VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
2657 VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
2658 VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
2659 VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
2660 VMOVDQA CC3, tmpStoreAVX2
2661 VPERM2I128 $0x02, AA0, BB0, CC3
2662 VPXOR (0*32)(inp), CC3, CC3
2663 VMOVDQU CC3, (0*32)(oup)
2664 VPERM2I128 $0x02, CC0, DD0, CC3
2665 VPXOR (1*32)(inp), CC3, CC3
2666 VMOVDQU CC3, (1*32)(oup)
2667 VPERM2I128 $0x13, AA0, BB0, CC3
2668 VPXOR (2*32)(inp), CC3, CC3
2669 VMOVDQU CC3, (2*32)(oup)
2670 VPERM2I128 $0x13, CC0, DD0, CC3
2671 VPXOR (3*32)(inp), CC3, CC3
2672 VMOVDQU CC3, (3*32)(oup)
2673
2674 VPERM2I128 $0x02, AA1, BB1, AA0
2675 VPERM2I128 $0x02, CC1, DD1, BB0
2676 VPERM2I128 $0x13, AA1, BB1, CC0
2677 VPERM2I128 $0x13, CC1, DD1, DD0
2678 VPXOR (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0
2679 VMOVDQU AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup)
2680
2681 VPERM2I128 $0x02, AA2, BB2, AA0
2682 VPERM2I128 $0x02, CC2, DD2, BB0
2683 VPERM2I128 $0x13, AA2, BB2, CC0
2684 VPERM2I128 $0x13, CC2, DD2, DD0
2685 VPXOR (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0
2686 VMOVDQU AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup)
2687
2688 MOVQ $384, itr1
2689 LEAQ 384(inp), inp
2690 SUBQ $384, inl
2691 VPERM2I128 $0x02, AA3, BB3, AA0
2692 VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0
2693 VPERM2I128 $0x13, AA3, BB3, CC0
2694 VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0
2695
2696 JMP sealAVX2SealHash
2697
View as plain text