Text file src/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_amd64.s

     1  // Copyright 2016 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // This file was originally from https://golang.org/cl/24717 by Vlad Krasnov of CloudFlare.
     6  
     7  //go:build gc && !purego
     8  // +build gc,!purego
     9  
    10  #include "textflag.h"
    11  // General register allocation
    12  #define oup DI
    13  #define inp SI
    14  #define inl BX
    15  #define adp CX // free to reuse, after we hash the additional data
    16  #define keyp R8 // free to reuse, when we copy the key to stack
    17  #define itr2 R9 // general iterator
    18  #define itr1 CX // general iterator
    19  #define acc0 R10
    20  #define acc1 R11
    21  #define acc2 R12
    22  #define t0 R13
    23  #define t1 R14
    24  #define t2 R15
    25  #define t3 R8
    26  // Register and stack allocation for the SSE code
    27  #define rStore (0*16)(BP)
    28  #define sStore (1*16)(BP)
    29  #define state1Store (2*16)(BP)
    30  #define state2Store (3*16)(BP)
    31  #define tmpStore (4*16)(BP)
    32  #define ctr0Store (5*16)(BP)
    33  #define ctr1Store (6*16)(BP)
    34  #define ctr2Store (7*16)(BP)
    35  #define ctr3Store (8*16)(BP)
    36  #define A0 X0
    37  #define A1 X1
    38  #define A2 X2
    39  #define B0 X3
    40  #define B1 X4
    41  #define B2 X5
    42  #define C0 X6
    43  #define C1 X7
    44  #define C2 X8
    45  #define D0 X9
    46  #define D1 X10
    47  #define D2 X11
    48  #define T0 X12
    49  #define T1 X13
    50  #define T2 X14
    51  #define T3 X15
    52  #define A3 T0
    53  #define B3 T1
    54  #define C3 T2
    55  #define D3 T3
    56  // Register and stack allocation for the AVX2 code
    57  #define rsStoreAVX2 (0*32)(BP)
    58  #define state1StoreAVX2 (1*32)(BP)
    59  #define state2StoreAVX2 (2*32)(BP)
    60  #define ctr0StoreAVX2 (3*32)(BP)
    61  #define ctr1StoreAVX2 (4*32)(BP)
    62  #define ctr2StoreAVX2 (5*32)(BP)
    63  #define ctr3StoreAVX2 (6*32)(BP)
    64  #define tmpStoreAVX2 (7*32)(BP) // 256 bytes on stack
    65  #define AA0 Y0
    66  #define AA1 Y5
    67  #define AA2 Y6
    68  #define AA3 Y7
    69  #define BB0 Y14
    70  #define BB1 Y9
    71  #define BB2 Y10
    72  #define BB3 Y11
    73  #define CC0 Y12
    74  #define CC1 Y13
    75  #define CC2 Y8
    76  #define CC3 Y15
    77  #define DD0 Y4
    78  #define DD1 Y1
    79  #define DD2 Y2
    80  #define DD3 Y3
    81  #define TT0 DD3
    82  #define TT1 AA3
    83  #define TT2 BB3
    84  #define TT3 CC3
    85  // ChaCha20 constants
    86  DATA ·chacha20Constants<>+0x00(SB)/4, $0x61707865
    87  DATA ·chacha20Constants<>+0x04(SB)/4, $0x3320646e
    88  DATA ·chacha20Constants<>+0x08(SB)/4, $0x79622d32
    89  DATA ·chacha20Constants<>+0x0c(SB)/4, $0x6b206574
    90  DATA ·chacha20Constants<>+0x10(SB)/4, $0x61707865
    91  DATA ·chacha20Constants<>+0x14(SB)/4, $0x3320646e
    92  DATA ·chacha20Constants<>+0x18(SB)/4, $0x79622d32
    93  DATA ·chacha20Constants<>+0x1c(SB)/4, $0x6b206574
    94  // <<< 16 with PSHUFB
    95  DATA ·rol16<>+0x00(SB)/8, $0x0504070601000302
    96  DATA ·rol16<>+0x08(SB)/8, $0x0D0C0F0E09080B0A
    97  DATA ·rol16<>+0x10(SB)/8, $0x0504070601000302
    98  DATA ·rol16<>+0x18(SB)/8, $0x0D0C0F0E09080B0A
    99  // <<< 8 with PSHUFB
   100  DATA ·rol8<>+0x00(SB)/8, $0x0605040702010003
   101  DATA ·rol8<>+0x08(SB)/8, $0x0E0D0C0F0A09080B
   102  DATA ·rol8<>+0x10(SB)/8, $0x0605040702010003
   103  DATA ·rol8<>+0x18(SB)/8, $0x0E0D0C0F0A09080B
   104  
   105  DATA ·avx2InitMask<>+0x00(SB)/8, $0x0
   106  DATA ·avx2InitMask<>+0x08(SB)/8, $0x0
   107  DATA ·avx2InitMask<>+0x10(SB)/8, $0x1
   108  DATA ·avx2InitMask<>+0x18(SB)/8, $0x0
   109  
   110  DATA ·avx2IncMask<>+0x00(SB)/8, $0x2
   111  DATA ·avx2IncMask<>+0x08(SB)/8, $0x0
   112  DATA ·avx2IncMask<>+0x10(SB)/8, $0x2
   113  DATA ·avx2IncMask<>+0x18(SB)/8, $0x0
   114  // Poly1305 key clamp
   115  DATA ·polyClampMask<>+0x00(SB)/8, $0x0FFFFFFC0FFFFFFF
   116  DATA ·polyClampMask<>+0x08(SB)/8, $0x0FFFFFFC0FFFFFFC
   117  DATA ·polyClampMask<>+0x10(SB)/8, $0xFFFFFFFFFFFFFFFF
   118  DATA ·polyClampMask<>+0x18(SB)/8, $0xFFFFFFFFFFFFFFFF
   119  
   120  DATA ·sseIncMask<>+0x00(SB)/8, $0x1
   121  DATA ·sseIncMask<>+0x08(SB)/8, $0x0
   122  // To load/store the last < 16 bytes in a buffer
   123  DATA ·andMask<>+0x00(SB)/8, $0x00000000000000ff
   124  DATA ·andMask<>+0x08(SB)/8, $0x0000000000000000
   125  DATA ·andMask<>+0x10(SB)/8, $0x000000000000ffff
   126  DATA ·andMask<>+0x18(SB)/8, $0x0000000000000000
   127  DATA ·andMask<>+0x20(SB)/8, $0x0000000000ffffff
   128  DATA ·andMask<>+0x28(SB)/8, $0x0000000000000000
   129  DATA ·andMask<>+0x30(SB)/8, $0x00000000ffffffff
   130  DATA ·andMask<>+0x38(SB)/8, $0x0000000000000000
   131  DATA ·andMask<>+0x40(SB)/8, $0x000000ffffffffff
   132  DATA ·andMask<>+0x48(SB)/8, $0x0000000000000000
   133  DATA ·andMask<>+0x50(SB)/8, $0x0000ffffffffffff
   134  DATA ·andMask<>+0x58(SB)/8, $0x0000000000000000
   135  DATA ·andMask<>+0x60(SB)/8, $0x00ffffffffffffff
   136  DATA ·andMask<>+0x68(SB)/8, $0x0000000000000000
   137  DATA ·andMask<>+0x70(SB)/8, $0xffffffffffffffff
   138  DATA ·andMask<>+0x78(SB)/8, $0x0000000000000000
   139  DATA ·andMask<>+0x80(SB)/8, $0xffffffffffffffff
   140  DATA ·andMask<>+0x88(SB)/8, $0x00000000000000ff
   141  DATA ·andMask<>+0x90(SB)/8, $0xffffffffffffffff
   142  DATA ·andMask<>+0x98(SB)/8, $0x000000000000ffff
   143  DATA ·andMask<>+0xa0(SB)/8, $0xffffffffffffffff
   144  DATA ·andMask<>+0xa8(SB)/8, $0x0000000000ffffff
   145  DATA ·andMask<>+0xb0(SB)/8, $0xffffffffffffffff
   146  DATA ·andMask<>+0xb8(SB)/8, $0x00000000ffffffff
   147  DATA ·andMask<>+0xc0(SB)/8, $0xffffffffffffffff
   148  DATA ·andMask<>+0xc8(SB)/8, $0x000000ffffffffff
   149  DATA ·andMask<>+0xd0(SB)/8, $0xffffffffffffffff
   150  DATA ·andMask<>+0xd8(SB)/8, $0x0000ffffffffffff
   151  DATA ·andMask<>+0xe0(SB)/8, $0xffffffffffffffff
   152  DATA ·andMask<>+0xe8(SB)/8, $0x00ffffffffffffff
   153  
   154  GLOBL ·chacha20Constants<>(SB), (NOPTR+RODATA), $32
   155  GLOBL ·rol16<>(SB), (NOPTR+RODATA), $32
   156  GLOBL ·rol8<>(SB), (NOPTR+RODATA), $32
   157  GLOBL ·sseIncMask<>(SB), (NOPTR+RODATA), $16
   158  GLOBL ·avx2IncMask<>(SB), (NOPTR+RODATA), $32
   159  GLOBL ·avx2InitMask<>(SB), (NOPTR+RODATA), $32
   160  GLOBL ·polyClampMask<>(SB), (NOPTR+RODATA), $32
   161  GLOBL ·andMask<>(SB), (NOPTR+RODATA), $240
   162  // No PALIGNR in Go ASM yet (but VPALIGNR is present).
   163  #define shiftB0Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x04 // PALIGNR $4, X3, X3
   164  #define shiftB1Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xe4; BYTE $0x04 // PALIGNR $4, X4, X4
   165  #define shiftB2Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x04 // PALIGNR $4, X5, X5
   166  #define shiftB3Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x04 // PALIGNR $4, X13, X13
   167  #define shiftC0Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xf6; BYTE $0x08 // PALIGNR $8, X6, X6
   168  #define shiftC1Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xff; BYTE $0x08 // PALIGNR $8, X7, X7
   169  #define shiftC2Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xc0; BYTE $0x08 // PALIGNR $8, X8, X8
   170  #define shiftC3Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xf6; BYTE $0x08 // PALIGNR $8, X14, X14
   171  #define shiftD0Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xc9; BYTE $0x0c // PALIGNR $12, X9, X9
   172  #define shiftD1Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xd2; BYTE $0x0c // PALIGNR $12, X10, X10
   173  #define shiftD2Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x0c // PALIGNR $12, X11, X11
   174  #define shiftD3Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xff; BYTE $0x0c // PALIGNR $12, X15, X15
   175  #define shiftB0Right BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x0c // PALIGNR $12, X3, X3
   176  #define shiftB1Right BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xe4; BYTE $0x0c // PALIGNR $12, X4, X4
   177  #define shiftB2Right BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x0c // PALIGNR $12, X5, X5
   178  #define shiftB3Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x0c // PALIGNR $12, X13, X13
   179  #define shiftC0Right shiftC0Left
   180  #define shiftC1Right shiftC1Left
   181  #define shiftC2Right shiftC2Left
   182  #define shiftC3Right shiftC3Left
   183  #define shiftD0Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xc9; BYTE $0x04 // PALIGNR $4, X9, X9
   184  #define shiftD1Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xd2; BYTE $0x04 // PALIGNR $4, X10, X10
   185  #define shiftD2Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x04 // PALIGNR $4, X11, X11
   186  #define shiftD3Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xff; BYTE $0x04 // PALIGNR $4, X15, X15
   187  // Some macros
   188  #define chachaQR(A, B, C, D, T) \
   189  	PADDD B, A; PXOR A, D; PSHUFB ·rol16<>(SB), D                            \
   190  	PADDD D, C; PXOR C, B; MOVO B, T; PSLLL $12, T; PSRLL $20, B; PXOR T, B \
   191  	PADDD B, A; PXOR A, D; PSHUFB ·rol8<>(SB), D                             \
   192  	PADDD D, C; PXOR C, B; MOVO B, T; PSLLL $7, T; PSRLL $25, B; PXOR T, B
   193  
   194  #define chachaQR_AVX2(A, B, C, D, T) \
   195  	VPADDD B, A, A; VPXOR A, D, D; VPSHUFB ·rol16<>(SB), D, D                         \
   196  	VPADDD D, C, C; VPXOR C, B, B; VPSLLD $12, B, T; VPSRLD $20, B, B; VPXOR T, B, B \
   197  	VPADDD B, A, A; VPXOR A, D, D; VPSHUFB ·rol8<>(SB), D, D                          \
   198  	VPADDD D, C, C; VPXOR C, B, B; VPSLLD $7, B, T; VPSRLD $25, B, B; VPXOR T, B, B
   199  
   200  #define polyAdd(S) ADDQ S, acc0; ADCQ 8+S, acc1; ADCQ $1, acc2
   201  #define polyMulStage1 MOVQ (0*8)(BP), AX; MOVQ AX, t2; MULQ acc0; MOVQ AX, t0; MOVQ DX, t1; MOVQ (0*8)(BP), AX; MULQ acc1; IMULQ acc2, t2; ADDQ AX, t1; ADCQ DX, t2
   202  #define polyMulStage2 MOVQ (1*8)(BP), AX; MOVQ AX, t3; MULQ acc0; ADDQ AX, t1; ADCQ $0, DX; MOVQ DX, acc0; MOVQ (1*8)(BP), AX; MULQ acc1; ADDQ AX, t2; ADCQ $0, DX
   203  #define polyMulStage3 IMULQ acc2, t3; ADDQ acc0, t2; ADCQ DX, t3
   204  #define polyMulReduceStage MOVQ t0, acc0; MOVQ t1, acc1; MOVQ t2, acc2; ANDQ $3, acc2; MOVQ t2, t0; ANDQ $-4, t0; MOVQ t3, t1; SHRQ $2, t3, t2; SHRQ $2, t3; ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $0, acc2; ADDQ t2, acc0; ADCQ t3, acc1; ADCQ $0, acc2
   205  
   206  #define polyMulStage1_AVX2 MOVQ (0*8)(BP), DX; MOVQ DX, t2; MULXQ acc0, t0, t1; IMULQ acc2, t2; MULXQ acc1, AX, DX; ADDQ AX, t1; ADCQ DX, t2
   207  #define polyMulStage2_AVX2 MOVQ (1*8)(BP), DX; MULXQ acc0, acc0, AX; ADDQ acc0, t1; MULXQ acc1, acc1, t3; ADCQ acc1, t2; ADCQ $0, t3
   208  #define polyMulStage3_AVX2 IMULQ acc2, DX; ADDQ AX, t2; ADCQ DX, t3
   209  
   210  #define polyMul polyMulStage1; polyMulStage2; polyMulStage3; polyMulReduceStage
   211  #define polyMulAVX2 polyMulStage1_AVX2; polyMulStage2_AVX2; polyMulStage3_AVX2; polyMulReduceStage
   212  // ----------------------------------------------------------------------------
   213  TEXT polyHashADInternal<>(SB), NOSPLIT, $0
   214  	// adp points to beginning of additional data
   215  	// itr2 holds ad length
   216  	XORQ acc0, acc0
   217  	XORQ acc1, acc1
   218  	XORQ acc2, acc2
   219  	CMPQ itr2, $13
   220  	JNE  hashADLoop
   221  
   222  openFastTLSAD:
   223  	// Special treatment for the TLS case of 13 bytes
   224  	MOVQ (adp), acc0
   225  	MOVQ 5(adp), acc1
   226  	SHRQ $24, acc1
   227  	MOVQ $1, acc2
   228  	polyMul
   229  	RET
   230  
   231  hashADLoop:
   232  	// Hash in 16 byte chunks
   233  	CMPQ itr2, $16
   234  	JB   hashADTail
   235  	polyAdd(0(adp))
   236  	LEAQ (1*16)(adp), adp
   237  	SUBQ $16, itr2
   238  	polyMul
   239  	JMP  hashADLoop
   240  
   241  hashADTail:
   242  	CMPQ itr2, $0
   243  	JE   hashADDone
   244  
   245  	// Hash last < 16 byte tail
   246  	XORQ t0, t0
   247  	XORQ t1, t1
   248  	XORQ t2, t2
   249  	ADDQ itr2, adp
   250  
   251  hashADTailLoop:
   252  	SHLQ $8, t0, t1
   253  	SHLQ $8, t0
   254  	MOVB -1(adp), t2
   255  	XORQ t2, t0
   256  	DECQ adp
   257  	DECQ itr2
   258  	JNE  hashADTailLoop
   259  
   260  hashADTailFinish:
   261  	ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2
   262  	polyMul
   263  
   264  	// Finished AD
   265  hashADDone:
   266  	RET
   267  
   268  // ----------------------------------------------------------------------------
   269  // func chacha20Poly1305Open(dst, key, src, ad []byte) bool
   270  TEXT ·chacha20Poly1305Open(SB), 0, $288-97
   271  	// For aligned stack access
   272  	MOVQ SP, BP
   273  	ADDQ $32, BP
   274  	ANDQ $-32, BP
   275  	MOVQ dst+0(FP), oup
   276  	MOVQ key+24(FP), keyp
   277  	MOVQ src+48(FP), inp
   278  	MOVQ src_len+56(FP), inl
   279  	MOVQ ad+72(FP), adp
   280  
   281  	// Check for AVX2 support
   282  	CMPB ·useAVX2(SB), $1
   283  	JE   chacha20Poly1305Open_AVX2
   284  
   285  	// Special optimization, for very short buffers
   286  	CMPQ inl, $128
   287  	JBE  openSSE128 // About 16% faster
   288  
   289  	// For long buffers, prepare the poly key first
   290  	MOVOU ·chacha20Constants<>(SB), A0
   291  	MOVOU (1*16)(keyp), B0
   292  	MOVOU (2*16)(keyp), C0
   293  	MOVOU (3*16)(keyp), D0
   294  	MOVO  D0, T1
   295  
   296  	// Store state on stack for future use
   297  	MOVO B0, state1Store
   298  	MOVO C0, state2Store
   299  	MOVO D0, ctr3Store
   300  	MOVQ $10, itr2
   301  
   302  openSSEPreparePolyKey:
   303  	chachaQR(A0, B0, C0, D0, T0)
   304  	shiftB0Left;  shiftC0Left; shiftD0Left
   305  	chachaQR(A0, B0, C0, D0, T0)
   306  	shiftB0Right; shiftC0Right; shiftD0Right
   307  	DECQ          itr2
   308  	JNE           openSSEPreparePolyKey
   309  
   310  	// A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded
   311  	PADDL ·chacha20Constants<>(SB), A0; PADDL state1Store, B0
   312  
   313  	// Clamp and store the key
   314  	PAND ·polyClampMask<>(SB), A0
   315  	MOVO A0, rStore; MOVO B0, sStore
   316  
   317  	// Hash AAD
   318  	MOVQ ad_len+80(FP), itr2
   319  	CALL polyHashADInternal<>(SB)
   320  
   321  openSSEMainLoop:
   322  	CMPQ inl, $256
   323  	JB   openSSEMainLoopDone
   324  
   325  	// Load state, increment counter blocks
   326  	MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0
   327  	MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
   328  	MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
   329  	MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3
   330  
   331  	// Store counters
   332  	MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store
   333  
   334  	// There are 10 ChaCha20 iterations of 2QR each, so for 6 iterations we hash 2 blocks, and for the remaining 4 only 1 block - for a total of 16
   335  	MOVQ $4, itr1
   336  	MOVQ inp, itr2
   337  
   338  openSSEInternalLoop:
   339  	MOVO          C3, tmpStore
   340  	chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
   341  	MOVO          tmpStore, C3
   342  	MOVO          C1, tmpStore
   343  	chachaQR(A3, B3, C3, D3, C1)
   344  	MOVO          tmpStore, C1
   345  	polyAdd(0(itr2))
   346  	shiftB0Left;  shiftB1Left; shiftB2Left; shiftB3Left
   347  	shiftC0Left;  shiftC1Left; shiftC2Left; shiftC3Left
   348  	shiftD0Left;  shiftD1Left; shiftD2Left; shiftD3Left
   349  	polyMulStage1
   350  	polyMulStage2
   351  	LEAQ          (2*8)(itr2), itr2
   352  	MOVO          C3, tmpStore
   353  	chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
   354  	MOVO          tmpStore, C3
   355  	MOVO          C1, tmpStore
   356  	polyMulStage3
   357  	chachaQR(A3, B3, C3, D3, C1)
   358  	MOVO          tmpStore, C1
   359  	polyMulReduceStage
   360  	shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right
   361  	shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right
   362  	shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right
   363  	DECQ          itr1
   364  	JGE           openSSEInternalLoop
   365  
   366  	polyAdd(0(itr2))
   367  	polyMul
   368  	LEAQ (2*8)(itr2), itr2
   369  
   370  	CMPQ itr1, $-6
   371  	JG   openSSEInternalLoop
   372  
   373  	// Add in the state
   374  	PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3
   375  	PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3
   376  	PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3
   377  	PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3
   378  
   379  	// Load - xor - store
   380  	MOVO  D3, tmpStore
   381  	MOVOU (0*16)(inp), D3; PXOR D3, A0; MOVOU A0, (0*16)(oup)
   382  	MOVOU (1*16)(inp), D3; PXOR D3, B0; MOVOU B0, (1*16)(oup)
   383  	MOVOU (2*16)(inp), D3; PXOR D3, C0; MOVOU C0, (2*16)(oup)
   384  	MOVOU (3*16)(inp), D3; PXOR D3, D0; MOVOU D0, (3*16)(oup)
   385  	MOVOU (4*16)(inp), D0; PXOR D0, A1; MOVOU A1, (4*16)(oup)
   386  	MOVOU (5*16)(inp), D0; PXOR D0, B1; MOVOU B1, (5*16)(oup)
   387  	MOVOU (6*16)(inp), D0; PXOR D0, C1; MOVOU C1, (6*16)(oup)
   388  	MOVOU (7*16)(inp), D0; PXOR D0, D1; MOVOU D1, (7*16)(oup)
   389  	MOVOU (8*16)(inp), D0; PXOR D0, A2; MOVOU A2, (8*16)(oup)
   390  	MOVOU (9*16)(inp), D0; PXOR D0, B2; MOVOU B2, (9*16)(oup)
   391  	MOVOU (10*16)(inp), D0; PXOR D0, C2; MOVOU C2, (10*16)(oup)
   392  	MOVOU (11*16)(inp), D0; PXOR D0, D2; MOVOU D2, (11*16)(oup)
   393  	MOVOU (12*16)(inp), D0; PXOR D0, A3; MOVOU A3, (12*16)(oup)
   394  	MOVOU (13*16)(inp), D0; PXOR D0, B3; MOVOU B3, (13*16)(oup)
   395  	MOVOU (14*16)(inp), D0; PXOR D0, C3; MOVOU C3, (14*16)(oup)
   396  	MOVOU (15*16)(inp), D0; PXOR tmpStore, D0; MOVOU D0, (15*16)(oup)
   397  	LEAQ  256(inp), inp
   398  	LEAQ  256(oup), oup
   399  	SUBQ  $256, inl
   400  	JMP   openSSEMainLoop
   401  
   402  openSSEMainLoopDone:
   403  	// Handle the various tail sizes efficiently
   404  	TESTQ inl, inl
   405  	JE    openSSEFinalize
   406  	CMPQ  inl, $64
   407  	JBE   openSSETail64
   408  	CMPQ  inl, $128
   409  	JBE   openSSETail128
   410  	CMPQ  inl, $192
   411  	JBE   openSSETail192
   412  	JMP   openSSETail256
   413  
   414  openSSEFinalize:
   415  	// Hash in the PT, AAD lengths
   416  	ADDQ ad_len+80(FP), acc0; ADCQ src_len+56(FP), acc1; ADCQ $1, acc2
   417  	polyMul
   418  
   419  	// Final reduce
   420  	MOVQ    acc0, t0
   421  	MOVQ    acc1, t1
   422  	MOVQ    acc2, t2
   423  	SUBQ    $-5, acc0
   424  	SBBQ    $-1, acc1
   425  	SBBQ    $3, acc2
   426  	CMOVQCS t0, acc0
   427  	CMOVQCS t1, acc1
   428  	CMOVQCS t2, acc2
   429  
   430  	// Add in the "s" part of the key
   431  	ADDQ 0+sStore, acc0
   432  	ADCQ 8+sStore, acc1
   433  
   434  	// Finally, constant time compare to the tag at the end of the message
   435  	XORQ    AX, AX
   436  	MOVQ    $1, DX
   437  	XORQ    (0*8)(inp), acc0
   438  	XORQ    (1*8)(inp), acc1
   439  	ORQ     acc1, acc0
   440  	CMOVQEQ DX, AX
   441  
   442  	// Return true iff tags are equal
   443  	MOVB AX, ret+96(FP)
   444  	RET
   445  
   446  // ----------------------------------------------------------------------------
   447  // Special optimization for buffers smaller than 129 bytes
   448  openSSE128:
   449  	// For up to 128 bytes of ciphertext and 64 bytes for the poly key, we require to process three blocks
   450  	MOVOU ·chacha20Constants<>(SB), A0; MOVOU (1*16)(keyp), B0; MOVOU (2*16)(keyp), C0; MOVOU (3*16)(keyp), D0
   451  	MOVO  A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
   452  	MOVO  A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
   453  	MOVO  B0, T1; MOVO C0, T2; MOVO D1, T3
   454  	MOVQ  $10, itr2
   455  
   456  openSSE128InnerCipherLoop:
   457  	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
   458  	shiftB0Left;  shiftB1Left; shiftB2Left
   459  	shiftC0Left;  shiftC1Left; shiftC2Left
   460  	shiftD0Left;  shiftD1Left; shiftD2Left
   461  	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
   462  	shiftB0Right; shiftB1Right; shiftB2Right
   463  	shiftC0Right; shiftC1Right; shiftC2Right
   464  	shiftD0Right; shiftD1Right; shiftD2Right
   465  	DECQ          itr2
   466  	JNE           openSSE128InnerCipherLoop
   467  
   468  	// A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded
   469  	PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2
   470  	PADDL T1, B0; PADDL T1, B1; PADDL T1, B2
   471  	PADDL T2, C1; PADDL T2, C2
   472  	PADDL T3, D1; PADDL ·sseIncMask<>(SB), T3; PADDL T3, D2
   473  
   474  	// Clamp and store the key
   475  	PAND  ·polyClampMask<>(SB), A0
   476  	MOVOU A0, rStore; MOVOU B0, sStore
   477  
   478  	// Hash
   479  	MOVQ ad_len+80(FP), itr2
   480  	CALL polyHashADInternal<>(SB)
   481  
   482  openSSE128Open:
   483  	CMPQ inl, $16
   484  	JB   openSSETail16
   485  	SUBQ $16, inl
   486  
   487  	// Load for hashing
   488  	polyAdd(0(inp))
   489  
   490  	// Load for decryption
   491  	MOVOU (inp), T0; PXOR T0, A1; MOVOU A1, (oup)
   492  	LEAQ  (1*16)(inp), inp
   493  	LEAQ  (1*16)(oup), oup
   494  	polyMul
   495  
   496  	// Shift the stream "left"
   497  	MOVO B1, A1
   498  	MOVO C1, B1
   499  	MOVO D1, C1
   500  	MOVO A2, D1
   501  	MOVO B2, A2
   502  	MOVO C2, B2
   503  	MOVO D2, C2
   504  	JMP  openSSE128Open
   505  
   506  openSSETail16:
   507  	TESTQ inl, inl
   508  	JE    openSSEFinalize
   509  
   510  	// We can safely load the CT from the end, because it is padded with the MAC
   511  	MOVQ   inl, itr2
   512  	SHLQ   $4, itr2
   513  	LEAQ   ·andMask<>(SB), t0
   514  	MOVOU  (inp), T0
   515  	ADDQ   inl, inp
   516  	PAND   -16(t0)(itr2*1), T0
   517  	MOVO   T0, 0+tmpStore
   518  	MOVQ   T0, t0
   519  	MOVQ   8+tmpStore, t1
   520  	PXOR   A1, T0
   521  
   522  	// We can only store one byte at a time, since plaintext can be shorter than 16 bytes
   523  openSSETail16Store:
   524  	MOVQ T0, t3
   525  	MOVB t3, (oup)
   526  	PSRLDQ $1, T0
   527  	INCQ   oup
   528  	DECQ   inl
   529  	JNE    openSSETail16Store
   530  	ADDQ   t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2
   531  	polyMul
   532  	JMP    openSSEFinalize
   533  
   534  // ----------------------------------------------------------------------------
   535  // Special optimization for the last 64 bytes of ciphertext
   536  openSSETail64:
   537  	// Need to decrypt up to 64 bytes - prepare single block
   538  	MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr0Store
   539  	XORQ itr2, itr2
   540  	MOVQ inl, itr1
   541  	CMPQ itr1, $16
   542  	JB   openSSETail64LoopB
   543  
   544  openSSETail64LoopA:
   545  	// Perform ChaCha rounds, while hashing the remaining input
   546  	polyAdd(0(inp)(itr2*1))
   547  	polyMul
   548  	SUBQ $16, itr1
   549  
   550  openSSETail64LoopB:
   551  	ADDQ          $16, itr2
   552  	chachaQR(A0, B0, C0, D0, T0)
   553  	shiftB0Left;  shiftC0Left; shiftD0Left
   554  	chachaQR(A0, B0, C0, D0, T0)
   555  	shiftB0Right; shiftC0Right; shiftD0Right
   556  
   557  	CMPQ itr1, $16
   558  	JAE  openSSETail64LoopA
   559  
   560  	CMPQ itr2, $160
   561  	JNE  openSSETail64LoopB
   562  
   563  	PADDL ·chacha20Constants<>(SB), A0; PADDL state1Store, B0; PADDL state2Store, C0; PADDL ctr0Store, D0
   564  
   565  openSSETail64DecLoop:
   566  	CMPQ  inl, $16
   567  	JB    openSSETail64DecLoopDone
   568  	SUBQ  $16, inl
   569  	MOVOU (inp), T0
   570  	PXOR  T0, A0
   571  	MOVOU A0, (oup)
   572  	LEAQ  16(inp), inp
   573  	LEAQ  16(oup), oup
   574  	MOVO  B0, A0
   575  	MOVO  C0, B0
   576  	MOVO  D0, C0
   577  	JMP   openSSETail64DecLoop
   578  
   579  openSSETail64DecLoopDone:
   580  	MOVO A0, A1
   581  	JMP  openSSETail16
   582  
   583  // ----------------------------------------------------------------------------
   584  // Special optimization for the last 128 bytes of ciphertext
   585  openSSETail128:
   586  	// Need to decrypt up to 128 bytes - prepare two blocks
   587  	MOVO ·chacha20Constants<>(SB), A1; MOVO state1Store, B1; MOVO state2Store, C1; MOVO ctr3Store, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr0Store
   588  	MOVO A1, A0; MOVO B1, B0; MOVO C1, C0; MOVO D1, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr1Store
   589  	XORQ itr2, itr2
   590  	MOVQ inl, itr1
   591  	ANDQ $-16, itr1
   592  
   593  openSSETail128LoopA:
   594  	// Perform ChaCha rounds, while hashing the remaining input
   595  	polyAdd(0(inp)(itr2*1))
   596  	polyMul
   597  
   598  openSSETail128LoopB:
   599  	ADDQ          $16, itr2
   600  	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0)
   601  	shiftB0Left;  shiftC0Left; shiftD0Left
   602  	shiftB1Left;  shiftC1Left; shiftD1Left
   603  	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0)
   604  	shiftB0Right; shiftC0Right; shiftD0Right
   605  	shiftB1Right; shiftC1Right; shiftD1Right
   606  
   607  	CMPQ itr2, itr1
   608  	JB   openSSETail128LoopA
   609  
   610  	CMPQ itr2, $160
   611  	JNE  openSSETail128LoopB
   612  
   613  	PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1
   614  	PADDL state1Store, B0; PADDL state1Store, B1
   615  	PADDL state2Store, C0; PADDL state2Store, C1
   616  	PADDL ctr1Store, D0; PADDL ctr0Store, D1
   617  
   618  	MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3
   619  	PXOR  T0, A1; PXOR T1, B1; PXOR T2, C1; PXOR T3, D1
   620  	MOVOU A1, (0*16)(oup); MOVOU B1, (1*16)(oup); MOVOU C1, (2*16)(oup); MOVOU D1, (3*16)(oup)
   621  
   622  	SUBQ $64, inl
   623  	LEAQ 64(inp), inp
   624  	LEAQ 64(oup), oup
   625  	JMP  openSSETail64DecLoop
   626  
   627  // ----------------------------------------------------------------------------
   628  // Special optimization for the last 192 bytes of ciphertext
   629  openSSETail192:
   630  	// Need to decrypt up to 192 bytes - prepare three blocks
   631  	MOVO ·chacha20Constants<>(SB), A2; MOVO state1Store, B2; MOVO state2Store, C2; MOVO ctr3Store, D2; PADDL ·sseIncMask<>(SB), D2; MOVO D2, ctr0Store
   632  	MOVO A2, A1; MOVO B2, B1; MOVO C2, C1; MOVO D2, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr1Store
   633  	MOVO A1, A0; MOVO B1, B0; MOVO C1, C0; MOVO D1, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr2Store
   634  
   635  	MOVQ    inl, itr1
   636  	MOVQ    $160, itr2
   637  	CMPQ    itr1, $160
   638  	CMOVQGT itr2, itr1
   639  	ANDQ    $-16, itr1
   640  	XORQ    itr2, itr2
   641  
   642  openSSLTail192LoopA:
   643  	// Perform ChaCha rounds, while hashing the remaining input
   644  	polyAdd(0(inp)(itr2*1))
   645  	polyMul
   646  
   647  openSSLTail192LoopB:
   648  	ADDQ         $16, itr2
   649  	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
   650  	shiftB0Left; shiftC0Left; shiftD0Left
   651  	shiftB1Left; shiftC1Left; shiftD1Left
   652  	shiftB2Left; shiftC2Left; shiftD2Left
   653  
   654  	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
   655  	shiftB0Right; shiftC0Right; shiftD0Right
   656  	shiftB1Right; shiftC1Right; shiftD1Right
   657  	shiftB2Right; shiftC2Right; shiftD2Right
   658  
   659  	CMPQ itr2, itr1
   660  	JB   openSSLTail192LoopA
   661  
   662  	CMPQ itr2, $160
   663  	JNE  openSSLTail192LoopB
   664  
   665  	CMPQ inl, $176
   666  	JB   openSSLTail192Store
   667  
   668  	polyAdd(160(inp))
   669  	polyMul
   670  
   671  	CMPQ inl, $192
   672  	JB   openSSLTail192Store
   673  
   674  	polyAdd(176(inp))
   675  	polyMul
   676  
   677  openSSLTail192Store:
   678  	PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2
   679  	PADDL state1Store, B0; PADDL state1Store, B1; PADDL state1Store, B2
   680  	PADDL state2Store, C0; PADDL state2Store, C1; PADDL state2Store, C2
   681  	PADDL ctr2Store, D0; PADDL ctr1Store, D1; PADDL ctr0Store, D2
   682  
   683  	MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3
   684  	PXOR  T0, A2; PXOR T1, B2; PXOR T2, C2; PXOR T3, D2
   685  	MOVOU A2, (0*16)(oup); MOVOU B2, (1*16)(oup); MOVOU C2, (2*16)(oup); MOVOU D2, (3*16)(oup)
   686  
   687  	MOVOU (4*16)(inp), T0; MOVOU (5*16)(inp), T1; MOVOU (6*16)(inp), T2; MOVOU (7*16)(inp), T3
   688  	PXOR  T0, A1; PXOR T1, B1; PXOR T2, C1; PXOR T3, D1
   689  	MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup)
   690  
   691  	SUBQ $128, inl
   692  	LEAQ 128(inp), inp
   693  	LEAQ 128(oup), oup
   694  	JMP  openSSETail64DecLoop
   695  
   696  // ----------------------------------------------------------------------------
   697  // Special optimization for the last 256 bytes of ciphertext
   698  openSSETail256:
   699  	// Need to decrypt up to 256 bytes - prepare four blocks
   700  	MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0
   701  	MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
   702  	MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
   703  	MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3
   704  
   705  	// Store counters
   706  	MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store
   707  	XORQ itr2, itr2
   708  
   709  openSSETail256Loop:
   710  	// This loop inteleaves 8 ChaCha quarter rounds with 1 poly multiplication
   711  	polyAdd(0(inp)(itr2*1))
   712  	MOVO          C3, tmpStore
   713  	chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
   714  	MOVO          tmpStore, C3
   715  	MOVO          C1, tmpStore
   716  	chachaQR(A3, B3, C3, D3, C1)
   717  	MOVO          tmpStore, C1
   718  	shiftB0Left;  shiftB1Left; shiftB2Left; shiftB3Left
   719  	shiftC0Left;  shiftC1Left; shiftC2Left; shiftC3Left
   720  	shiftD0Left;  shiftD1Left; shiftD2Left; shiftD3Left
   721  	polyMulStage1
   722  	polyMulStage2
   723  	MOVO          C3, tmpStore
   724  	chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
   725  	MOVO          tmpStore, C3
   726  	MOVO          C1, tmpStore
   727  	chachaQR(A3, B3, C3, D3, C1)
   728  	MOVO          tmpStore, C1
   729  	polyMulStage3
   730  	polyMulReduceStage
   731  	shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right
   732  	shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right
   733  	shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right
   734  	ADDQ          $2*8, itr2
   735  	CMPQ          itr2, $160
   736  	JB            openSSETail256Loop
   737  	MOVQ          inl, itr1
   738  	ANDQ          $-16, itr1
   739  
   740  openSSETail256HashLoop:
   741  	polyAdd(0(inp)(itr2*1))
   742  	polyMul
   743  	ADDQ $2*8, itr2
   744  	CMPQ itr2, itr1
   745  	JB   openSSETail256HashLoop
   746  
   747  	// Add in the state
   748  	PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3
   749  	PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3
   750  	PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3
   751  	PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3
   752  	MOVO  D3, tmpStore
   753  
   754  	// Load - xor - store
   755  	MOVOU (0*16)(inp), D3; PXOR D3, A0
   756  	MOVOU (1*16)(inp), D3; PXOR D3, B0
   757  	MOVOU (2*16)(inp), D3; PXOR D3, C0
   758  	MOVOU (3*16)(inp), D3; PXOR D3, D0
   759  	MOVOU A0, (0*16)(oup)
   760  	MOVOU B0, (1*16)(oup)
   761  	MOVOU C0, (2*16)(oup)
   762  	MOVOU D0, (3*16)(oup)
   763  	MOVOU (4*16)(inp), A0; MOVOU (5*16)(inp), B0; MOVOU (6*16)(inp), C0; MOVOU (7*16)(inp), D0
   764  	PXOR  A0, A1; PXOR B0, B1; PXOR C0, C1; PXOR D0, D1
   765  	MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup)
   766  	MOVOU (8*16)(inp), A0; MOVOU (9*16)(inp), B0; MOVOU (10*16)(inp), C0; MOVOU (11*16)(inp), D0
   767  	PXOR  A0, A2; PXOR B0, B2; PXOR C0, C2; PXOR D0, D2
   768  	MOVOU A2, (8*16)(oup); MOVOU B2, (9*16)(oup); MOVOU C2, (10*16)(oup); MOVOU D2, (11*16)(oup)
   769  	LEAQ  192(inp), inp
   770  	LEAQ  192(oup), oup
   771  	SUBQ  $192, inl
   772  	MOVO  A3, A0
   773  	MOVO  B3, B0
   774  	MOVO  C3, C0
   775  	MOVO  tmpStore, D0
   776  
   777  	JMP openSSETail64DecLoop
   778  
   779  // ----------------------------------------------------------------------------
   780  // ------------------------- AVX2 Code ----------------------------------------
   781  chacha20Poly1305Open_AVX2:
   782  	VZEROUPPER
   783  	VMOVDQU ·chacha20Constants<>(SB), AA0
   784  	BYTE    $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x70; BYTE $0x10 // broadcasti128 16(r8), ymm14
   785  	BYTE    $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x20 // broadcasti128 32(r8), ymm12
   786  	BYTE    $0xc4; BYTE $0xc2; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x30 // broadcasti128 48(r8), ymm4
   787  	VPADDD  ·avx2InitMask<>(SB), DD0, DD0
   788  
   789  	// Special optimization, for very short buffers
   790  	CMPQ inl, $192
   791  	JBE  openAVX2192
   792  	CMPQ inl, $320
   793  	JBE  openAVX2320
   794  
   795  	// For the general key prepare the key first - as a byproduct we have 64 bytes of cipher stream
   796  	VMOVDQA BB0, state1StoreAVX2
   797  	VMOVDQA CC0, state2StoreAVX2
   798  	VMOVDQA DD0, ctr3StoreAVX2
   799  	MOVQ    $10, itr2
   800  
   801  openAVX2PreparePolyKey:
   802  	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0)
   803  	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $12, DD0, DD0, DD0
   804  	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0)
   805  	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $4, DD0, DD0, DD0
   806  	DECQ     itr2
   807  	JNE      openAVX2PreparePolyKey
   808  
   809  	VPADDD ·chacha20Constants<>(SB), AA0, AA0
   810  	VPADDD state1StoreAVX2, BB0, BB0
   811  	VPADDD state2StoreAVX2, CC0, CC0
   812  	VPADDD ctr3StoreAVX2, DD0, DD0
   813  
   814  	VPERM2I128 $0x02, AA0, BB0, TT0
   815  
   816  	// Clamp and store poly key
   817  	VPAND   ·polyClampMask<>(SB), TT0, TT0
   818  	VMOVDQA TT0, rsStoreAVX2
   819  
   820  	// Stream for the first 64 bytes
   821  	VPERM2I128 $0x13, AA0, BB0, AA0
   822  	VPERM2I128 $0x13, CC0, DD0, BB0
   823  
   824  	// Hash AD + first 64 bytes
   825  	MOVQ ad_len+80(FP), itr2
   826  	CALL polyHashADInternal<>(SB)
   827  	XORQ itr1, itr1
   828  
   829  openAVX2InitialHash64:
   830  	polyAdd(0(inp)(itr1*1))
   831  	polyMulAVX2
   832  	ADDQ $16, itr1
   833  	CMPQ itr1, $64
   834  	JNE  openAVX2InitialHash64
   835  
   836  	// Decrypt the first 64 bytes
   837  	VPXOR   (0*32)(inp), AA0, AA0
   838  	VPXOR   (1*32)(inp), BB0, BB0
   839  	VMOVDQU AA0, (0*32)(oup)
   840  	VMOVDQU BB0, (1*32)(oup)
   841  	LEAQ    (2*32)(inp), inp
   842  	LEAQ    (2*32)(oup), oup
   843  	SUBQ    $64, inl
   844  
   845  openAVX2MainLoop:
   846  	CMPQ inl, $512
   847  	JB   openAVX2MainLoopDone
   848  
   849  	// Load state, increment counter blocks, store the incremented counters
   850  	VMOVDQU ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
   851  	VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
   852  	VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
   853  	VMOVDQA ctr3StoreAVX2, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
   854  	VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
   855  	XORQ    itr1, itr1
   856  
   857  openAVX2InternalLoop:
   858  	// Lets just say this spaghetti loop interleaves 2 quarter rounds with 3 poly multiplications
   859  	// Effectively per 512 bytes of stream we hash 480 bytes of ciphertext
   860  	polyAdd(0*8(inp)(itr1*1))
   861  	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
   862  	polyMulStage1_AVX2
   863  	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
   864  	VPSHUFB  ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
   865  	polyMulStage2_AVX2
   866  	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
   867  	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
   868  	polyMulStage3_AVX2
   869  	VMOVDQA  CC3, tmpStoreAVX2
   870  	VPSLLD   $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
   871  	VPSLLD   $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
   872  	VPSLLD   $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
   873  	VPSLLD   $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
   874  	VMOVDQA  tmpStoreAVX2, CC3
   875  	polyMulReduceStage
   876  	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
   877  	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
   878  	VPSHUFB  ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
   879  	polyAdd(2*8(inp)(itr1*1))
   880  	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
   881  	polyMulStage1_AVX2
   882  	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
   883  	VMOVDQA  CC3, tmpStoreAVX2
   884  	VPSLLD   $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
   885  	VPSLLD   $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
   886  	VPSLLD   $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
   887  	VPSLLD   $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
   888  	VMOVDQA  tmpStoreAVX2, CC3
   889  	polyMulStage2_AVX2
   890  	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3
   891  	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
   892  	VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3
   893  	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
   894  	polyMulStage3_AVX2
   895  	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
   896  	VPSHUFB  ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
   897  	polyMulReduceStage
   898  	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
   899  	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
   900  	polyAdd(4*8(inp)(itr1*1))
   901  	LEAQ     (6*8)(itr1), itr1
   902  	VMOVDQA  CC3, tmpStoreAVX2
   903  	VPSLLD   $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
   904  	VPSLLD   $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
   905  	VPSLLD   $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
   906  	VPSLLD   $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
   907  	VMOVDQA  tmpStoreAVX2, CC3
   908  	polyMulStage1_AVX2
   909  	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
   910  	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
   911  	polyMulStage2_AVX2
   912  	VPSHUFB  ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
   913  	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
   914  	polyMulStage3_AVX2
   915  	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
   916  	VMOVDQA  CC3, tmpStoreAVX2
   917  	VPSLLD   $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
   918  	VPSLLD   $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
   919  	VPSLLD   $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
   920  	VPSLLD   $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
   921  	VMOVDQA  tmpStoreAVX2, CC3
   922  	polyMulReduceStage
   923  	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3
   924  	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
   925  	VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3
   926  	CMPQ     itr1, $480
   927  	JNE      openAVX2InternalLoop
   928  
   929  	VPADDD  ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
   930  	VPADDD  state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
   931  	VPADDD  state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
   932  	VPADDD  ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
   933  	VMOVDQA CC3, tmpStoreAVX2
   934  
   935  	// We only hashed 480 of the 512 bytes available - hash the remaining 32 here
   936  	polyAdd(480(inp))
   937  	polyMulAVX2
   938  	VPERM2I128 $0x02, AA0, BB0, CC3; VPERM2I128 $0x13, AA0, BB0, BB0; VPERM2I128 $0x02, CC0, DD0, AA0; VPERM2I128 $0x13, CC0, DD0, CC0
   939  	VPXOR      (0*32)(inp), CC3, CC3; VPXOR (1*32)(inp), AA0, AA0; VPXOR (2*32)(inp), BB0, BB0; VPXOR (3*32)(inp), CC0, CC0
   940  	VMOVDQU    CC3, (0*32)(oup); VMOVDQU AA0, (1*32)(oup); VMOVDQU BB0, (2*32)(oup); VMOVDQU CC0, (3*32)(oup)
   941  	VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
   942  	VPXOR      (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0
   943  	VMOVDQU    AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup)
   944  
   945  	// and here
   946  	polyAdd(496(inp))
   947  	polyMulAVX2
   948  	VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0
   949  	VPXOR      (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0
   950  	VMOVDQU    AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup)
   951  	VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0
   952  	VPXOR      (12*32)(inp), AA0, AA0; VPXOR (13*32)(inp), BB0, BB0; VPXOR (14*32)(inp), CC0, CC0; VPXOR (15*32)(inp), DD0, DD0
   953  	VMOVDQU    AA0, (12*32)(oup); VMOVDQU BB0, (13*32)(oup); VMOVDQU CC0, (14*32)(oup); VMOVDQU DD0, (15*32)(oup)
   954  	LEAQ       (32*16)(inp), inp
   955  	LEAQ       (32*16)(oup), oup
   956  	SUBQ       $(32*16), inl
   957  	JMP        openAVX2MainLoop
   958  
   959  openAVX2MainLoopDone:
   960  	// Handle the various tail sizes efficiently
   961  	TESTQ inl, inl
   962  	JE    openSSEFinalize
   963  	CMPQ  inl, $128
   964  	JBE   openAVX2Tail128
   965  	CMPQ  inl, $256
   966  	JBE   openAVX2Tail256
   967  	CMPQ  inl, $384
   968  	JBE   openAVX2Tail384
   969  	JMP   openAVX2Tail512
   970  
   971  // ----------------------------------------------------------------------------
   972  // Special optimization for buffers smaller than 193 bytes
   973  openAVX2192:
   974  	// For up to 192 bytes of ciphertext and 64 bytes for the poly key, we process four blocks
   975  	VMOVDQA AA0, AA1
   976  	VMOVDQA BB0, BB1
   977  	VMOVDQA CC0, CC1
   978  	VPADDD  ·avx2IncMask<>(SB), DD0, DD1
   979  	VMOVDQA AA0, AA2
   980  	VMOVDQA BB0, BB2
   981  	VMOVDQA CC0, CC2
   982  	VMOVDQA DD0, DD2
   983  	VMOVDQA DD1, TT3
   984  	MOVQ    $10, itr2
   985  
   986  openAVX2192InnerCipherLoop:
   987  	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
   988  	VPALIGNR   $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1
   989  	VPALIGNR   $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
   990  	VPALIGNR   $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1
   991  	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
   992  	VPALIGNR   $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1
   993  	VPALIGNR   $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
   994  	VPALIGNR   $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1
   995  	DECQ       itr2
   996  	JNE        openAVX2192InnerCipherLoop
   997  	VPADDD     AA2, AA0, AA0; VPADDD AA2, AA1, AA1
   998  	VPADDD     BB2, BB0, BB0; VPADDD BB2, BB1, BB1
   999  	VPADDD     CC2, CC0, CC0; VPADDD CC2, CC1, CC1
  1000  	VPADDD     DD2, DD0, DD0; VPADDD TT3, DD1, DD1
  1001  	VPERM2I128 $0x02, AA0, BB0, TT0
  1002  
  1003  	// Clamp and store poly key
  1004  	VPAND   ·polyClampMask<>(SB), TT0, TT0
  1005  	VMOVDQA TT0, rsStoreAVX2
  1006  
  1007  	// Stream for up to 192 bytes
  1008  	VPERM2I128 $0x13, AA0, BB0, AA0
  1009  	VPERM2I128 $0x13, CC0, DD0, BB0
  1010  	VPERM2I128 $0x02, AA1, BB1, CC0
  1011  	VPERM2I128 $0x02, CC1, DD1, DD0
  1012  	VPERM2I128 $0x13, AA1, BB1, AA1
  1013  	VPERM2I128 $0x13, CC1, DD1, BB1
  1014  
  1015  openAVX2ShortOpen:
  1016  	// Hash
  1017  	MOVQ ad_len+80(FP), itr2
  1018  	CALL polyHashADInternal<>(SB)
  1019  
  1020  openAVX2ShortOpenLoop:
  1021  	CMPQ inl, $32
  1022  	JB   openAVX2ShortTail32
  1023  	SUBQ $32, inl
  1024  
  1025  	// Load for hashing
  1026  	polyAdd(0*8(inp))
  1027  	polyMulAVX2
  1028  	polyAdd(2*8(inp))
  1029  	polyMulAVX2
  1030  
  1031  	// Load for decryption
  1032  	VPXOR   (inp), AA0, AA0
  1033  	VMOVDQU AA0, (oup)
  1034  	LEAQ    (1*32)(inp), inp
  1035  	LEAQ    (1*32)(oup), oup
  1036  
  1037  	// Shift stream left
  1038  	VMOVDQA BB0, AA0
  1039  	VMOVDQA CC0, BB0
  1040  	VMOVDQA DD0, CC0
  1041  	VMOVDQA AA1, DD0
  1042  	VMOVDQA BB1, AA1
  1043  	VMOVDQA CC1, BB1
  1044  	VMOVDQA DD1, CC1
  1045  	VMOVDQA AA2, DD1
  1046  	VMOVDQA BB2, AA2
  1047  	JMP     openAVX2ShortOpenLoop
  1048  
  1049  openAVX2ShortTail32:
  1050  	CMPQ    inl, $16
  1051  	VMOVDQA A0, A1
  1052  	JB      openAVX2ShortDone
  1053  
  1054  	SUBQ $16, inl
  1055  
  1056  	// Load for hashing
  1057  	polyAdd(0*8(inp))
  1058  	polyMulAVX2
  1059  
  1060  	// Load for decryption
  1061  	VPXOR      (inp), A0, T0
  1062  	VMOVDQU    T0, (oup)
  1063  	LEAQ       (1*16)(inp), inp
  1064  	LEAQ       (1*16)(oup), oup
  1065  	VPERM2I128 $0x11, AA0, AA0, AA0
  1066  	VMOVDQA    A0, A1
  1067  
  1068  openAVX2ShortDone:
  1069  	VZEROUPPER
  1070  	JMP openSSETail16
  1071  
  1072  // ----------------------------------------------------------------------------
  1073  // Special optimization for buffers smaller than 321 bytes
  1074  openAVX2320:
  1075  	// For up to 320 bytes of ciphertext and 64 bytes for the poly key, we process six blocks
  1076  	VMOVDQA AA0, AA1; VMOVDQA BB0, BB1; VMOVDQA CC0, CC1; VPADDD ·avx2IncMask<>(SB), DD0, DD1
  1077  	VMOVDQA AA0, AA2; VMOVDQA BB0, BB2; VMOVDQA CC0, CC2; VPADDD ·avx2IncMask<>(SB), DD1, DD2
  1078  	VMOVDQA BB0, TT1; VMOVDQA CC0, TT2; VMOVDQA DD0, TT3
  1079  	MOVQ    $10, itr2
  1080  
  1081  openAVX2320InnerCipherLoop:
  1082  	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
  1083  	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2
  1084  	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
  1085  	VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2
  1086  	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
  1087  	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2
  1088  	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
  1089  	VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2
  1090  	DECQ     itr2
  1091  	JNE      openAVX2320InnerCipherLoop
  1092  
  1093  	VMOVDQA ·chacha20Constants<>(SB), TT0
  1094  	VPADDD  TT0, AA0, AA0; VPADDD TT0, AA1, AA1; VPADDD TT0, AA2, AA2
  1095  	VPADDD  TT1, BB0, BB0; VPADDD TT1, BB1, BB1; VPADDD TT1, BB2, BB2
  1096  	VPADDD  TT2, CC0, CC0; VPADDD TT2, CC1, CC1; VPADDD TT2, CC2, CC2
  1097  	VMOVDQA ·avx2IncMask<>(SB), TT0
  1098  	VPADDD  TT3, DD0, DD0; VPADDD TT0, TT3, TT3
  1099  	VPADDD  TT3, DD1, DD1; VPADDD TT0, TT3, TT3
  1100  	VPADDD  TT3, DD2, DD2
  1101  
  1102  	// Clamp and store poly key
  1103  	VPERM2I128 $0x02, AA0, BB0, TT0
  1104  	VPAND      ·polyClampMask<>(SB), TT0, TT0
  1105  	VMOVDQA    TT0, rsStoreAVX2
  1106  
  1107  	// Stream for up to 320 bytes
  1108  	VPERM2I128 $0x13, AA0, BB0, AA0
  1109  	VPERM2I128 $0x13, CC0, DD0, BB0
  1110  	VPERM2I128 $0x02, AA1, BB1, CC0
  1111  	VPERM2I128 $0x02, CC1, DD1, DD0
  1112  	VPERM2I128 $0x13, AA1, BB1, AA1
  1113  	VPERM2I128 $0x13, CC1, DD1, BB1
  1114  	VPERM2I128 $0x02, AA2, BB2, CC1
  1115  	VPERM2I128 $0x02, CC2, DD2, DD1
  1116  	VPERM2I128 $0x13, AA2, BB2, AA2
  1117  	VPERM2I128 $0x13, CC2, DD2, BB2
  1118  	JMP        openAVX2ShortOpen
  1119  
  1120  // ----------------------------------------------------------------------------
  1121  // Special optimization for the last 128 bytes of ciphertext
  1122  openAVX2Tail128:
  1123  	// Need to decrypt up to 128 bytes - prepare two blocks
  1124  	VMOVDQA ·chacha20Constants<>(SB), AA1
  1125  	VMOVDQA state1StoreAVX2, BB1
  1126  	VMOVDQA state2StoreAVX2, CC1
  1127  	VMOVDQA ctr3StoreAVX2, DD1
  1128  	VPADDD  ·avx2IncMask<>(SB), DD1, DD1
  1129  	VMOVDQA DD1, DD0
  1130  
  1131  	XORQ  itr2, itr2
  1132  	MOVQ  inl, itr1
  1133  	ANDQ  $-16, itr1
  1134  	TESTQ itr1, itr1
  1135  	JE    openAVX2Tail128LoopB
  1136  
  1137  openAVX2Tail128LoopA:
  1138  	// Perform ChaCha rounds, while hashing the remaining input
  1139  	polyAdd(0(inp)(itr2*1))
  1140  	polyMulAVX2
  1141  
  1142  openAVX2Tail128LoopB:
  1143  	ADDQ     $16, itr2
  1144  	chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
  1145  	VPALIGNR $4, BB1, BB1, BB1
  1146  	VPALIGNR $8, CC1, CC1, CC1
  1147  	VPALIGNR $12, DD1, DD1, DD1
  1148  	chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
  1149  	VPALIGNR $12, BB1, BB1, BB1
  1150  	VPALIGNR $8, CC1, CC1, CC1
  1151  	VPALIGNR $4, DD1, DD1, DD1
  1152  	CMPQ     itr2, itr1
  1153  	JB       openAVX2Tail128LoopA
  1154  	CMPQ     itr2, $160
  1155  	JNE      openAVX2Tail128LoopB
  1156  
  1157  	VPADDD     ·chacha20Constants<>(SB), AA1, AA1
  1158  	VPADDD     state1StoreAVX2, BB1, BB1
  1159  	VPADDD     state2StoreAVX2, CC1, CC1
  1160  	VPADDD     DD0, DD1, DD1
  1161  	VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
  1162  
  1163  openAVX2TailLoop:
  1164  	CMPQ inl, $32
  1165  	JB   openAVX2Tail
  1166  	SUBQ $32, inl
  1167  
  1168  	// Load for decryption
  1169  	VPXOR   (inp), AA0, AA0
  1170  	VMOVDQU AA0, (oup)
  1171  	LEAQ    (1*32)(inp), inp
  1172  	LEAQ    (1*32)(oup), oup
  1173  	VMOVDQA BB0, AA0
  1174  	VMOVDQA CC0, BB0
  1175  	VMOVDQA DD0, CC0
  1176  	JMP     openAVX2TailLoop
  1177  
  1178  openAVX2Tail:
  1179  	CMPQ    inl, $16
  1180  	VMOVDQA A0, A1
  1181  	JB      openAVX2TailDone
  1182  	SUBQ    $16, inl
  1183  
  1184  	// Load for decryption
  1185  	VPXOR      (inp), A0, T0
  1186  	VMOVDQU    T0, (oup)
  1187  	LEAQ       (1*16)(inp), inp
  1188  	LEAQ       (1*16)(oup), oup
  1189  	VPERM2I128 $0x11, AA0, AA0, AA0
  1190  	VMOVDQA    A0, A1
  1191  
  1192  openAVX2TailDone:
  1193  	VZEROUPPER
  1194  	JMP openSSETail16
  1195  
  1196  // ----------------------------------------------------------------------------
  1197  // Special optimization for the last 256 bytes of ciphertext
  1198  openAVX2Tail256:
  1199  	// Need to decrypt up to 256 bytes - prepare four blocks
  1200  	VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1
  1201  	VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1
  1202  	VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1
  1203  	VMOVDQA ctr3StoreAVX2, DD0
  1204  	VPADDD  ·avx2IncMask<>(SB), DD0, DD0
  1205  	VPADDD  ·avx2IncMask<>(SB), DD0, DD1
  1206  	VMOVDQA DD0, TT1
  1207  	VMOVDQA DD1, TT2
  1208  
  1209  	// Compute the number of iterations that will hash data
  1210  	MOVQ    inl, tmpStoreAVX2
  1211  	MOVQ    inl, itr1
  1212  	SUBQ    $128, itr1
  1213  	SHRQ    $4, itr1
  1214  	MOVQ    $10, itr2
  1215  	CMPQ    itr1, $10
  1216  	CMOVQGT itr2, itr1
  1217  	MOVQ    inp, inl
  1218  	XORQ    itr2, itr2
  1219  
  1220  openAVX2Tail256LoopA:
  1221  	polyAdd(0(inl))
  1222  	polyMulAVX2
  1223  	LEAQ 16(inl), inl
  1224  
  1225  	// Perform ChaCha rounds, while hashing the remaining input
  1226  openAVX2Tail256LoopB:
  1227  	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
  1228  	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1
  1229  	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
  1230  	VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1
  1231  	INCQ     itr2
  1232  	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
  1233  	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1
  1234  	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
  1235  	VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1
  1236  	CMPQ     itr2, itr1
  1237  	JB       openAVX2Tail256LoopA
  1238  
  1239  	CMPQ itr2, $10
  1240  	JNE  openAVX2Tail256LoopB
  1241  
  1242  	MOVQ inl, itr2
  1243  	SUBQ inp, inl
  1244  	MOVQ inl, itr1
  1245  	MOVQ tmpStoreAVX2, inl
  1246  
  1247  	// Hash the remainder of data (if any)
  1248  openAVX2Tail256Hash:
  1249  	ADDQ $16, itr1
  1250  	CMPQ itr1, inl
  1251  	JGT  openAVX2Tail256HashEnd
  1252  	polyAdd (0(itr2))
  1253  	polyMulAVX2
  1254  	LEAQ 16(itr2), itr2
  1255  	JMP  openAVX2Tail256Hash
  1256  
  1257  // Store 128 bytes safely, then go to store loop
  1258  openAVX2Tail256HashEnd:
  1259  	VPADDD     ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1
  1260  	VPADDD     state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1
  1261  	VPADDD     state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1
  1262  	VPADDD     TT1, DD0, DD0; VPADDD TT2, DD1, DD1
  1263  	VPERM2I128 $0x02, AA0, BB0, AA2; VPERM2I128 $0x02, CC0, DD0, BB2; VPERM2I128 $0x13, AA0, BB0, CC2; VPERM2I128 $0x13, CC0, DD0, DD2
  1264  	VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
  1265  
  1266  	VPXOR   (0*32)(inp), AA2, AA2; VPXOR (1*32)(inp), BB2, BB2; VPXOR (2*32)(inp), CC2, CC2; VPXOR (3*32)(inp), DD2, DD2
  1267  	VMOVDQU AA2, (0*32)(oup); VMOVDQU BB2, (1*32)(oup); VMOVDQU CC2, (2*32)(oup); VMOVDQU DD2, (3*32)(oup)
  1268  	LEAQ    (4*32)(inp), inp
  1269  	LEAQ    (4*32)(oup), oup
  1270  	SUBQ    $4*32, inl
  1271  
  1272  	JMP openAVX2TailLoop
  1273  
  1274  // ----------------------------------------------------------------------------
  1275  // Special optimization for the last 384 bytes of ciphertext
  1276  openAVX2Tail384:
  1277  	// Need to decrypt up to 384 bytes - prepare six blocks
  1278  	VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2
  1279  	VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2
  1280  	VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2
  1281  	VMOVDQA ctr3StoreAVX2, DD0
  1282  	VPADDD  ·avx2IncMask<>(SB), DD0, DD0
  1283  	VPADDD  ·avx2IncMask<>(SB), DD0, DD1
  1284  	VPADDD  ·avx2IncMask<>(SB), DD1, DD2
  1285  	VMOVDQA DD0, ctr0StoreAVX2
  1286  	VMOVDQA DD1, ctr1StoreAVX2
  1287  	VMOVDQA DD2, ctr2StoreAVX2
  1288  
  1289  	// Compute the number of iterations that will hash two blocks of data
  1290  	MOVQ    inl, tmpStoreAVX2
  1291  	MOVQ    inl, itr1
  1292  	SUBQ    $256, itr1
  1293  	SHRQ    $4, itr1
  1294  	ADDQ    $6, itr1
  1295  	MOVQ    $10, itr2
  1296  	CMPQ    itr1, $10
  1297  	CMOVQGT itr2, itr1
  1298  	MOVQ    inp, inl
  1299  	XORQ    itr2, itr2
  1300  
  1301  	// Perform ChaCha rounds, while hashing the remaining input
  1302  openAVX2Tail384LoopB:
  1303  	polyAdd(0(inl))
  1304  	polyMulAVX2
  1305  	LEAQ 16(inl), inl
  1306  
  1307  openAVX2Tail384LoopA:
  1308  	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
  1309  	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2
  1310  	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
  1311  	VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2
  1312  	polyAdd(0(inl))
  1313  	polyMulAVX2
  1314  	LEAQ     16(inl), inl
  1315  	INCQ     itr2
  1316  	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
  1317  	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2
  1318  	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
  1319  	VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2
  1320  
  1321  	CMPQ itr2, itr1
  1322  	JB   openAVX2Tail384LoopB
  1323  
  1324  	CMPQ itr2, $10
  1325  	JNE  openAVX2Tail384LoopA
  1326  
  1327  	MOVQ inl, itr2
  1328  	SUBQ inp, inl
  1329  	MOVQ inl, itr1
  1330  	MOVQ tmpStoreAVX2, inl
  1331  
  1332  openAVX2Tail384Hash:
  1333  	ADDQ $16, itr1
  1334  	CMPQ itr1, inl
  1335  	JGT  openAVX2Tail384HashEnd
  1336  	polyAdd(0(itr2))
  1337  	polyMulAVX2
  1338  	LEAQ 16(itr2), itr2
  1339  	JMP  openAVX2Tail384Hash
  1340  
  1341  // Store 256 bytes safely, then go to store loop
  1342  openAVX2Tail384HashEnd:
  1343  	VPADDD     ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2
  1344  	VPADDD     state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2
  1345  	VPADDD     state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2
  1346  	VPADDD     ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2
  1347  	VPERM2I128 $0x02, AA0, BB0, TT0; VPERM2I128 $0x02, CC0, DD0, TT1; VPERM2I128 $0x13, AA0, BB0, TT2; VPERM2I128 $0x13, CC0, DD0, TT3
  1348  	VPXOR      (0*32)(inp), TT0, TT0; VPXOR (1*32)(inp), TT1, TT1; VPXOR (2*32)(inp), TT2, TT2; VPXOR (3*32)(inp), TT3, TT3
  1349  	VMOVDQU    TT0, (0*32)(oup); VMOVDQU TT1, (1*32)(oup); VMOVDQU TT2, (2*32)(oup); VMOVDQU TT3, (3*32)(oup)
  1350  	VPERM2I128 $0x02, AA1, BB1, TT0; VPERM2I128 $0x02, CC1, DD1, TT1; VPERM2I128 $0x13, AA1, BB1, TT2; VPERM2I128 $0x13, CC1, DD1, TT3
  1351  	VPXOR      (4*32)(inp), TT0, TT0; VPXOR (5*32)(inp), TT1, TT1; VPXOR (6*32)(inp), TT2, TT2; VPXOR (7*32)(inp), TT3, TT3
  1352  	VMOVDQU    TT0, (4*32)(oup); VMOVDQU TT1, (5*32)(oup); VMOVDQU TT2, (6*32)(oup); VMOVDQU TT3, (7*32)(oup)
  1353  	VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0
  1354  	LEAQ       (8*32)(inp), inp
  1355  	LEAQ       (8*32)(oup), oup
  1356  	SUBQ       $8*32, inl
  1357  	JMP        openAVX2TailLoop
  1358  
  1359  // ----------------------------------------------------------------------------
  1360  // Special optimization for the last 512 bytes of ciphertext
  1361  openAVX2Tail512:
  1362  	VMOVDQU ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
  1363  	VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
  1364  	VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
  1365  	VMOVDQA ctr3StoreAVX2, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
  1366  	VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
  1367  	XORQ    itr1, itr1
  1368  	MOVQ    inp, itr2
  1369  
  1370  openAVX2Tail512LoopB:
  1371  	polyAdd(0(itr2))
  1372  	polyMulAVX2
  1373  	LEAQ (2*8)(itr2), itr2
  1374  
  1375  openAVX2Tail512LoopA:
  1376  	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
  1377  	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
  1378  	VPSHUFB  ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
  1379  	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
  1380  	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
  1381  	VMOVDQA  CC3, tmpStoreAVX2
  1382  	VPSLLD   $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
  1383  	VPSLLD   $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
  1384  	VPSLLD   $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
  1385  	VPSLLD   $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
  1386  	VMOVDQA  tmpStoreAVX2, CC3
  1387  	polyAdd(0*8(itr2))
  1388  	polyMulAVX2
  1389  	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
  1390  	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
  1391  	VPSHUFB  ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
  1392  	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
  1393  	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
  1394  	VMOVDQA  CC3, tmpStoreAVX2
  1395  	VPSLLD   $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
  1396  	VPSLLD   $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
  1397  	VPSLLD   $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
  1398  	VPSLLD   $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
  1399  	VMOVDQA  tmpStoreAVX2, CC3
  1400  	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3
  1401  	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
  1402  	VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3
  1403  	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
  1404  	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
  1405  	VPSHUFB  ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
  1406  	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
  1407  	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
  1408  	polyAdd(2*8(itr2))
  1409  	polyMulAVX2
  1410  	LEAQ     (4*8)(itr2), itr2
  1411  	VMOVDQA  CC3, tmpStoreAVX2
  1412  	VPSLLD   $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
  1413  	VPSLLD   $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
  1414  	VPSLLD   $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
  1415  	VPSLLD   $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
  1416  	VMOVDQA  tmpStoreAVX2, CC3
  1417  	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
  1418  	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
  1419  	VPSHUFB  ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
  1420  	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
  1421  	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
  1422  	VMOVDQA  CC3, tmpStoreAVX2
  1423  	VPSLLD   $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
  1424  	VPSLLD   $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
  1425  	VPSLLD   $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
  1426  	VPSLLD   $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
  1427  	VMOVDQA  tmpStoreAVX2, CC3
  1428  	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3
  1429  	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
  1430  	VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3
  1431  	INCQ     itr1
  1432  	CMPQ     itr1, $4
  1433  	JLT      openAVX2Tail512LoopB
  1434  
  1435  	CMPQ itr1, $10
  1436  	JNE  openAVX2Tail512LoopA
  1437  
  1438  	MOVQ inl, itr1
  1439  	SUBQ $384, itr1
  1440  	ANDQ $-16, itr1
  1441  
  1442  openAVX2Tail512HashLoop:
  1443  	TESTQ itr1, itr1
  1444  	JE    openAVX2Tail512HashEnd
  1445  	polyAdd(0(itr2))
  1446  	polyMulAVX2
  1447  	LEAQ  16(itr2), itr2
  1448  	SUBQ  $16, itr1
  1449  	JMP   openAVX2Tail512HashLoop
  1450  
  1451  openAVX2Tail512HashEnd:
  1452  	VPADDD     ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
  1453  	VPADDD     state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
  1454  	VPADDD     state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
  1455  	VPADDD     ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
  1456  	VMOVDQA    CC3, tmpStoreAVX2
  1457  	VPERM2I128 $0x02, AA0, BB0, CC3; VPERM2I128 $0x13, AA0, BB0, BB0; VPERM2I128 $0x02, CC0, DD0, AA0; VPERM2I128 $0x13, CC0, DD0, CC0
  1458  	VPXOR      (0*32)(inp), CC3, CC3; VPXOR (1*32)(inp), AA0, AA0; VPXOR (2*32)(inp), BB0, BB0; VPXOR (3*32)(inp), CC0, CC0
  1459  	VMOVDQU    CC3, (0*32)(oup); VMOVDQU AA0, (1*32)(oup); VMOVDQU BB0, (2*32)(oup); VMOVDQU CC0, (3*32)(oup)
  1460  	VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
  1461  	VPXOR      (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0
  1462  	VMOVDQU    AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup)
  1463  	VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0
  1464  	VPXOR      (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0
  1465  	VMOVDQU    AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup)
  1466  	VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0
  1467  
  1468  	LEAQ (12*32)(inp), inp
  1469  	LEAQ (12*32)(oup), oup
  1470  	SUBQ $12*32, inl
  1471  
  1472  	JMP openAVX2TailLoop
  1473  
  1474  // ----------------------------------------------------------------------------
  1475  // ----------------------------------------------------------------------------
  1476  // func chacha20Poly1305Seal(dst, key, src, ad []byte)
  1477  TEXT ·chacha20Poly1305Seal(SB), 0, $288-96
  1478  	// For aligned stack access
  1479  	MOVQ SP, BP
  1480  	ADDQ $32, BP
  1481  	ANDQ $-32, BP
  1482  	MOVQ dst+0(FP), oup
  1483  	MOVQ key+24(FP), keyp
  1484  	MOVQ src+48(FP), inp
  1485  	MOVQ src_len+56(FP), inl
  1486  	MOVQ ad+72(FP), adp
  1487  
  1488  	CMPB ·useAVX2(SB), $1
  1489  	JE   chacha20Poly1305Seal_AVX2
  1490  
  1491  	// Special optimization, for very short buffers
  1492  	CMPQ inl, $128
  1493  	JBE  sealSSE128 // About 15% faster
  1494  
  1495  	// In the seal case - prepare the poly key + 3 blocks of stream in the first iteration
  1496  	MOVOU ·chacha20Constants<>(SB), A0
  1497  	MOVOU (1*16)(keyp), B0
  1498  	MOVOU (2*16)(keyp), C0
  1499  	MOVOU (3*16)(keyp), D0
  1500  
  1501  	// Store state on stack for future use
  1502  	MOVO B0, state1Store
  1503  	MOVO C0, state2Store
  1504  
  1505  	// Load state, increment counter blocks
  1506  	MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
  1507  	MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
  1508  	MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3
  1509  
  1510  	// Store counters
  1511  	MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store
  1512  	MOVQ $10, itr2
  1513  
  1514  sealSSEIntroLoop:
  1515  	MOVO         C3, tmpStore
  1516  	chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
  1517  	MOVO         tmpStore, C3
  1518  	MOVO         C1, tmpStore
  1519  	chachaQR(A3, B3, C3, D3, C1)
  1520  	MOVO         tmpStore, C1
  1521  	shiftB0Left; shiftB1Left; shiftB2Left; shiftB3Left
  1522  	shiftC0Left; shiftC1Left; shiftC2Left; shiftC3Left
  1523  	shiftD0Left; shiftD1Left; shiftD2Left; shiftD3Left
  1524  
  1525  	MOVO          C3, tmpStore
  1526  	chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
  1527  	MOVO          tmpStore, C3
  1528  	MOVO          C1, tmpStore
  1529  	chachaQR(A3, B3, C3, D3, C1)
  1530  	MOVO          tmpStore, C1
  1531  	shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right
  1532  	shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right
  1533  	shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right
  1534  	DECQ          itr2
  1535  	JNE           sealSSEIntroLoop
  1536  
  1537  	// Add in the state
  1538  	PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3
  1539  	PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3
  1540  	PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3
  1541  	PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3
  1542  
  1543  	// Clamp and store the key
  1544  	PAND ·polyClampMask<>(SB), A0
  1545  	MOVO A0, rStore
  1546  	MOVO B0, sStore
  1547  
  1548  	// Hash AAD
  1549  	MOVQ ad_len+80(FP), itr2
  1550  	CALL polyHashADInternal<>(SB)
  1551  
  1552  	MOVOU (0*16)(inp), A0; MOVOU (1*16)(inp), B0; MOVOU (2*16)(inp), C0; MOVOU (3*16)(inp), D0
  1553  	PXOR  A0, A1; PXOR B0, B1; PXOR C0, C1; PXOR D0, D1
  1554  	MOVOU A1, (0*16)(oup); MOVOU B1, (1*16)(oup); MOVOU C1, (2*16)(oup); MOVOU D1, (3*16)(oup)
  1555  	MOVOU (4*16)(inp), A0; MOVOU (5*16)(inp), B0; MOVOU (6*16)(inp), C0; MOVOU (7*16)(inp), D0
  1556  	PXOR  A0, A2; PXOR B0, B2; PXOR C0, C2; PXOR D0, D2
  1557  	MOVOU A2, (4*16)(oup); MOVOU B2, (5*16)(oup); MOVOU C2, (6*16)(oup); MOVOU D2, (7*16)(oup)
  1558  
  1559  	MOVQ $128, itr1
  1560  	SUBQ $128, inl
  1561  	LEAQ 128(inp), inp
  1562  
  1563  	MOVO A3, A1; MOVO B3, B1; MOVO C3, C1; MOVO D3, D1
  1564  
  1565  	CMPQ inl, $64
  1566  	JBE  sealSSE128SealHash
  1567  
  1568  	MOVOU (0*16)(inp), A0; MOVOU (1*16)(inp), B0; MOVOU (2*16)(inp), C0; MOVOU (3*16)(inp), D0
  1569  	PXOR  A0, A3; PXOR B0, B3; PXOR C0, C3; PXOR D0, D3
  1570  	MOVOU A3, (8*16)(oup); MOVOU B3, (9*16)(oup); MOVOU C3, (10*16)(oup); MOVOU D3, (11*16)(oup)
  1571  
  1572  	ADDQ $64, itr1
  1573  	SUBQ $64, inl
  1574  	LEAQ 64(inp), inp
  1575  
  1576  	MOVQ $2, itr1
  1577  	MOVQ $8, itr2
  1578  
  1579  	CMPQ inl, $64
  1580  	JBE  sealSSETail64
  1581  	CMPQ inl, $128
  1582  	JBE  sealSSETail128
  1583  	CMPQ inl, $192
  1584  	JBE  sealSSETail192
  1585  
  1586  sealSSEMainLoop:
  1587  	// Load state, increment counter blocks
  1588  	MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0
  1589  	MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
  1590  	MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
  1591  	MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3
  1592  
  1593  	// Store counters
  1594  	MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store
  1595  
  1596  sealSSEInnerLoop:
  1597  	MOVO          C3, tmpStore
  1598  	chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
  1599  	MOVO          tmpStore, C3
  1600  	MOVO          C1, tmpStore
  1601  	chachaQR(A3, B3, C3, D3, C1)
  1602  	MOVO          tmpStore, C1
  1603  	polyAdd(0(oup))
  1604  	shiftB0Left;  shiftB1Left; shiftB2Left; shiftB3Left
  1605  	shiftC0Left;  shiftC1Left; shiftC2Left; shiftC3Left
  1606  	shiftD0Left;  shiftD1Left; shiftD2Left; shiftD3Left
  1607  	polyMulStage1
  1608  	polyMulStage2
  1609  	LEAQ          (2*8)(oup), oup
  1610  	MOVO          C3, tmpStore
  1611  	chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
  1612  	MOVO          tmpStore, C3
  1613  	MOVO          C1, tmpStore
  1614  	polyMulStage3
  1615  	chachaQR(A3, B3, C3, D3, C1)
  1616  	MOVO          tmpStore, C1
  1617  	polyMulReduceStage
  1618  	shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right
  1619  	shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right
  1620  	shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right
  1621  	DECQ          itr2
  1622  	JGE           sealSSEInnerLoop
  1623  	polyAdd(0(oup))
  1624  	polyMul
  1625  	LEAQ          (2*8)(oup), oup
  1626  	DECQ          itr1
  1627  	JG            sealSSEInnerLoop
  1628  
  1629  	// Add in the state
  1630  	PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3
  1631  	PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3
  1632  	PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3
  1633  	PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3
  1634  	MOVO  D3, tmpStore
  1635  
  1636  	// Load - xor - store
  1637  	MOVOU (0*16)(inp), D3; PXOR D3, A0
  1638  	MOVOU (1*16)(inp), D3; PXOR D3, B0
  1639  	MOVOU (2*16)(inp), D3; PXOR D3, C0
  1640  	MOVOU (3*16)(inp), D3; PXOR D3, D0
  1641  	MOVOU A0, (0*16)(oup)
  1642  	MOVOU B0, (1*16)(oup)
  1643  	MOVOU C0, (2*16)(oup)
  1644  	MOVOU D0, (3*16)(oup)
  1645  	MOVO  tmpStore, D3
  1646  
  1647  	MOVOU (4*16)(inp), A0; MOVOU (5*16)(inp), B0; MOVOU (6*16)(inp), C0; MOVOU (7*16)(inp), D0
  1648  	PXOR  A0, A1; PXOR B0, B1; PXOR C0, C1; PXOR D0, D1
  1649  	MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup)
  1650  	MOVOU (8*16)(inp), A0; MOVOU (9*16)(inp), B0; MOVOU (10*16)(inp), C0; MOVOU (11*16)(inp), D0
  1651  	PXOR  A0, A2; PXOR B0, B2; PXOR C0, C2; PXOR D0, D2
  1652  	MOVOU A2, (8*16)(oup); MOVOU B2, (9*16)(oup); MOVOU C2, (10*16)(oup); MOVOU D2, (11*16)(oup)
  1653  	ADDQ  $192, inp
  1654  	MOVQ  $192, itr1
  1655  	SUBQ  $192, inl
  1656  	MOVO  A3, A1
  1657  	MOVO  B3, B1
  1658  	MOVO  C3, C1
  1659  	MOVO  D3, D1
  1660  	CMPQ  inl, $64
  1661  	JBE   sealSSE128SealHash
  1662  	MOVOU (0*16)(inp), A0; MOVOU (1*16)(inp), B0; MOVOU (2*16)(inp), C0; MOVOU (3*16)(inp), D0
  1663  	PXOR  A0, A3; PXOR B0, B3; PXOR C0, C3; PXOR D0, D3
  1664  	MOVOU A3, (12*16)(oup); MOVOU B3, (13*16)(oup); MOVOU C3, (14*16)(oup); MOVOU D3, (15*16)(oup)
  1665  	LEAQ  64(inp), inp
  1666  	SUBQ  $64, inl
  1667  	MOVQ  $6, itr1
  1668  	MOVQ  $4, itr2
  1669  	CMPQ  inl, $192
  1670  	JG    sealSSEMainLoop
  1671  
  1672  	MOVQ  inl, itr1
  1673  	TESTQ inl, inl
  1674  	JE    sealSSE128SealHash
  1675  	MOVQ  $6, itr1
  1676  	CMPQ  inl, $64
  1677  	JBE   sealSSETail64
  1678  	CMPQ  inl, $128
  1679  	JBE   sealSSETail128
  1680  	JMP   sealSSETail192
  1681  
  1682  // ----------------------------------------------------------------------------
  1683  // Special optimization for the last 64 bytes of plaintext
  1684  sealSSETail64:
  1685  	// Need to encrypt up to 64 bytes - prepare single block, hash 192 or 256 bytes
  1686  	MOVO  ·chacha20Constants<>(SB), A1
  1687  	MOVO  state1Store, B1
  1688  	MOVO  state2Store, C1
  1689  	MOVO  ctr3Store, D1
  1690  	PADDL ·sseIncMask<>(SB), D1
  1691  	MOVO  D1, ctr0Store
  1692  
  1693  sealSSETail64LoopA:
  1694  	// Perform ChaCha rounds, while hashing the previously encrypted ciphertext
  1695  	polyAdd(0(oup))
  1696  	polyMul
  1697  	LEAQ 16(oup), oup
  1698  
  1699  sealSSETail64LoopB:
  1700  	chachaQR(A1, B1, C1, D1, T1)
  1701  	shiftB1Left;  shiftC1Left; shiftD1Left
  1702  	chachaQR(A1, B1, C1, D1, T1)
  1703  	shiftB1Right; shiftC1Right; shiftD1Right
  1704  	polyAdd(0(oup))
  1705  	polyMul
  1706  	LEAQ          16(oup), oup
  1707  
  1708  	DECQ itr1
  1709  	JG   sealSSETail64LoopA
  1710  
  1711  	DECQ  itr2
  1712  	JGE   sealSSETail64LoopB
  1713  	PADDL ·chacha20Constants<>(SB), A1
  1714  	PADDL state1Store, B1
  1715  	PADDL state2Store, C1
  1716  	PADDL ctr0Store, D1
  1717  
  1718  	JMP sealSSE128Seal
  1719  
  1720  // ----------------------------------------------------------------------------
  1721  // Special optimization for the last 128 bytes of plaintext
  1722  sealSSETail128:
  1723  	// Need to encrypt up to 128 bytes - prepare two blocks, hash 192 or 256 bytes
  1724  	MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr0Store
  1725  	MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr1Store
  1726  
  1727  sealSSETail128LoopA:
  1728  	// Perform ChaCha rounds, while hashing the previously encrypted ciphertext
  1729  	polyAdd(0(oup))
  1730  	polyMul
  1731  	LEAQ 16(oup), oup
  1732  
  1733  sealSSETail128LoopB:
  1734  	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0)
  1735  	shiftB0Left;  shiftC0Left; shiftD0Left
  1736  	shiftB1Left;  shiftC1Left; shiftD1Left
  1737  	polyAdd(0(oup))
  1738  	polyMul
  1739  	LEAQ          16(oup), oup
  1740  	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0)
  1741  	shiftB0Right; shiftC0Right; shiftD0Right
  1742  	shiftB1Right; shiftC1Right; shiftD1Right
  1743  
  1744  	DECQ itr1
  1745  	JG   sealSSETail128LoopA
  1746  
  1747  	DECQ itr2
  1748  	JGE  sealSSETail128LoopB
  1749  
  1750  	PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1
  1751  	PADDL state1Store, B0; PADDL state1Store, B1
  1752  	PADDL state2Store, C0; PADDL state2Store, C1
  1753  	PADDL ctr0Store, D0; PADDL ctr1Store, D1
  1754  
  1755  	MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3
  1756  	PXOR  T0, A0; PXOR T1, B0; PXOR T2, C0; PXOR T3, D0
  1757  	MOVOU A0, (0*16)(oup); MOVOU B0, (1*16)(oup); MOVOU C0, (2*16)(oup); MOVOU D0, (3*16)(oup)
  1758  
  1759  	MOVQ $64, itr1
  1760  	LEAQ 64(inp), inp
  1761  	SUBQ $64, inl
  1762  
  1763  	JMP sealSSE128SealHash
  1764  
  1765  // ----------------------------------------------------------------------------
  1766  // Special optimization for the last 192 bytes of plaintext
  1767  sealSSETail192:
  1768  	// Need to encrypt up to 192 bytes - prepare three blocks, hash 192 or 256 bytes
  1769  	MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr0Store
  1770  	MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr1Store
  1771  	MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2; MOVO D2, ctr2Store
  1772  
  1773  sealSSETail192LoopA:
  1774  	// Perform ChaCha rounds, while hashing the previously encrypted ciphertext
  1775  	polyAdd(0(oup))
  1776  	polyMul
  1777  	LEAQ 16(oup), oup
  1778  
  1779  sealSSETail192LoopB:
  1780  	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
  1781  	shiftB0Left; shiftC0Left; shiftD0Left
  1782  	shiftB1Left; shiftC1Left; shiftD1Left
  1783  	shiftB2Left; shiftC2Left; shiftD2Left
  1784  
  1785  	polyAdd(0(oup))
  1786  	polyMul
  1787  	LEAQ 16(oup), oup
  1788  
  1789  	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
  1790  	shiftB0Right; shiftC0Right; shiftD0Right
  1791  	shiftB1Right; shiftC1Right; shiftD1Right
  1792  	shiftB2Right; shiftC2Right; shiftD2Right
  1793  
  1794  	DECQ itr1
  1795  	JG   sealSSETail192LoopA
  1796  
  1797  	DECQ itr2
  1798  	JGE  sealSSETail192LoopB
  1799  
  1800  	PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2
  1801  	PADDL state1Store, B0; PADDL state1Store, B1; PADDL state1Store, B2
  1802  	PADDL state2Store, C0; PADDL state2Store, C1; PADDL state2Store, C2
  1803  	PADDL ctr0Store, D0; PADDL ctr1Store, D1; PADDL ctr2Store, D2
  1804  
  1805  	MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3
  1806  	PXOR  T0, A0; PXOR T1, B0; PXOR T2, C0; PXOR T3, D0
  1807  	MOVOU A0, (0*16)(oup); MOVOU B0, (1*16)(oup); MOVOU C0, (2*16)(oup); MOVOU D0, (3*16)(oup)
  1808  	MOVOU (4*16)(inp), T0; MOVOU (5*16)(inp), T1; MOVOU (6*16)(inp), T2; MOVOU (7*16)(inp), T3
  1809  	PXOR  T0, A1; PXOR T1, B1; PXOR T2, C1; PXOR T3, D1
  1810  	MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup)
  1811  
  1812  	MOVO A2, A1
  1813  	MOVO B2, B1
  1814  	MOVO C2, C1
  1815  	MOVO D2, D1
  1816  	MOVQ $128, itr1
  1817  	LEAQ 128(inp), inp
  1818  	SUBQ $128, inl
  1819  
  1820  	JMP sealSSE128SealHash
  1821  
  1822  // ----------------------------------------------------------------------------
  1823  // Special seal optimization for buffers smaller than 129 bytes
  1824  sealSSE128:
  1825  	// For up to 128 bytes of ciphertext and 64 bytes for the poly key, we require to process three blocks
  1826  	MOVOU ·chacha20Constants<>(SB), A0; MOVOU (1*16)(keyp), B0; MOVOU (2*16)(keyp), C0; MOVOU (3*16)(keyp), D0
  1827  	MOVO  A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
  1828  	MOVO  A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
  1829  	MOVO  B0, T1; MOVO C0, T2; MOVO D1, T3
  1830  	MOVQ  $10, itr2
  1831  
  1832  sealSSE128InnerCipherLoop:
  1833  	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
  1834  	shiftB0Left;  shiftB1Left; shiftB2Left
  1835  	shiftC0Left;  shiftC1Left; shiftC2Left
  1836  	shiftD0Left;  shiftD1Left; shiftD2Left
  1837  	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
  1838  	shiftB0Right; shiftB1Right; shiftB2Right
  1839  	shiftC0Right; shiftC1Right; shiftC2Right
  1840  	shiftD0Right; shiftD1Right; shiftD2Right
  1841  	DECQ          itr2
  1842  	JNE           sealSSE128InnerCipherLoop
  1843  
  1844  	// A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded
  1845  	PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2
  1846  	PADDL T1, B0; PADDL T1, B1; PADDL T1, B2
  1847  	PADDL T2, C1; PADDL T2, C2
  1848  	PADDL T3, D1; PADDL ·sseIncMask<>(SB), T3; PADDL T3, D2
  1849  	PAND  ·polyClampMask<>(SB), A0
  1850  	MOVOU A0, rStore
  1851  	MOVOU B0, sStore
  1852  
  1853  	// Hash
  1854  	MOVQ ad_len+80(FP), itr2
  1855  	CALL polyHashADInternal<>(SB)
  1856  	XORQ itr1, itr1
  1857  
  1858  sealSSE128SealHash:
  1859  	// itr1 holds the number of bytes encrypted but not yet hashed
  1860  	CMPQ itr1, $16
  1861  	JB   sealSSE128Seal
  1862  	polyAdd(0(oup))
  1863  	polyMul
  1864  
  1865  	SUBQ $16, itr1
  1866  	ADDQ $16, oup
  1867  
  1868  	JMP sealSSE128SealHash
  1869  
  1870  sealSSE128Seal:
  1871  	CMPQ inl, $16
  1872  	JB   sealSSETail
  1873  	SUBQ $16, inl
  1874  
  1875  	// Load for decryption
  1876  	MOVOU (inp), T0
  1877  	PXOR  T0, A1
  1878  	MOVOU A1, (oup)
  1879  	LEAQ  (1*16)(inp), inp
  1880  	LEAQ  (1*16)(oup), oup
  1881  
  1882  	// Extract for hashing
  1883  	MOVQ   A1, t0
  1884  	PSRLDQ $8, A1
  1885  	MOVQ A1, t1
  1886  	ADDQ   t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2
  1887  	polyMul
  1888  
  1889  	// Shift the stream "left"
  1890  	MOVO B1, A1
  1891  	MOVO C1, B1
  1892  	MOVO D1, C1
  1893  	MOVO A2, D1
  1894  	MOVO B2, A2
  1895  	MOVO C2, B2
  1896  	MOVO D2, C2
  1897  	JMP  sealSSE128Seal
  1898  
  1899  sealSSETail:
  1900  	TESTQ inl, inl
  1901  	JE    sealSSEFinalize
  1902  
  1903  	// We can only load the PT one byte at a time to avoid read after end of buffer
  1904  	MOVQ inl, itr2
  1905  	SHLQ $4, itr2
  1906  	LEAQ ·andMask<>(SB), t0
  1907  	MOVQ inl, itr1
  1908  	LEAQ -1(inp)(inl*1), inp
  1909  	XORQ t2, t2
  1910  	XORQ t3, t3
  1911  	XORQ AX, AX
  1912  
  1913  sealSSETailLoadLoop:
  1914  	SHLQ $8, t2, t3
  1915  	SHLQ $8, t2
  1916  	MOVB (inp), AX
  1917  	XORQ AX, t2
  1918  	LEAQ   -1(inp), inp
  1919  	DECQ   itr1
  1920  	JNE    sealSSETailLoadLoop
  1921  	MOVQ t2, 0+tmpStore
  1922  	MOVQ t3, 8+tmpStore
  1923  	PXOR 0+tmpStore, A1
  1924  	MOVOU  A1, (oup)
  1925  	MOVOU  -16(t0)(itr2*1), T0
  1926  	PAND   T0, A1
  1927  	MOVQ   A1, t0
  1928  	PSRLDQ $8, A1
  1929  	MOVQ   A1, t1
  1930  	ADDQ   t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2
  1931  	polyMul
  1932  
  1933  	ADDQ inl, oup
  1934  
  1935  sealSSEFinalize:
  1936  	// Hash in the buffer lengths
  1937  	ADDQ ad_len+80(FP), acc0
  1938  	ADCQ src_len+56(FP), acc1
  1939  	ADCQ $1, acc2
  1940  	polyMul
  1941  
  1942  	// Final reduce
  1943  	MOVQ    acc0, t0
  1944  	MOVQ    acc1, t1
  1945  	MOVQ    acc2, t2
  1946  	SUBQ    $-5, acc0
  1947  	SBBQ    $-1, acc1
  1948  	SBBQ    $3, acc2
  1949  	CMOVQCS t0, acc0
  1950  	CMOVQCS t1, acc1
  1951  	CMOVQCS t2, acc2
  1952  
  1953  	// Add in the "s" part of the key
  1954  	ADDQ 0+sStore, acc0
  1955  	ADCQ 8+sStore, acc1
  1956  
  1957  	// Finally store the tag at the end of the message
  1958  	MOVQ acc0, (0*8)(oup)
  1959  	MOVQ acc1, (1*8)(oup)
  1960  	RET
  1961  
  1962  // ----------------------------------------------------------------------------
  1963  // ------------------------- AVX2 Code ----------------------------------------
  1964  chacha20Poly1305Seal_AVX2:
  1965  	VZEROUPPER
  1966  	VMOVDQU ·chacha20Constants<>(SB), AA0
  1967  	BYTE    $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x70; BYTE $0x10 // broadcasti128 16(r8), ymm14
  1968  	BYTE    $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x20 // broadcasti128 32(r8), ymm12
  1969  	BYTE    $0xc4; BYTE $0xc2; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x30 // broadcasti128 48(r8), ymm4
  1970  	VPADDD  ·avx2InitMask<>(SB), DD0, DD0
  1971  
  1972  	// Special optimizations, for very short buffers
  1973  	CMPQ inl, $192
  1974  	JBE  seal192AVX2 // 33% faster
  1975  	CMPQ inl, $320
  1976  	JBE  seal320AVX2 // 17% faster
  1977  
  1978  	// For the general key prepare the key first - as a byproduct we have 64 bytes of cipher stream
  1979  	VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
  1980  	VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3; VMOVDQA BB0, state1StoreAVX2
  1981  	VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3; VMOVDQA CC0, state2StoreAVX2
  1982  	VPADDD  ·avx2IncMask<>(SB), DD0, DD1; VMOVDQA DD0, ctr0StoreAVX2
  1983  	VPADDD  ·avx2IncMask<>(SB), DD1, DD2; VMOVDQA DD1, ctr1StoreAVX2
  1984  	VPADDD  ·avx2IncMask<>(SB), DD2, DD3; VMOVDQA DD2, ctr2StoreAVX2
  1985  	VMOVDQA DD3, ctr3StoreAVX2
  1986  	MOVQ    $10, itr2
  1987  
  1988  sealAVX2IntroLoop:
  1989  	VMOVDQA CC3, tmpStoreAVX2
  1990  	chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3)
  1991  	VMOVDQA tmpStoreAVX2, CC3
  1992  	VMOVDQA CC1, tmpStoreAVX2
  1993  	chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1)
  1994  	VMOVDQA tmpStoreAVX2, CC1
  1995  
  1996  	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $12, DD0, DD0, DD0
  1997  	VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $12, DD1, DD1, DD1
  1998  	VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $12, DD2, DD2, DD2
  1999  	VPALIGNR $4, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $12, DD3, DD3, DD3
  2000  
  2001  	VMOVDQA CC3, tmpStoreAVX2
  2002  	chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3)
  2003  	VMOVDQA tmpStoreAVX2, CC3
  2004  	VMOVDQA CC1, tmpStoreAVX2
  2005  	chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1)
  2006  	VMOVDQA tmpStoreAVX2, CC1
  2007  
  2008  	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $4, DD0, DD0, DD0
  2009  	VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $4, DD1, DD1, DD1
  2010  	VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $4, DD2, DD2, DD2
  2011  	VPALIGNR $12, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $4, DD3, DD3, DD3
  2012  	DECQ     itr2
  2013  	JNE      sealAVX2IntroLoop
  2014  
  2015  	VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
  2016  	VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
  2017  	VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
  2018  	VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
  2019  
  2020  	VPERM2I128 $0x13, CC0, DD0, CC0 // Stream bytes 96 - 127
  2021  	VPERM2I128 $0x02, AA0, BB0, DD0 // The Poly1305 key
  2022  	VPERM2I128 $0x13, AA0, BB0, AA0 // Stream bytes 64 - 95
  2023  
  2024  	// Clamp and store poly key
  2025  	VPAND   ·polyClampMask<>(SB), DD0, DD0
  2026  	VMOVDQA DD0, rsStoreAVX2
  2027  
  2028  	// Hash AD
  2029  	MOVQ ad_len+80(FP), itr2
  2030  	CALL polyHashADInternal<>(SB)
  2031  
  2032  	// Can store at least 320 bytes
  2033  	VPXOR   (0*32)(inp), AA0, AA0
  2034  	VPXOR   (1*32)(inp), CC0, CC0
  2035  	VMOVDQU AA0, (0*32)(oup)
  2036  	VMOVDQU CC0, (1*32)(oup)
  2037  
  2038  	VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
  2039  	VPXOR      (2*32)(inp), AA0, AA0; VPXOR (3*32)(inp), BB0, BB0; VPXOR (4*32)(inp), CC0, CC0; VPXOR (5*32)(inp), DD0, DD0
  2040  	VMOVDQU    AA0, (2*32)(oup); VMOVDQU BB0, (3*32)(oup); VMOVDQU CC0, (4*32)(oup); VMOVDQU DD0, (5*32)(oup)
  2041  	VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0
  2042  	VPXOR      (6*32)(inp), AA0, AA0; VPXOR (7*32)(inp), BB0, BB0; VPXOR (8*32)(inp), CC0, CC0; VPXOR (9*32)(inp), DD0, DD0
  2043  	VMOVDQU    AA0, (6*32)(oup); VMOVDQU BB0, (7*32)(oup); VMOVDQU CC0, (8*32)(oup); VMOVDQU DD0, (9*32)(oup)
  2044  
  2045  	MOVQ $320, itr1
  2046  	SUBQ $320, inl
  2047  	LEAQ 320(inp), inp
  2048  
  2049  	VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, CC3, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, CC3, DD3, DD0
  2050  	CMPQ       inl, $128
  2051  	JBE        sealAVX2SealHash
  2052  
  2053  	VPXOR   (0*32)(inp), AA0, AA0; VPXOR (1*32)(inp), BB0, BB0; VPXOR (2*32)(inp), CC0, CC0; VPXOR (3*32)(inp), DD0, DD0
  2054  	VMOVDQU AA0, (10*32)(oup); VMOVDQU BB0, (11*32)(oup); VMOVDQU CC0, (12*32)(oup); VMOVDQU DD0, (13*32)(oup)
  2055  	SUBQ    $128, inl
  2056  	LEAQ    128(inp), inp
  2057  
  2058  	MOVQ $8, itr1
  2059  	MOVQ $2, itr2
  2060  
  2061  	CMPQ inl, $128
  2062  	JBE  sealAVX2Tail128
  2063  	CMPQ inl, $256
  2064  	JBE  sealAVX2Tail256
  2065  	CMPQ inl, $384
  2066  	JBE  sealAVX2Tail384
  2067  	CMPQ inl, $512
  2068  	JBE  sealAVX2Tail512
  2069  
  2070  	// We have 448 bytes to hash, but main loop hashes 512 bytes at a time - perform some rounds, before the main loop
  2071  	VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
  2072  	VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
  2073  	VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
  2074  	VMOVDQA ctr3StoreAVX2, DD0
  2075  	VPADDD  ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
  2076  	VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
  2077  
  2078  	VMOVDQA CC3, tmpStoreAVX2
  2079  	chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3)
  2080  	VMOVDQA tmpStoreAVX2, CC3
  2081  	VMOVDQA CC1, tmpStoreAVX2
  2082  	chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1)
  2083  	VMOVDQA tmpStoreAVX2, CC1
  2084  
  2085  	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $12, DD0, DD0, DD0
  2086  	VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $12, DD1, DD1, DD1
  2087  	VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $12, DD2, DD2, DD2
  2088  	VPALIGNR $4, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $12, DD3, DD3, DD3
  2089  
  2090  	VMOVDQA CC3, tmpStoreAVX2
  2091  	chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3)
  2092  	VMOVDQA tmpStoreAVX2, CC3
  2093  	VMOVDQA CC1, tmpStoreAVX2
  2094  	chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1)
  2095  	VMOVDQA tmpStoreAVX2, CC1
  2096  
  2097  	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $4, DD0, DD0, DD0
  2098  	VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $4, DD1, DD1, DD1
  2099  	VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $4, DD2, DD2, DD2
  2100  	VPALIGNR $12, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $4, DD3, DD3, DD3
  2101  	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
  2102  	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
  2103  	VPSHUFB  ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
  2104  	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
  2105  	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
  2106  	VMOVDQA  CC3, tmpStoreAVX2
  2107  	VPSLLD   $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
  2108  	VPSLLD   $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
  2109  	VPSLLD   $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
  2110  	VPSLLD   $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
  2111  	VMOVDQA  tmpStoreAVX2, CC3
  2112  
  2113  	SUBQ $16, oup                  // Adjust the pointer
  2114  	MOVQ $9, itr1
  2115  	JMP  sealAVX2InternalLoopStart
  2116  
  2117  sealAVX2MainLoop:
  2118  	// Load state, increment counter blocks, store the incremented counters
  2119  	VMOVDQU ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
  2120  	VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
  2121  	VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
  2122  	VMOVDQA ctr3StoreAVX2, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
  2123  	VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
  2124  	MOVQ    $10, itr1
  2125  
  2126  sealAVX2InternalLoop:
  2127  	polyAdd(0*8(oup))
  2128  	VPADDD  BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
  2129  	polyMulStage1_AVX2
  2130  	VPXOR   AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
  2131  	VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
  2132  	polyMulStage2_AVX2
  2133  	VPADDD  DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
  2134  	VPXOR   CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
  2135  	polyMulStage3_AVX2
  2136  	VMOVDQA CC3, tmpStoreAVX2
  2137  	VPSLLD  $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
  2138  	VPSLLD  $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
  2139  	VPSLLD  $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
  2140  	VPSLLD  $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
  2141  	VMOVDQA tmpStoreAVX2, CC3
  2142  	polyMulReduceStage
  2143  
  2144  sealAVX2InternalLoopStart:
  2145  	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
  2146  	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
  2147  	VPSHUFB  ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
  2148  	polyAdd(2*8(oup))
  2149  	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
  2150  	polyMulStage1_AVX2
  2151  	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
  2152  	VMOVDQA  CC3, tmpStoreAVX2
  2153  	VPSLLD   $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
  2154  	VPSLLD   $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
  2155  	VPSLLD   $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
  2156  	VPSLLD   $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
  2157  	VMOVDQA  tmpStoreAVX2, CC3
  2158  	polyMulStage2_AVX2
  2159  	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3
  2160  	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
  2161  	VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3
  2162  	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
  2163  	polyMulStage3_AVX2
  2164  	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
  2165  	VPSHUFB  ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
  2166  	polyMulReduceStage
  2167  	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
  2168  	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
  2169  	polyAdd(4*8(oup))
  2170  	LEAQ     (6*8)(oup), oup
  2171  	VMOVDQA  CC3, tmpStoreAVX2
  2172  	VPSLLD   $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
  2173  	VPSLLD   $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
  2174  	VPSLLD   $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
  2175  	VPSLLD   $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
  2176  	VMOVDQA  tmpStoreAVX2, CC3
  2177  	polyMulStage1_AVX2
  2178  	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
  2179  	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
  2180  	polyMulStage2_AVX2
  2181  	VPSHUFB  ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
  2182  	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
  2183  	polyMulStage3_AVX2
  2184  	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
  2185  	VMOVDQA  CC3, tmpStoreAVX2
  2186  	VPSLLD   $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
  2187  	VPSLLD   $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
  2188  	VPSLLD   $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
  2189  	VPSLLD   $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
  2190  	VMOVDQA  tmpStoreAVX2, CC3
  2191  	polyMulReduceStage
  2192  	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3
  2193  	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
  2194  	VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3
  2195  	DECQ     itr1
  2196  	JNE      sealAVX2InternalLoop
  2197  
  2198  	VPADDD  ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
  2199  	VPADDD  state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
  2200  	VPADDD  state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
  2201  	VPADDD  ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
  2202  	VMOVDQA CC3, tmpStoreAVX2
  2203  
  2204  	// We only hashed 480 of the 512 bytes available - hash the remaining 32 here
  2205  	polyAdd(0*8(oup))
  2206  	polyMulAVX2
  2207  	LEAQ       (4*8)(oup), oup
  2208  	VPERM2I128 $0x02, AA0, BB0, CC3; VPERM2I128 $0x13, AA0, BB0, BB0; VPERM2I128 $0x02, CC0, DD0, AA0; VPERM2I128 $0x13, CC0, DD0, CC0
  2209  	VPXOR      (0*32)(inp), CC3, CC3; VPXOR (1*32)(inp), AA0, AA0; VPXOR (2*32)(inp), BB0, BB0; VPXOR (3*32)(inp), CC0, CC0
  2210  	VMOVDQU    CC3, (0*32)(oup); VMOVDQU AA0, (1*32)(oup); VMOVDQU BB0, (2*32)(oup); VMOVDQU CC0, (3*32)(oup)
  2211  	VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
  2212  	VPXOR      (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0
  2213  	VMOVDQU    AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup)
  2214  
  2215  	// and here
  2216  	polyAdd(-2*8(oup))
  2217  	polyMulAVX2
  2218  	VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0
  2219  	VPXOR      (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0
  2220  	VMOVDQU    AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup)
  2221  	VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0
  2222  	VPXOR      (12*32)(inp), AA0, AA0; VPXOR (13*32)(inp), BB0, BB0; VPXOR (14*32)(inp), CC0, CC0; VPXOR (15*32)(inp), DD0, DD0
  2223  	VMOVDQU    AA0, (12*32)(oup); VMOVDQU BB0, (13*32)(oup); VMOVDQU CC0, (14*32)(oup); VMOVDQU DD0, (15*32)(oup)
  2224  	LEAQ       (32*16)(inp), inp
  2225  	SUBQ       $(32*16), inl
  2226  	CMPQ       inl, $512
  2227  	JG         sealAVX2MainLoop
  2228  
  2229  	// Tail can only hash 480 bytes
  2230  	polyAdd(0*8(oup))
  2231  	polyMulAVX2
  2232  	polyAdd(2*8(oup))
  2233  	polyMulAVX2
  2234  	LEAQ 32(oup), oup
  2235  
  2236  	MOVQ $10, itr1
  2237  	MOVQ $0, itr2
  2238  	CMPQ inl, $128
  2239  	JBE  sealAVX2Tail128
  2240  	CMPQ inl, $256
  2241  	JBE  sealAVX2Tail256
  2242  	CMPQ inl, $384
  2243  	JBE  sealAVX2Tail384
  2244  	JMP  sealAVX2Tail512
  2245  
  2246  // ----------------------------------------------------------------------------
  2247  // Special optimization for buffers smaller than 193 bytes
  2248  seal192AVX2:
  2249  	// For up to 192 bytes of ciphertext and 64 bytes for the poly key, we process four blocks
  2250  	VMOVDQA AA0, AA1
  2251  	VMOVDQA BB0, BB1
  2252  	VMOVDQA CC0, CC1
  2253  	VPADDD  ·avx2IncMask<>(SB), DD0, DD1
  2254  	VMOVDQA AA0, AA2
  2255  	VMOVDQA BB0, BB2
  2256  	VMOVDQA CC0, CC2
  2257  	VMOVDQA DD0, DD2
  2258  	VMOVDQA DD1, TT3
  2259  	MOVQ    $10, itr2
  2260  
  2261  sealAVX2192InnerCipherLoop:
  2262  	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
  2263  	VPALIGNR   $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1
  2264  	VPALIGNR   $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
  2265  	VPALIGNR   $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1
  2266  	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
  2267  	VPALIGNR   $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1
  2268  	VPALIGNR   $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
  2269  	VPALIGNR   $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1
  2270  	DECQ       itr2
  2271  	JNE        sealAVX2192InnerCipherLoop
  2272  	VPADDD     AA2, AA0, AA0; VPADDD AA2, AA1, AA1
  2273  	VPADDD     BB2, BB0, BB0; VPADDD BB2, BB1, BB1
  2274  	VPADDD     CC2, CC0, CC0; VPADDD CC2, CC1, CC1
  2275  	VPADDD     DD2, DD0, DD0; VPADDD TT3, DD1, DD1
  2276  	VPERM2I128 $0x02, AA0, BB0, TT0
  2277  
  2278  	// Clamp and store poly key
  2279  	VPAND   ·polyClampMask<>(SB), TT0, TT0
  2280  	VMOVDQA TT0, rsStoreAVX2
  2281  
  2282  	// Stream for up to 192 bytes
  2283  	VPERM2I128 $0x13, AA0, BB0, AA0
  2284  	VPERM2I128 $0x13, CC0, DD0, BB0
  2285  	VPERM2I128 $0x02, AA1, BB1, CC0
  2286  	VPERM2I128 $0x02, CC1, DD1, DD0
  2287  	VPERM2I128 $0x13, AA1, BB1, AA1
  2288  	VPERM2I128 $0x13, CC1, DD1, BB1
  2289  
  2290  sealAVX2ShortSeal:
  2291  	// Hash aad
  2292  	MOVQ ad_len+80(FP), itr2
  2293  	CALL polyHashADInternal<>(SB)
  2294  	XORQ itr1, itr1
  2295  
  2296  sealAVX2SealHash:
  2297  	// itr1 holds the number of bytes encrypted but not yet hashed
  2298  	CMPQ itr1, $16
  2299  	JB   sealAVX2ShortSealLoop
  2300  	polyAdd(0(oup))
  2301  	polyMul
  2302  	SUBQ $16, itr1
  2303  	ADDQ $16, oup
  2304  	JMP  sealAVX2SealHash
  2305  
  2306  sealAVX2ShortSealLoop:
  2307  	CMPQ inl, $32
  2308  	JB   sealAVX2ShortTail32
  2309  	SUBQ $32, inl
  2310  
  2311  	// Load for encryption
  2312  	VPXOR   (inp), AA0, AA0
  2313  	VMOVDQU AA0, (oup)
  2314  	LEAQ    (1*32)(inp), inp
  2315  
  2316  	// Now can hash
  2317  	polyAdd(0*8(oup))
  2318  	polyMulAVX2
  2319  	polyAdd(2*8(oup))
  2320  	polyMulAVX2
  2321  	LEAQ (1*32)(oup), oup
  2322  
  2323  	// Shift stream left
  2324  	VMOVDQA BB0, AA0
  2325  	VMOVDQA CC0, BB0
  2326  	VMOVDQA DD0, CC0
  2327  	VMOVDQA AA1, DD0
  2328  	VMOVDQA BB1, AA1
  2329  	VMOVDQA CC1, BB1
  2330  	VMOVDQA DD1, CC1
  2331  	VMOVDQA AA2, DD1
  2332  	VMOVDQA BB2, AA2
  2333  	JMP     sealAVX2ShortSealLoop
  2334  
  2335  sealAVX2ShortTail32:
  2336  	CMPQ    inl, $16
  2337  	VMOVDQA A0, A1
  2338  	JB      sealAVX2ShortDone
  2339  
  2340  	SUBQ $16, inl
  2341  
  2342  	// Load for encryption
  2343  	VPXOR   (inp), A0, T0
  2344  	VMOVDQU T0, (oup)
  2345  	LEAQ    (1*16)(inp), inp
  2346  
  2347  	// Hash
  2348  	polyAdd(0*8(oup))
  2349  	polyMulAVX2
  2350  	LEAQ       (1*16)(oup), oup
  2351  	VPERM2I128 $0x11, AA0, AA0, AA0
  2352  	VMOVDQA    A0, A1
  2353  
  2354  sealAVX2ShortDone:
  2355  	VZEROUPPER
  2356  	JMP sealSSETail
  2357  
  2358  // ----------------------------------------------------------------------------
  2359  // Special optimization for buffers smaller than 321 bytes
  2360  seal320AVX2:
  2361  	// For up to 320 bytes of ciphertext and 64 bytes for the poly key, we process six blocks
  2362  	VMOVDQA AA0, AA1; VMOVDQA BB0, BB1; VMOVDQA CC0, CC1; VPADDD ·avx2IncMask<>(SB), DD0, DD1
  2363  	VMOVDQA AA0, AA2; VMOVDQA BB0, BB2; VMOVDQA CC0, CC2; VPADDD ·avx2IncMask<>(SB), DD1, DD2
  2364  	VMOVDQA BB0, TT1; VMOVDQA CC0, TT2; VMOVDQA DD0, TT3
  2365  	MOVQ    $10, itr2
  2366  
  2367  sealAVX2320InnerCipherLoop:
  2368  	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
  2369  	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2
  2370  	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
  2371  	VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2
  2372  	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
  2373  	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2
  2374  	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
  2375  	VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2
  2376  	DECQ     itr2
  2377  	JNE      sealAVX2320InnerCipherLoop
  2378  
  2379  	VMOVDQA ·chacha20Constants<>(SB), TT0
  2380  	VPADDD  TT0, AA0, AA0; VPADDD TT0, AA1, AA1; VPADDD TT0, AA2, AA2
  2381  	VPADDD  TT1, BB0, BB0; VPADDD TT1, BB1, BB1; VPADDD TT1, BB2, BB2
  2382  	VPADDD  TT2, CC0, CC0; VPADDD TT2, CC1, CC1; VPADDD TT2, CC2, CC2
  2383  	VMOVDQA ·avx2IncMask<>(SB), TT0
  2384  	VPADDD  TT3, DD0, DD0; VPADDD TT0, TT3, TT3
  2385  	VPADDD  TT3, DD1, DD1; VPADDD TT0, TT3, TT3
  2386  	VPADDD  TT3, DD2, DD2
  2387  
  2388  	// Clamp and store poly key
  2389  	VPERM2I128 $0x02, AA0, BB0, TT0
  2390  	VPAND      ·polyClampMask<>(SB), TT0, TT0
  2391  	VMOVDQA    TT0, rsStoreAVX2
  2392  
  2393  	// Stream for up to 320 bytes
  2394  	VPERM2I128 $0x13, AA0, BB0, AA0
  2395  	VPERM2I128 $0x13, CC0, DD0, BB0
  2396  	VPERM2I128 $0x02, AA1, BB1, CC0
  2397  	VPERM2I128 $0x02, CC1, DD1, DD0
  2398  	VPERM2I128 $0x13, AA1, BB1, AA1
  2399  	VPERM2I128 $0x13, CC1, DD1, BB1
  2400  	VPERM2I128 $0x02, AA2, BB2, CC1
  2401  	VPERM2I128 $0x02, CC2, DD2, DD1
  2402  	VPERM2I128 $0x13, AA2, BB2, AA2
  2403  	VPERM2I128 $0x13, CC2, DD2, BB2
  2404  	JMP        sealAVX2ShortSeal
  2405  
  2406  // ----------------------------------------------------------------------------
  2407  // Special optimization for the last 128 bytes of ciphertext
  2408  sealAVX2Tail128:
  2409  	// Need to decrypt up to 128 bytes - prepare two blocks
  2410  	// If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed
  2411  	// If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed
  2412  	VMOVDQA ·chacha20Constants<>(SB), AA0
  2413  	VMOVDQA state1StoreAVX2, BB0
  2414  	VMOVDQA state2StoreAVX2, CC0
  2415  	VMOVDQA ctr3StoreAVX2, DD0
  2416  	VPADDD  ·avx2IncMask<>(SB), DD0, DD0
  2417  	VMOVDQA DD0, DD1
  2418  
  2419  sealAVX2Tail128LoopA:
  2420  	polyAdd(0(oup))
  2421  	polyMul
  2422  	LEAQ 16(oup), oup
  2423  
  2424  sealAVX2Tail128LoopB:
  2425  	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0)
  2426  	polyAdd(0(oup))
  2427  	polyMul
  2428  	VPALIGNR $4, BB0, BB0, BB0
  2429  	VPALIGNR $8, CC0, CC0, CC0
  2430  	VPALIGNR $12, DD0, DD0, DD0
  2431  	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0)
  2432  	polyAdd(16(oup))
  2433  	polyMul
  2434  	LEAQ     32(oup), oup
  2435  	VPALIGNR $12, BB0, BB0, BB0
  2436  	VPALIGNR $8, CC0, CC0, CC0
  2437  	VPALIGNR $4, DD0, DD0, DD0
  2438  	DECQ     itr1
  2439  	JG       sealAVX2Tail128LoopA
  2440  	DECQ     itr2
  2441  	JGE      sealAVX2Tail128LoopB
  2442  
  2443  	VPADDD ·chacha20Constants<>(SB), AA0, AA1
  2444  	VPADDD state1StoreAVX2, BB0, BB1
  2445  	VPADDD state2StoreAVX2, CC0, CC1
  2446  	VPADDD DD1, DD0, DD1
  2447  
  2448  	VPERM2I128 $0x02, AA1, BB1, AA0
  2449  	VPERM2I128 $0x02, CC1, DD1, BB0
  2450  	VPERM2I128 $0x13, AA1, BB1, CC0
  2451  	VPERM2I128 $0x13, CC1, DD1, DD0
  2452  	JMP        sealAVX2ShortSealLoop
  2453  
  2454  // ----------------------------------------------------------------------------
  2455  // Special optimization for the last 256 bytes of ciphertext
  2456  sealAVX2Tail256:
  2457  	// Need to decrypt up to 256 bytes - prepare two blocks
  2458  	// If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed
  2459  	// If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed
  2460  	VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA ·chacha20Constants<>(SB), AA1
  2461  	VMOVDQA state1StoreAVX2, BB0; VMOVDQA state1StoreAVX2, BB1
  2462  	VMOVDQA state2StoreAVX2, CC0; VMOVDQA state2StoreAVX2, CC1
  2463  	VMOVDQA ctr3StoreAVX2, DD0
  2464  	VPADDD  ·avx2IncMask<>(SB), DD0, DD0
  2465  	VPADDD  ·avx2IncMask<>(SB), DD0, DD1
  2466  	VMOVDQA DD0, TT1
  2467  	VMOVDQA DD1, TT2
  2468  
  2469  sealAVX2Tail256LoopA:
  2470  	polyAdd(0(oup))
  2471  	polyMul
  2472  	LEAQ 16(oup), oup
  2473  
  2474  sealAVX2Tail256LoopB:
  2475  	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
  2476  	polyAdd(0(oup))
  2477  	polyMul
  2478  	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1
  2479  	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
  2480  	VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1
  2481  	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
  2482  	polyAdd(16(oup))
  2483  	polyMul
  2484  	LEAQ     32(oup), oup
  2485  	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1
  2486  	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
  2487  	VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1
  2488  	DECQ     itr1
  2489  	JG       sealAVX2Tail256LoopA
  2490  	DECQ     itr2
  2491  	JGE      sealAVX2Tail256LoopB
  2492  
  2493  	VPADDD     ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1
  2494  	VPADDD     state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1
  2495  	VPADDD     state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1
  2496  	VPADDD     TT1, DD0, DD0; VPADDD TT2, DD1, DD1
  2497  	VPERM2I128 $0x02, AA0, BB0, TT0
  2498  	VPERM2I128 $0x02, CC0, DD0, TT1
  2499  	VPERM2I128 $0x13, AA0, BB0, TT2
  2500  	VPERM2I128 $0x13, CC0, DD0, TT3
  2501  	VPXOR      (0*32)(inp), TT0, TT0; VPXOR (1*32)(inp), TT1, TT1; VPXOR (2*32)(inp), TT2, TT2; VPXOR (3*32)(inp), TT3, TT3
  2502  	VMOVDQU    TT0, (0*32)(oup); VMOVDQU TT1, (1*32)(oup); VMOVDQU TT2, (2*32)(oup); VMOVDQU TT3, (3*32)(oup)
  2503  	MOVQ       $128, itr1
  2504  	LEAQ       128(inp), inp
  2505  	SUBQ       $128, inl
  2506  	VPERM2I128 $0x02, AA1, BB1, AA0
  2507  	VPERM2I128 $0x02, CC1, DD1, BB0
  2508  	VPERM2I128 $0x13, AA1, BB1, CC0
  2509  	VPERM2I128 $0x13, CC1, DD1, DD0
  2510  
  2511  	JMP sealAVX2SealHash
  2512  
  2513  // ----------------------------------------------------------------------------
  2514  // Special optimization for the last 384 bytes of ciphertext
  2515  sealAVX2Tail384:
  2516  	// Need to decrypt up to 384 bytes - prepare two blocks
  2517  	// If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed
  2518  	// If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed
  2519  	VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2
  2520  	VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2
  2521  	VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2
  2522  	VMOVDQA ctr3StoreAVX2, DD0
  2523  	VPADDD  ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2
  2524  	VMOVDQA DD0, TT1; VMOVDQA DD1, TT2; VMOVDQA DD2, TT3
  2525  
  2526  sealAVX2Tail384LoopA:
  2527  	polyAdd(0(oup))
  2528  	polyMul
  2529  	LEAQ 16(oup), oup
  2530  
  2531  sealAVX2Tail384LoopB:
  2532  	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
  2533  	polyAdd(0(oup))
  2534  	polyMul
  2535  	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2
  2536  	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
  2537  	VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2
  2538  	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
  2539  	polyAdd(16(oup))
  2540  	polyMul
  2541  	LEAQ     32(oup), oup
  2542  	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2
  2543  	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
  2544  	VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2
  2545  	DECQ     itr1
  2546  	JG       sealAVX2Tail384LoopA
  2547  	DECQ     itr2
  2548  	JGE      sealAVX2Tail384LoopB
  2549  
  2550  	VPADDD     ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2
  2551  	VPADDD     state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2
  2552  	VPADDD     state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2
  2553  	VPADDD     TT1, DD0, DD0; VPADDD TT2, DD1, DD1; VPADDD TT3, DD2, DD2
  2554  	VPERM2I128 $0x02, AA0, BB0, TT0
  2555  	VPERM2I128 $0x02, CC0, DD0, TT1
  2556  	VPERM2I128 $0x13, AA0, BB0, TT2
  2557  	VPERM2I128 $0x13, CC0, DD0, TT3
  2558  	VPXOR      (0*32)(inp), TT0, TT0; VPXOR (1*32)(inp), TT1, TT1; VPXOR (2*32)(inp), TT2, TT2; VPXOR (3*32)(inp), TT3, TT3
  2559  	VMOVDQU    TT0, (0*32)(oup); VMOVDQU TT1, (1*32)(oup); VMOVDQU TT2, (2*32)(oup); VMOVDQU TT3, (3*32)(oup)
  2560  	VPERM2I128 $0x02, AA1, BB1, TT0
  2561  	VPERM2I128 $0x02, CC1, DD1, TT1
  2562  	VPERM2I128 $0x13, AA1, BB1, TT2
  2563  	VPERM2I128 $0x13, CC1, DD1, TT3
  2564  	VPXOR      (4*32)(inp), TT0, TT0; VPXOR (5*32)(inp), TT1, TT1; VPXOR (6*32)(inp), TT2, TT2; VPXOR (7*32)(inp), TT3, TT3
  2565  	VMOVDQU    TT0, (4*32)(oup); VMOVDQU TT1, (5*32)(oup); VMOVDQU TT2, (6*32)(oup); VMOVDQU TT3, (7*32)(oup)
  2566  	MOVQ       $256, itr1
  2567  	LEAQ       256(inp), inp
  2568  	SUBQ       $256, inl
  2569  	VPERM2I128 $0x02, AA2, BB2, AA0
  2570  	VPERM2I128 $0x02, CC2, DD2, BB0
  2571  	VPERM2I128 $0x13, AA2, BB2, CC0
  2572  	VPERM2I128 $0x13, CC2, DD2, DD0
  2573  
  2574  	JMP sealAVX2SealHash
  2575  
  2576  // ----------------------------------------------------------------------------
  2577  // Special optimization for the last 512 bytes of ciphertext
  2578  sealAVX2Tail512:
  2579  	// Need to decrypt up to 512 bytes - prepare two blocks
  2580  	// If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed
  2581  	// If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed
  2582  	VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
  2583  	VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
  2584  	VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
  2585  	VMOVDQA ctr3StoreAVX2, DD0
  2586  	VPADDD  ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
  2587  	VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
  2588  
  2589  sealAVX2Tail512LoopA:
  2590  	polyAdd(0(oup))
  2591  	polyMul
  2592  	LEAQ 16(oup), oup
  2593  
  2594  sealAVX2Tail512LoopB:
  2595  	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
  2596  	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
  2597  	VPSHUFB  ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
  2598  	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
  2599  	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
  2600  	VMOVDQA  CC3, tmpStoreAVX2
  2601  	VPSLLD   $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
  2602  	VPSLLD   $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
  2603  	VPSLLD   $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
  2604  	VPSLLD   $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
  2605  	VMOVDQA  tmpStoreAVX2, CC3
  2606  	polyAdd(0*8(oup))
  2607  	polyMulAVX2
  2608  	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
  2609  	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
  2610  	VPSHUFB  ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
  2611  	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
  2612  	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
  2613  	VMOVDQA  CC3, tmpStoreAVX2
  2614  	VPSLLD   $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
  2615  	VPSLLD   $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
  2616  	VPSLLD   $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
  2617  	VPSLLD   $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
  2618  	VMOVDQA  tmpStoreAVX2, CC3
  2619  	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3
  2620  	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
  2621  	VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3
  2622  	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
  2623  	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
  2624  	VPSHUFB  ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
  2625  	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
  2626  	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
  2627  	polyAdd(2*8(oup))
  2628  	polyMulAVX2
  2629  	LEAQ     (4*8)(oup), oup
  2630  	VMOVDQA  CC3, tmpStoreAVX2
  2631  	VPSLLD   $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
  2632  	VPSLLD   $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
  2633  	VPSLLD   $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
  2634  	VPSLLD   $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
  2635  	VMOVDQA  tmpStoreAVX2, CC3
  2636  	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
  2637  	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
  2638  	VPSHUFB  ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
  2639  	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
  2640  	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
  2641  	VMOVDQA  CC3, tmpStoreAVX2
  2642  	VPSLLD   $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
  2643  	VPSLLD   $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
  2644  	VPSLLD   $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
  2645  	VPSLLD   $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
  2646  	VMOVDQA  tmpStoreAVX2, CC3
  2647  	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3
  2648  	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
  2649  	VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3
  2650  
  2651  	DECQ itr1
  2652  	JG   sealAVX2Tail512LoopA
  2653  	DECQ itr2
  2654  	JGE  sealAVX2Tail512LoopB
  2655  
  2656  	VPADDD     ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
  2657  	VPADDD     state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
  2658  	VPADDD     state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
  2659  	VPADDD     ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
  2660  	VMOVDQA    CC3, tmpStoreAVX2
  2661  	VPERM2I128 $0x02, AA0, BB0, CC3
  2662  	VPXOR      (0*32)(inp), CC3, CC3
  2663  	VMOVDQU    CC3, (0*32)(oup)
  2664  	VPERM2I128 $0x02, CC0, DD0, CC3
  2665  	VPXOR      (1*32)(inp), CC3, CC3
  2666  	VMOVDQU    CC3, (1*32)(oup)
  2667  	VPERM2I128 $0x13, AA0, BB0, CC3
  2668  	VPXOR      (2*32)(inp), CC3, CC3
  2669  	VMOVDQU    CC3, (2*32)(oup)
  2670  	VPERM2I128 $0x13, CC0, DD0, CC3
  2671  	VPXOR      (3*32)(inp), CC3, CC3
  2672  	VMOVDQU    CC3, (3*32)(oup)
  2673  
  2674  	VPERM2I128 $0x02, AA1, BB1, AA0
  2675  	VPERM2I128 $0x02, CC1, DD1, BB0
  2676  	VPERM2I128 $0x13, AA1, BB1, CC0
  2677  	VPERM2I128 $0x13, CC1, DD1, DD0
  2678  	VPXOR      (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0
  2679  	VMOVDQU    AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup)
  2680  
  2681  	VPERM2I128 $0x02, AA2, BB2, AA0
  2682  	VPERM2I128 $0x02, CC2, DD2, BB0
  2683  	VPERM2I128 $0x13, AA2, BB2, CC0
  2684  	VPERM2I128 $0x13, CC2, DD2, DD0
  2685  	VPXOR      (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0
  2686  	VMOVDQU    AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup)
  2687  
  2688  	MOVQ       $384, itr1
  2689  	LEAQ       384(inp), inp
  2690  	SUBQ       $384, inl
  2691  	VPERM2I128 $0x02, AA3, BB3, AA0
  2692  	VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0
  2693  	VPERM2I128 $0x13, AA3, BB3, CC0
  2694  	VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0
  2695  
  2696  	JMP sealAVX2SealHash
  2697  

View as plain text