Text file src/crypto/aes/gcm_amd64.s

     1  // Copyright 2015 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // This is an optimized implementation of AES-GCM using AES-NI and CLMUL-NI
     6  // The implementation uses some optimization as described in:
     7  // [1] Gueron, S., Kounavis, M.E.: Intel® Carry-Less Multiplication
     8  //     Instruction and its Usage for Computing the GCM Mode rev. 2.02
     9  // [2] Gueron, S., Krasnov, V.: Speeding up Counter Mode in Software and
    10  //     Hardware
    11  
    12  #include "textflag.h"
    13  
    14  #define B0 X0
    15  #define B1 X1
    16  #define B2 X2
    17  #define B3 X3
    18  #define B4 X4
    19  #define B5 X5
    20  #define B6 X6
    21  #define B7 X7
    22  
    23  #define ACC0 X8
    24  #define ACC1 X9
    25  #define ACCM X10
    26  
    27  #define T0 X11
    28  #define T1 X12
    29  #define T2 X13
    30  #define POLY X14
    31  #define BSWAP X15
    32  
    33  DATA bswapMask<>+0x00(SB)/8, $0x08090a0b0c0d0e0f
    34  DATA bswapMask<>+0x08(SB)/8, $0x0001020304050607
    35  
    36  DATA gcmPoly<>+0x00(SB)/8, $0x0000000000000001
    37  DATA gcmPoly<>+0x08(SB)/8, $0xc200000000000000
    38  
    39  DATA andMask<>+0x00(SB)/8, $0x00000000000000ff
    40  DATA andMask<>+0x08(SB)/8, $0x0000000000000000
    41  DATA andMask<>+0x10(SB)/8, $0x000000000000ffff
    42  DATA andMask<>+0x18(SB)/8, $0x0000000000000000
    43  DATA andMask<>+0x20(SB)/8, $0x0000000000ffffff
    44  DATA andMask<>+0x28(SB)/8, $0x0000000000000000
    45  DATA andMask<>+0x30(SB)/8, $0x00000000ffffffff
    46  DATA andMask<>+0x38(SB)/8, $0x0000000000000000
    47  DATA andMask<>+0x40(SB)/8, $0x000000ffffffffff
    48  DATA andMask<>+0x48(SB)/8, $0x0000000000000000
    49  DATA andMask<>+0x50(SB)/8, $0x0000ffffffffffff
    50  DATA andMask<>+0x58(SB)/8, $0x0000000000000000
    51  DATA andMask<>+0x60(SB)/8, $0x00ffffffffffffff
    52  DATA andMask<>+0x68(SB)/8, $0x0000000000000000
    53  DATA andMask<>+0x70(SB)/8, $0xffffffffffffffff
    54  DATA andMask<>+0x78(SB)/8, $0x0000000000000000
    55  DATA andMask<>+0x80(SB)/8, $0xffffffffffffffff
    56  DATA andMask<>+0x88(SB)/8, $0x00000000000000ff
    57  DATA andMask<>+0x90(SB)/8, $0xffffffffffffffff
    58  DATA andMask<>+0x98(SB)/8, $0x000000000000ffff
    59  DATA andMask<>+0xa0(SB)/8, $0xffffffffffffffff
    60  DATA andMask<>+0xa8(SB)/8, $0x0000000000ffffff
    61  DATA andMask<>+0xb0(SB)/8, $0xffffffffffffffff
    62  DATA andMask<>+0xb8(SB)/8, $0x00000000ffffffff
    63  DATA andMask<>+0xc0(SB)/8, $0xffffffffffffffff
    64  DATA andMask<>+0xc8(SB)/8, $0x000000ffffffffff
    65  DATA andMask<>+0xd0(SB)/8, $0xffffffffffffffff
    66  DATA andMask<>+0xd8(SB)/8, $0x0000ffffffffffff
    67  DATA andMask<>+0xe0(SB)/8, $0xffffffffffffffff
    68  DATA andMask<>+0xe8(SB)/8, $0x00ffffffffffffff
    69  
    70  GLOBL bswapMask<>(SB), (NOPTR+RODATA), $16
    71  GLOBL gcmPoly<>(SB), (NOPTR+RODATA), $16
    72  GLOBL andMask<>(SB), (NOPTR+RODATA), $240
    73  
    74  // func gcmAesFinish(productTable *[256]byte, tagMask, T *[16]byte, pLen, dLen uint64)
    75  TEXT ·gcmAesFinish(SB),NOSPLIT,$0
    76  #define pTbl DI
    77  #define tMsk SI
    78  #define tPtr DX
    79  #define plen AX
    80  #define dlen CX
    81  
    82  	MOVQ productTable+0(FP), pTbl
    83  	MOVQ tagMask+8(FP), tMsk
    84  	MOVQ T+16(FP), tPtr
    85  	MOVQ pLen+24(FP), plen
    86  	MOVQ dLen+32(FP), dlen
    87  
    88  	MOVOU (tPtr), ACC0
    89  	MOVOU (tMsk), T2
    90  
    91  	MOVOU bswapMask<>(SB), BSWAP
    92  	MOVOU gcmPoly<>(SB), POLY
    93  
    94  	SHLQ $3, plen
    95  	SHLQ $3, dlen
    96  
    97  	MOVQ plen, B0
    98  	PINSRQ $1, dlen, B0
    99  
   100  	PXOR ACC0, B0
   101  
   102  	MOVOU (16*14)(pTbl), ACC0
   103  	MOVOU (16*15)(pTbl), ACCM
   104  	MOVOU ACC0, ACC1
   105  
   106  	PCLMULQDQ $0x00, B0, ACC0
   107  	PCLMULQDQ $0x11, B0, ACC1
   108  	PSHUFD $78, B0, T0
   109  	PXOR B0, T0
   110  	PCLMULQDQ $0x00, T0, ACCM
   111  
   112  	PXOR ACC0, ACCM
   113  	PXOR ACC1, ACCM
   114  	MOVOU ACCM, T0
   115  	PSRLDQ $8, ACCM
   116  	PSLLDQ $8, T0
   117  	PXOR ACCM, ACC1
   118  	PXOR T0, ACC0
   119  
   120  	MOVOU POLY, T0
   121  	PCLMULQDQ $0x01, ACC0, T0
   122  	PSHUFD $78, ACC0, ACC0
   123  	PXOR T0, ACC0
   124  
   125  	MOVOU POLY, T0
   126  	PCLMULQDQ $0x01, ACC0, T0
   127  	PSHUFD $78, ACC0, ACC0
   128  	PXOR T0, ACC0
   129  
   130  	PXOR ACC1, ACC0
   131  
   132  	PSHUFB BSWAP, ACC0
   133  	PXOR T2, ACC0
   134  	MOVOU ACC0, (tPtr)
   135  
   136  	RET
   137  #undef pTbl
   138  #undef tMsk
   139  #undef tPtr
   140  #undef plen
   141  #undef dlen
   142  
   143  // func gcmAesInit(productTable *[256]byte, ks []uint32)
   144  TEXT ·gcmAesInit(SB),NOSPLIT,$0
   145  #define dst DI
   146  #define KS SI
   147  #define NR DX
   148  
   149  	MOVQ productTable+0(FP), dst
   150  	MOVQ ks_base+8(FP), KS
   151  	MOVQ ks_len+16(FP), NR
   152  
   153  	SHRQ $2, NR
   154  	DECQ NR
   155  
   156  	MOVOU bswapMask<>(SB), BSWAP
   157  	MOVOU gcmPoly<>(SB), POLY
   158  
   159  	// Encrypt block 0, with the AES key to generate the hash key H
   160  	MOVOU (16*0)(KS), B0
   161  	MOVOU (16*1)(KS), T0
   162  	AESENC T0, B0
   163  	MOVOU (16*2)(KS), T0
   164  	AESENC T0, B0
   165  	MOVOU (16*3)(KS), T0
   166  	AESENC T0, B0
   167  	MOVOU (16*4)(KS), T0
   168  	AESENC T0, B0
   169  	MOVOU (16*5)(KS), T0
   170  	AESENC T0, B0
   171  	MOVOU (16*6)(KS), T0
   172  	AESENC T0, B0
   173  	MOVOU (16*7)(KS), T0
   174  	AESENC T0, B0
   175  	MOVOU (16*8)(KS), T0
   176  	AESENC T0, B0
   177  	MOVOU (16*9)(KS), T0
   178  	AESENC T0, B0
   179  	MOVOU (16*10)(KS), T0
   180  	CMPQ NR, $12
   181  	JB initEncLast
   182  	AESENC T0, B0
   183  	MOVOU (16*11)(KS), T0
   184  	AESENC T0, B0
   185  	MOVOU (16*12)(KS), T0
   186  	JE initEncLast
   187  	AESENC T0, B0
   188  	MOVOU (16*13)(KS), T0
   189  	AESENC T0, B0
   190  	MOVOU (16*14)(KS), T0
   191  initEncLast:
   192  	AESENCLAST T0, B0
   193  
   194  	PSHUFB BSWAP, B0
   195  	// H * 2
   196  	PSHUFD $0xff, B0, T0
   197  	MOVOU B0, T1
   198  	PSRAL $31, T0
   199  	PAND POLY, T0
   200  	PSRLL $31, T1
   201  	PSLLDQ $4, T1
   202  	PSLLL $1, B0
   203  	PXOR T0, B0
   204  	PXOR T1, B0
   205  	// Karatsuba pre-computations
   206  	MOVOU B0, (16*14)(dst)
   207  	PSHUFD $78, B0, B1
   208  	PXOR B0, B1
   209  	MOVOU B1, (16*15)(dst)
   210  
   211  	MOVOU B0, B2
   212  	MOVOU B1, B3
   213  	// Now prepare powers of H and pre-computations for them
   214  	MOVQ $7, AX
   215  
   216  initLoop:
   217  		MOVOU B2, T0
   218  		MOVOU B2, T1
   219  		MOVOU B3, T2
   220  		PCLMULQDQ $0x00, B0, T0
   221  		PCLMULQDQ $0x11, B0, T1
   222  		PCLMULQDQ $0x00, B1, T2
   223  
   224  		PXOR T0, T2
   225  		PXOR T1, T2
   226  		MOVOU T2, B4
   227  		PSLLDQ $8, B4
   228  		PSRLDQ $8, T2
   229  		PXOR B4, T0
   230  		PXOR T2, T1
   231  
   232  		MOVOU POLY, B2
   233  		PCLMULQDQ $0x01, T0, B2
   234  		PSHUFD $78, T0, T0
   235  		PXOR B2, T0
   236  		MOVOU POLY, B2
   237  		PCLMULQDQ $0x01, T0, B2
   238  		PSHUFD $78, T0, T0
   239  		PXOR T0, B2
   240  		PXOR T1, B2
   241  
   242  		MOVOU B2, (16*12)(dst)
   243  		PSHUFD $78, B2, B3
   244  		PXOR B2, B3
   245  		MOVOU B3, (16*13)(dst)
   246  
   247  		DECQ AX
   248  		LEAQ (-16*2)(dst), dst
   249  	JNE initLoop
   250  
   251  	RET
   252  #undef NR
   253  #undef KS
   254  #undef dst
   255  
   256  // func gcmAesData(productTable *[256]byte, data []byte, T *[16]byte)
   257  TEXT ·gcmAesData(SB),NOSPLIT,$0
   258  #define pTbl DI
   259  #define aut SI
   260  #define tPtr CX
   261  #define autLen DX
   262  
   263  #define reduceRound(a) 	MOVOU POLY, T0;	PCLMULQDQ $0x01, a, T0; PSHUFD $78, a, a; PXOR T0, a
   264  #define mulRoundAAD(X ,i) \
   265  	MOVOU (16*(i*2))(pTbl), T1;\
   266  	MOVOU T1, T2;\
   267  	PCLMULQDQ $0x00, X, T1;\
   268  	PXOR T1, ACC0;\
   269  	PCLMULQDQ $0x11, X, T2;\
   270  	PXOR T2, ACC1;\
   271  	PSHUFD $78, X, T1;\
   272  	PXOR T1, X;\
   273  	MOVOU (16*(i*2+1))(pTbl), T1;\
   274  	PCLMULQDQ $0x00, X, T1;\
   275  	PXOR T1, ACCM
   276  
   277  	MOVQ productTable+0(FP), pTbl
   278  	MOVQ data_base+8(FP), aut
   279  	MOVQ data_len+16(FP), autLen
   280  	MOVQ T+32(FP), tPtr
   281  
   282  	PXOR ACC0, ACC0
   283  	MOVOU bswapMask<>(SB), BSWAP
   284  	MOVOU gcmPoly<>(SB), POLY
   285  
   286  	TESTQ autLen, autLen
   287  	JEQ dataBail
   288  
   289  	CMPQ autLen, $13	// optimize the TLS case
   290  	JE dataTLS
   291  	CMPQ autLen, $128
   292  	JB startSinglesLoop
   293  	JMP dataOctaLoop
   294  
   295  dataTLS:
   296  	MOVOU (16*14)(pTbl), T1
   297  	MOVOU (16*15)(pTbl), T2
   298  	PXOR B0, B0
   299  	MOVQ (aut), B0
   300  	PINSRD $2, 8(aut), B0
   301  	PINSRB $12, 12(aut), B0
   302  	XORQ autLen, autLen
   303  	JMP dataMul
   304  
   305  dataOctaLoop:
   306  		CMPQ autLen, $128
   307  		JB startSinglesLoop
   308  		SUBQ $128, autLen
   309  
   310  		MOVOU (16*0)(aut), X0
   311  		MOVOU (16*1)(aut), X1
   312  		MOVOU (16*2)(aut), X2
   313  		MOVOU (16*3)(aut), X3
   314  		MOVOU (16*4)(aut), X4
   315  		MOVOU (16*5)(aut), X5
   316  		MOVOU (16*6)(aut), X6
   317  		MOVOU (16*7)(aut), X7
   318  		LEAQ (16*8)(aut), aut
   319  		PSHUFB BSWAP, X0
   320  		PSHUFB BSWAP, X1
   321  		PSHUFB BSWAP, X2
   322  		PSHUFB BSWAP, X3
   323  		PSHUFB BSWAP, X4
   324  		PSHUFB BSWAP, X5
   325  		PSHUFB BSWAP, X6
   326  		PSHUFB BSWAP, X7
   327  		PXOR ACC0, X0
   328  
   329  		MOVOU (16*0)(pTbl), ACC0
   330  		MOVOU (16*1)(pTbl), ACCM
   331  		MOVOU ACC0, ACC1
   332  		PSHUFD $78, X0, T1
   333  		PXOR X0, T1
   334  		PCLMULQDQ $0x00, X0, ACC0
   335  		PCLMULQDQ $0x11, X0, ACC1
   336  		PCLMULQDQ $0x00, T1, ACCM
   337  
   338  		mulRoundAAD(X1, 1)
   339  		mulRoundAAD(X2, 2)
   340  		mulRoundAAD(X3, 3)
   341  		mulRoundAAD(X4, 4)
   342  		mulRoundAAD(X5, 5)
   343  		mulRoundAAD(X6, 6)
   344  		mulRoundAAD(X7, 7)
   345  
   346  		PXOR ACC0, ACCM
   347  		PXOR ACC1, ACCM
   348  		MOVOU ACCM, T0
   349  		PSRLDQ $8, ACCM
   350  		PSLLDQ $8, T0
   351  		PXOR ACCM, ACC1
   352  		PXOR T0, ACC0
   353  		reduceRound(ACC0)
   354  		reduceRound(ACC0)
   355  		PXOR ACC1, ACC0
   356  	JMP dataOctaLoop
   357  
   358  startSinglesLoop:
   359  	MOVOU (16*14)(pTbl), T1
   360  	MOVOU (16*15)(pTbl), T2
   361  
   362  dataSinglesLoop:
   363  
   364  		CMPQ autLen, $16
   365  		JB dataEnd
   366  		SUBQ $16, autLen
   367  
   368  		MOVOU (aut), B0
   369  dataMul:
   370  		PSHUFB BSWAP, B0
   371  		PXOR ACC0, B0
   372  
   373  		MOVOU T1, ACC0
   374  		MOVOU T2, ACCM
   375  		MOVOU T1, ACC1
   376  
   377  		PSHUFD $78, B0, T0
   378  		PXOR B0, T0
   379  		PCLMULQDQ $0x00, B0, ACC0
   380  		PCLMULQDQ $0x11, B0, ACC1
   381  		PCLMULQDQ $0x00, T0, ACCM
   382  
   383  		PXOR ACC0, ACCM
   384  		PXOR ACC1, ACCM
   385  		MOVOU ACCM, T0
   386  		PSRLDQ $8, ACCM
   387  		PSLLDQ $8, T0
   388  		PXOR ACCM, ACC1
   389  		PXOR T0, ACC0
   390  
   391  		MOVOU POLY, T0
   392  		PCLMULQDQ $0x01, ACC0, T0
   393  		PSHUFD $78, ACC0, ACC0
   394  		PXOR T0, ACC0
   395  
   396  		MOVOU POLY, T0
   397  		PCLMULQDQ $0x01, ACC0, T0
   398  		PSHUFD $78, ACC0, ACC0
   399  		PXOR T0, ACC0
   400  		PXOR ACC1, ACC0
   401  
   402  		LEAQ 16(aut), aut
   403  
   404  	JMP dataSinglesLoop
   405  
   406  dataEnd:
   407  
   408  	TESTQ autLen, autLen
   409  	JEQ dataBail
   410  
   411  	PXOR B0, B0
   412  	LEAQ -1(aut)(autLen*1), aut
   413  
   414  dataLoadLoop:
   415  
   416  		PSLLDQ $1, B0
   417  		PINSRB $0, (aut), B0
   418  
   419  		LEAQ -1(aut), aut
   420  		DECQ autLen
   421  		JNE dataLoadLoop
   422  
   423  	JMP dataMul
   424  
   425  dataBail:
   426  	MOVOU ACC0, (tPtr)
   427  	RET
   428  #undef pTbl
   429  #undef aut
   430  #undef tPtr
   431  #undef autLen
   432  
   433  // func gcmAesEnc(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, ks []uint32)
   434  TEXT ·gcmAesEnc(SB),0,$256-96
   435  #define pTbl DI
   436  #define ctx DX
   437  #define ctrPtr CX
   438  #define ptx SI
   439  #define ks AX
   440  #define tPtr R8
   441  #define ptxLen R9
   442  #define aluCTR R10
   443  #define aluTMP R11
   444  #define aluK R12
   445  #define NR R13
   446  
   447  #define increment(i) ADDL $1, aluCTR; MOVL aluCTR, aluTMP; XORL aluK, aluTMP; BSWAPL aluTMP; MOVL aluTMP, (3*4 + 8*16 + i*16)(SP)
   448  #define aesRnd(k) AESENC k, B0; AESENC k, B1; AESENC k, B2; AESENC k, B3; AESENC k, B4; AESENC k, B5; AESENC k, B6; AESENC k, B7
   449  #define aesRound(i) MOVOU (16*i)(ks), T0;AESENC T0, B0; AESENC T0, B1; AESENC T0, B2; AESENC T0, B3; AESENC T0, B4; AESENC T0, B5; AESENC T0, B6; AESENC T0, B7
   450  #define aesRndLast(k) AESENCLAST k, B0; AESENCLAST k, B1; AESENCLAST k, B2; AESENCLAST k, B3; AESENCLAST k, B4; AESENCLAST k, B5; AESENCLAST k, B6; AESENCLAST k, B7
   451  #define combinedRound(i) \
   452  	MOVOU (16*i)(ks), T0;\
   453  	AESENC T0, B0;\
   454  	AESENC T0, B1;\
   455  	AESENC T0, B2;\
   456  	AESENC T0, B3;\
   457  	 MOVOU (16*(i*2))(pTbl), T1;\
   458  	 MOVOU T1, T2;\
   459  	AESENC T0, B4;\
   460  	AESENC T0, B5;\
   461  	AESENC T0, B6;\
   462  	AESENC T0, B7;\
   463  	 MOVOU (16*i)(SP), T0;\
   464  	 PCLMULQDQ $0x00, T0, T1;\
   465  	 PXOR T1, ACC0;\
   466  	 PSHUFD $78, T0, T1;\
   467  	 PCLMULQDQ $0x11, T0, T2;\
   468  	 PXOR T1, T0;\
   469  	 PXOR T2, ACC1;\
   470  	 MOVOU (16*(i*2+1))(pTbl), T2;\
   471  	 PCLMULQDQ $0x00, T2, T0;\
   472  	 PXOR T0, ACCM
   473  #define mulRound(i) \
   474  	MOVOU (16*i)(SP), T0;\
   475  	MOVOU (16*(i*2))(pTbl), T1;\
   476  	MOVOU T1, T2;\
   477  	PCLMULQDQ $0x00, T0, T1;\
   478  	PXOR T1, ACC0;\
   479  	PCLMULQDQ $0x11, T0, T2;\
   480  	PXOR T2, ACC1;\
   481  	PSHUFD $78, T0, T1;\
   482  	PXOR T1, T0;\
   483  	MOVOU (16*(i*2+1))(pTbl), T1;\
   484  	PCLMULQDQ $0x00, T0, T1;\
   485  	PXOR T1, ACCM
   486  
   487  	MOVQ productTable+0(FP), pTbl
   488  	MOVQ dst+8(FP), ctx
   489  	MOVQ src_base+32(FP), ptx
   490  	MOVQ src_len+40(FP), ptxLen
   491  	MOVQ ctr+56(FP), ctrPtr
   492  	MOVQ T+64(FP), tPtr
   493  	MOVQ ks_base+72(FP), ks
   494  	MOVQ ks_len+80(FP), NR
   495  
   496  	SHRQ $2, NR
   497  	DECQ NR
   498  
   499  	MOVOU bswapMask<>(SB), BSWAP
   500  	MOVOU gcmPoly<>(SB), POLY
   501  
   502  	MOVOU (tPtr), ACC0
   503  	PXOR ACC1, ACC1
   504  	PXOR ACCM, ACCM
   505  	MOVOU (ctrPtr), B0
   506  	MOVL (3*4)(ctrPtr), aluCTR
   507  	MOVOU (ks), T0
   508  	MOVL (3*4)(ks), aluK
   509  	BSWAPL aluCTR
   510  	BSWAPL aluK
   511  
   512  	PXOR B0, T0
   513  	MOVOU T0, (8*16 + 0*16)(SP)
   514  	increment(0)
   515  
   516  	CMPQ ptxLen, $128
   517  	JB gcmAesEncSingles
   518  	SUBQ $128, ptxLen
   519  
   520  	// We have at least 8 blocks to encrypt, prepare the rest of the counters
   521  	MOVOU T0, (8*16 + 1*16)(SP)
   522  	increment(1)
   523  	MOVOU T0, (8*16 + 2*16)(SP)
   524  	increment(2)
   525  	MOVOU T0, (8*16 + 3*16)(SP)
   526  	increment(3)
   527  	MOVOU T0, (8*16 + 4*16)(SP)
   528  	increment(4)
   529  	MOVOU T0, (8*16 + 5*16)(SP)
   530  	increment(5)
   531  	MOVOU T0, (8*16 + 6*16)(SP)
   532  	increment(6)
   533  	MOVOU T0, (8*16 + 7*16)(SP)
   534  	increment(7)
   535  
   536  	MOVOU (8*16 + 0*16)(SP), B0
   537  	MOVOU (8*16 + 1*16)(SP), B1
   538  	MOVOU (8*16 + 2*16)(SP), B2
   539  	MOVOU (8*16 + 3*16)(SP), B3
   540  	MOVOU (8*16 + 4*16)(SP), B4
   541  	MOVOU (8*16 + 5*16)(SP), B5
   542  	MOVOU (8*16 + 6*16)(SP), B6
   543  	MOVOU (8*16 + 7*16)(SP), B7
   544  
   545  	aesRound(1)
   546  	increment(0)
   547  	aesRound(2)
   548  	increment(1)
   549  	aesRound(3)
   550  	increment(2)
   551  	aesRound(4)
   552  	increment(3)
   553  	aesRound(5)
   554  	increment(4)
   555  	aesRound(6)
   556  	increment(5)
   557  	aesRound(7)
   558  	increment(6)
   559  	aesRound(8)
   560  	increment(7)
   561  	aesRound(9)
   562  	MOVOU (16*10)(ks), T0
   563  	CMPQ NR, $12
   564  	JB encLast1
   565  	aesRnd(T0)
   566  	aesRound(11)
   567  	MOVOU (16*12)(ks), T0
   568  	JE encLast1
   569  	aesRnd(T0)
   570  	aesRound(13)
   571  	MOVOU (16*14)(ks), T0
   572  encLast1:
   573  	aesRndLast(T0)
   574  
   575  	MOVOU (16*0)(ptx), T0
   576  	PXOR T0, B0
   577  	MOVOU (16*1)(ptx), T0
   578  	PXOR T0, B1
   579  	MOVOU (16*2)(ptx), T0
   580  	PXOR T0, B2
   581  	MOVOU (16*3)(ptx), T0
   582  	PXOR T0, B3
   583  	MOVOU (16*4)(ptx), T0
   584  	PXOR T0, B4
   585  	MOVOU (16*5)(ptx), T0
   586  	PXOR T0, B5
   587  	MOVOU (16*6)(ptx), T0
   588  	PXOR T0, B6
   589  	MOVOU (16*7)(ptx), T0
   590  	PXOR T0, B7
   591  
   592  	MOVOU B0, (16*0)(ctx)
   593  	PSHUFB BSWAP, B0
   594  	PXOR ACC0, B0
   595  	MOVOU B1, (16*1)(ctx)
   596  	PSHUFB BSWAP, B1
   597  	MOVOU B2, (16*2)(ctx)
   598  	PSHUFB BSWAP, B2
   599  	MOVOU B3, (16*3)(ctx)
   600  	PSHUFB BSWAP, B3
   601  	MOVOU B4, (16*4)(ctx)
   602  	PSHUFB BSWAP, B4
   603  	MOVOU B5, (16*5)(ctx)
   604  	PSHUFB BSWAP, B5
   605  	MOVOU B6, (16*6)(ctx)
   606  	PSHUFB BSWAP, B6
   607  	MOVOU B7, (16*7)(ctx)
   608  	PSHUFB BSWAP, B7
   609  
   610  	MOVOU B0, (16*0)(SP)
   611  	MOVOU B1, (16*1)(SP)
   612  	MOVOU B2, (16*2)(SP)
   613  	MOVOU B3, (16*3)(SP)
   614  	MOVOU B4, (16*4)(SP)
   615  	MOVOU B5, (16*5)(SP)
   616  	MOVOU B6, (16*6)(SP)
   617  	MOVOU B7, (16*7)(SP)
   618  
   619  	LEAQ 128(ptx), ptx
   620  	LEAQ 128(ctx), ctx
   621  
   622  gcmAesEncOctetsLoop:
   623  
   624  		CMPQ ptxLen, $128
   625  		JB gcmAesEncOctetsEnd
   626  		SUBQ $128, ptxLen
   627  
   628  		MOVOU (8*16 + 0*16)(SP), B0
   629  		MOVOU (8*16 + 1*16)(SP), B1
   630  		MOVOU (8*16 + 2*16)(SP), B2
   631  		MOVOU (8*16 + 3*16)(SP), B3
   632  		MOVOU (8*16 + 4*16)(SP), B4
   633  		MOVOU (8*16 + 5*16)(SP), B5
   634  		MOVOU (8*16 + 6*16)(SP), B6
   635  		MOVOU (8*16 + 7*16)(SP), B7
   636  
   637  		MOVOU (16*0)(SP), T0
   638  		PSHUFD $78, T0, T1
   639  		PXOR T0, T1
   640  
   641  		MOVOU (16*0)(pTbl), ACC0
   642  		MOVOU (16*1)(pTbl), ACCM
   643  		MOVOU ACC0, ACC1
   644  
   645  		PCLMULQDQ $0x00, T1, ACCM
   646  		PCLMULQDQ $0x00, T0, ACC0
   647  		PCLMULQDQ $0x11, T0, ACC1
   648  
   649  		combinedRound(1)
   650  		increment(0)
   651  		combinedRound(2)
   652  		increment(1)
   653  		combinedRound(3)
   654  		increment(2)
   655  		combinedRound(4)
   656  		increment(3)
   657  		combinedRound(5)
   658  		increment(4)
   659  		combinedRound(6)
   660  		increment(5)
   661  		combinedRound(7)
   662  		increment(6)
   663  
   664  		aesRound(8)
   665  		increment(7)
   666  
   667  		PXOR ACC0, ACCM
   668  		PXOR ACC1, ACCM
   669  		MOVOU ACCM, T0
   670  		PSRLDQ $8, ACCM
   671  		PSLLDQ $8, T0
   672  		PXOR ACCM, ACC1
   673  		PXOR T0, ACC0
   674  
   675  		reduceRound(ACC0)
   676  		aesRound(9)
   677  
   678  		reduceRound(ACC0)
   679  		PXOR ACC1, ACC0
   680  
   681  		MOVOU (16*10)(ks), T0
   682  		CMPQ NR, $12
   683  		JB encLast2
   684  		aesRnd(T0)
   685  		aesRound(11)
   686  		MOVOU (16*12)(ks), T0
   687  		JE encLast2
   688  		aesRnd(T0)
   689  		aesRound(13)
   690  		MOVOU (16*14)(ks), T0
   691  encLast2:
   692  		aesRndLast(T0)
   693  
   694  		MOVOU (16*0)(ptx), T0
   695  		PXOR T0, B0
   696  		MOVOU (16*1)(ptx), T0
   697  		PXOR T0, B1
   698  		MOVOU (16*2)(ptx), T0
   699  		PXOR T0, B2
   700  		MOVOU (16*3)(ptx), T0
   701  		PXOR T0, B3
   702  		MOVOU (16*4)(ptx), T0
   703  		PXOR T0, B4
   704  		MOVOU (16*5)(ptx), T0
   705  		PXOR T0, B5
   706  		MOVOU (16*6)(ptx), T0
   707  		PXOR T0, B6
   708  		MOVOU (16*7)(ptx), T0
   709  		PXOR T0, B7
   710  
   711  		MOVOU B0, (16*0)(ctx)
   712  		PSHUFB BSWAP, B0
   713  		PXOR ACC0, B0
   714  		MOVOU B1, (16*1)(ctx)
   715  		PSHUFB BSWAP, B1
   716  		MOVOU B2, (16*2)(ctx)
   717  		PSHUFB BSWAP, B2
   718  		MOVOU B3, (16*3)(ctx)
   719  		PSHUFB BSWAP, B3
   720  		MOVOU B4, (16*4)(ctx)
   721  		PSHUFB BSWAP, B4
   722  		MOVOU B5, (16*5)(ctx)
   723  		PSHUFB BSWAP, B5
   724  		MOVOU B6, (16*6)(ctx)
   725  		PSHUFB BSWAP, B6
   726  		MOVOU B7, (16*7)(ctx)
   727  		PSHUFB BSWAP, B7
   728  
   729  		MOVOU B0, (16*0)(SP)
   730  		MOVOU B1, (16*1)(SP)
   731  		MOVOU B2, (16*2)(SP)
   732  		MOVOU B3, (16*3)(SP)
   733  		MOVOU B4, (16*4)(SP)
   734  		MOVOU B5, (16*5)(SP)
   735  		MOVOU B6, (16*6)(SP)
   736  		MOVOU B7, (16*7)(SP)
   737  
   738  		LEAQ 128(ptx), ptx
   739  		LEAQ 128(ctx), ctx
   740  
   741  		JMP gcmAesEncOctetsLoop
   742  
   743  gcmAesEncOctetsEnd:
   744  
   745  	MOVOU (16*0)(SP), T0
   746  	MOVOU (16*0)(pTbl), ACC0
   747  	MOVOU (16*1)(pTbl), ACCM
   748  	MOVOU ACC0, ACC1
   749  	PSHUFD $78, T0, T1
   750  	PXOR T0, T1
   751  	PCLMULQDQ $0x00, T0, ACC0
   752  	PCLMULQDQ $0x11, T0, ACC1
   753  	PCLMULQDQ $0x00, T1, ACCM
   754  
   755  	mulRound(1)
   756  	mulRound(2)
   757  	mulRound(3)
   758  	mulRound(4)
   759  	mulRound(5)
   760  	mulRound(6)
   761  	mulRound(7)
   762  
   763  	PXOR ACC0, ACCM
   764  	PXOR ACC1, ACCM
   765  	MOVOU ACCM, T0
   766  	PSRLDQ $8, ACCM
   767  	PSLLDQ $8, T0
   768  	PXOR ACCM, ACC1
   769  	PXOR T0, ACC0
   770  
   771  	reduceRound(ACC0)
   772  	reduceRound(ACC0)
   773  	PXOR ACC1, ACC0
   774  
   775  	TESTQ ptxLen, ptxLen
   776  	JE gcmAesEncDone
   777  
   778  	SUBQ $7, aluCTR
   779  
   780  gcmAesEncSingles:
   781  
   782  	MOVOU (16*1)(ks), B1
   783  	MOVOU (16*2)(ks), B2
   784  	MOVOU (16*3)(ks), B3
   785  	MOVOU (16*4)(ks), B4
   786  	MOVOU (16*5)(ks), B5
   787  	MOVOU (16*6)(ks), B6
   788  	MOVOU (16*7)(ks), B7
   789  
   790  	MOVOU (16*14)(pTbl), T2
   791  
   792  gcmAesEncSinglesLoop:
   793  
   794  		CMPQ ptxLen, $16
   795  		JB gcmAesEncTail
   796  		SUBQ $16, ptxLen
   797  
   798  		MOVOU (8*16 + 0*16)(SP), B0
   799  		increment(0)
   800  
   801  		AESENC B1, B0
   802  		AESENC B2, B0
   803  		AESENC B3, B0
   804  		AESENC B4, B0
   805  		AESENC B5, B0
   806  		AESENC B6, B0
   807  		AESENC B7, B0
   808  		MOVOU (16*8)(ks), T0
   809  		AESENC T0, B0
   810  		MOVOU (16*9)(ks), T0
   811  		AESENC T0, B0
   812  		MOVOU (16*10)(ks), T0
   813  		CMPQ NR, $12
   814  		JB encLast3
   815  		AESENC T0, B0
   816  		MOVOU (16*11)(ks), T0
   817  		AESENC T0, B0
   818  		MOVOU (16*12)(ks), T0
   819  		JE encLast3
   820  		AESENC T0, B0
   821  		MOVOU (16*13)(ks), T0
   822  		AESENC T0, B0
   823  		MOVOU (16*14)(ks), T0
   824  encLast3:
   825  		AESENCLAST T0, B0
   826  
   827  		MOVOU (ptx), T0
   828  		PXOR T0, B0
   829  		MOVOU B0, (ctx)
   830  
   831  		PSHUFB BSWAP, B0
   832  		PXOR ACC0, B0
   833  
   834  		MOVOU T2, ACC0
   835  		MOVOU T2, ACC1
   836  		MOVOU (16*15)(pTbl), ACCM
   837  
   838  		PSHUFD $78, B0, T0
   839  		PXOR B0, T0
   840  		PCLMULQDQ $0x00, B0, ACC0
   841  		PCLMULQDQ $0x11, B0, ACC1
   842  		PCLMULQDQ $0x00, T0, ACCM
   843  
   844  		PXOR ACC0, ACCM
   845  		PXOR ACC1, ACCM
   846  		MOVOU ACCM, T0
   847  		PSRLDQ $8, ACCM
   848  		PSLLDQ $8, T0
   849  		PXOR ACCM, ACC1
   850  		PXOR T0, ACC0
   851  
   852  		reduceRound(ACC0)
   853  		reduceRound(ACC0)
   854  		PXOR ACC1, ACC0
   855  
   856  		LEAQ (16*1)(ptx), ptx
   857  		LEAQ (16*1)(ctx), ctx
   858  
   859  	JMP gcmAesEncSinglesLoop
   860  
   861  gcmAesEncTail:
   862  	TESTQ ptxLen, ptxLen
   863  	JE gcmAesEncDone
   864  
   865  	MOVOU (8*16 + 0*16)(SP), B0
   866  	AESENC B1, B0
   867  	AESENC B2, B0
   868  	AESENC B3, B0
   869  	AESENC B4, B0
   870  	AESENC B5, B0
   871  	AESENC B6, B0
   872  	AESENC B7, B0
   873  	MOVOU (16*8)(ks), T0
   874  	AESENC T0, B0
   875  	MOVOU (16*9)(ks), T0
   876  	AESENC T0, B0
   877  	MOVOU (16*10)(ks), T0
   878  	CMPQ NR, $12
   879  	JB encLast4
   880  	AESENC T0, B0
   881  	MOVOU (16*11)(ks), T0
   882  	AESENC T0, B0
   883  	MOVOU (16*12)(ks), T0
   884  	JE encLast4
   885  	AESENC T0, B0
   886  	MOVOU (16*13)(ks), T0
   887  	AESENC T0, B0
   888  	MOVOU (16*14)(ks), T0
   889  encLast4:
   890  	AESENCLAST T0, B0
   891  	MOVOU B0, T0
   892  
   893  	LEAQ -1(ptx)(ptxLen*1), ptx
   894  
   895  	MOVQ ptxLen, aluTMP
   896  	SHLQ $4, aluTMP
   897  
   898  	LEAQ andMask<>(SB), aluCTR
   899  	MOVOU -16(aluCTR)(aluTMP*1), T1
   900  
   901  	PXOR B0, B0
   902  ptxLoadLoop:
   903  		PSLLDQ $1, B0
   904  		PINSRB $0, (ptx), B0
   905  		LEAQ -1(ptx), ptx
   906  		DECQ ptxLen
   907  	JNE ptxLoadLoop
   908  
   909  	PXOR T0, B0
   910  	PAND T1, B0
   911  	MOVOU B0, (ctx)	// I assume there is always space, due to TAG in the end of the CT
   912  
   913  	PSHUFB BSWAP, B0
   914  	PXOR ACC0, B0
   915  
   916  	MOVOU T2, ACC0
   917  	MOVOU T2, ACC1
   918  	MOVOU (16*15)(pTbl), ACCM
   919  
   920  	PSHUFD $78, B0, T0
   921  	PXOR B0, T0
   922  	PCLMULQDQ $0x00, B0, ACC0
   923  	PCLMULQDQ $0x11, B0, ACC1
   924  	PCLMULQDQ $0x00, T0, ACCM
   925  
   926  	PXOR ACC0, ACCM
   927  	PXOR ACC1, ACCM
   928  	MOVOU ACCM, T0
   929  	PSRLDQ $8, ACCM
   930  	PSLLDQ $8, T0
   931  	PXOR ACCM, ACC1
   932  	PXOR T0, ACC0
   933  
   934  	reduceRound(ACC0)
   935  	reduceRound(ACC0)
   936  	PXOR ACC1, ACC0
   937  
   938  gcmAesEncDone:
   939  	MOVOU ACC0, (tPtr)
   940  	RET
   941  #undef increment
   942  
   943  // func gcmAesDec(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, ks []uint32)
   944  TEXT ·gcmAesDec(SB),0,$128-96
   945  #define increment(i) ADDL $1, aluCTR; MOVL aluCTR, aluTMP; XORL aluK, aluTMP; BSWAPL aluTMP; MOVL aluTMP, (3*4 + i*16)(SP)
   946  #define combinedDecRound(i) \
   947  	MOVOU (16*i)(ks), T0;\
   948  	AESENC T0, B0;\
   949  	AESENC T0, B1;\
   950  	AESENC T0, B2;\
   951  	AESENC T0, B3;\
   952  	MOVOU (16*(i*2))(pTbl), T1;\
   953  	MOVOU T1, T2;\
   954  	AESENC T0, B4;\
   955  	AESENC T0, B5;\
   956  	AESENC T0, B6;\
   957  	AESENC T0, B7;\
   958  	MOVOU (16*i)(ctx), T0;\
   959  	PSHUFB BSWAP, T0;\
   960  	PCLMULQDQ $0x00, T0, T1;\
   961  	PXOR T1, ACC0;\
   962  	PSHUFD $78, T0, T1;\
   963  	PCLMULQDQ $0x11, T0, T2;\
   964  	PXOR T1, T0;\
   965  	PXOR T2, ACC1;\
   966  	MOVOU (16*(i*2+1))(pTbl), T2;\
   967  	PCLMULQDQ $0x00, T2, T0;\
   968  	PXOR T0, ACCM
   969  
   970  	MOVQ productTable+0(FP), pTbl
   971  	MOVQ dst+8(FP), ptx
   972  	MOVQ src_base+32(FP), ctx
   973  	MOVQ src_len+40(FP), ptxLen
   974  	MOVQ ctr+56(FP), ctrPtr
   975  	MOVQ T+64(FP), tPtr
   976  	MOVQ ks_base+72(FP), ks
   977  	MOVQ ks_len+80(FP), NR
   978  
   979  	SHRQ $2, NR
   980  	DECQ NR
   981  
   982  	MOVOU bswapMask<>(SB), BSWAP
   983  	MOVOU gcmPoly<>(SB), POLY
   984  
   985  	MOVOU (tPtr), ACC0
   986  	PXOR ACC1, ACC1
   987  	PXOR ACCM, ACCM
   988  	MOVOU (ctrPtr), B0
   989  	MOVL (3*4)(ctrPtr), aluCTR
   990  	MOVOU (ks), T0
   991  	MOVL (3*4)(ks), aluK
   992  	BSWAPL aluCTR
   993  	BSWAPL aluK
   994  
   995  	PXOR B0, T0
   996  	MOVOU T0, (0*16)(SP)
   997  	increment(0)
   998  
   999  	CMPQ ptxLen, $128
  1000  	JB gcmAesDecSingles
  1001  
  1002  	MOVOU T0, (1*16)(SP)
  1003  	increment(1)
  1004  	MOVOU T0, (2*16)(SP)
  1005  	increment(2)
  1006  	MOVOU T0, (3*16)(SP)
  1007  	increment(3)
  1008  	MOVOU T0, (4*16)(SP)
  1009  	increment(4)
  1010  	MOVOU T0, (5*16)(SP)
  1011  	increment(5)
  1012  	MOVOU T0, (6*16)(SP)
  1013  	increment(6)
  1014  	MOVOU T0, (7*16)(SP)
  1015  	increment(7)
  1016  
  1017  gcmAesDecOctetsLoop:
  1018  
  1019  		CMPQ ptxLen, $128
  1020  		JB gcmAesDecEndOctets
  1021  		SUBQ $128, ptxLen
  1022  
  1023  		MOVOU (0*16)(SP), B0
  1024  		MOVOU (1*16)(SP), B1
  1025  		MOVOU (2*16)(SP), B2
  1026  		MOVOU (3*16)(SP), B3
  1027  		MOVOU (4*16)(SP), B4
  1028  		MOVOU (5*16)(SP), B5
  1029  		MOVOU (6*16)(SP), B6
  1030  		MOVOU (7*16)(SP), B7
  1031  
  1032  		MOVOU (16*0)(ctx), T0
  1033  		PSHUFB BSWAP, T0
  1034  		PXOR ACC0, T0
  1035  		PSHUFD $78, T0, T1
  1036  		PXOR T0, T1
  1037  
  1038  		MOVOU (16*0)(pTbl), ACC0
  1039  		MOVOU (16*1)(pTbl), ACCM
  1040  		MOVOU ACC0, ACC1
  1041  
  1042  		PCLMULQDQ $0x00, T1, ACCM
  1043  		PCLMULQDQ $0x00, T0, ACC0
  1044  		PCLMULQDQ $0x11, T0, ACC1
  1045  
  1046  		combinedDecRound(1)
  1047  		increment(0)
  1048  		combinedDecRound(2)
  1049  		increment(1)
  1050  		combinedDecRound(3)
  1051  		increment(2)
  1052  		combinedDecRound(4)
  1053  		increment(3)
  1054  		combinedDecRound(5)
  1055  		increment(4)
  1056  		combinedDecRound(6)
  1057  		increment(5)
  1058  		combinedDecRound(7)
  1059  		increment(6)
  1060  
  1061  		aesRound(8)
  1062  		increment(7)
  1063  
  1064  		PXOR ACC0, ACCM
  1065  		PXOR ACC1, ACCM
  1066  		MOVOU ACCM, T0
  1067  		PSRLDQ $8, ACCM
  1068  		PSLLDQ $8, T0
  1069  		PXOR ACCM, ACC1
  1070  		PXOR T0, ACC0
  1071  
  1072  		reduceRound(ACC0)
  1073  		aesRound(9)
  1074  
  1075  		reduceRound(ACC0)
  1076  		PXOR ACC1, ACC0
  1077  
  1078  		MOVOU (16*10)(ks), T0
  1079  		CMPQ NR, $12
  1080  		JB decLast1
  1081  		aesRnd(T0)
  1082  		aesRound(11)
  1083  		MOVOU (16*12)(ks), T0
  1084  		JE decLast1
  1085  		aesRnd(T0)
  1086  		aesRound(13)
  1087  		MOVOU (16*14)(ks), T0
  1088  decLast1:
  1089  		aesRndLast(T0)
  1090  
  1091  		MOVOU (16*0)(ctx), T0
  1092  		PXOR T0, B0
  1093  		MOVOU (16*1)(ctx), T0
  1094  		PXOR T0, B1
  1095  		MOVOU (16*2)(ctx), T0
  1096  		PXOR T0, B2
  1097  		MOVOU (16*3)(ctx), T0
  1098  		PXOR T0, B3
  1099  		MOVOU (16*4)(ctx), T0
  1100  		PXOR T0, B4
  1101  		MOVOU (16*5)(ctx), T0
  1102  		PXOR T0, B5
  1103  		MOVOU (16*6)(ctx), T0
  1104  		PXOR T0, B6
  1105  		MOVOU (16*7)(ctx), T0
  1106  		PXOR T0, B7
  1107  
  1108  		MOVOU B0, (16*0)(ptx)
  1109  		MOVOU B1, (16*1)(ptx)
  1110  		MOVOU B2, (16*2)(ptx)
  1111  		MOVOU B3, (16*3)(ptx)
  1112  		MOVOU B4, (16*4)(ptx)
  1113  		MOVOU B5, (16*5)(ptx)
  1114  		MOVOU B6, (16*6)(ptx)
  1115  		MOVOU B7, (16*7)(ptx)
  1116  
  1117  		LEAQ 128(ptx), ptx
  1118  		LEAQ 128(ctx), ctx
  1119  
  1120  		JMP gcmAesDecOctetsLoop
  1121  
  1122  gcmAesDecEndOctets:
  1123  
  1124  	SUBQ $7, aluCTR
  1125  
  1126  gcmAesDecSingles:
  1127  
  1128  	MOVOU (16*1)(ks), B1
  1129  	MOVOU (16*2)(ks), B2
  1130  	MOVOU (16*3)(ks), B3
  1131  	MOVOU (16*4)(ks), B4
  1132  	MOVOU (16*5)(ks), B5
  1133  	MOVOU (16*6)(ks), B6
  1134  	MOVOU (16*7)(ks), B7
  1135  
  1136  	MOVOU (16*14)(pTbl), T2
  1137  
  1138  gcmAesDecSinglesLoop:
  1139  
  1140  		CMPQ ptxLen, $16
  1141  		JB gcmAesDecTail
  1142  		SUBQ $16, ptxLen
  1143  
  1144  		MOVOU (ctx), B0
  1145  		MOVOU B0, T1
  1146  		PSHUFB BSWAP, B0
  1147  		PXOR ACC0, B0
  1148  
  1149  		MOVOU T2, ACC0
  1150  		MOVOU T2, ACC1
  1151  		MOVOU (16*15)(pTbl), ACCM
  1152  
  1153  		PCLMULQDQ $0x00, B0, ACC0
  1154  		PCLMULQDQ $0x11, B0, ACC1
  1155  		PSHUFD $78, B0, T0
  1156  		PXOR B0, T0
  1157  		PCLMULQDQ $0x00, T0, ACCM
  1158  
  1159  		PXOR ACC0, ACCM
  1160  		PXOR ACC1, ACCM
  1161  		MOVOU ACCM, T0
  1162  		PSRLDQ $8, ACCM
  1163  		PSLLDQ $8, T0
  1164  		PXOR ACCM, ACC1
  1165  		PXOR T0, ACC0
  1166  
  1167  		reduceRound(ACC0)
  1168  		reduceRound(ACC0)
  1169  		PXOR ACC1, ACC0
  1170  
  1171  		MOVOU (0*16)(SP), B0
  1172  		increment(0)
  1173  		AESENC B1, B0
  1174  		AESENC B2, B0
  1175  		AESENC B3, B0
  1176  		AESENC B4, B0
  1177  		AESENC B5, B0
  1178  		AESENC B6, B0
  1179  		AESENC B7, B0
  1180  		MOVOU (16*8)(ks), T0
  1181  		AESENC T0, B0
  1182  		MOVOU (16*9)(ks), T0
  1183  		AESENC T0, B0
  1184  		MOVOU (16*10)(ks), T0
  1185  		CMPQ NR, $12
  1186  		JB decLast2
  1187  		AESENC T0, B0
  1188  		MOVOU (16*11)(ks), T0
  1189  		AESENC T0, B0
  1190  		MOVOU (16*12)(ks), T0
  1191  		JE decLast2
  1192  		AESENC T0, B0
  1193  		MOVOU (16*13)(ks), T0
  1194  		AESENC T0, B0
  1195  		MOVOU (16*14)(ks), T0
  1196  decLast2:
  1197  		AESENCLAST T0, B0
  1198  
  1199  		PXOR T1, B0
  1200  		MOVOU B0, (ptx)
  1201  
  1202  		LEAQ (16*1)(ptx), ptx
  1203  		LEAQ (16*1)(ctx), ctx
  1204  
  1205  	JMP gcmAesDecSinglesLoop
  1206  
  1207  gcmAesDecTail:
  1208  
  1209  	TESTQ ptxLen, ptxLen
  1210  	JE gcmAesDecDone
  1211  
  1212  	MOVQ ptxLen, aluTMP
  1213  	SHLQ $4, aluTMP
  1214  	LEAQ andMask<>(SB), aluCTR
  1215  	MOVOU -16(aluCTR)(aluTMP*1), T1
  1216  
  1217  	MOVOU (ctx), B0	// I assume there is TAG attached to the ctx, and there is no read overflow
  1218  	PAND T1, B0
  1219  
  1220  	MOVOU B0, T1
  1221  	PSHUFB BSWAP, B0
  1222  	PXOR ACC0, B0
  1223  
  1224  	MOVOU (16*14)(pTbl), ACC0
  1225  	MOVOU (16*15)(pTbl), ACCM
  1226  	MOVOU ACC0, ACC1
  1227  
  1228  	PCLMULQDQ $0x00, B0, ACC0
  1229  	PCLMULQDQ $0x11, B0, ACC1
  1230  	PSHUFD $78, B0, T0
  1231  	PXOR B0, T0
  1232  	PCLMULQDQ $0x00, T0, ACCM
  1233  
  1234  	PXOR ACC0, ACCM
  1235  	PXOR ACC1, ACCM
  1236  	MOVOU ACCM, T0
  1237  	PSRLDQ $8, ACCM
  1238  	PSLLDQ $8, T0
  1239  	PXOR ACCM, ACC1
  1240  	PXOR T0, ACC0
  1241  
  1242  	reduceRound(ACC0)
  1243  	reduceRound(ACC0)
  1244  	PXOR ACC1, ACC0
  1245  
  1246  	MOVOU (0*16)(SP), B0
  1247  	increment(0)
  1248  	AESENC B1, B0
  1249  	AESENC B2, B0
  1250  	AESENC B3, B0
  1251  	AESENC B4, B0
  1252  	AESENC B5, B0
  1253  	AESENC B6, B0
  1254  	AESENC B7, B0
  1255  	MOVOU (16*8)(ks), T0
  1256  	AESENC T0, B0
  1257  	MOVOU (16*9)(ks), T0
  1258  	AESENC T0, B0
  1259  	MOVOU (16*10)(ks), T0
  1260  	CMPQ NR, $12
  1261  	JB decLast3
  1262  	AESENC T0, B0
  1263  	MOVOU (16*11)(ks), T0
  1264  	AESENC T0, B0
  1265  	MOVOU (16*12)(ks), T0
  1266  	JE decLast3
  1267  	AESENC T0, B0
  1268  	MOVOU (16*13)(ks), T0
  1269  	AESENC T0, B0
  1270  	MOVOU (16*14)(ks), T0
  1271  decLast3:
  1272  	AESENCLAST T0, B0
  1273  	PXOR T1, B0
  1274  
  1275  ptxStoreLoop:
  1276  		PEXTRB $0, B0, (ptx)
  1277  		PSRLDQ $1, B0
  1278  		LEAQ 1(ptx), ptx
  1279  		DECQ ptxLen
  1280  
  1281  	JNE ptxStoreLoop
  1282  
  1283  gcmAesDecDone:
  1284  
  1285  	MOVOU ACC0, (tPtr)
  1286  	RET
  1287  

View as plain text