Text file src/crypto/aes/gcm_arm64.s

     1  // Copyright 2018 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  #include "textflag.h"
     6  
     7  #define B0 V0
     8  #define B1 V1
     9  #define B2 V2
    10  #define B3 V3
    11  #define B4 V4
    12  #define B5 V5
    13  #define B6 V6
    14  #define B7 V7
    15  
    16  #define ACC0 V8
    17  #define ACC1 V9
    18  #define ACCM V10
    19  
    20  #define T0 V11
    21  #define T1 V12
    22  #define T2 V13
    23  #define T3 V14
    24  
    25  #define POLY V15
    26  #define ZERO V16
    27  #define INC V17
    28  #define CTR V18
    29  
    30  #define K0 V19
    31  #define K1 V20
    32  #define K2 V21
    33  #define K3 V22
    34  #define K4 V23
    35  #define K5 V24
    36  #define K6 V25
    37  #define K7 V26
    38  #define K8 V27
    39  #define K9 V28
    40  #define K10 V29
    41  #define K11 V30
    42  #define KLAST V31
    43  
    44  #define reduce() \
    45  	VEOR	ACC0.B16, ACCM.B16, ACCM.B16     \
    46  	VEOR	ACC1.B16, ACCM.B16, ACCM.B16     \
    47  	VEXT	$8, ZERO.B16, ACCM.B16, T0.B16   \
    48  	VEXT	$8, ACCM.B16, ZERO.B16, ACCM.B16 \
    49  	VEOR	ACCM.B16, ACC0.B16, ACC0.B16     \
    50  	VEOR	T0.B16, ACC1.B16, ACC1.B16       \
    51  	VPMULL	POLY.D1, ACC0.D1, T0.Q1          \
    52  	VEXT	$8, ACC0.B16, ACC0.B16, ACC0.B16 \
    53  	VEOR	T0.B16, ACC0.B16, ACC0.B16       \
    54  	VPMULL	POLY.D1, ACC0.D1, T0.Q1          \
    55  	VEOR	T0.B16, ACC1.B16, ACC1.B16       \
    56  	VEXT	$8, ACC1.B16, ACC1.B16, ACC1.B16 \
    57  	VEOR	ACC1.B16, ACC0.B16, ACC0.B16     \
    58  
    59  // func gcmAesFinish(productTable *[256]byte, tagMask, T *[16]byte, pLen, dLen uint64)
    60  TEXT ·gcmAesFinish(SB),NOSPLIT,$0
    61  #define pTbl R0
    62  #define tMsk R1
    63  #define tPtr R2
    64  #define plen R3
    65  #define dlen R4
    66  
    67  	MOVD	$0xC2, R1
    68  	LSL	$56, R1
    69  	MOVD	$1, R0
    70  	VMOV	R1, POLY.D[0]
    71  	VMOV	R0, POLY.D[1]
    72  	VEOR	ZERO.B16, ZERO.B16, ZERO.B16
    73  
    74  	MOVD	productTable+0(FP), pTbl
    75  	MOVD	tagMask+8(FP), tMsk
    76  	MOVD	T+16(FP), tPtr
    77  	MOVD	pLen+24(FP), plen
    78  	MOVD	dLen+32(FP), dlen
    79  
    80  	VLD1	(tPtr), [ACC0.B16]
    81  	VLD1	(tMsk), [B1.B16]
    82  
    83  	LSL	$3, plen
    84  	LSL	$3, dlen
    85  
    86  	VMOV	dlen, B0.D[0]
    87  	VMOV	plen, B0.D[1]
    88  
    89  	ADD	$14*16, pTbl
    90  	VLD1.P	(pTbl), [T1.B16, T2.B16]
    91  
    92  	VEOR	ACC0.B16, B0.B16, B0.B16
    93  
    94  	VEXT	$8, B0.B16, B0.B16, T0.B16
    95  	VEOR	B0.B16, T0.B16, T0.B16
    96  	VPMULL	B0.D1, T1.D1, ACC1.Q1
    97  	VPMULL2	B0.D2, T1.D2, ACC0.Q1
    98  	VPMULL	T0.D1, T2.D1, ACCM.Q1
    99  
   100  	reduce()
   101  
   102  	VREV64	ACC0.B16, ACC0.B16
   103  	VEOR	B1.B16, ACC0.B16, ACC0.B16
   104  
   105  	VST1	[ACC0.B16], (tPtr)
   106  	RET
   107  #undef pTbl
   108  #undef tMsk
   109  #undef tPtr
   110  #undef plen
   111  #undef dlen
   112  
   113  // func gcmAesInit(productTable *[256]byte, ks []uint32)
   114  TEXT ·gcmAesInit(SB),NOSPLIT,$0
   115  #define pTbl R0
   116  #define KS R1
   117  #define NR R2
   118  #define I R3
   119  	MOVD	productTable+0(FP), pTbl
   120  	MOVD	ks_base+8(FP), KS
   121  	MOVD	ks_len+16(FP), NR
   122  
   123  	MOVD	$0xC2, I
   124  	LSL	$56, I
   125  	VMOV	I, POLY.D[0]
   126  	MOVD	$1, I
   127  	VMOV	I, POLY.D[1]
   128  	VEOR	ZERO.B16, ZERO.B16, ZERO.B16
   129  
   130  	// Encrypt block 0 with the AES key to generate the hash key H
   131  	VLD1.P	64(KS), [T0.B16, T1.B16, T2.B16, T3.B16]
   132  	VEOR	B0.B16, B0.B16, B0.B16
   133  	AESE	T0.B16, B0.B16
   134  	AESMC	B0.B16, B0.B16
   135  	AESE	T1.B16, B0.B16
   136  	AESMC	B0.B16, B0.B16
   137  	AESE	T2.B16, B0.B16
   138  	AESMC	B0.B16, B0.B16
   139  	AESE	T3.B16, B0.B16
   140  	AESMC	B0.B16, B0.B16
   141  	VLD1.P	64(KS), [T0.B16, T1.B16, T2.B16, T3.B16]
   142  	AESE	T0.B16, B0.B16
   143  	AESMC	B0.B16, B0.B16
   144  	AESE	T1.B16, B0.B16
   145  	AESMC	B0.B16, B0.B16
   146  	AESE	T2.B16, B0.B16
   147  	AESMC	B0.B16, B0.B16
   148  	AESE	T3.B16, B0.B16
   149  	AESMC	B0.B16, B0.B16
   150  	TBZ	$4, NR, initEncFinish
   151  	VLD1.P	32(KS), [T0.B16, T1.B16]
   152  	AESE	T0.B16, B0.B16
   153  	AESMC	B0.B16, B0.B16
   154  	AESE	T1.B16, B0.B16
   155  	AESMC	B0.B16, B0.B16
   156  	TBZ	$3, NR, initEncFinish
   157  	VLD1.P	32(KS), [T0.B16, T1.B16]
   158  	AESE	T0.B16, B0.B16
   159  	AESMC	B0.B16, B0.B16
   160  	AESE	T1.B16, B0.B16
   161  	AESMC	B0.B16, B0.B16
   162  initEncFinish:
   163  	VLD1	(KS), [T0.B16, T1.B16, T2.B16]
   164  	AESE	T0.B16, B0.B16
   165  	AESMC	B0.B16, B0.B16
   166  	AESE	T1.B16, B0.B16
   167   	VEOR	T2.B16, B0.B16, B0.B16
   168  
   169  	VREV64	B0.B16, B0.B16
   170  
   171  	// Multiply by 2 modulo P
   172  	VMOV	B0.D[0], I
   173  	ASR	$63, I
   174  	VMOV	I, T1.D[0]
   175  	VMOV	I, T1.D[1]
   176  	VAND	POLY.B16, T1.B16, T1.B16
   177  	VUSHR	$63, B0.D2, T2.D2
   178  	VEXT	$8, ZERO.B16, T2.B16, T2.B16
   179  	VSHL	$1, B0.D2, B0.D2
   180  	VEOR	T1.B16, B0.B16, B0.B16
   181  	VEOR	T2.B16, B0.B16, B0.B16 // Can avoid this when VSLI is available
   182  
   183  	// Karatsuba pre-computation
   184  	VEXT	$8, B0.B16, B0.B16, B1.B16
   185  	VEOR	B0.B16, B1.B16, B1.B16
   186  
   187  	ADD	$14*16, pTbl
   188  	VST1	[B0.B16, B1.B16], (pTbl)
   189  	SUB	$2*16, pTbl
   190  
   191  	VMOV	B0.B16, B2.B16
   192  	VMOV	B1.B16, B3.B16
   193  
   194  	MOVD	$7, I
   195  
   196  initLoop:
   197  	// Compute powers of H
   198  	SUBS	$1, I
   199  
   200  	VPMULL	B0.D1, B2.D1, T1.Q1
   201  	VPMULL2	B0.D2, B2.D2, T0.Q1
   202  	VPMULL	B1.D1, B3.D1, T2.Q1
   203  	VEOR	T0.B16, T2.B16, T2.B16
   204  	VEOR	T1.B16, T2.B16, T2.B16
   205  	VEXT	$8, ZERO.B16, T2.B16, T3.B16
   206  	VEXT	$8, T2.B16, ZERO.B16, T2.B16
   207  	VEOR	T2.B16, T0.B16, T0.B16
   208  	VEOR	T3.B16, T1.B16, T1.B16
   209  	VPMULL	POLY.D1, T0.D1, T2.Q1
   210  	VEXT	$8, T0.B16, T0.B16, T0.B16
   211  	VEOR	T2.B16, T0.B16, T0.B16
   212  	VPMULL	POLY.D1, T0.D1, T2.Q1
   213  	VEXT	$8, T0.B16, T0.B16, T0.B16
   214  	VEOR	T2.B16, T0.B16, T0.B16
   215  	VEOR	T1.B16, T0.B16, B2.B16
   216  	VMOV	B2.B16, B3.B16
   217  	VEXT	$8, B2.B16, B2.B16, B2.B16
   218  	VEOR	B2.B16, B3.B16, B3.B16
   219  
   220  	VST1	[B2.B16, B3.B16], (pTbl)
   221  	SUB	$2*16, pTbl
   222  
   223  	BNE	initLoop
   224  	RET
   225  #undef I
   226  #undef NR
   227  #undef KS
   228  #undef pTbl
   229  
   230  // func gcmAesData(productTable *[256]byte, data []byte, T *[16]byte)
   231  TEXT ·gcmAesData(SB),NOSPLIT,$0
   232  #define pTbl R0
   233  #define aut R1
   234  #define tPtr R2
   235  #define autLen R3
   236  #define H0 R4
   237  #define pTblSave R5
   238  
   239  #define mulRound(X) \
   240  	VLD1.P	32(pTbl), [T1.B16, T2.B16] \
   241  	VREV64	X.B16, X.B16               \
   242  	VEXT	$8, X.B16, X.B16, T0.B16   \
   243  	VEOR	X.B16, T0.B16, T0.B16      \
   244  	VPMULL	X.D1, T1.D1, T3.Q1         \
   245  	VEOR	T3.B16, ACC1.B16, ACC1.B16 \
   246  	VPMULL2	X.D2, T1.D2, T3.Q1         \
   247  	VEOR	T3.B16, ACC0.B16, ACC0.B16 \
   248  	VPMULL	T0.D1, T2.D1, T3.Q1        \
   249  	VEOR	T3.B16, ACCM.B16, ACCM.B16
   250  
   251  	MOVD	productTable+0(FP), pTbl
   252  	MOVD	data_base+8(FP), aut
   253  	MOVD	data_len+16(FP), autLen
   254  	MOVD	T+32(FP), tPtr
   255  
   256  	VEOR	ACC0.B16, ACC0.B16, ACC0.B16
   257  	CBZ	autLen, dataBail
   258  
   259  	MOVD	$0xC2, H0
   260  	LSL	$56, H0
   261  	VMOV	H0, POLY.D[0]
   262  	MOVD	$1, H0
   263  	VMOV	H0, POLY.D[1]
   264  	VEOR	ZERO.B16, ZERO.B16, ZERO.B16
   265  	MOVD	pTbl, pTblSave
   266  
   267  	CMP	$13, autLen
   268  	BEQ	dataTLS
   269  	CMP	$128, autLen
   270  	BLT	startSinglesLoop
   271  	B	octetsLoop
   272  
   273  dataTLS:
   274  	ADD	$14*16, pTbl
   275  	VLD1.P	(pTbl), [T1.B16, T2.B16]
   276  	VEOR	B0.B16, B0.B16, B0.B16
   277  
   278  	MOVD	(aut), H0
   279  	VMOV	H0, B0.D[0]
   280  	MOVW	8(aut), H0
   281  	VMOV	H0, B0.S[2]
   282  	MOVB	12(aut), H0
   283  	VMOV	H0, B0.B[12]
   284  
   285  	MOVD	$0, autLen
   286  	B	dataMul
   287  
   288  octetsLoop:
   289  		CMP	$128, autLen
   290  		BLT	startSinglesLoop
   291  		SUB	$128, autLen
   292  
   293  		VLD1.P	32(aut), [B0.B16, B1.B16]
   294  
   295  		VLD1.P	32(pTbl), [T1.B16, T2.B16]
   296  		VREV64	B0.B16, B0.B16
   297  		VEOR	ACC0.B16, B0.B16, B0.B16
   298  		VEXT	$8, B0.B16, B0.B16, T0.B16
   299  		VEOR	B0.B16, T0.B16, T0.B16
   300  		VPMULL	B0.D1, T1.D1, ACC1.Q1
   301  		VPMULL2	B0.D2, T1.D2, ACC0.Q1
   302  		VPMULL	T0.D1, T2.D1, ACCM.Q1
   303  
   304  		mulRound(B1)
   305  		VLD1.P  32(aut), [B2.B16, B3.B16]
   306  		mulRound(B2)
   307  		mulRound(B3)
   308  		VLD1.P  32(aut), [B4.B16, B5.B16]
   309  		mulRound(B4)
   310  		mulRound(B5)
   311  		VLD1.P  32(aut), [B6.B16, B7.B16]
   312  		mulRound(B6)
   313  		mulRound(B7)
   314  
   315  		MOVD	pTblSave, pTbl
   316  		reduce()
   317  	B	octetsLoop
   318  
   319  startSinglesLoop:
   320  
   321  	ADD	$14*16, pTbl
   322  	VLD1.P	(pTbl), [T1.B16, T2.B16]
   323  
   324  singlesLoop:
   325  
   326  		CMP	$16, autLen
   327  		BLT	dataEnd
   328  		SUB	$16, autLen
   329  
   330  		VLD1.P	16(aut), [B0.B16]
   331  dataMul:
   332  		VREV64	B0.B16, B0.B16
   333  		VEOR	ACC0.B16, B0.B16, B0.B16
   334  
   335  		VEXT	$8, B0.B16, B0.B16, T0.B16
   336  		VEOR	B0.B16, T0.B16, T0.B16
   337  		VPMULL	B0.D1, T1.D1, ACC1.Q1
   338  		VPMULL2	B0.D2, T1.D2, ACC0.Q1
   339  		VPMULL	T0.D1, T2.D1, ACCM.Q1
   340  
   341  		reduce()
   342  
   343  	B	singlesLoop
   344  
   345  dataEnd:
   346  
   347  	CBZ	autLen, dataBail
   348  	VEOR	B0.B16, B0.B16, B0.B16
   349  	ADD	autLen, aut
   350  
   351  dataLoadLoop:
   352  		MOVB.W	-1(aut), H0
   353  		VEXT	$15, B0.B16, ZERO.B16, B0.B16
   354  		VMOV	H0, B0.B[0]
   355  		SUBS	$1, autLen
   356  		BNE	dataLoadLoop
   357  	B	dataMul
   358  
   359  dataBail:
   360  	VST1	[ACC0.B16], (tPtr)
   361  	RET
   362  
   363  #undef pTbl
   364  #undef aut
   365  #undef tPtr
   366  #undef autLen
   367  #undef H0
   368  #undef pTblSave
   369  
   370  // func gcmAesEnc(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, ks []uint32)
   371  TEXT ·gcmAesEnc(SB),NOSPLIT,$0
   372  #define pTbl R0
   373  #define dstPtr R1
   374  #define ctrPtr R2
   375  #define srcPtr R3
   376  #define ks R4
   377  #define tPtr R5
   378  #define srcPtrLen R6
   379  #define aluCTR R7
   380  #define aluTMP R8
   381  #define aluK R9
   382  #define NR R10
   383  #define H0 R11
   384  #define H1 R12
   385  #define curK R13
   386  #define pTblSave R14
   387  
   388  #define aesrndx8(K) \
   389  	AESE	K.B16, B0.B16    \
   390  	AESMC	B0.B16, B0.B16   \
   391  	AESE	K.B16, B1.B16    \
   392  	AESMC	B1.B16, B1.B16   \
   393  	AESE	K.B16, B2.B16    \
   394  	AESMC	B2.B16, B2.B16   \
   395  	AESE	K.B16, B3.B16    \
   396  	AESMC	B3.B16, B3.B16   \
   397  	AESE	K.B16, B4.B16    \
   398  	AESMC	B4.B16, B4.B16   \
   399  	AESE	K.B16, B5.B16    \
   400  	AESMC	B5.B16, B5.B16   \
   401  	AESE	K.B16, B6.B16    \
   402  	AESMC	B6.B16, B6.B16   \
   403  	AESE	K.B16, B7.B16    \
   404  	AESMC	B7.B16, B7.B16
   405  
   406  #define aesrndlastx8(K) \
   407  	AESE	K.B16, B0.B16    \
   408  	AESE	K.B16, B1.B16    \
   409  	AESE	K.B16, B2.B16    \
   410  	AESE	K.B16, B3.B16    \
   411  	AESE	K.B16, B4.B16    \
   412  	AESE	K.B16, B5.B16    \
   413  	AESE	K.B16, B6.B16    \
   414  	AESE	K.B16, B7.B16
   415  
   416  	MOVD	productTable+0(FP), pTbl
   417  	MOVD	dst+8(FP), dstPtr
   418  	MOVD	src_base+32(FP), srcPtr
   419  	MOVD	src_len+40(FP), srcPtrLen
   420  	MOVD	ctr+56(FP), ctrPtr
   421  	MOVD	T+64(FP), tPtr
   422  	MOVD	ks_base+72(FP), ks
   423  	MOVD	ks_len+80(FP), NR
   424  
   425  	MOVD	$0xC2, H1
   426  	LSL	$56, H1
   427  	MOVD	$1, H0
   428  	VMOV	H1, POLY.D[0]
   429  	VMOV	H0, POLY.D[1]
   430  	VEOR	ZERO.B16, ZERO.B16, ZERO.B16
   431  	// Compute NR from len(ks)
   432  	MOVD	pTbl, pTblSave
   433  	// Current tag, after AAD
   434  	VLD1	(tPtr), [ACC0.B16]
   435  	VEOR	ACC1.B16, ACC1.B16, ACC1.B16
   436  	VEOR	ACCM.B16, ACCM.B16, ACCM.B16
   437  	// Prepare initial counter, and the increment vector
   438  	VLD1	(ctrPtr), [CTR.B16]
   439  	VEOR	INC.B16, INC.B16, INC.B16
   440  	MOVD	$1, H0
   441  	VMOV	H0, INC.S[3]
   442  	VREV32	CTR.B16, CTR.B16
   443  	VADD	CTR.S4, INC.S4, CTR.S4
   444  	// Skip to <8 blocks loop
   445  	CMP	$128, srcPtrLen
   446  
   447  	MOVD	ks, H0
   448  	// For AES-128 round keys are stored in: K0 .. K10, KLAST
   449  	VLD1.P	64(H0), [K0.B16, K1.B16, K2.B16, K3.B16]
   450  	VLD1.P	64(H0), [K4.B16, K5.B16, K6.B16, K7.B16]
   451  	VLD1.P	48(H0), [K8.B16, K9.B16, K10.B16]
   452  	VMOV	K10.B16, KLAST.B16
   453  
   454  	BLT	startSingles
   455  	// There are at least 8 blocks to encrypt
   456  	TBZ	$4, NR, octetsLoop
   457  
   458  	// For AES-192 round keys occupy: K0 .. K7, K10, K11, K8, K9, KLAST
   459  	VMOV	K8.B16, K10.B16
   460  	VMOV	K9.B16, K11.B16
   461  	VMOV	KLAST.B16, K8.B16
   462  	VLD1.P	16(H0), [K9.B16]
   463  	VLD1.P  16(H0), [KLAST.B16]
   464  	TBZ	$3, NR, octetsLoop
   465  	// For AES-256 round keys occupy: K0 .. K7, K10, K11, mem, mem, K8, K9, KLAST
   466  	VMOV	KLAST.B16, K8.B16
   467  	VLD1.P	16(H0), [K9.B16]
   468  	VLD1.P  16(H0), [KLAST.B16]
   469  	ADD	$10*16, ks, H0
   470  	MOVD	H0, curK
   471  
   472  octetsLoop:
   473  		SUB	$128, srcPtrLen
   474  
   475  		VMOV	CTR.B16, B0.B16
   476  		VADD	B0.S4, INC.S4, B1.S4
   477  		VREV32	B0.B16, B0.B16
   478  		VADD	B1.S4, INC.S4, B2.S4
   479  		VREV32	B1.B16, B1.B16
   480  		VADD	B2.S4, INC.S4, B3.S4
   481  		VREV32	B2.B16, B2.B16
   482  		VADD	B3.S4, INC.S4, B4.S4
   483  		VREV32	B3.B16, B3.B16
   484  		VADD	B4.S4, INC.S4, B5.S4
   485  		VREV32	B4.B16, B4.B16
   486  		VADD	B5.S4, INC.S4, B6.S4
   487  		VREV32	B5.B16, B5.B16
   488  		VADD	B6.S4, INC.S4, B7.S4
   489  		VREV32	B6.B16, B6.B16
   490  		VADD	B7.S4, INC.S4, CTR.S4
   491  		VREV32	B7.B16, B7.B16
   492  
   493  		aesrndx8(K0)
   494  		aesrndx8(K1)
   495  		aesrndx8(K2)
   496  		aesrndx8(K3)
   497  		aesrndx8(K4)
   498  		aesrndx8(K5)
   499  		aesrndx8(K6)
   500  		aesrndx8(K7)
   501  		TBZ	$4, NR, octetsFinish
   502  		aesrndx8(K10)
   503  		aesrndx8(K11)
   504  		TBZ	$3, NR, octetsFinish
   505  		VLD1.P	32(curK), [T1.B16, T2.B16]
   506  		aesrndx8(T1)
   507  		aesrndx8(T2)
   508  		MOVD	H0, curK
   509  octetsFinish:
   510  		aesrndx8(K8)
   511  		aesrndlastx8(K9)
   512  
   513  		VEOR	KLAST.B16, B0.B16, B0.B16
   514  		VEOR	KLAST.B16, B1.B16, B1.B16
   515  		VEOR	KLAST.B16, B2.B16, B2.B16
   516  		VEOR	KLAST.B16, B3.B16, B3.B16
   517  		VEOR	KLAST.B16, B4.B16, B4.B16
   518  		VEOR	KLAST.B16, B5.B16, B5.B16
   519  		VEOR	KLAST.B16, B6.B16, B6.B16
   520  		VEOR	KLAST.B16, B7.B16, B7.B16
   521  
   522  		VLD1.P	32(srcPtr), [T1.B16, T2.B16]
   523  		VEOR	B0.B16, T1.B16, B0.B16
   524  		VEOR	B1.B16, T2.B16, B1.B16
   525  		VST1.P  [B0.B16, B1.B16], 32(dstPtr)
   526  		VLD1.P	32(srcPtr), [T1.B16, T2.B16]
   527  		VEOR	B2.B16, T1.B16, B2.B16
   528  		VEOR	B3.B16, T2.B16, B3.B16
   529  		VST1.P  [B2.B16, B3.B16], 32(dstPtr)
   530  		VLD1.P	32(srcPtr), [T1.B16, T2.B16]
   531  		VEOR	B4.B16, T1.B16, B4.B16
   532  		VEOR	B5.B16, T2.B16, B5.B16
   533  		VST1.P  [B4.B16, B5.B16], 32(dstPtr)
   534  		VLD1.P	32(srcPtr), [T1.B16, T2.B16]
   535  		VEOR	B6.B16, T1.B16, B6.B16
   536  		VEOR	B7.B16, T2.B16, B7.B16
   537  		VST1.P  [B6.B16, B7.B16], 32(dstPtr)
   538  
   539  		VLD1.P	32(pTbl), [T1.B16, T2.B16]
   540  		VREV64	B0.B16, B0.B16
   541  		VEOR	ACC0.B16, B0.B16, B0.B16
   542  		VEXT	$8, B0.B16, B0.B16, T0.B16
   543  		VEOR	B0.B16, T0.B16, T0.B16
   544  		VPMULL	B0.D1, T1.D1, ACC1.Q1
   545  		VPMULL2	B0.D2, T1.D2, ACC0.Q1
   546  		VPMULL	T0.D1, T2.D1, ACCM.Q1
   547  
   548  		mulRound(B1)
   549  		mulRound(B2)
   550  		mulRound(B3)
   551  		mulRound(B4)
   552  		mulRound(B5)
   553  		mulRound(B6)
   554  		mulRound(B7)
   555  		MOVD	pTblSave, pTbl
   556  		reduce()
   557  
   558  		CMP	$128, srcPtrLen
   559  		BGE	octetsLoop
   560  
   561  startSingles:
   562  	CBZ	srcPtrLen, done
   563  	ADD	$14*16, pTbl
   564  	// Preload H and its Karatsuba precomp
   565  	VLD1.P	(pTbl), [T1.B16, T2.B16]
   566  	// Preload AES round keys
   567  	ADD	$128, ks
   568  	VLD1.P	48(ks), [K8.B16, K9.B16, K10.B16]
   569  	VMOV	K10.B16, KLAST.B16
   570  	TBZ	$4, NR, singlesLoop
   571  	VLD1.P	32(ks), [B1.B16, B2.B16]
   572  	VMOV	B2.B16, KLAST.B16
   573  	TBZ	$3, NR, singlesLoop
   574  	VLD1.P	32(ks), [B3.B16, B4.B16]
   575  	VMOV	B4.B16, KLAST.B16
   576  
   577  singlesLoop:
   578  		CMP	$16, srcPtrLen
   579  		BLT	tail
   580  		SUB	$16, srcPtrLen
   581  
   582  		VLD1.P	16(srcPtr), [T0.B16]
   583  		VEOR	KLAST.B16, T0.B16, T0.B16
   584  
   585  		VREV32	CTR.B16, B0.B16
   586  		VADD	CTR.S4, INC.S4, CTR.S4
   587  
   588  		AESE	K0.B16, B0.B16
   589  		AESMC	B0.B16, B0.B16
   590  		AESE	K1.B16, B0.B16
   591  		AESMC	B0.B16, B0.B16
   592  		AESE	K2.B16, B0.B16
   593  		AESMC	B0.B16, B0.B16
   594  		AESE	K3.B16, B0.B16
   595  		AESMC	B0.B16, B0.B16
   596  		AESE	K4.B16, B0.B16
   597  		AESMC	B0.B16, B0.B16
   598  		AESE	K5.B16, B0.B16
   599  		AESMC	B0.B16, B0.B16
   600  		AESE	K6.B16, B0.B16
   601  		AESMC	B0.B16, B0.B16
   602  		AESE	K7.B16, B0.B16
   603  		AESMC	B0.B16, B0.B16
   604  		AESE	K8.B16, B0.B16
   605  		AESMC	B0.B16, B0.B16
   606  		AESE	K9.B16, B0.B16
   607  		TBZ	$4, NR, singlesLast
   608  		AESMC	B0.B16, B0.B16
   609  		AESE	K10.B16, B0.B16
   610  		AESMC	B0.B16, B0.B16
   611  		AESE	B1.B16, B0.B16
   612  		TBZ	$3, NR, singlesLast
   613  		AESMC	B0.B16, B0.B16
   614  		AESE	B2.B16, B0.B16
   615  		AESMC	B0.B16, B0.B16
   616  		AESE	B3.B16, B0.B16
   617  singlesLast:
   618  		VEOR	T0.B16, B0.B16, B0.B16
   619  encReduce:
   620  		VST1.P	[B0.B16], 16(dstPtr)
   621  
   622  		VREV64	B0.B16, B0.B16
   623  		VEOR	ACC0.B16, B0.B16, B0.B16
   624  
   625  		VEXT	$8, B0.B16, B0.B16, T0.B16
   626  		VEOR	B0.B16, T0.B16, T0.B16
   627  		VPMULL	B0.D1, T1.D1, ACC1.Q1
   628  		VPMULL2	B0.D2, T1.D2, ACC0.Q1
   629  		VPMULL	T0.D1, T2.D1, ACCM.Q1
   630  
   631  		reduce()
   632  
   633  	B	singlesLoop
   634  tail:
   635  	CBZ	srcPtrLen, done
   636  
   637  	VEOR	T0.B16, T0.B16, T0.B16
   638  	VEOR	T3.B16, T3.B16, T3.B16
   639  	MOVD	$0, H1
   640  	SUB	$1, H1
   641  	ADD	srcPtrLen, srcPtr
   642  
   643  	TBZ	$3, srcPtrLen, ld4
   644  	MOVD.W	-8(srcPtr), H0
   645  	VMOV	H0, T0.D[0]
   646  	VMOV	H1, T3.D[0]
   647  ld4:
   648  	TBZ	$2, srcPtrLen, ld2
   649  	MOVW.W	-4(srcPtr), H0
   650  	VEXT	$12, T0.B16, ZERO.B16, T0.B16
   651  	VEXT	$12, T3.B16, ZERO.B16, T3.B16
   652  	VMOV	H0, T0.S[0]
   653  	VMOV	H1, T3.S[0]
   654  ld2:
   655  	TBZ	$1, srcPtrLen, ld1
   656  	MOVH.W	-2(srcPtr), H0
   657  	VEXT	$14, T0.B16, ZERO.B16, T0.B16
   658  	VEXT	$14, T3.B16, ZERO.B16, T3.B16
   659  	VMOV	H0, T0.H[0]
   660  	VMOV	H1, T3.H[0]
   661  ld1:
   662  	TBZ	$0, srcPtrLen, ld0
   663  	MOVB.W	-1(srcPtr), H0
   664  	VEXT	$15, T0.B16, ZERO.B16, T0.B16
   665  	VEXT	$15, T3.B16, ZERO.B16, T3.B16
   666  	VMOV	H0, T0.B[0]
   667  	VMOV	H1, T3.B[0]
   668  ld0:
   669  
   670  	MOVD	ZR, srcPtrLen
   671  	VEOR	KLAST.B16, T0.B16, T0.B16
   672  	VREV32	CTR.B16, B0.B16
   673  
   674  	AESE	K0.B16, B0.B16
   675  	AESMC	B0.B16, B0.B16
   676  	AESE	K1.B16, B0.B16
   677  	AESMC	B0.B16, B0.B16
   678  	AESE	K2.B16, B0.B16
   679  	AESMC	B0.B16, B0.B16
   680  	AESE	K3.B16, B0.B16
   681  	AESMC	B0.B16, B0.B16
   682  	AESE	K4.B16, B0.B16
   683  	AESMC	B0.B16, B0.B16
   684  	AESE	K5.B16, B0.B16
   685  	AESMC	B0.B16, B0.B16
   686  	AESE	K6.B16, B0.B16
   687  	AESMC	B0.B16, B0.B16
   688  	AESE	K7.B16, B0.B16
   689  	AESMC	B0.B16, B0.B16
   690  	AESE	K8.B16, B0.B16
   691  	AESMC	B0.B16, B0.B16
   692  	AESE	K9.B16, B0.B16
   693  	TBZ	$4, NR, tailLast
   694  	AESMC	B0.B16, B0.B16
   695  	AESE	K10.B16, B0.B16
   696  	AESMC	B0.B16, B0.B16
   697  	AESE	B1.B16, B0.B16
   698  	TBZ	$3, NR, tailLast
   699  	AESMC	B0.B16, B0.B16
   700  	AESE	B2.B16, B0.B16
   701  	AESMC	B0.B16, B0.B16
   702  	AESE	B3.B16, B0.B16
   703  
   704  tailLast:
   705  	VEOR	T0.B16, B0.B16, B0.B16
   706  	VAND	T3.B16, B0.B16, B0.B16
   707  	B	encReduce
   708  
   709  done:
   710  	VST1	[ACC0.B16], (tPtr)
   711  	RET
   712  
   713  // func gcmAesDec(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, ks []uint32)
   714  TEXT ·gcmAesDec(SB),NOSPLIT,$0
   715  	MOVD	productTable+0(FP), pTbl
   716  	MOVD	dst+8(FP), dstPtr
   717  	MOVD	src_base+32(FP), srcPtr
   718  	MOVD	src_len+40(FP), srcPtrLen
   719  	MOVD	ctr+56(FP), ctrPtr
   720  	MOVD	T+64(FP), tPtr
   721  	MOVD	ks_base+72(FP), ks
   722  	MOVD	ks_len+80(FP), NR
   723  
   724  	MOVD	$0xC2, H1
   725  	LSL	$56, H1
   726  	MOVD	$1, H0
   727  	VMOV	H1, POLY.D[0]
   728  	VMOV	H0, POLY.D[1]
   729  	VEOR	ZERO.B16, ZERO.B16, ZERO.B16
   730  	// Compute NR from len(ks)
   731  	MOVD	pTbl, pTblSave
   732  	// Current tag, after AAD
   733  	VLD1	(tPtr), [ACC0.B16]
   734  	VEOR	ACC1.B16, ACC1.B16, ACC1.B16
   735  	VEOR	ACCM.B16, ACCM.B16, ACCM.B16
   736  	// Prepare initial counter, and the increment vector
   737  	VLD1	(ctrPtr), [CTR.B16]
   738  	VEOR	INC.B16, INC.B16, INC.B16
   739  	MOVD	$1, H0
   740  	VMOV	H0, INC.S[3]
   741  	VREV32	CTR.B16, CTR.B16
   742  	VADD	CTR.S4, INC.S4, CTR.S4
   743  
   744  	MOVD	ks, H0
   745  	// For AES-128 round keys are stored in: K0 .. K10, KLAST
   746  	VLD1.P	64(H0), [K0.B16, K1.B16, K2.B16, K3.B16]
   747  	VLD1.P	64(H0), [K4.B16, K5.B16, K6.B16, K7.B16]
   748  	VLD1.P	48(H0), [K8.B16, K9.B16, K10.B16]
   749  	VMOV	K10.B16, KLAST.B16
   750  
   751  	// Skip to <8 blocks loop
   752  	CMP	$128, srcPtrLen
   753  	BLT	startSingles
   754  	// There are at least 8 blocks to encrypt
   755  	TBZ	$4, NR, octetsLoop
   756  
   757  	// For AES-192 round keys occupy: K0 .. K7, K10, K11, K8, K9, KLAST
   758  	VMOV	K8.B16, K10.B16
   759  	VMOV	K9.B16, K11.B16
   760  	VMOV	KLAST.B16, K8.B16
   761  	VLD1.P	16(H0), [K9.B16]
   762  	VLD1.P  16(H0), [KLAST.B16]
   763  	TBZ	$3, NR, octetsLoop
   764  	// For AES-256 round keys occupy: K0 .. K7, K10, K11, mem, mem, K8, K9, KLAST
   765  	VMOV	KLAST.B16, K8.B16
   766  	VLD1.P	16(H0), [K9.B16]
   767  	VLD1.P  16(H0), [KLAST.B16]
   768  	ADD	$10*16, ks, H0
   769  	MOVD	H0, curK
   770  
   771  octetsLoop:
   772  		SUB	$128, srcPtrLen
   773  
   774  		VMOV	CTR.B16, B0.B16
   775  		VADD	B0.S4, INC.S4, B1.S4
   776  		VREV32	B0.B16, B0.B16
   777  		VADD	B1.S4, INC.S4, B2.S4
   778  		VREV32	B1.B16, B1.B16
   779  		VADD	B2.S4, INC.S4, B3.S4
   780  		VREV32	B2.B16, B2.B16
   781  		VADD	B3.S4, INC.S4, B4.S4
   782  		VREV32	B3.B16, B3.B16
   783  		VADD	B4.S4, INC.S4, B5.S4
   784  		VREV32	B4.B16, B4.B16
   785  		VADD	B5.S4, INC.S4, B6.S4
   786  		VREV32	B5.B16, B5.B16
   787  		VADD	B6.S4, INC.S4, B7.S4
   788  		VREV32	B6.B16, B6.B16
   789  		VADD	B7.S4, INC.S4, CTR.S4
   790  		VREV32	B7.B16, B7.B16
   791  
   792  		aesrndx8(K0)
   793  		aesrndx8(K1)
   794  		aesrndx8(K2)
   795  		aesrndx8(K3)
   796  		aesrndx8(K4)
   797  		aesrndx8(K5)
   798  		aesrndx8(K6)
   799  		aesrndx8(K7)
   800  		TBZ	$4, NR, octetsFinish
   801  		aesrndx8(K10)
   802  		aesrndx8(K11)
   803  		TBZ	$3, NR, octetsFinish
   804  		VLD1.P	32(curK), [T1.B16, T2.B16]
   805  		aesrndx8(T1)
   806  		aesrndx8(T2)
   807  		MOVD	H0, curK
   808  octetsFinish:
   809  		aesrndx8(K8)
   810  		aesrndlastx8(K9)
   811  
   812  		VEOR	KLAST.B16, B0.B16, T1.B16
   813  		VEOR	KLAST.B16, B1.B16, T2.B16
   814  		VEOR	KLAST.B16, B2.B16, B2.B16
   815  		VEOR	KLAST.B16, B3.B16, B3.B16
   816  		VEOR	KLAST.B16, B4.B16, B4.B16
   817  		VEOR	KLAST.B16, B5.B16, B5.B16
   818  		VEOR	KLAST.B16, B6.B16, B6.B16
   819  		VEOR	KLAST.B16, B7.B16, B7.B16
   820  
   821  		VLD1.P	32(srcPtr), [B0.B16, B1.B16]
   822  		VEOR	B0.B16, T1.B16, T1.B16
   823  		VEOR	B1.B16, T2.B16, T2.B16
   824  		VST1.P  [T1.B16, T2.B16], 32(dstPtr)
   825  
   826  		VLD1.P	32(pTbl), [T1.B16, T2.B16]
   827  		VREV64	B0.B16, B0.B16
   828  		VEOR	ACC0.B16, B0.B16, B0.B16
   829  		VEXT	$8, B0.B16, B0.B16, T0.B16
   830  		VEOR	B0.B16, T0.B16, T0.B16
   831  		VPMULL	B0.D1, T1.D1, ACC1.Q1
   832  		VPMULL2	B0.D2, T1.D2, ACC0.Q1
   833  		VPMULL	T0.D1, T2.D1, ACCM.Q1
   834  		mulRound(B1)
   835  
   836  		VLD1.P	32(srcPtr), [B0.B16, B1.B16]
   837  		VEOR	B2.B16, B0.B16, T1.B16
   838  		VEOR	B3.B16, B1.B16, T2.B16
   839  		VST1.P  [T1.B16, T2.B16], 32(dstPtr)
   840  		mulRound(B0)
   841  		mulRound(B1)
   842  
   843  		VLD1.P	32(srcPtr), [B0.B16, B1.B16]
   844  		VEOR	B4.B16, B0.B16, T1.B16
   845  		VEOR	B5.B16, B1.B16, T2.B16
   846  		VST1.P  [T1.B16, T2.B16], 32(dstPtr)
   847  		mulRound(B0)
   848  		mulRound(B1)
   849  
   850  		VLD1.P	32(srcPtr), [B0.B16, B1.B16]
   851  		VEOR	B6.B16, B0.B16, T1.B16
   852  		VEOR	B7.B16, B1.B16, T2.B16
   853  		VST1.P  [T1.B16, T2.B16], 32(dstPtr)
   854  		mulRound(B0)
   855  		mulRound(B1)
   856  
   857  		MOVD	pTblSave, pTbl
   858  		reduce()
   859  
   860  		CMP	$128, srcPtrLen
   861  		BGE	octetsLoop
   862  
   863  startSingles:
   864  	CBZ	srcPtrLen, done
   865  	ADD	$14*16, pTbl
   866  	// Preload H and its Karatsuba precomp
   867  	VLD1.P	(pTbl), [T1.B16, T2.B16]
   868  	// Preload AES round keys
   869  	ADD	$128, ks
   870  	VLD1.P	48(ks), [K8.B16, K9.B16, K10.B16]
   871  	VMOV	K10.B16, KLAST.B16
   872  	TBZ	$4, NR, singlesLoop
   873  	VLD1.P	32(ks), [B1.B16, B2.B16]
   874  	VMOV	B2.B16, KLAST.B16
   875  	TBZ	$3, NR, singlesLoop
   876  	VLD1.P	32(ks), [B3.B16, B4.B16]
   877  	VMOV	B4.B16, KLAST.B16
   878  
   879  singlesLoop:
   880  		CMP	$16, srcPtrLen
   881  		BLT	tail
   882  		SUB	$16, srcPtrLen
   883  
   884  		VLD1.P	16(srcPtr), [T0.B16]
   885  		VREV64	T0.B16, B5.B16
   886  		VEOR	KLAST.B16, T0.B16, T0.B16
   887  
   888  		VREV32	CTR.B16, B0.B16
   889  		VADD	CTR.S4, INC.S4, CTR.S4
   890  
   891  		AESE	K0.B16, B0.B16
   892  		AESMC	B0.B16, B0.B16
   893  		AESE	K1.B16, B0.B16
   894  		AESMC	B0.B16, B0.B16
   895  		AESE	K2.B16, B0.B16
   896  		AESMC	B0.B16, B0.B16
   897  		AESE	K3.B16, B0.B16
   898  		AESMC	B0.B16, B0.B16
   899  		AESE	K4.B16, B0.B16
   900  		AESMC	B0.B16, B0.B16
   901  		AESE	K5.B16, B0.B16
   902  		AESMC	B0.B16, B0.B16
   903  		AESE	K6.B16, B0.B16
   904  		AESMC	B0.B16, B0.B16
   905  		AESE	K7.B16, B0.B16
   906  		AESMC	B0.B16, B0.B16
   907  		AESE	K8.B16, B0.B16
   908  		AESMC	B0.B16, B0.B16
   909  		AESE	K9.B16, B0.B16
   910  		TBZ	$4, NR, singlesLast
   911  		AESMC	B0.B16, B0.B16
   912  		AESE	K10.B16, B0.B16
   913  		AESMC	B0.B16, B0.B16
   914  		AESE	B1.B16, B0.B16
   915  		TBZ	$3, NR, singlesLast
   916  		AESMC	B0.B16, B0.B16
   917  		AESE	B2.B16, B0.B16
   918  		AESMC	B0.B16, B0.B16
   919  		AESE	B3.B16, B0.B16
   920  singlesLast:
   921  		VEOR	T0.B16, B0.B16, B0.B16
   922  
   923  		VST1.P	[B0.B16], 16(dstPtr)
   924  
   925  		VEOR	ACC0.B16, B5.B16, B5.B16
   926  		VEXT	$8, B5.B16, B5.B16, T0.B16
   927  		VEOR	B5.B16, T0.B16, T0.B16
   928  		VPMULL	B5.D1, T1.D1, ACC1.Q1
   929  		VPMULL2	B5.D2, T1.D2, ACC0.Q1
   930  		VPMULL	T0.D1, T2.D1, ACCM.Q1
   931  		reduce()
   932  
   933  	B	singlesLoop
   934  tail:
   935  	CBZ	srcPtrLen, done
   936  
   937  	VREV32	CTR.B16, B0.B16
   938  	VADD	CTR.S4, INC.S4, CTR.S4
   939  
   940  	AESE	K0.B16, B0.B16
   941  	AESMC	B0.B16, B0.B16
   942  	AESE	K1.B16, B0.B16
   943  	AESMC	B0.B16, B0.B16
   944  	AESE	K2.B16, B0.B16
   945  	AESMC	B0.B16, B0.B16
   946  	AESE	K3.B16, B0.B16
   947  	AESMC	B0.B16, B0.B16
   948  	AESE	K4.B16, B0.B16
   949  	AESMC	B0.B16, B0.B16
   950  	AESE	K5.B16, B0.B16
   951  	AESMC	B0.B16, B0.B16
   952  	AESE	K6.B16, B0.B16
   953  	AESMC	B0.B16, B0.B16
   954  	AESE	K7.B16, B0.B16
   955  	AESMC	B0.B16, B0.B16
   956  	AESE	K8.B16, B0.B16
   957  	AESMC	B0.B16, B0.B16
   958  	AESE	K9.B16, B0.B16
   959  	TBZ	$4, NR, tailLast
   960  	AESMC	B0.B16, B0.B16
   961  	AESE	K10.B16, B0.B16
   962  	AESMC	B0.B16, B0.B16
   963  	AESE	B1.B16, B0.B16
   964  	TBZ	$3, NR, tailLast
   965  	AESMC	B0.B16, B0.B16
   966  	AESE	B2.B16, B0.B16
   967  	AESMC	B0.B16, B0.B16
   968  	AESE	B3.B16, B0.B16
   969  tailLast:
   970  	VEOR	KLAST.B16, B0.B16, B0.B16
   971  
   972  	// Assuming it is safe to load past dstPtr due to the presence of the tag
   973  	VLD1	(srcPtr), [B5.B16]
   974  
   975  	VEOR	B5.B16, B0.B16, B0.B16
   976  
   977  	VEOR	T3.B16, T3.B16, T3.B16
   978  	MOVD	$0, H1
   979  	SUB	$1, H1
   980  
   981  	TBZ	$3, srcPtrLen, ld4
   982  	VMOV	B0.D[0], H0
   983  	MOVD.P	H0, 8(dstPtr)
   984  	VMOV	H1, T3.D[0]
   985  	VEXT	$8, ZERO.B16, B0.B16, B0.B16
   986  ld4:
   987  	TBZ	$2, srcPtrLen, ld2
   988  	VMOV	B0.S[0], H0
   989  	MOVW.P	H0, 4(dstPtr)
   990  	VEXT	$12, T3.B16, ZERO.B16, T3.B16
   991  	VMOV	H1, T3.S[0]
   992  	VEXT	$4, ZERO.B16, B0.B16, B0.B16
   993  ld2:
   994  	TBZ	$1, srcPtrLen, ld1
   995  	VMOV	B0.H[0], H0
   996  	MOVH.P	H0, 2(dstPtr)
   997  	VEXT	$14, T3.B16, ZERO.B16, T3.B16
   998  	VMOV	H1, T3.H[0]
   999  	VEXT	$2, ZERO.B16, B0.B16, B0.B16
  1000  ld1:
  1001  	TBZ	$0, srcPtrLen, ld0
  1002  	VMOV	B0.B[0], H0
  1003  	MOVB.P	H0, 1(dstPtr)
  1004  	VEXT	$15, T3.B16, ZERO.B16, T3.B16
  1005  	VMOV	H1, T3.B[0]
  1006  ld0:
  1007  
  1008  	VAND	T3.B16, B5.B16, B5.B16
  1009  	VREV64	B5.B16, B5.B16
  1010  
  1011  	VEOR	ACC0.B16, B5.B16, B5.B16
  1012  	VEXT	$8, B5.B16, B5.B16, T0.B16
  1013  	VEOR	B5.B16, T0.B16, T0.B16
  1014  	VPMULL	B5.D1, T1.D1, ACC1.Q1
  1015  	VPMULL2	B5.D2, T1.D2, ACC0.Q1
  1016  	VPMULL	T0.D1, T2.D1, ACCM.Q1
  1017  	reduce()
  1018  done:
  1019  	VST1	[ACC0.B16], (tPtr)
  1020  
  1021  	RET
  1022  

View as plain text