Text file src/crypto/aes/asm_arm64.s

     1  // Copyright 2017 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  #include "textflag.h"
     6  DATA rotInvSRows<>+0x00(SB)/8, $0x080f0205040b0e01
     7  DATA rotInvSRows<>+0x08(SB)/8, $0x00070a0d0c030609
     8  GLOBL rotInvSRows<>(SB), (NOPTR+RODATA), $16
     9  DATA invSRows<>+0x00(SB)/8, $0x0b0e0104070a0d00
    10  DATA invSRows<>+0x08(SB)/8, $0x0306090c0f020508
    11  GLOBL invSRows<>(SB), (NOPTR+RODATA), $16
    12  // func encryptBlockAsm(nr int, xk *uint32, dst, src *byte)
    13  TEXT ·encryptBlockAsm(SB),NOSPLIT,$0
    14  	MOVD	nr+0(FP), R9
    15  	MOVD	xk+8(FP), R10
    16  	MOVD	dst+16(FP), R11
    17  	MOVD	src+24(FP), R12
    18  
    19  	VLD1	(R12), [V0.B16]
    20  
    21  	CMP	$12, R9
    22  	BLT	enc128
    23  	BEQ	enc196
    24  enc256:
    25  	VLD1.P	32(R10), [V1.B16, V2.B16]
    26  	AESE	V1.B16, V0.B16
    27  	AESMC	V0.B16, V0.B16
    28  	AESE	V2.B16, V0.B16
    29  	AESMC	V0.B16, V0.B16
    30  enc196:
    31  	VLD1.P	32(R10), [V3.B16, V4.B16]
    32  	AESE	V3.B16, V0.B16
    33  	AESMC	V0.B16, V0.B16
    34  	AESE	V4.B16, V0.B16
    35  	AESMC	V0.B16, V0.B16
    36  enc128:
    37  	VLD1.P	64(R10), [V5.B16, V6.B16, V7.B16, V8.B16]
    38  	VLD1.P	64(R10), [V9.B16, V10.B16, V11.B16, V12.B16]
    39  	VLD1.P	48(R10), [V13.B16, V14.B16, V15.B16]
    40  	AESE	V5.B16, V0.B16
    41  	AESMC	V0.B16, V0.B16
    42  	AESE	V6.B16, V0.B16
    43  	AESMC	V0.B16, V0.B16
    44  	AESE	V7.B16, V0.B16
    45  	AESMC	V0.B16, V0.B16
    46  	AESE	V8.B16, V0.B16
    47  	AESMC	V0.B16, V0.B16
    48  	AESE	V9.B16, V0.B16
    49  	AESMC	V0.B16, V0.B16
    50  	AESE	V10.B16, V0.B16
    51  	AESMC	V0.B16, V0.B16
    52  	AESE	V11.B16, V0.B16
    53  	AESMC	V0.B16, V0.B16
    54  	AESE	V12.B16, V0.B16
    55  	AESMC	V0.B16, V0.B16
    56  	AESE	V13.B16, V0.B16
    57  	AESMC	V0.B16, V0.B16
    58  	AESE	V14.B16, V0.B16
    59  	VEOR    V0.B16, V15.B16, V0.B16
    60  	VST1	[V0.B16], (R11)
    61  	RET
    62  
    63  // func decryptBlockAsm(nr int, xk *uint32, dst, src *byte)
    64  TEXT ·decryptBlockAsm(SB),NOSPLIT,$0
    65  	MOVD	nr+0(FP), R9
    66  	MOVD	xk+8(FP), R10
    67  	MOVD	dst+16(FP), R11
    68  	MOVD	src+24(FP), R12
    69  
    70  	VLD1	(R12), [V0.B16]
    71  
    72  	CMP	$12, R9
    73  	BLT	dec128
    74  	BEQ	dec196
    75  dec256:
    76  	VLD1.P	32(R10), [V1.B16, V2.B16]
    77  	AESD	V1.B16, V0.B16
    78  	AESIMC	V0.B16, V0.B16
    79  	AESD	V2.B16, V0.B16
    80  	AESIMC	V0.B16, V0.B16
    81  dec196:
    82  	VLD1.P	32(R10), [V3.B16, V4.B16]
    83  	AESD	V3.B16, V0.B16
    84  	AESIMC	V0.B16, V0.B16
    85  	AESD	V4.B16, V0.B16
    86  	AESIMC	V0.B16, V0.B16
    87  dec128:
    88  	VLD1.P	64(R10), [V5.B16, V6.B16, V7.B16, V8.B16]
    89  	VLD1.P	64(R10), [V9.B16, V10.B16, V11.B16, V12.B16]
    90  	VLD1.P	48(R10), [V13.B16, V14.B16, V15.B16]
    91  	AESD	V5.B16, V0.B16
    92  	AESIMC	V0.B16, V0.B16
    93  	AESD	V6.B16, V0.B16
    94  	AESIMC	V0.B16, V0.B16
    95  	AESD	V7.B16, V0.B16
    96  	AESIMC	V0.B16, V0.B16
    97  	AESD	V8.B16, V0.B16
    98  	AESIMC	V0.B16, V0.B16
    99  	AESD	V9.B16, V0.B16
   100  	AESIMC	V0.B16, V0.B16
   101  	AESD	V10.B16, V0.B16
   102  	AESIMC	V0.B16, V0.B16
   103  	AESD	V11.B16, V0.B16
   104  	AESIMC	V0.B16, V0.B16
   105  	AESD	V12.B16, V0.B16
   106  	AESIMC	V0.B16, V0.B16
   107  	AESD	V13.B16, V0.B16
   108  	AESIMC	V0.B16, V0.B16
   109  	AESD	V14.B16, V0.B16
   110  	VEOR    V0.B16, V15.B16, V0.B16
   111  	VST1	[V0.B16], (R11)
   112  	RET
   113  
   114  // func expandKeyAsm(nr int, key *byte, enc, dec *uint32) {
   115  // Note that round keys are stored in uint128 format, not uint32
   116  TEXT ·expandKeyAsm(SB),NOSPLIT,$0
   117  	MOVD	nr+0(FP), R8
   118  	MOVD	key+8(FP), R9
   119  	MOVD	enc+16(FP), R10
   120  	MOVD	dec+24(FP), R11
   121  	LDP	rotInvSRows<>(SB), (R0, R1)
   122  	VMOV	R0, V3.D[0]
   123  	VMOV	R1, V3.D[1]
   124  	VEOR	V0.B16, V0.B16, V0.B16 // All zeroes
   125  	MOVW	$1, R13
   126  	TBZ	$1, R8, ks192
   127  	TBNZ	$2, R8, ks256
   128  	LDPW	(R9), (R4, R5)
   129  	LDPW	8(R9), (R6, R7)
   130  	STPW.P	(R4, R5), 8(R10)
   131  	STPW.P	(R6, R7), 8(R10)
   132  	MOVW	$0x1b, R14
   133  ks128Loop:
   134  		VMOV	R7, V2.S[0]
   135  		WORD	$0x4E030042       // TBL V3.B16, [V2.B16], V2.B16
   136  		AESE	V0.B16, V2.B16    // Use AES to compute the SBOX
   137  		EORW	R13, R4
   138  		LSLW	$1, R13           // Compute next Rcon
   139  		ANDSW	$0x100, R13, ZR
   140  		CSELW	NE, R14, R13, R13 // Fake modulo
   141  		SUBS	$1, R8
   142  		VMOV	V2.S[0], R0
   143  		EORW	R0, R4
   144  		EORW	R4, R5
   145  		EORW	R5, R6
   146  		EORW	R6, R7
   147  		STPW.P	(R4, R5), 8(R10)
   148  		STPW.P	(R6, R7), 8(R10)
   149  	BNE	ks128Loop
   150  	CBZ	R11, ksDone       // If dec is nil we are done
   151  	SUB	$176, R10
   152          // Decryption keys are encryption keys with InverseMixColumns applied
   153  	VLD1.P	64(R10), [V0.B16, V1.B16, V2.B16, V3.B16]
   154  	VMOV	V0.B16, V7.B16
   155  	AESIMC	V1.B16, V6.B16
   156  	AESIMC	V2.B16, V5.B16
   157  	AESIMC	V3.B16, V4.B16
   158  	VLD1.P	64(R10), [V0.B16, V1.B16, V2.B16, V3.B16]
   159  	AESIMC	V0.B16, V11.B16
   160  	AESIMC	V1.B16, V10.B16
   161  	AESIMC	V2.B16, V9.B16
   162  	AESIMC	V3.B16, V8.B16
   163  	VLD1	(R10), [V0.B16, V1.B16, V2.B16]
   164  	AESIMC	V0.B16, V14.B16
   165  	AESIMC	V1.B16, V13.B16
   166  	VMOV	V2.B16, V12.B16
   167  	VST1.P	[V12.B16, V13.B16, V14.B16], 48(R11)
   168  	VST1.P	[V8.B16, V9.B16, V10.B16, V11.B16], 64(R11)
   169  	VST1	[V4.B16, V5.B16, V6.B16, V7.B16], (R11)
   170  	B	ksDone
   171  ks192:
   172  	LDPW	(R9), (R2, R3)
   173  	LDPW	8(R9), (R4, R5)
   174  	LDPW	16(R9), (R6, R7)
   175  	STPW.P	(R2, R3), 8(R10)
   176  	STPW.P	(R4, R5), 8(R10)
   177  	SUB	$4, R8
   178  ks192Loop:
   179  		STPW.P	(R6, R7), 8(R10)
   180  		VMOV	R7, V2.S[0]
   181  		WORD	$0x4E030042 //TBL	V3.B16, [V2.B16], V2.B16
   182  		AESE	V0.B16, V2.B16
   183  		EORW	R13, R2
   184  		LSLW	$1, R13
   185  		SUBS	$1, R8
   186  		VMOV	V2.S[0], R0
   187  		EORW	R0, R2
   188  		EORW	R2, R3
   189  		EORW	R3, R4
   190  		EORW	R4, R5
   191  		EORW	R5, R6
   192  		EORW	R6, R7
   193  		STPW.P	(R2, R3), 8(R10)
   194  		STPW.P	(R4, R5), 8(R10)
   195  	BNE	ks192Loop
   196  	CBZ	R11, ksDone
   197  	SUB	$208, R10
   198  	VLD1.P	64(R10), [V0.B16, V1.B16, V2.B16, V3.B16]
   199  	VMOV	V0.B16, V7.B16
   200  	AESIMC	V1.B16, V6.B16
   201  	AESIMC	V2.B16, V5.B16
   202  	AESIMC	V3.B16, V4.B16
   203  	VLD1.P	64(R10), [V0.B16, V1.B16, V2.B16, V3.B16]
   204  	AESIMC	V0.B16, V11.B16
   205  	AESIMC	V1.B16, V10.B16
   206  	AESIMC	V2.B16, V9.B16
   207  	AESIMC	V3.B16, V8.B16
   208  	VLD1.P	64(R10), [V0.B16, V1.B16, V2.B16, V3.B16]
   209  	AESIMC	V0.B16, V15.B16
   210  	AESIMC	V1.B16, V14.B16
   211  	AESIMC	V2.B16, V13.B16
   212  	AESIMC	V3.B16, V12.B16
   213  	VLD1	(R10), [V0.B16]
   214  	VST1.P	[V0.B16], 16(R11)
   215  	VST1.P	[V12.B16, V13.B16, V14.B16, V15.B16], 64(R11)
   216  	VST1.P	[V8.B16, V9.B16, V10.B16, V11.B16], 64(R11)
   217  	VST1	[V4.B16, V5.B16, V6.B16, V7.B16], (R11)
   218  	B	ksDone
   219  ks256:
   220  	LDP	invSRows<>(SB), (R0, R1)
   221  	VMOV	R0, V4.D[0]
   222  	VMOV	R1, V4.D[1]
   223  	LDPW	(R9), (R0, R1)
   224  	LDPW	8(R9), (R2, R3)
   225  	LDPW	16(R9), (R4, R5)
   226  	LDPW	24(R9), (R6, R7)
   227  	STPW.P	(R0, R1), 8(R10)
   228  	STPW.P	(R2, R3), 8(R10)
   229  	SUB	$7, R8
   230  ks256Loop:
   231  		STPW.P	(R4, R5), 8(R10)
   232  		STPW.P	(R6, R7), 8(R10)
   233  		VMOV	R7, V2.S[0]
   234  		WORD	$0x4E030042 //TBL	V3.B16, [V2.B16], V2.B16
   235  		AESE	V0.B16, V2.B16
   236  		EORW	R13, R0
   237  		LSLW	$1, R13
   238  		SUBS	$1, R8
   239  		VMOV	V2.S[0], R9
   240  		EORW	R9, R0
   241  		EORW	R0, R1
   242  		EORW	R1, R2
   243  		EORW	R2, R3
   244  		VMOV	R3, V2.S[0]
   245  		WORD	$0x4E040042 //TBL	V3.B16, [V2.B16], V2.B16
   246  		AESE	V0.B16, V2.B16
   247  		VMOV	V2.S[0], R9
   248  		EORW	R9, R4
   249  		EORW	R4, R5
   250  		EORW	R5, R6
   251  		EORW	R6, R7
   252  		STPW.P	(R0, R1), 8(R10)
   253  		STPW.P	(R2, R3), 8(R10)
   254  	BNE	ks256Loop
   255  	CBZ	R11, ksDone
   256  	SUB	$240, R10
   257  	VLD1.P	64(R10), [V0.B16, V1.B16, V2.B16, V3.B16]
   258  	VMOV	V0.B16, V7.B16
   259  	AESIMC	V1.B16, V6.B16
   260  	AESIMC	V2.B16, V5.B16
   261  	AESIMC	V3.B16, V4.B16
   262  	VLD1.P	64(R10), [V0.B16, V1.B16, V2.B16, V3.B16]
   263  	AESIMC	V0.B16, V11.B16
   264  	AESIMC	V1.B16, V10.B16
   265  	AESIMC	V2.B16, V9.B16
   266  	AESIMC	V3.B16, V8.B16
   267  	VLD1.P	64(R10), [V0.B16, V1.B16, V2.B16, V3.B16]
   268  	AESIMC	V0.B16, V15.B16
   269  	AESIMC	V1.B16, V14.B16
   270  	AESIMC	V2.B16, V13.B16
   271  	AESIMC	V3.B16, V12.B16
   272  	VLD1	(R10), [V0.B16, V1.B16, V2.B16]
   273  	AESIMC	V0.B16, V18.B16
   274  	AESIMC	V1.B16, V17.B16
   275  	VMOV	V2.B16, V16.B16
   276  	VST1.P	[V16.B16, V17.B16, V18.B16], 48(R11)
   277  	VST1.P	[V12.B16, V13.B16, V14.B16, V15.B16], 64(R11)
   278  	VST1.P	[V8.B16, V9.B16, V10.B16, V11.B16], 64(R11)
   279  	VST1	[V4.B16, V5.B16, V6.B16, V7.B16], (R11)
   280  ksDone:
   281  	RET
   282  

View as plain text