Text file src/vendor/golang.org/x/crypto/chacha20/chacha_arm64.s

     1  // Copyright 2018 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  //go:build go1.11 && gc && !purego
     6  // +build go1.11,gc,!purego
     7  
     8  #include "textflag.h"
     9  
    10  #define NUM_ROUNDS 10
    11  
    12  // func xorKeyStreamVX(dst, src []byte, key *[8]uint32, nonce *[3]uint32, counter *uint32)
    13  TEXT ·xorKeyStreamVX(SB), NOSPLIT, $0
    14  	MOVD	dst+0(FP), R1
    15  	MOVD	src+24(FP), R2
    16  	MOVD	src_len+32(FP), R3
    17  	MOVD	key+48(FP), R4
    18  	MOVD	nonce+56(FP), R6
    19  	MOVD	counter+64(FP), R7
    20  
    21  	MOVD	$·constants(SB), R10
    22  	MOVD	$·incRotMatrix(SB), R11
    23  
    24  	MOVW	(R7), R20
    25  
    26  	AND	$~255, R3, R13
    27  	ADD	R2, R13, R12 // R12 for block end
    28  	AND	$255, R3, R13
    29  loop:
    30  	MOVD	$NUM_ROUNDS, R21
    31  	VLD1	(R11), [V30.S4, V31.S4]
    32  
    33  	// load contants
    34  	// VLD4R (R10), [V0.S4, V1.S4, V2.S4, V3.S4]
    35  	WORD	$0x4D60E940
    36  
    37  	// load keys
    38  	// VLD4R 16(R4), [V4.S4, V5.S4, V6.S4, V7.S4]
    39  	WORD	$0x4DFFE884
    40  	// VLD4R 16(R4), [V8.S4, V9.S4, V10.S4, V11.S4]
    41  	WORD	$0x4DFFE888
    42  	SUB	$32, R4
    43  
    44  	// load counter + nonce
    45  	// VLD1R (R7), [V12.S4]
    46  	WORD	$0x4D40C8EC
    47  
    48  	// VLD3R (R6), [V13.S4, V14.S4, V15.S4]
    49  	WORD	$0x4D40E8CD
    50  
    51  	// update counter
    52  	VADD	V30.S4, V12.S4, V12.S4
    53  
    54  chacha:
    55  	// V0..V3 += V4..V7
    56  	// V12..V15 <<<= ((V12..V15 XOR V0..V3), 16)
    57  	VADD	V0.S4, V4.S4, V0.S4
    58  	VADD	V1.S4, V5.S4, V1.S4
    59  	VADD	V2.S4, V6.S4, V2.S4
    60  	VADD	V3.S4, V7.S4, V3.S4
    61  	VEOR	V12.B16, V0.B16, V12.B16
    62  	VEOR	V13.B16, V1.B16, V13.B16
    63  	VEOR	V14.B16, V2.B16, V14.B16
    64  	VEOR	V15.B16, V3.B16, V15.B16
    65  	VREV32	V12.H8, V12.H8
    66  	VREV32	V13.H8, V13.H8
    67  	VREV32	V14.H8, V14.H8
    68  	VREV32	V15.H8, V15.H8
    69  	// V8..V11 += V12..V15
    70  	// V4..V7 <<<= ((V4..V7 XOR V8..V11), 12)
    71  	VADD	V8.S4, V12.S4, V8.S4
    72  	VADD	V9.S4, V13.S4, V9.S4
    73  	VADD	V10.S4, V14.S4, V10.S4
    74  	VADD	V11.S4, V15.S4, V11.S4
    75  	VEOR	V8.B16, V4.B16, V16.B16
    76  	VEOR	V9.B16, V5.B16, V17.B16
    77  	VEOR	V10.B16, V6.B16, V18.B16
    78  	VEOR	V11.B16, V7.B16, V19.B16
    79  	VSHL	$12, V16.S4, V4.S4
    80  	VSHL	$12, V17.S4, V5.S4
    81  	VSHL	$12, V18.S4, V6.S4
    82  	VSHL	$12, V19.S4, V7.S4
    83  	VSRI	$20, V16.S4, V4.S4
    84  	VSRI	$20, V17.S4, V5.S4
    85  	VSRI	$20, V18.S4, V6.S4
    86  	VSRI	$20, V19.S4, V7.S4
    87  
    88  	// V0..V3 += V4..V7
    89  	// V12..V15 <<<= ((V12..V15 XOR V0..V3), 8)
    90  	VADD	V0.S4, V4.S4, V0.S4
    91  	VADD	V1.S4, V5.S4, V1.S4
    92  	VADD	V2.S4, V6.S4, V2.S4
    93  	VADD	V3.S4, V7.S4, V3.S4
    94  	VEOR	V12.B16, V0.B16, V12.B16
    95  	VEOR	V13.B16, V1.B16, V13.B16
    96  	VEOR	V14.B16, V2.B16, V14.B16
    97  	VEOR	V15.B16, V3.B16, V15.B16
    98  	VTBL	V31.B16, [V12.B16], V12.B16
    99  	VTBL	V31.B16, [V13.B16], V13.B16
   100  	VTBL	V31.B16, [V14.B16], V14.B16
   101  	VTBL	V31.B16, [V15.B16], V15.B16
   102  
   103  	// V8..V11 += V12..V15
   104  	// V4..V7 <<<= ((V4..V7 XOR V8..V11), 7)
   105  	VADD	V12.S4, V8.S4, V8.S4
   106  	VADD	V13.S4, V9.S4, V9.S4
   107  	VADD	V14.S4, V10.S4, V10.S4
   108  	VADD	V15.S4, V11.S4, V11.S4
   109  	VEOR	V8.B16, V4.B16, V16.B16
   110  	VEOR	V9.B16, V5.B16, V17.B16
   111  	VEOR	V10.B16, V6.B16, V18.B16
   112  	VEOR	V11.B16, V7.B16, V19.B16
   113  	VSHL	$7, V16.S4, V4.S4
   114  	VSHL	$7, V17.S4, V5.S4
   115  	VSHL	$7, V18.S4, V6.S4
   116  	VSHL	$7, V19.S4, V7.S4
   117  	VSRI	$25, V16.S4, V4.S4
   118  	VSRI	$25, V17.S4, V5.S4
   119  	VSRI	$25, V18.S4, V6.S4
   120  	VSRI	$25, V19.S4, V7.S4
   121  
   122  	// V0..V3 += V5..V7, V4
   123  	// V15,V12-V14 <<<= ((V15,V12-V14 XOR V0..V3), 16)
   124  	VADD	V0.S4, V5.S4, V0.S4
   125  	VADD	V1.S4, V6.S4, V1.S4
   126  	VADD	V2.S4, V7.S4, V2.S4
   127  	VADD	V3.S4, V4.S4, V3.S4
   128  	VEOR	V15.B16, V0.B16, V15.B16
   129  	VEOR	V12.B16, V1.B16, V12.B16
   130  	VEOR	V13.B16, V2.B16, V13.B16
   131  	VEOR	V14.B16, V3.B16, V14.B16
   132  	VREV32	V12.H8, V12.H8
   133  	VREV32	V13.H8, V13.H8
   134  	VREV32	V14.H8, V14.H8
   135  	VREV32	V15.H8, V15.H8
   136  
   137  	// V10 += V15; V5 <<<= ((V10 XOR V5), 12)
   138  	// ...
   139  	VADD	V15.S4, V10.S4, V10.S4
   140  	VADD	V12.S4, V11.S4, V11.S4
   141  	VADD	V13.S4, V8.S4, V8.S4
   142  	VADD	V14.S4, V9.S4, V9.S4
   143  	VEOR	V10.B16, V5.B16, V16.B16
   144  	VEOR	V11.B16, V6.B16, V17.B16
   145  	VEOR	V8.B16, V7.B16, V18.B16
   146  	VEOR	V9.B16, V4.B16, V19.B16
   147  	VSHL	$12, V16.S4, V5.S4
   148  	VSHL	$12, V17.S4, V6.S4
   149  	VSHL	$12, V18.S4, V7.S4
   150  	VSHL	$12, V19.S4, V4.S4
   151  	VSRI	$20, V16.S4, V5.S4
   152  	VSRI	$20, V17.S4, V6.S4
   153  	VSRI	$20, V18.S4, V7.S4
   154  	VSRI	$20, V19.S4, V4.S4
   155  
   156  	// V0 += V5; V15 <<<= ((V0 XOR V15), 8)
   157  	// ...
   158  	VADD	V5.S4, V0.S4, V0.S4
   159  	VADD	V6.S4, V1.S4, V1.S4
   160  	VADD	V7.S4, V2.S4, V2.S4
   161  	VADD	V4.S4, V3.S4, V3.S4
   162  	VEOR	V0.B16, V15.B16, V15.B16
   163  	VEOR	V1.B16, V12.B16, V12.B16
   164  	VEOR	V2.B16, V13.B16, V13.B16
   165  	VEOR	V3.B16, V14.B16, V14.B16
   166  	VTBL	V31.B16, [V12.B16], V12.B16
   167  	VTBL	V31.B16, [V13.B16], V13.B16
   168  	VTBL	V31.B16, [V14.B16], V14.B16
   169  	VTBL	V31.B16, [V15.B16], V15.B16
   170  
   171  	// V10 += V15; V5 <<<= ((V10 XOR V5), 7)
   172  	// ...
   173  	VADD	V15.S4, V10.S4, V10.S4
   174  	VADD	V12.S4, V11.S4, V11.S4
   175  	VADD	V13.S4, V8.S4, V8.S4
   176  	VADD	V14.S4, V9.S4, V9.S4
   177  	VEOR	V10.B16, V5.B16, V16.B16
   178  	VEOR	V11.B16, V6.B16, V17.B16
   179  	VEOR	V8.B16, V7.B16, V18.B16
   180  	VEOR	V9.B16, V4.B16, V19.B16
   181  	VSHL	$7, V16.S4, V5.S4
   182  	VSHL	$7, V17.S4, V6.S4
   183  	VSHL	$7, V18.S4, V7.S4
   184  	VSHL	$7, V19.S4, V4.S4
   185  	VSRI	$25, V16.S4, V5.S4
   186  	VSRI	$25, V17.S4, V6.S4
   187  	VSRI	$25, V18.S4, V7.S4
   188  	VSRI	$25, V19.S4, V4.S4
   189  
   190  	SUB	$1, R21
   191  	CBNZ	R21, chacha
   192  
   193  	// VLD4R (R10), [V16.S4, V17.S4, V18.S4, V19.S4]
   194  	WORD	$0x4D60E950
   195  
   196  	// VLD4R 16(R4), [V20.S4, V21.S4, V22.S4, V23.S4]
   197  	WORD	$0x4DFFE894
   198  	VADD	V30.S4, V12.S4, V12.S4
   199  	VADD	V16.S4, V0.S4, V0.S4
   200  	VADD	V17.S4, V1.S4, V1.S4
   201  	VADD	V18.S4, V2.S4, V2.S4
   202  	VADD	V19.S4, V3.S4, V3.S4
   203  	// VLD4R 16(R4), [V24.S4, V25.S4, V26.S4, V27.S4]
   204  	WORD	$0x4DFFE898
   205  	// restore R4
   206  	SUB	$32, R4
   207  
   208  	// load counter + nonce
   209  	// VLD1R (R7), [V28.S4]
   210  	WORD	$0x4D40C8FC
   211  	// VLD3R (R6), [V29.S4, V30.S4, V31.S4]
   212  	WORD	$0x4D40E8DD
   213  
   214  	VADD	V20.S4, V4.S4, V4.S4
   215  	VADD	V21.S4, V5.S4, V5.S4
   216  	VADD	V22.S4, V6.S4, V6.S4
   217  	VADD	V23.S4, V7.S4, V7.S4
   218  	VADD	V24.S4, V8.S4, V8.S4
   219  	VADD	V25.S4, V9.S4, V9.S4
   220  	VADD	V26.S4, V10.S4, V10.S4
   221  	VADD	V27.S4, V11.S4, V11.S4
   222  	VADD	V28.S4, V12.S4, V12.S4
   223  	VADD	V29.S4, V13.S4, V13.S4
   224  	VADD	V30.S4, V14.S4, V14.S4
   225  	VADD	V31.S4, V15.S4, V15.S4
   226  
   227  	VZIP1	V1.S4, V0.S4, V16.S4
   228  	VZIP2	V1.S4, V0.S4, V17.S4
   229  	VZIP1	V3.S4, V2.S4, V18.S4
   230  	VZIP2	V3.S4, V2.S4, V19.S4
   231  	VZIP1	V5.S4, V4.S4, V20.S4
   232  	VZIP2	V5.S4, V4.S4, V21.S4
   233  	VZIP1	V7.S4, V6.S4, V22.S4
   234  	VZIP2	V7.S4, V6.S4, V23.S4
   235  	VZIP1	V9.S4, V8.S4, V24.S4
   236  	VZIP2	V9.S4, V8.S4, V25.S4
   237  	VZIP1	V11.S4, V10.S4, V26.S4
   238  	VZIP2	V11.S4, V10.S4, V27.S4
   239  	VZIP1	V13.S4, V12.S4, V28.S4
   240  	VZIP2	V13.S4, V12.S4, V29.S4
   241  	VZIP1	V15.S4, V14.S4, V30.S4
   242  	VZIP2	V15.S4, V14.S4, V31.S4
   243  	VZIP1	V18.D2, V16.D2, V0.D2
   244  	VZIP2	V18.D2, V16.D2, V4.D2
   245  	VZIP1	V19.D2, V17.D2, V8.D2
   246  	VZIP2	V19.D2, V17.D2, V12.D2
   247  	VLD1.P	64(R2), [V16.B16, V17.B16, V18.B16, V19.B16]
   248  
   249  	VZIP1	V22.D2, V20.D2, V1.D2
   250  	VZIP2	V22.D2, V20.D2, V5.D2
   251  	VZIP1	V23.D2, V21.D2, V9.D2
   252  	VZIP2	V23.D2, V21.D2, V13.D2
   253  	VLD1.P	64(R2), [V20.B16, V21.B16, V22.B16, V23.B16]
   254  	VZIP1	V26.D2, V24.D2, V2.D2
   255  	VZIP2	V26.D2, V24.D2, V6.D2
   256  	VZIP1	V27.D2, V25.D2, V10.D2
   257  	VZIP2	V27.D2, V25.D2, V14.D2
   258  	VLD1.P	64(R2), [V24.B16, V25.B16, V26.B16, V27.B16]
   259  	VZIP1	V30.D2, V28.D2, V3.D2
   260  	VZIP2	V30.D2, V28.D2, V7.D2
   261  	VZIP1	V31.D2, V29.D2, V11.D2
   262  	VZIP2	V31.D2, V29.D2, V15.D2
   263  	VLD1.P	64(R2), [V28.B16, V29.B16, V30.B16, V31.B16]
   264  	VEOR	V0.B16, V16.B16, V16.B16
   265  	VEOR	V1.B16, V17.B16, V17.B16
   266  	VEOR	V2.B16, V18.B16, V18.B16
   267  	VEOR	V3.B16, V19.B16, V19.B16
   268  	VST1.P	[V16.B16, V17.B16, V18.B16, V19.B16], 64(R1)
   269  	VEOR	V4.B16, V20.B16, V20.B16
   270  	VEOR	V5.B16, V21.B16, V21.B16
   271  	VEOR	V6.B16, V22.B16, V22.B16
   272  	VEOR	V7.B16, V23.B16, V23.B16
   273  	VST1.P	[V20.B16, V21.B16, V22.B16, V23.B16], 64(R1)
   274  	VEOR	V8.B16, V24.B16, V24.B16
   275  	VEOR	V9.B16, V25.B16, V25.B16
   276  	VEOR	V10.B16, V26.B16, V26.B16
   277  	VEOR	V11.B16, V27.B16, V27.B16
   278  	VST1.P	[V24.B16, V25.B16, V26.B16, V27.B16], 64(R1)
   279  	VEOR	V12.B16, V28.B16, V28.B16
   280  	VEOR	V13.B16, V29.B16, V29.B16
   281  	VEOR	V14.B16, V30.B16, V30.B16
   282  	VEOR	V15.B16, V31.B16, V31.B16
   283  	VST1.P	[V28.B16, V29.B16, V30.B16, V31.B16], 64(R1)
   284  
   285  	ADD	$4, R20
   286  	MOVW	R20, (R7) // update counter
   287  
   288  	CMP	R2, R12
   289  	BGT	loop
   290  
   291  	RET
   292  
   293  
   294  DATA	·constants+0x00(SB)/4, $0x61707865
   295  DATA	·constants+0x04(SB)/4, $0x3320646e
   296  DATA	·constants+0x08(SB)/4, $0x79622d32
   297  DATA	·constants+0x0c(SB)/4, $0x6b206574
   298  GLOBL	·constants(SB), NOPTR|RODATA, $32
   299  
   300  DATA	·incRotMatrix+0x00(SB)/4, $0x00000000
   301  DATA	·incRotMatrix+0x04(SB)/4, $0x00000001
   302  DATA	·incRotMatrix+0x08(SB)/4, $0x00000002
   303  DATA	·incRotMatrix+0x0c(SB)/4, $0x00000003
   304  DATA	·incRotMatrix+0x10(SB)/4, $0x02010003
   305  DATA	·incRotMatrix+0x14(SB)/4, $0x06050407
   306  DATA	·incRotMatrix+0x18(SB)/4, $0x0A09080B
   307  DATA	·incRotMatrix+0x1c(SB)/4, $0x0E0D0C0F
   308  GLOBL	·incRotMatrix(SB), NOPTR|RODATA, $32
   309  

View as plain text