Text file src/vendor/golang.org/x/crypto/chacha20/chacha_ppc64le.s

     1  // Copyright 2019 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // Based on CRYPTOGAMS code with the following comment:
     6  // # ====================================================================
     7  // # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
     8  // # project. The module is, however, dual licensed under OpenSSL and
     9  // # CRYPTOGAMS licenses depending on where you obtain it. For further
    10  // # details see http://www.openssl.org/~appro/cryptogams/.
    11  // # ====================================================================
    12  
    13  // Code for the perl script that generates the ppc64 assembler
    14  // can be found in the cryptogams repository at the link below. It is based on
    15  // the original from openssl.
    16  
    17  // https://github.com/dot-asm/cryptogams/commit/a60f5b50ed908e91
    18  
    19  // The differences in this and the original implementation are
    20  // due to the calling conventions and initialization of constants.
    21  
    22  //go:build gc && !purego
    23  // +build gc,!purego
    24  
    25  #include "textflag.h"
    26  
    27  #define OUT  R3
    28  #define INP  R4
    29  #define LEN  R5
    30  #define KEY  R6
    31  #define CNT  R7
    32  #define TMP  R15
    33  
    34  #define CONSTBASE  R16
    35  #define BLOCKS R17
    36  
    37  DATA consts<>+0x00(SB)/8, $0x3320646e61707865
    38  DATA consts<>+0x08(SB)/8, $0x6b20657479622d32
    39  DATA consts<>+0x10(SB)/8, $0x0000000000000001
    40  DATA consts<>+0x18(SB)/8, $0x0000000000000000
    41  DATA consts<>+0x20(SB)/8, $0x0000000000000004
    42  DATA consts<>+0x28(SB)/8, $0x0000000000000000
    43  DATA consts<>+0x30(SB)/8, $0x0a0b08090e0f0c0d
    44  DATA consts<>+0x38(SB)/8, $0x0203000106070405
    45  DATA consts<>+0x40(SB)/8, $0x090a0b080d0e0f0c
    46  DATA consts<>+0x48(SB)/8, $0x0102030005060704
    47  DATA consts<>+0x50(SB)/8, $0x6170786561707865
    48  DATA consts<>+0x58(SB)/8, $0x6170786561707865
    49  DATA consts<>+0x60(SB)/8, $0x3320646e3320646e
    50  DATA consts<>+0x68(SB)/8, $0x3320646e3320646e
    51  DATA consts<>+0x70(SB)/8, $0x79622d3279622d32
    52  DATA consts<>+0x78(SB)/8, $0x79622d3279622d32
    53  DATA consts<>+0x80(SB)/8, $0x6b2065746b206574
    54  DATA consts<>+0x88(SB)/8, $0x6b2065746b206574
    55  DATA consts<>+0x90(SB)/8, $0x0000000100000000
    56  DATA consts<>+0x98(SB)/8, $0x0000000300000002
    57  GLOBL consts<>(SB), RODATA, $0xa0
    58  
    59  //func chaCha20_ctr32_vsx(out, inp *byte, len int, key *[8]uint32, counter *uint32)
    60  TEXT ·chaCha20_ctr32_vsx(SB),NOSPLIT,$64-40
    61  	MOVD out+0(FP), OUT
    62  	MOVD inp+8(FP), INP
    63  	MOVD len+16(FP), LEN
    64  	MOVD key+24(FP), KEY
    65  	MOVD counter+32(FP), CNT
    66  
    67  	// Addressing for constants
    68  	MOVD $consts<>+0x00(SB), CONSTBASE
    69  	MOVD $16, R8
    70  	MOVD $32, R9
    71  	MOVD $48, R10
    72  	MOVD $64, R11
    73  	SRD $6, LEN, BLOCKS
    74  	// V16
    75  	LXVW4X (CONSTBASE)(R0), VS48
    76  	ADD $80,CONSTBASE
    77  
    78  	// Load key into V17,V18
    79  	LXVW4X (KEY)(R0), VS49
    80  	LXVW4X (KEY)(R8), VS50
    81  
    82  	// Load CNT, NONCE into V19
    83  	LXVW4X (CNT)(R0), VS51
    84  
    85  	// Clear V27
    86  	VXOR V27, V27, V27
    87  
    88  	// V28
    89  	LXVW4X (CONSTBASE)(R11), VS60
    90  
    91  	// splat slot from V19 -> V26
    92  	VSPLTW $0, V19, V26
    93  
    94  	VSLDOI $4, V19, V27, V19
    95  	VSLDOI $12, V27, V19, V19
    96  
    97  	VADDUWM V26, V28, V26
    98  
    99  	MOVD $10, R14
   100  	MOVD R14, CTR
   101  
   102  loop_outer_vsx:
   103  	// V0, V1, V2, V3
   104  	LXVW4X (R0)(CONSTBASE), VS32
   105  	LXVW4X (R8)(CONSTBASE), VS33
   106  	LXVW4X (R9)(CONSTBASE), VS34
   107  	LXVW4X (R10)(CONSTBASE), VS35
   108  
   109  	// splat values from V17, V18 into V4-V11
   110  	VSPLTW $0, V17, V4
   111  	VSPLTW $1, V17, V5
   112  	VSPLTW $2, V17, V6
   113  	VSPLTW $3, V17, V7
   114  	VSPLTW $0, V18, V8
   115  	VSPLTW $1, V18, V9
   116  	VSPLTW $2, V18, V10
   117  	VSPLTW $3, V18, V11
   118  
   119  	// VOR
   120  	VOR V26, V26, V12
   121  
   122  	// splat values from V19 -> V13, V14, V15
   123  	VSPLTW $1, V19, V13
   124  	VSPLTW $2, V19, V14
   125  	VSPLTW $3, V19, V15
   126  
   127  	// splat   const values
   128  	VSPLTISW $-16, V27
   129  	VSPLTISW $12, V28
   130  	VSPLTISW $8, V29
   131  	VSPLTISW $7, V30
   132  
   133  loop_vsx:
   134  	VADDUWM V0, V4, V0
   135  	VADDUWM V1, V5, V1
   136  	VADDUWM V2, V6, V2
   137  	VADDUWM V3, V7, V3
   138  
   139  	VXOR V12, V0, V12
   140  	VXOR V13, V1, V13
   141  	VXOR V14, V2, V14
   142  	VXOR V15, V3, V15
   143  
   144  	VRLW V12, V27, V12
   145  	VRLW V13, V27, V13
   146  	VRLW V14, V27, V14
   147  	VRLW V15, V27, V15
   148  
   149  	VADDUWM V8, V12, V8
   150  	VADDUWM V9, V13, V9
   151  	VADDUWM V10, V14, V10
   152  	VADDUWM V11, V15, V11
   153  
   154  	VXOR V4, V8, V4
   155  	VXOR V5, V9, V5
   156  	VXOR V6, V10, V6
   157  	VXOR V7, V11, V7
   158  
   159  	VRLW V4, V28, V4
   160  	VRLW V5, V28, V5
   161  	VRLW V6, V28, V6
   162  	VRLW V7, V28, V7
   163  
   164  	VADDUWM V0, V4, V0
   165  	VADDUWM V1, V5, V1
   166  	VADDUWM V2, V6, V2
   167  	VADDUWM V3, V7, V3
   168  
   169  	VXOR V12, V0, V12
   170  	VXOR V13, V1, V13
   171  	VXOR V14, V2, V14
   172  	VXOR V15, V3, V15
   173  
   174  	VRLW V12, V29, V12
   175  	VRLW V13, V29, V13
   176  	VRLW V14, V29, V14
   177  	VRLW V15, V29, V15
   178  
   179  	VADDUWM V8, V12, V8
   180  	VADDUWM V9, V13, V9
   181  	VADDUWM V10, V14, V10
   182  	VADDUWM V11, V15, V11
   183  
   184  	VXOR V4, V8, V4
   185  	VXOR V5, V9, V5
   186  	VXOR V6, V10, V6
   187  	VXOR V7, V11, V7
   188  
   189  	VRLW V4, V30, V4
   190  	VRLW V5, V30, V5
   191  	VRLW V6, V30, V6
   192  	VRLW V7, V30, V7
   193  
   194  	VADDUWM V0, V5, V0
   195  	VADDUWM V1, V6, V1
   196  	VADDUWM V2, V7, V2
   197  	VADDUWM V3, V4, V3
   198  
   199  	VXOR V15, V0, V15
   200  	VXOR V12, V1, V12
   201  	VXOR V13, V2, V13
   202  	VXOR V14, V3, V14
   203  
   204  	VRLW V15, V27, V15
   205  	VRLW V12, V27, V12
   206  	VRLW V13, V27, V13
   207  	VRLW V14, V27, V14
   208  
   209  	VADDUWM V10, V15, V10
   210  	VADDUWM V11, V12, V11
   211  	VADDUWM V8, V13, V8
   212  	VADDUWM V9, V14, V9
   213  
   214  	VXOR V5, V10, V5
   215  	VXOR V6, V11, V6
   216  	VXOR V7, V8, V7
   217  	VXOR V4, V9, V4
   218  
   219  	VRLW V5, V28, V5
   220  	VRLW V6, V28, V6
   221  	VRLW V7, V28, V7
   222  	VRLW V4, V28, V4
   223  
   224  	VADDUWM V0, V5, V0
   225  	VADDUWM V1, V6, V1
   226  	VADDUWM V2, V7, V2
   227  	VADDUWM V3, V4, V3
   228  
   229  	VXOR V15, V0, V15
   230  	VXOR V12, V1, V12
   231  	VXOR V13, V2, V13
   232  	VXOR V14, V3, V14
   233  
   234  	VRLW V15, V29, V15
   235  	VRLW V12, V29, V12
   236  	VRLW V13, V29, V13
   237  	VRLW V14, V29, V14
   238  
   239  	VADDUWM V10, V15, V10
   240  	VADDUWM V11, V12, V11
   241  	VADDUWM V8, V13, V8
   242  	VADDUWM V9, V14, V9
   243  
   244  	VXOR V5, V10, V5
   245  	VXOR V6, V11, V6
   246  	VXOR V7, V8, V7
   247  	VXOR V4, V9, V4
   248  
   249  	VRLW V5, V30, V5
   250  	VRLW V6, V30, V6
   251  	VRLW V7, V30, V7
   252  	VRLW V4, V30, V4
   253  	BC   16, LT, loop_vsx
   254  
   255  	VADDUWM V12, V26, V12
   256  
   257  	WORD $0x13600F8C		// VMRGEW V0, V1, V27
   258  	WORD $0x13821F8C		// VMRGEW V2, V3, V28
   259  
   260  	WORD $0x10000E8C		// VMRGOW V0, V1, V0
   261  	WORD $0x10421E8C		// VMRGOW V2, V3, V2
   262  
   263  	WORD $0x13A42F8C		// VMRGEW V4, V5, V29
   264  	WORD $0x13C63F8C		// VMRGEW V6, V7, V30
   265  
   266  	XXPERMDI VS32, VS34, $0, VS33
   267  	XXPERMDI VS32, VS34, $3, VS35
   268  	XXPERMDI VS59, VS60, $0, VS32
   269  	XXPERMDI VS59, VS60, $3, VS34
   270  
   271  	WORD $0x10842E8C		// VMRGOW V4, V5, V4
   272  	WORD $0x10C63E8C		// VMRGOW V6, V7, V6
   273  
   274  	WORD $0x13684F8C		// VMRGEW V8, V9, V27
   275  	WORD $0x138A5F8C		// VMRGEW V10, V11, V28
   276  
   277  	XXPERMDI VS36, VS38, $0, VS37
   278  	XXPERMDI VS36, VS38, $3, VS39
   279  	XXPERMDI VS61, VS62, $0, VS36
   280  	XXPERMDI VS61, VS62, $3, VS38
   281  
   282  	WORD $0x11084E8C		// VMRGOW V8, V9, V8
   283  	WORD $0x114A5E8C		// VMRGOW V10, V11, V10
   284  
   285  	WORD $0x13AC6F8C		// VMRGEW V12, V13, V29
   286  	WORD $0x13CE7F8C		// VMRGEW V14, V15, V30
   287  
   288  	XXPERMDI VS40, VS42, $0, VS41
   289  	XXPERMDI VS40, VS42, $3, VS43
   290  	XXPERMDI VS59, VS60, $0, VS40
   291  	XXPERMDI VS59, VS60, $3, VS42
   292  
   293  	WORD $0x118C6E8C		// VMRGOW V12, V13, V12
   294  	WORD $0x11CE7E8C		// VMRGOW V14, V15, V14
   295  
   296  	VSPLTISW $4, V27
   297  	VADDUWM V26, V27, V26
   298  
   299  	XXPERMDI VS44, VS46, $0, VS45
   300  	XXPERMDI VS44, VS46, $3, VS47
   301  	XXPERMDI VS61, VS62, $0, VS44
   302  	XXPERMDI VS61, VS62, $3, VS46
   303  
   304  	VADDUWM V0, V16, V0
   305  	VADDUWM V4, V17, V4
   306  	VADDUWM V8, V18, V8
   307  	VADDUWM V12, V19, V12
   308  
   309  	CMPU LEN, $64
   310  	BLT tail_vsx
   311  
   312  	// Bottom of loop
   313  	LXVW4X (INP)(R0), VS59
   314  	LXVW4X (INP)(R8), VS60
   315  	LXVW4X (INP)(R9), VS61
   316  	LXVW4X (INP)(R10), VS62
   317  
   318  	VXOR V27, V0, V27
   319  	VXOR V28, V4, V28
   320  	VXOR V29, V8, V29
   321  	VXOR V30, V12, V30
   322  
   323  	STXVW4X VS59, (OUT)(R0)
   324  	STXVW4X VS60, (OUT)(R8)
   325  	ADD     $64, INP
   326  	STXVW4X VS61, (OUT)(R9)
   327  	ADD     $-64, LEN
   328  	STXVW4X VS62, (OUT)(R10)
   329  	ADD     $64, OUT
   330  	BEQ     done_vsx
   331  
   332  	VADDUWM V1, V16, V0
   333  	VADDUWM V5, V17, V4
   334  	VADDUWM V9, V18, V8
   335  	VADDUWM V13, V19, V12
   336  
   337  	CMPU  LEN, $64
   338  	BLT   tail_vsx
   339  
   340  	LXVW4X (INP)(R0), VS59
   341  	LXVW4X (INP)(R8), VS60
   342  	LXVW4X (INP)(R9), VS61
   343  	LXVW4X (INP)(R10), VS62
   344  	VXOR   V27, V0, V27
   345  
   346  	VXOR V28, V4, V28
   347  	VXOR V29, V8, V29
   348  	VXOR V30, V12, V30
   349  
   350  	STXVW4X VS59, (OUT)(R0)
   351  	STXVW4X VS60, (OUT)(R8)
   352  	ADD     $64, INP
   353  	STXVW4X VS61, (OUT)(R9)
   354  	ADD     $-64, LEN
   355  	STXVW4X VS62, (OUT)(V10)
   356  	ADD     $64, OUT
   357  	BEQ     done_vsx
   358  
   359  	VADDUWM V2, V16, V0
   360  	VADDUWM V6, V17, V4
   361  	VADDUWM V10, V18, V8
   362  	VADDUWM V14, V19, V12
   363  
   364  	CMPU LEN, $64
   365  	BLT  tail_vsx
   366  
   367  	LXVW4X (INP)(R0), VS59
   368  	LXVW4X (INP)(R8), VS60
   369  	LXVW4X (INP)(R9), VS61
   370  	LXVW4X (INP)(R10), VS62
   371  
   372  	VXOR V27, V0, V27
   373  	VXOR V28, V4, V28
   374  	VXOR V29, V8, V29
   375  	VXOR V30, V12, V30
   376  
   377  	STXVW4X VS59, (OUT)(R0)
   378  	STXVW4X VS60, (OUT)(R8)
   379  	ADD     $64, INP
   380  	STXVW4X VS61, (OUT)(R9)
   381  	ADD     $-64, LEN
   382  	STXVW4X VS62, (OUT)(R10)
   383  	ADD     $64, OUT
   384  	BEQ     done_vsx
   385  
   386  	VADDUWM V3, V16, V0
   387  	VADDUWM V7, V17, V4
   388  	VADDUWM V11, V18, V8
   389  	VADDUWM V15, V19, V12
   390  
   391  	CMPU  LEN, $64
   392  	BLT   tail_vsx
   393  
   394  	LXVW4X (INP)(R0), VS59
   395  	LXVW4X (INP)(R8), VS60
   396  	LXVW4X (INP)(R9), VS61
   397  	LXVW4X (INP)(R10), VS62
   398  
   399  	VXOR V27, V0, V27
   400  	VXOR V28, V4, V28
   401  	VXOR V29, V8, V29
   402  	VXOR V30, V12, V30
   403  
   404  	STXVW4X VS59, (OUT)(R0)
   405  	STXVW4X VS60, (OUT)(R8)
   406  	ADD     $64, INP
   407  	STXVW4X VS61, (OUT)(R9)
   408  	ADD     $-64, LEN
   409  	STXVW4X VS62, (OUT)(R10)
   410  	ADD     $64, OUT
   411  
   412  	MOVD $10, R14
   413  	MOVD R14, CTR
   414  	BNE  loop_outer_vsx
   415  
   416  done_vsx:
   417  	// Increment counter by number of 64 byte blocks
   418  	MOVD (CNT), R14
   419  	ADD  BLOCKS, R14
   420  	MOVD R14, (CNT)
   421  	RET
   422  
   423  tail_vsx:
   424  	ADD  $32, R1, R11
   425  	MOVD LEN, CTR
   426  
   427  	// Save values on stack to copy from
   428  	STXVW4X VS32, (R11)(R0)
   429  	STXVW4X VS36, (R11)(R8)
   430  	STXVW4X VS40, (R11)(R9)
   431  	STXVW4X VS44, (R11)(R10)
   432  	ADD $-1, R11, R12
   433  	ADD $-1, INP
   434  	ADD $-1, OUT
   435  
   436  looptail_vsx:
   437  	// Copying the result to OUT
   438  	// in bytes.
   439  	MOVBZU 1(R12), KEY
   440  	MOVBZU 1(INP), TMP
   441  	XOR    KEY, TMP, KEY
   442  	MOVBU  KEY, 1(OUT)
   443  	BC     16, LT, looptail_vsx
   444  
   445  	// Clear the stack values
   446  	STXVW4X VS48, (R11)(R0)
   447  	STXVW4X VS48, (R11)(R8)
   448  	STXVW4X VS48, (R11)(R9)
   449  	STXVW4X VS48, (R11)(R10)
   450  	BR      done_vsx
   451  

View as plain text