Text file src/math/big/arith_s390x.s

     1  // Copyright 2016 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  //go:build !math_big_pure_go
     6  // +build !math_big_pure_go
     7  
     8  #include "textflag.h"
     9  
    10  // This file provides fast assembly versions for the elementary
    11  // arithmetic operations on vectors implemented in arith.go.
    12  
    13  TEXT ·mulWW(SB), NOSPLIT, $0
    14  	MOVD   x+0(FP), R3
    15  	MOVD   y+8(FP), R4
    16  	MULHDU R3, R4
    17  	MOVD   R10, z1+16(FP)
    18  	MOVD   R11, z0+24(FP)
    19  	RET
    20  
    21  
    22  // DI = R3, CX = R4, SI = r10, r8 = r8, r9=r9, r10 = r2, r11 = r5, r12 = r6, r13 = r7, r14 = r1 (R0 set to 0) + use R11
    23  // func addVV(z, x, y []Word) (c Word)
    24  
    25  TEXT ·addVV(SB), NOSPLIT, $0
    26  	MOVD addvectorfacility+0x00(SB), R1
    27  	BR   (R1)
    28  
    29  TEXT ·addVV_check(SB), NOSPLIT, $0
    30  	MOVB   ·hasVX(SB), R1
    31  	CMPBEQ R1, $1, vectorimpl              // vectorfacility = 1, vector supported
    32  	MOVD   $addvectorfacility+0x00(SB), R1
    33  	MOVD   $·addVV_novec(SB), R2
    34  	MOVD   R2, 0(R1)
    35  
    36  	// MOVD	$·addVV_novec(SB), 0(R1)
    37  	BR ·addVV_novec(SB)
    38  
    39  vectorimpl:
    40  	MOVD $addvectorfacility+0x00(SB), R1
    41  	MOVD $·addVV_vec(SB), R2
    42  	MOVD R2, 0(R1)
    43  
    44  	// MOVD	$·addVV_vec(SB), 0(R1)
    45  	BR ·addVV_vec(SB)
    46  
    47  GLOBL addvectorfacility+0x00(SB), NOPTR, $8
    48  DATA addvectorfacility+0x00(SB)/8, $·addVV_check(SB)
    49  
    50  TEXT ·addVV_vec(SB), NOSPLIT, $0
    51  	MOVD z_len+8(FP), R3
    52  	MOVD x+24(FP), R8
    53  	MOVD y+48(FP), R9
    54  	MOVD z+0(FP), R2
    55  
    56  	MOVD $0, R4  // c = 0
    57  	MOVD $0, R0  // make sure it's zero
    58  	MOVD $0, R10 // i = 0
    59  
    60  	// s/JL/JMP/ below to disable the unrolled loop
    61  	SUB $4, R3
    62  	BLT v1
    63  	SUB $12, R3 // n -= 16
    64  	BLT A1      // if n < 0 goto A1
    65  
    66  	MOVD R8, R5
    67  	MOVD R9, R6
    68  	MOVD R2, R7
    69  
    70  	// n >= 0
    71  	// regular loop body unrolled 16x
    72  	VZERO V0 // c = 0
    73  
    74  UU1:
    75  	VLM  0(R5), V1, V4    // 64-bytes into V1..V8
    76  	ADD  $64, R5
    77  	VPDI $0x4, V1, V1, V1 // flip the doublewords to big-endian order
    78  	VPDI $0x4, V2, V2, V2 // flip the doublewords to big-endian order
    79  
    80  	VLM  0(R6), V9, V12      // 64-bytes into V9..V16
    81  	ADD  $64, R6
    82  	VPDI $0x4, V9, V9, V9    // flip the doublewords to big-endian order
    83  	VPDI $0x4, V10, V10, V10 // flip the doublewords to big-endian order
    84  
    85  	VACCCQ V1, V9, V0, V25
    86  	VACQ   V1, V9, V0, V17
    87  	VACCCQ V2, V10, V25, V26
    88  	VACQ   V2, V10, V25, V18
    89  
    90  	VLM 0(R5), V5, V6   // 32-bytes into V1..V8
    91  	VLM 0(R6), V13, V14 // 32-bytes into V9..V16
    92  	ADD $32, R5
    93  	ADD $32, R6
    94  
    95  	VPDI $0x4, V3, V3, V3    // flip the doublewords to big-endian order
    96  	VPDI $0x4, V4, V4, V4    // flip the doublewords to big-endian order
    97  	VPDI $0x4, V11, V11, V11 // flip the doublewords to big-endian order
    98  	VPDI $0x4, V12, V12, V12 // flip the doublewords to big-endian order
    99  
   100  	VACCCQ V3, V11, V26, V27
   101  	VACQ   V3, V11, V26, V19
   102  	VACCCQ V4, V12, V27, V28
   103  	VACQ   V4, V12, V27, V20
   104  
   105  	VLM 0(R5), V7, V8   // 32-bytes into V1..V8
   106  	VLM 0(R6), V15, V16 // 32-bytes into V9..V16
   107  	ADD $32, R5
   108  	ADD $32, R6
   109  
   110  	VPDI $0x4, V5, V5, V5    // flip the doublewords to big-endian order
   111  	VPDI $0x4, V6, V6, V6    // flip the doublewords to big-endian order
   112  	VPDI $0x4, V13, V13, V13 // flip the doublewords to big-endian order
   113  	VPDI $0x4, V14, V14, V14 // flip the doublewords to big-endian order
   114  
   115  	VACCCQ V5, V13, V28, V29
   116  	VACQ   V5, V13, V28, V21
   117  	VACCCQ V6, V14, V29, V30
   118  	VACQ   V6, V14, V29, V22
   119  
   120  	VPDI $0x4, V7, V7, V7    // flip the doublewords to big-endian order
   121  	VPDI $0x4, V8, V8, V8    // flip the doublewords to big-endian order
   122  	VPDI $0x4, V15, V15, V15 // flip the doublewords to big-endian order
   123  	VPDI $0x4, V16, V16, V16 // flip the doublewords to big-endian order
   124  
   125  	VACCCQ V7, V15, V30, V31
   126  	VACQ   V7, V15, V30, V23
   127  	VACCCQ V8, V16, V31, V0  // V0 has carry-over
   128  	VACQ   V8, V16, V31, V24
   129  
   130  	VPDI  $0x4, V17, V17, V17 // flip the doublewords to big-endian order
   131  	VPDI  $0x4, V18, V18, V18 // flip the doublewords to big-endian order
   132  	VPDI  $0x4, V19, V19, V19 // flip the doublewords to big-endian order
   133  	VPDI  $0x4, V20, V20, V20 // flip the doublewords to big-endian order
   134  	VPDI  $0x4, V21, V21, V21 // flip the doublewords to big-endian order
   135  	VPDI  $0x4, V22, V22, V22 // flip the doublewords to big-endian order
   136  	VPDI  $0x4, V23, V23, V23 // flip the doublewords to big-endian order
   137  	VPDI  $0x4, V24, V24, V24 // flip the doublewords to big-endian order
   138  	VSTM  V17, V24, 0(R7)     // 128-bytes into z
   139  	ADD   $128, R7
   140  	ADD   $128, R10           // i += 16
   141  	SUB   $16, R3             // n -= 16
   142  	BGE   UU1                 // if n >= 0 goto U1
   143  	VLGVG $1, V0, R4          // put cf into R4
   144  	NEG   R4, R4              // save cf
   145  
   146  A1:
   147  	ADD $12, R3 // n += 16
   148  
   149  	// s/JL/JMP/ below to disable the unrolled loop
   150  	BLT v1 // if n < 0 goto v1
   151  
   152  U1:  // n >= 0
   153  	// regular loop body unrolled 4x
   154  	MOVD 0(R8)(R10*1), R5
   155  	MOVD 8(R8)(R10*1), R6
   156  	MOVD 16(R8)(R10*1), R7
   157  	MOVD 24(R8)(R10*1), R1
   158  	ADDC R4, R4             // restore CF
   159  	MOVD 0(R9)(R10*1), R11
   160  	ADDE R11, R5
   161  	MOVD 8(R9)(R10*1), R11
   162  	ADDE R11, R6
   163  	MOVD 16(R9)(R10*1), R11
   164  	ADDE R11, R7
   165  	MOVD 24(R9)(R10*1), R11
   166  	ADDE R11, R1
   167  	MOVD R0, R4
   168  	ADDE R4, R4             // save CF
   169  	NEG  R4, R4
   170  	MOVD R5, 0(R2)(R10*1)
   171  	MOVD R6, 8(R2)(R10*1)
   172  	MOVD R7, 16(R2)(R10*1)
   173  	MOVD R1, 24(R2)(R10*1)
   174  
   175  	ADD $32, R10 // i += 4
   176  	SUB $4, R3   // n -= 4
   177  	BGE U1       // if n >= 0 goto U1
   178  
   179  v1:
   180  	ADD $4, R3 // n += 4
   181  	BLE E1     // if n <= 0 goto E1
   182  
   183  L1:  // n > 0
   184  	ADDC R4, R4            // restore CF
   185  	MOVD 0(R8)(R10*1), R5
   186  	MOVD 0(R9)(R10*1), R11
   187  	ADDE R11, R5
   188  	MOVD R5, 0(R2)(R10*1)
   189  	MOVD R0, R4
   190  	ADDE R4, R4            // save CF
   191  	NEG  R4, R4
   192  
   193  	ADD $8, R10 // i++
   194  	SUB $1, R3  // n--
   195  	BGT L1      // if n > 0 goto L1
   196  
   197  E1:
   198  	NEG  R4, R4
   199  	MOVD R4, c+72(FP) // return c
   200  	RET
   201  
   202  TEXT ·addVV_novec(SB), NOSPLIT, $0
   203  novec:
   204  	MOVD z_len+8(FP), R3
   205  	MOVD x+24(FP), R8
   206  	MOVD y+48(FP), R9
   207  	MOVD z+0(FP), R2
   208  
   209  	MOVD $0, R4  // c = 0
   210  	MOVD $0, R0  // make sure it's zero
   211  	MOVD $0, R10 // i = 0
   212  
   213  	// s/JL/JMP/ below to disable the unrolled loop
   214  	SUB $4, R3 // n -= 4
   215  	BLT v1n    // if n < 0 goto v1n
   216  
   217  U1n:  // n >= 0
   218  	// regular loop body unrolled 4x
   219  	MOVD 0(R8)(R10*1), R5
   220  	MOVD 8(R8)(R10*1), R6
   221  	MOVD 16(R8)(R10*1), R7
   222  	MOVD 24(R8)(R10*1), R1
   223  	ADDC R4, R4             // restore CF
   224  	MOVD 0(R9)(R10*1), R11
   225  	ADDE R11, R5
   226  	MOVD 8(R9)(R10*1), R11
   227  	ADDE R11, R6
   228  	MOVD 16(R9)(R10*1), R11
   229  	ADDE R11, R7
   230  	MOVD 24(R9)(R10*1), R11
   231  	ADDE R11, R1
   232  	MOVD R0, R4
   233  	ADDE R4, R4             // save CF
   234  	NEG  R4, R4
   235  	MOVD R5, 0(R2)(R10*1)
   236  	MOVD R6, 8(R2)(R10*1)
   237  	MOVD R7, 16(R2)(R10*1)
   238  	MOVD R1, 24(R2)(R10*1)
   239  
   240  	ADD $32, R10 // i += 4
   241  	SUB $4, R3   // n -= 4
   242  	BGE U1n      // if n >= 0 goto U1n
   243  
   244  v1n:
   245  	ADD $4, R3 // n += 4
   246  	BLE E1n    // if n <= 0 goto E1n
   247  
   248  L1n:  // n > 0
   249  	ADDC R4, R4            // restore CF
   250  	MOVD 0(R8)(R10*1), R5
   251  	MOVD 0(R9)(R10*1), R11
   252  	ADDE R11, R5
   253  	MOVD R5, 0(R2)(R10*1)
   254  	MOVD R0, R4
   255  	ADDE R4, R4            // save CF
   256  	NEG  R4, R4
   257  
   258  	ADD $8, R10 // i++
   259  	SUB $1, R3  // n--
   260  	BGT L1n     // if n > 0 goto L1n
   261  
   262  E1n:
   263  	NEG  R4, R4
   264  	MOVD R4, c+72(FP) // return c
   265  	RET
   266  
   267  TEXT ·subVV(SB), NOSPLIT, $0
   268  	MOVD subvectorfacility+0x00(SB), R1
   269  	BR   (R1)
   270  
   271  TEXT ·subVV_check(SB), NOSPLIT, $0
   272  	MOVB   ·hasVX(SB), R1
   273  	CMPBEQ R1, $1, vectorimpl              // vectorfacility = 1, vector supported
   274  	MOVD   $subvectorfacility+0x00(SB), R1
   275  	MOVD   $·subVV_novec(SB), R2
   276  	MOVD   R2, 0(R1)
   277  
   278  	// MOVD	$·subVV_novec(SB), 0(R1)
   279  	BR ·subVV_novec(SB)
   280  
   281  vectorimpl:
   282  	MOVD $subvectorfacility+0x00(SB), R1
   283  	MOVD $·subVV_vec(SB), R2
   284  	MOVD R2, 0(R1)
   285  
   286  	// MOVD	$·subVV_vec(SB), 0(R1)
   287  	BR ·subVV_vec(SB)
   288  
   289  GLOBL subvectorfacility+0x00(SB), NOPTR, $8
   290  DATA subvectorfacility+0x00(SB)/8, $·subVV_check(SB)
   291  
   292  // DI = R3, CX = R4, SI = r10, r8 = r8, r9=r9, r10 = r2, r11 = r5, r12 = r6, r13 = r7, r14 = r1 (R0 set to 0) + use R11
   293  // func subVV(z, x, y []Word) (c Word)
   294  // (same as addVV except for SUBC/SUBE instead of ADDC/ADDE and label names)
   295  TEXT ·subVV_vec(SB), NOSPLIT, $0
   296  	MOVD z_len+8(FP), R3
   297  	MOVD x+24(FP), R8
   298  	MOVD y+48(FP), R9
   299  	MOVD z+0(FP), R2
   300  	MOVD $0, R4          // c = 0
   301  	MOVD $0, R0          // make sure it's zero
   302  	MOVD $0, R10         // i = 0
   303  
   304  	// s/JL/JMP/ below to disable the unrolled loop
   305  	SUB $4, R3  // n -= 4
   306  	BLT v1      // if n < 0 goto v1
   307  	SUB $12, R3 // n -= 16
   308  	BLT A1      // if n < 0 goto A1
   309  
   310  	MOVD R8, R5
   311  	MOVD R9, R6
   312  	MOVD R2, R7
   313  
   314  	// n >= 0
   315  	// regular loop body unrolled 16x
   316  	VZERO V0         // cf = 0
   317  	MOVD  $1, R4     // for 390 subtraction cf starts as 1 (no borrow)
   318  	VLVGG $1, R4, V0 // put carry into V0
   319  
   320  UU1:
   321  	VLM  0(R5), V1, V4    // 64-bytes into V1..V8
   322  	ADD  $64, R5
   323  	VPDI $0x4, V1, V1, V1 // flip the doublewords to big-endian order
   324  	VPDI $0x4, V2, V2, V2 // flip the doublewords to big-endian order
   325  
   326  	VLM  0(R6), V9, V12      // 64-bytes into V9..V16
   327  	ADD  $64, R6
   328  	VPDI $0x4, V9, V9, V9    // flip the doublewords to big-endian order
   329  	VPDI $0x4, V10, V10, V10 // flip the doublewords to big-endian order
   330  
   331  	VSBCBIQ V1, V9, V0, V25
   332  	VSBIQ   V1, V9, V0, V17
   333  	VSBCBIQ V2, V10, V25, V26
   334  	VSBIQ   V2, V10, V25, V18
   335  
   336  	VLM 0(R5), V5, V6   // 32-bytes into V1..V8
   337  	VLM 0(R6), V13, V14 // 32-bytes into V9..V16
   338  	ADD $32, R5
   339  	ADD $32, R6
   340  
   341  	VPDI $0x4, V3, V3, V3    // flip the doublewords to big-endian order
   342  	VPDI $0x4, V4, V4, V4    // flip the doublewords to big-endian order
   343  	VPDI $0x4, V11, V11, V11 // flip the doublewords to big-endian order
   344  	VPDI $0x4, V12, V12, V12 // flip the doublewords to big-endian order
   345  
   346  	VSBCBIQ V3, V11, V26, V27
   347  	VSBIQ   V3, V11, V26, V19
   348  	VSBCBIQ V4, V12, V27, V28
   349  	VSBIQ   V4, V12, V27, V20
   350  
   351  	VLM 0(R5), V7, V8   // 32-bytes into V1..V8
   352  	VLM 0(R6), V15, V16 // 32-bytes into V9..V16
   353  	ADD $32, R5
   354  	ADD $32, R6
   355  
   356  	VPDI $0x4, V5, V5, V5    // flip the doublewords to big-endian order
   357  	VPDI $0x4, V6, V6, V6    // flip the doublewords to big-endian order
   358  	VPDI $0x4, V13, V13, V13 // flip the doublewords to big-endian order
   359  	VPDI $0x4, V14, V14, V14 // flip the doublewords to big-endian order
   360  
   361  	VSBCBIQ V5, V13, V28, V29
   362  	VSBIQ   V5, V13, V28, V21
   363  	VSBCBIQ V6, V14, V29, V30
   364  	VSBIQ   V6, V14, V29, V22
   365  
   366  	VPDI $0x4, V7, V7, V7    // flip the doublewords to big-endian order
   367  	VPDI $0x4, V8, V8, V8    // flip the doublewords to big-endian order
   368  	VPDI $0x4, V15, V15, V15 // flip the doublewords to big-endian order
   369  	VPDI $0x4, V16, V16, V16 // flip the doublewords to big-endian order
   370  
   371  	VSBCBIQ V7, V15, V30, V31
   372  	VSBIQ   V7, V15, V30, V23
   373  	VSBCBIQ V8, V16, V31, V0  // V0 has carry-over
   374  	VSBIQ   V8, V16, V31, V24
   375  
   376  	VPDI  $0x4, V17, V17, V17 // flip the doublewords to big-endian order
   377  	VPDI  $0x4, V18, V18, V18 // flip the doublewords to big-endian order
   378  	VPDI  $0x4, V19, V19, V19 // flip the doublewords to big-endian order
   379  	VPDI  $0x4, V20, V20, V20 // flip the doublewords to big-endian order
   380  	VPDI  $0x4, V21, V21, V21 // flip the doublewords to big-endian order
   381  	VPDI  $0x4, V22, V22, V22 // flip the doublewords to big-endian order
   382  	VPDI  $0x4, V23, V23, V23 // flip the doublewords to big-endian order
   383  	VPDI  $0x4, V24, V24, V24 // flip the doublewords to big-endian order
   384  	VSTM  V17, V24, 0(R7)     // 128-bytes into z
   385  	ADD   $128, R7
   386  	ADD   $128, R10           // i += 16
   387  	SUB   $16, R3             // n -= 16
   388  	BGE   UU1                 // if n >= 0 goto U1
   389  	VLGVG $1, V0, R4          // put cf into R4
   390  	SUB   $1, R4              // save cf
   391  
   392  A1:
   393  	ADD $12, R3 // n += 16
   394  	BLT v1      // if n < 0 goto v1
   395  
   396  U1:  // n >= 0
   397  	// regular loop body unrolled 4x
   398  	MOVD 0(R8)(R10*1), R5
   399  	MOVD 8(R8)(R10*1), R6
   400  	MOVD 16(R8)(R10*1), R7
   401  	MOVD 24(R8)(R10*1), R1
   402  	MOVD R0, R11
   403  	SUBC R4, R11            // restore CF
   404  	MOVD 0(R9)(R10*1), R11
   405  	SUBE R11, R5
   406  	MOVD 8(R9)(R10*1), R11
   407  	SUBE R11, R6
   408  	MOVD 16(R9)(R10*1), R11
   409  	SUBE R11, R7
   410  	MOVD 24(R9)(R10*1), R11
   411  	SUBE R11, R1
   412  	MOVD R0, R4
   413  	SUBE R4, R4             // save CF
   414  	MOVD R5, 0(R2)(R10*1)
   415  	MOVD R6, 8(R2)(R10*1)
   416  	MOVD R7, 16(R2)(R10*1)
   417  	MOVD R1, 24(R2)(R10*1)
   418  
   419  	ADD $32, R10 // i += 4
   420  	SUB $4, R3   // n -= 4
   421  	BGE U1       // if n >= 0 goto U1n
   422  
   423  v1:
   424  	ADD $4, R3 // n += 4
   425  	BLE E1     // if n <= 0 goto E1
   426  
   427  L1:  // n > 0
   428  	MOVD R0, R11
   429  	SUBC R4, R11           // restore CF
   430  	MOVD 0(R8)(R10*1), R5
   431  	MOVD 0(R9)(R10*1), R11
   432  	SUBE R11, R5
   433  	MOVD R5, 0(R2)(R10*1)
   434  	MOVD R0, R4
   435  	SUBE R4, R4            // save CF
   436  
   437  	ADD $8, R10 // i++
   438  	SUB $1, R3  // n--
   439  	BGT L1      // if n > 0 goto L1n
   440  
   441  E1:
   442  	NEG  R4, R4
   443  	MOVD R4, c+72(FP) // return c
   444  	RET
   445  
   446  // DI = R3, CX = R4, SI = r10, r8 = r8, r9=r9, r10 = r2, r11 = r5, r12 = r6, r13 = r7, r14 = r1 (R0 set to 0) + use R11
   447  // func subVV(z, x, y []Word) (c Word)
   448  // (same as addVV except for SUBC/SUBE instead of ADDC/ADDE and label names)
   449  TEXT ·subVV_novec(SB), NOSPLIT, $0
   450  	MOVD z_len+8(FP), R3
   451  	MOVD x+24(FP), R8
   452  	MOVD y+48(FP), R9
   453  	MOVD z+0(FP), R2
   454  
   455  	MOVD $0, R4  // c = 0
   456  	MOVD $0, R0  // make sure it's zero
   457  	MOVD $0, R10 // i = 0
   458  
   459  	// s/JL/JMP/ below to disable the unrolled loop
   460  	SUB $4, R3 // n -= 4
   461  	BLT v1     // if n < 0 goto v1
   462  
   463  U1:  // n >= 0
   464  	// regular loop body unrolled 4x
   465  	MOVD 0(R8)(R10*1), R5
   466  	MOVD 8(R8)(R10*1), R6
   467  	MOVD 16(R8)(R10*1), R7
   468  	MOVD 24(R8)(R10*1), R1
   469  	MOVD R0, R11
   470  	SUBC R4, R11            // restore CF
   471  	MOVD 0(R9)(R10*1), R11
   472  	SUBE R11, R5
   473  	MOVD 8(R9)(R10*1), R11
   474  	SUBE R11, R6
   475  	MOVD 16(R9)(R10*1), R11
   476  	SUBE R11, R7
   477  	MOVD 24(R9)(R10*1), R11
   478  	SUBE R11, R1
   479  	MOVD R0, R4
   480  	SUBE R4, R4             // save CF
   481  	MOVD R5, 0(R2)(R10*1)
   482  	MOVD R6, 8(R2)(R10*1)
   483  	MOVD R7, 16(R2)(R10*1)
   484  	MOVD R1, 24(R2)(R10*1)
   485  
   486  	ADD $32, R10 // i += 4
   487  	SUB $4, R3   // n -= 4
   488  	BGE U1       // if n >= 0 goto U1
   489  
   490  v1:
   491  	ADD $4, R3 // n += 4
   492  	BLE E1     // if n <= 0 goto E1
   493  
   494  L1:  // n > 0
   495  	MOVD R0, R11
   496  	SUBC R4, R11           // restore CF
   497  	MOVD 0(R8)(R10*1), R5
   498  	MOVD 0(R9)(R10*1), R11
   499  	SUBE R11, R5
   500  	MOVD R5, 0(R2)(R10*1)
   501  	MOVD R0, R4
   502  	SUBE R4, R4            // save CF
   503  
   504  	ADD $8, R10 // i++
   505  	SUB $1, R3  // n--
   506  	BGT L1      // if n > 0 goto L1
   507  
   508  E1:
   509  	NEG  R4, R4
   510  	MOVD R4, c+72(FP) // return c
   511  	RET
   512  
   513  TEXT ·addVW(SB), NOSPLIT, $0
   514  	MOVD z_len+8(FP), R5 // length of z
   515  	MOVD x+24(FP), R6
   516  	MOVD y+48(FP), R7    // c = y
   517  	MOVD z+0(FP), R8
   518  
   519  	CMPBEQ R5, $0, returnC // if len(z) == 0, we can have an early return
   520  
   521  	// Add the first two words, and determine which path (copy path or loop path) to take based on the carry flag.
   522  	ADDC   0(R6), R7
   523  	MOVD   R7, 0(R8)
   524  	CMPBEQ R5, $1, returnResult // len(z) == 1
   525  	MOVD   $0, R9
   526  	ADDE   8(R6), R9
   527  	MOVD   R9, 8(R8)
   528  	CMPBEQ R5, $2, returnResult // len(z) == 2
   529  
   530  	// Update the counters
   531  	MOVD $16, R12    // i = 2
   532  	MOVD $-2(R5), R5 // n = n - 2
   533  
   534  loopOverEachWord:
   535  	BRC  $12, copySetup // carry = 0, copy the rest
   536  	MOVD $1, R9
   537  
   538  	// Originally we used the carry flag generated in the previous iteration
   539  	// (i.e: ADDE could be used here to do the addition).  However, since we
   540  	// already know carry is 1 (otherwise we will go to copy section), we can use
   541  	// ADDC here so the current iteration does not depend on the carry flag
   542  	// generated in the previous iteration. This could be useful when branch prediction happens.
   543  	ADDC 0(R6)(R12*1), R9
   544  	MOVD R9, 0(R8)(R12*1) // z[i] = x[i] + c
   545  
   546  	MOVD  $8(R12), R12         // i++
   547  	BRCTG R5, loopOverEachWord // n--
   548  
   549  // Return the current carry value
   550  returnResult:
   551  	MOVD $0, R0
   552  	ADDE R0, R0
   553  	MOVD R0, c+56(FP)
   554  	RET
   555  
   556  // Update position of x(R6) and z(R8) based on the current counter value and perform copying.
   557  // With the assumption that x and z will not overlap with each other or x and z will
   558  // point to same memory region, we can use a faster version of copy using only MVC here.
   559  // In the following implementation, we have three copy loops, each copying a word, 4 words, and
   560  // 32 words at a time.  Via benchmarking, this implementation is faster than calling runtime·memmove.
   561  copySetup:
   562  	ADD R12, R6
   563  	ADD R12, R8
   564  
   565  	CMPBGE R5, $4, mediumLoop
   566  
   567  smallLoop:  // does a loop unrolling to copy word when n < 4
   568  	CMPBEQ R5, $0, returnZero
   569  	MVC    $8, 0(R6), 0(R8)
   570  	CMPBEQ R5, $1, returnZero
   571  	MVC    $8, 8(R6), 8(R8)
   572  	CMPBEQ R5, $2, returnZero
   573  	MVC    $8, 16(R6), 16(R8)
   574  
   575  returnZero:
   576  	MOVD $0, c+56(FP) // return 0 as carry
   577  	RET
   578  
   579  mediumLoop:
   580  	CMPBLT R5, $4, smallLoop
   581  	CMPBLT R5, $32, mediumLoopBody
   582  
   583  largeLoop:  // Copying 256 bytes at a time.
   584  	MVC    $256, 0(R6), 0(R8)
   585  	MOVD   $256(R6), R6
   586  	MOVD   $256(R8), R8
   587  	MOVD   $-32(R5), R5
   588  	CMPBGE R5, $32, largeLoop
   589  	BR     mediumLoop
   590  
   591  mediumLoopBody:  // Copying 32 bytes at a time
   592  	MVC    $32, 0(R6), 0(R8)
   593  	MOVD   $32(R6), R6
   594  	MOVD   $32(R8), R8
   595  	MOVD   $-4(R5), R5
   596  	CMPBGE R5, $4, mediumLoopBody
   597  	BR     smallLoop
   598  
   599  returnC:
   600  	MOVD R7, c+56(FP)
   601  	RET
   602  
   603  TEXT ·subVW(SB), NOSPLIT, $0
   604  	MOVD z_len+8(FP), R5
   605  	MOVD x+24(FP), R6
   606  	MOVD y+48(FP), R7    // The borrow bit passed in
   607  	MOVD z+0(FP), R8
   608  	MOVD $0, R0          // R0 is a temporary variable used during computation. Ensure it has zero in it.
   609  
   610  	CMPBEQ R5, $0, returnC // len(z) == 0, have an early return
   611  
   612  	// Subtract the first two words, and determine which path (copy path or loop path) to take based on the borrow flag
   613  	MOVD   0(R6), R9
   614  	SUBC   R7, R9
   615  	MOVD   R9, 0(R8)
   616  	CMPBEQ R5, $1, returnResult
   617  	MOVD   8(R6), R9
   618  	SUBE   R0, R9
   619  	MOVD   R9, 8(R8)
   620  	CMPBEQ R5, $2, returnResult
   621  
   622  	// Update the counters
   623  	MOVD $16, R12    // i = 2
   624  	MOVD $-2(R5), R5 // n = n - 2
   625  
   626  loopOverEachWord:
   627  	BRC  $3, copySetup    // no borrow, copy the rest
   628  	MOVD 0(R6)(R12*1), R9
   629  
   630  	// Originally we used the borrow flag generated in the previous iteration
   631  	// (i.e: SUBE could be used here to do the subtraction). However, since we
   632  	// already know borrow is 1 (otherwise we will go to copy section), we can
   633  	// use SUBC here so the current iteration does not depend on the borrow flag
   634  	// generated in the previous iteration. This could be useful when branch prediction happens.
   635  	SUBC $1, R9
   636  	MOVD R9, 0(R8)(R12*1) // z[i] = x[i] - 1
   637  
   638  	MOVD  $8(R12), R12         // i++
   639  	BRCTG R5, loopOverEachWord // n--
   640  
   641  // return the current borrow value
   642  returnResult:
   643  	SUBE R0, R0
   644  	NEG  R0, R0
   645  	MOVD R0, c+56(FP)
   646  	RET
   647  
   648  // Update position of x(R6) and z(R8) based on the current counter value and perform copying.
   649  // With the assumption that x and z will not overlap with each other or x and z will
   650  // point to same memory region, we can use a faster version of copy using only MVC here.
   651  // In the following implementation, we have three copy loops, each copying a word, 4 words, and
   652  // 32 words at a time. Via benchmarking, this implementation is faster than calling runtime·memmove.
   653  copySetup:
   654  	ADD R12, R6
   655  	ADD R12, R8
   656  
   657  	CMPBGE R5, $4, mediumLoop
   658  
   659  smallLoop:  // does a loop unrolling to copy word when n < 4
   660  	CMPBEQ R5, $0, returnZero
   661  	MVC    $8, 0(R6), 0(R8)
   662  	CMPBEQ R5, $1, returnZero
   663  	MVC    $8, 8(R6), 8(R8)
   664  	CMPBEQ R5, $2, returnZero
   665  	MVC    $8, 16(R6), 16(R8)
   666  
   667  returnZero:
   668  	MOVD $0, c+56(FP) // return 0 as borrow
   669  	RET
   670  
   671  mediumLoop:
   672  	CMPBLT R5, $4, smallLoop
   673  	CMPBLT R5, $32, mediumLoopBody
   674  
   675  largeLoop:  // Copying 256 bytes at a time
   676  	MVC    $256, 0(R6), 0(R8)
   677  	MOVD   $256(R6), R6
   678  	MOVD   $256(R8), R8
   679  	MOVD   $-32(R5), R5
   680  	CMPBGE R5, $32, largeLoop
   681  	BR     mediumLoop
   682  
   683  mediumLoopBody:  // Copying 32 bytes at a time
   684  	MVC    $32, 0(R6), 0(R8)
   685  	MOVD   $32(R6), R6
   686  	MOVD   $32(R8), R8
   687  	MOVD   $-4(R5), R5
   688  	CMPBGE R5, $4, mediumLoopBody
   689  	BR     smallLoop
   690  
   691  returnC:
   692  	MOVD R7, c+56(FP)
   693  	RET
   694  
   695  // func shlVU(z, x []Word, s uint) (c Word)
   696  TEXT ·shlVU(SB), NOSPLIT, $0
   697  	BR ·shlVU_g(SB)
   698  
   699  // func shrVU(z, x []Word, s uint) (c Word)
   700  TEXT ·shrVU(SB), NOSPLIT, $0
   701  	BR ·shrVU_g(SB)
   702  
   703  // CX = R4, r8 = r8, r9=r9, r10 = r2, r11 = r5, DX = r3, AX = r6, BX = R1, (R0 set to 0) + use R11 + use R7 for i
   704  // func mulAddVWW(z, x []Word, y, r Word) (c Word)
   705  TEXT ·mulAddVWW(SB), NOSPLIT, $0
   706  	MOVD z+0(FP), R2
   707  	MOVD x+24(FP), R8
   708  	MOVD y+48(FP), R9
   709  	MOVD r+56(FP), R4    // c = r
   710  	MOVD z_len+8(FP), R5
   711  	MOVD $0, R1          // i = 0
   712  	MOVD $0, R7          // i*8 = 0
   713  	MOVD $0, R0          // make sure it's zero
   714  	BR   E5
   715  
   716  L5:
   717  	MOVD   (R8)(R1*1), R6
   718  	MULHDU R9, R6
   719  	ADDC   R4, R11         // add to low order bits
   720  	ADDE   R0, R6
   721  	MOVD   R11, (R2)(R1*1)
   722  	MOVD   R6, R4
   723  	ADD    $8, R1          // i*8 + 8
   724  	ADD    $1, R7          // i++
   725  
   726  E5:
   727  	CMPBLT R7, R5, L5 // i < n
   728  
   729  	MOVD R4, c+64(FP)
   730  	RET
   731  
   732  // func addMulVVW(z, x []Word, y Word) (c Word)
   733  // CX = R4, r8 = r8, r9=r9, r10 = r2, r11 = r5, AX = r11, DX = R6, r12=r12, BX = R1, (R0 set to 0) + use R11 + use R7 for i
   734  TEXT ·addMulVVW(SB), NOSPLIT, $0
   735  	MOVD z+0(FP), R2
   736  	MOVD x+24(FP), R8
   737  	MOVD y+48(FP), R9
   738  	MOVD z_len+8(FP), R5
   739  
   740  	MOVD $0, R1 // i*8 = 0
   741  	MOVD $0, R7 // i = 0
   742  	MOVD $0, R0 // make sure it's zero
   743  	MOVD $0, R4 // c = 0
   744  
   745  	MOVD   R5, R12
   746  	AND    $-2, R12
   747  	CMPBGE R5, $2, A6
   748  	BR     E6
   749  
   750  A6:
   751  	MOVD   (R8)(R1*1), R6
   752  	MULHDU R9, R6
   753  	MOVD   (R2)(R1*1), R10
   754  	ADDC   R10, R11        // add to low order bits
   755  	ADDE   R0, R6
   756  	ADDC   R4, R11
   757  	ADDE   R0, R6
   758  	MOVD   R6, R4
   759  	MOVD   R11, (R2)(R1*1)
   760  
   761  	MOVD   (8)(R8)(R1*1), R6
   762  	MULHDU R9, R6
   763  	MOVD   (8)(R2)(R1*1), R10
   764  	ADDC   R10, R11           // add to low order bits
   765  	ADDE   R0, R6
   766  	ADDC   R4, R11
   767  	ADDE   R0, R6
   768  	MOVD   R6, R4
   769  	MOVD   R11, (8)(R2)(R1*1)
   770  
   771  	ADD $16, R1 // i*8 + 8
   772  	ADD $2, R7  // i++
   773  
   774  	CMPBLT R7, R12, A6
   775  	BR     E6
   776  
   777  L6:
   778  	MOVD   (R8)(R1*1), R6
   779  	MULHDU R9, R6
   780  	MOVD   (R2)(R1*1), R10
   781  	ADDC   R10, R11        // add to low order bits
   782  	ADDE   R0, R6
   783  	ADDC   R4, R11
   784  	ADDE   R0, R6
   785  	MOVD   R6, R4
   786  	MOVD   R11, (R2)(R1*1)
   787  
   788  	ADD $8, R1 // i*8 + 8
   789  	ADD $1, R7 // i++
   790  
   791  E6:
   792  	CMPBLT R7, R5, L6 // i < n
   793  
   794  	MOVD R4, c+56(FP)
   795  	RET
   796  
   797  

View as plain text