Text file src/math/big/arith_ppc64x.s

     1  // Copyright 2013 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  //go:build !math_big_pure_go && (ppc64 || ppc64le)
     6  // +build !math_big_pure_go
     7  // +build ppc64 ppc64le
     8  
     9  #include "textflag.h"
    10  
    11  // This file provides fast assembly versions for the elementary
    12  // arithmetic operations on vectors implemented in arith.go.
    13  
    14  // func mulWW(x, y Word) (z1, z0 Word)
    15  TEXT ·mulWW(SB), NOSPLIT, $0
    16  	MOVD   x+0(FP), R4
    17  	MOVD   y+8(FP), R5
    18  	MULHDU R4, R5, R6
    19  	MULLD  R4, R5, R7
    20  	MOVD   R6, z1+16(FP)
    21  	MOVD   R7, z0+24(FP)
    22  	RET
    23  
    24  // func addVV(z, y, y []Word) (c Word)
    25  // z[i] = x[i] + y[i] for all i, carrying
    26  TEXT ·addVV(SB), NOSPLIT, $0
    27  	MOVD  z_len+8(FP), R7   // R7 = z_len
    28  	MOVD  x+24(FP), R8      // R8 = x[]
    29  	MOVD  y+48(FP), R9      // R9 = y[]
    30  	MOVD  z+0(FP), R10      // R10 = z[]
    31  
    32  	// If z_len = 0, we are done
    33  	CMP   R0, R7
    34  	MOVD  R0, R4
    35  	BEQ   done
    36  
    37  	// Process the first iteration out of the loop so we can
    38  	// use MOVDU and avoid 3 index registers updates.
    39  	MOVD  0(R8), R11      // R11 = x[i]
    40  	MOVD  0(R9), R12      // R12 = y[i]
    41  	ADD   $-1, R7         // R7 = z_len - 1
    42  	ADDC  R12, R11, R15   // R15 = x[i] + y[i], set CA
    43  	CMP   R0, R7
    44  	MOVD  R15, 0(R10)     // z[i]
    45  	BEQ   final          // If z_len was 1, we are done
    46  
    47  	SRD   $2, R7, R5      // R5 = z_len/4
    48  	CMP   R0, R5
    49  	MOVD  R5, CTR         // Set up loop counter
    50  	BEQ   tail            // If R5 = 0, we can't use the loop
    51  
    52  	// Process 4 elements per iteration. Unrolling this loop
    53  	// means a performance trade-off: we will lose performance
    54  	// for small values of z_len (0.90x in the worst case), but
    55  	// gain significant performance as z_len increases (up to
    56  	// 1.45x).
    57  loop:
    58  	MOVD  8(R8), R11      // R11 = x[i]
    59  	MOVD  16(R8), R12     // R12 = x[i+1]
    60  	MOVD  24(R8), R14     // R14 = x[i+2]
    61  	MOVDU 32(R8), R15     // R15 = x[i+3]
    62  	MOVD  8(R9), R16      // R16 = y[i]
    63  	MOVD  16(R9), R17     // R17 = y[i+1]
    64  	MOVD  24(R9), R18     // R18 = y[i+2]
    65  	MOVDU 32(R9), R19     // R19 = y[i+3]
    66  	ADDE  R11, R16, R20   // R20 = x[i] + y[i] + CA
    67  	ADDE  R12, R17, R21   // R21 = x[i+1] + y[i+1] + CA
    68  	ADDE  R14, R18, R22   // R22 = x[i+2] + y[i+2] + CA
    69  	ADDE  R15, R19, R23   // R23 = x[i+3] + y[i+3] + CA
    70  	MOVD  R20, 8(R10)     // z[i]
    71  	MOVD  R21, 16(R10)    // z[i+1]
    72  	MOVD  R22, 24(R10)    // z[i+2]
    73  	MOVDU R23, 32(R10)    // z[i+3]
    74  	ADD   $-4, R7         // R7 = z_len - 4
    75  	BC  16, 0, loop       // bdnz
    76  
    77  	// We may have more elements to read
    78  	CMP   R0, R7
    79  	BEQ   final
    80  
    81  	// Process the remaining elements, one at a time
    82  tail:
    83  	MOVDU 8(R8), R11      // R11 = x[i]
    84  	MOVDU 8(R9), R16      // R16 = y[i]
    85  	ADD   $-1, R7         // R7 = z_len - 1
    86  	ADDE  R11, R16, R20   // R20 = x[i] + y[i] + CA
    87  	CMP   R0, R7
    88  	MOVDU R20, 8(R10)     // z[i]
    89  	BEQ   final           // If R7 = 0, we are done
    90  
    91  	MOVDU 8(R8), R11
    92  	MOVDU 8(R9), R16
    93  	ADD   $-1, R7
    94  	ADDE  R11, R16, R20
    95  	CMP   R0, R7
    96  	MOVDU R20, 8(R10)
    97  	BEQ   final
    98  
    99  	MOVD  8(R8), R11
   100  	MOVD  8(R9), R16
   101  	ADDE  R11, R16, R20
   102  	MOVD  R20, 8(R10)
   103  
   104  final:
   105  	ADDZE R4              // Capture CA
   106  
   107  done:
   108  	MOVD  R4, c+72(FP)
   109  	RET
   110  
   111  // func subVV(z, x, y []Word) (c Word)
   112  // z[i] = x[i] - y[i] for all i, carrying
   113  TEXT ·subVV(SB), NOSPLIT, $0
   114  	MOVD  z_len+8(FP), R7 // R7 = z_len
   115  	MOVD  x+24(FP), R8    // R8 = x[]
   116  	MOVD  y+48(FP), R9    // R9 = y[]
   117  	MOVD  z+0(FP), R10    // R10 = z[]
   118  
   119  	// If z_len = 0, we are done
   120  	CMP   R0, R7
   121  	MOVD  R0, R4
   122  	BEQ   done
   123  
   124  	// Process the first iteration out of the loop so we can
   125  	// use MOVDU and avoid 3 index registers updates.
   126  	MOVD  0(R8), R11      // R11 = x[i]
   127  	MOVD  0(R9), R12      // R12 = y[i]
   128  	ADD   $-1, R7         // R7 = z_len - 1
   129  	SUBC  R12, R11, R15   // R15 = x[i] - y[i], set CA
   130  	CMP   R0, R7
   131  	MOVD  R15, 0(R10)     // z[i]
   132  	BEQ   final           // If z_len was 1, we are done
   133  
   134  	SRD   $2, R7, R5      // R5 = z_len/4
   135  	CMP   R0, R5
   136  	MOVD  R5, CTR         // Set up loop counter
   137  	BEQ   tail            // If R5 = 0, we can't use the loop
   138  
   139  	// Process 4 elements per iteration. Unrolling this loop
   140  	// means a performance trade-off: we will lose performance
   141  	// for small values of z_len (0.92x in the worst case), but
   142  	// gain significant performance as z_len increases (up to
   143  	// 1.45x).
   144  loop:
   145  	MOVD  8(R8), R11      // R11 = x[i]
   146  	MOVD  16(R8), R12     // R12 = x[i+1]
   147  	MOVD  24(R8), R14     // R14 = x[i+2]
   148  	MOVDU 32(R8), R15     // R15 = x[i+3]
   149  	MOVD  8(R9), R16      // R16 = y[i]
   150  	MOVD  16(R9), R17     // R17 = y[i+1]
   151  	MOVD  24(R9), R18     // R18 = y[i+2]
   152  	MOVDU 32(R9), R19     // R19 = y[i+3]
   153  	SUBE  R16, R11, R20   // R20 = x[i] - y[i] + CA
   154  	SUBE  R17, R12, R21   // R21 = x[i+1] - y[i+1] + CA
   155  	SUBE  R18, R14, R22   // R22 = x[i+2] - y[i+2] + CA
   156  	SUBE  R19, R15, R23   // R23 = x[i+3] - y[i+3] + CA
   157  	MOVD  R20, 8(R10)     // z[i]
   158  	MOVD  R21, 16(R10)    // z[i+1]
   159  	MOVD  R22, 24(R10)    // z[i+2]
   160  	MOVDU R23, 32(R10)    // z[i+3]
   161  	ADD   $-4, R7         // R7 = z_len - 4
   162  	BC  16, 0, loop       // bdnz
   163  
   164  	// We may have more elements to read
   165  	CMP   R0, R7
   166  	BEQ   final
   167  
   168  	// Process the remaining elements, one at a time
   169  tail:
   170  	MOVDU 8(R8), R11      // R11 = x[i]
   171  	MOVDU 8(R9), R16      // R16 = y[i]
   172  	ADD   $-1, R7         // R7 = z_len - 1
   173  	SUBE  R16, R11, R20   // R20 = x[i] - y[i] + CA
   174  	CMP   R0, R7
   175  	MOVDU R20, 8(R10)     // z[i]
   176  	BEQ   final           // If R7 = 0, we are done
   177  
   178  	MOVDU 8(R8), R11
   179  	MOVDU 8(R9), R16
   180  	ADD   $-1, R7
   181  	SUBE  R16, R11, R20
   182  	CMP   R0, R7
   183  	MOVDU R20, 8(R10)
   184  	BEQ   final
   185  
   186  	MOVD  8(R8), R11
   187  	MOVD  8(R9), R16
   188  	SUBE  R16, R11, R20
   189  	MOVD  R20, 8(R10)
   190  
   191  final:
   192  	ADDZE R4
   193  	XOR   $1, R4
   194  
   195  done:
   196  	MOVD  R4, c+72(FP)
   197  	RET
   198  
   199  // func addVW(z, x []Word, y Word) (c Word)
   200  TEXT ·addVW(SB), NOSPLIT, $0
   201  	MOVD z+0(FP), R10	// R10 = z[]
   202  	MOVD x+24(FP), R8	// R8 = x[]
   203  	MOVD y+48(FP), R4	// R4 = y = c
   204  	MOVD z_len+8(FP), R11	// R11 = z_len
   205  
   206  	CMP   R0, R11		// If z_len is zero, return
   207  	BEQ   done
   208  
   209  	// We will process the first iteration out of the loop so we capture
   210  	// the value of c. In the subsequent iterations, we will rely on the
   211  	// value of CA set here.
   212  	MOVD  0(R8), R20	// R20 = x[i]
   213  	ADD   $-1, R11		// R11 = z_len - 1
   214  	ADDC  R20, R4, R6	// R6 = x[i] + c
   215  	CMP   R0, R11		// If z_len was 1, we are done
   216  	MOVD  R6, 0(R10)	// z[i]
   217  	BEQ   final
   218  
   219  	// We will read 4 elements per iteration
   220  	SRD   $2, R11, R9	// R9 = z_len/4
   221  	DCBT  (R8)
   222  	CMP   R0, R9
   223  	MOVD  R9, CTR		// Set up the loop counter
   224  	BEQ   tail		// If R9 = 0, we can't use the loop
   225  
   226  loop:
   227  	MOVD  8(R8), R20	// R20 = x[i]
   228  	MOVD  16(R8), R21	// R21 = x[i+1]
   229  	MOVD  24(R8), R22	// R22 = x[i+2]
   230  	MOVDU 32(R8), R23	// R23 = x[i+3]
   231  	ADDZE R20, R24		// R24 = x[i] + CA
   232  	ADDZE R21, R25		// R25 = x[i+1] + CA
   233  	ADDZE R22, R26		// R26 = x[i+2] + CA
   234  	ADDZE R23, R27		// R27 = x[i+3] + CA
   235  	MOVD  R24, 8(R10)	// z[i]
   236  	MOVD  R25, 16(R10)	// z[i+1]
   237  	MOVD  R26, 24(R10)	// z[i+2]
   238  	MOVDU R27, 32(R10)	// z[i+3]
   239  	ADD   $-4, R11		// R11 = z_len - 4
   240  	BC    16, 0, loop	// bdnz
   241  
   242  	// We may have some elements to read
   243  	CMP R0, R11
   244  	BEQ final
   245  
   246  tail:
   247  	MOVDU 8(R8), R20
   248  	ADDZE R20, R24
   249  	ADD $-1, R11
   250  	MOVDU R24, 8(R10)
   251  	CMP R0, R11
   252  	BEQ final
   253  
   254  	MOVDU 8(R8), R20
   255  	ADDZE R20, R24
   256  	ADD $-1, R11
   257  	MOVDU R24, 8(R10)
   258  	CMP R0, R11
   259  	BEQ final
   260  
   261  	MOVD 8(R8), R20
   262  	ADDZE R20, R24
   263  	MOVD R24, 8(R10)
   264  
   265  final:
   266  	ADDZE R0, R4		// c = CA
   267  done:
   268  	MOVD  R4, c+56(FP)
   269  	RET
   270  
   271  // func subVW(z, x []Word, y Word) (c Word)
   272  TEXT ·subVW(SB), NOSPLIT, $0
   273  	MOVD  z+0(FP), R10	// R10 = z[]
   274  	MOVD  x+24(FP), R8	// R8 = x[]
   275  	MOVD  y+48(FP), R4	// R4 = y = c
   276  	MOVD  z_len+8(FP), R11	// R11 = z_len
   277  
   278  	CMP   R0, R11		// If z_len is zero, return
   279  	BEQ   done
   280  
   281  	// We will process the first iteration out of the loop so we capture
   282  	// the value of c. In the subsequent iterations, we will rely on the
   283  	// value of CA set here.
   284  	MOVD  0(R8), R20	// R20 = x[i]
   285  	ADD   $-1, R11		// R11 = z_len - 1
   286  	SUBC  R4, R20, R6	// R6 = x[i] - c
   287  	CMP   R0, R11		// If z_len was 1, we are done
   288  	MOVD  R6, 0(R10)	// z[i]
   289  	BEQ   final
   290  
   291  	// We will read 4 elements per iteration
   292  	SRD   $2, R11, R9	// R9 = z_len/4
   293  	DCBT  (R8)
   294  	CMP   R0, R9
   295  	MOVD  R9, CTR		// Set up the loop counter
   296  	BEQ   tail		// If R9 = 0, we can't use the loop
   297  
   298  	// The loop here is almost the same as the one used in s390x, but
   299  	// we don't need to capture CA every iteration because we've already
   300  	// done that above.
   301  loop:
   302  	MOVD  8(R8), R20
   303  	MOVD  16(R8), R21
   304  	MOVD  24(R8), R22
   305  	MOVDU 32(R8), R23
   306  	SUBE  R0, R20
   307  	SUBE  R0, R21
   308  	SUBE  R0, R22
   309  	SUBE  R0, R23
   310  	MOVD  R20, 8(R10)
   311  	MOVD  R21, 16(R10)
   312  	MOVD  R22, 24(R10)
   313  	MOVDU R23, 32(R10)
   314  	ADD   $-4, R11
   315  	BC    16, 0, loop	// bdnz
   316  
   317  	// We may have some elements to read
   318  	CMP   R0, R11
   319  	BEQ   final
   320  
   321  tail:
   322  	MOVDU 8(R8), R20
   323  	SUBE  R0, R20
   324  	ADD   $-1, R11
   325  	MOVDU R20, 8(R10)
   326  	CMP   R0, R11
   327  	BEQ   final
   328  
   329  	MOVDU 8(R8), R20
   330  	SUBE  R0, R20
   331  	ADD   $-1, R11
   332  	MOVDU R20, 8(R10)
   333  	CMP   R0, R11
   334  	BEQ   final
   335  
   336  	MOVD  8(R8), R20
   337  	SUBE  R0, R20
   338  	MOVD  R20, 8(R10)
   339  
   340  final:
   341  	// Capture CA
   342  	SUBE  R4, R4
   343  	NEG   R4, R4
   344  
   345  done:
   346  	MOVD  R4, c+56(FP)
   347  	RET
   348  
   349  TEXT ·shlVU(SB), NOSPLIT, $0
   350  	BR ·shlVU_g(SB)
   351  
   352  TEXT ·shrVU(SB), NOSPLIT, $0
   353  	BR ·shrVU_g(SB)
   354  
   355  // func mulAddVWW(z, x []Word, y, r Word) (c Word)
   356  TEXT ·mulAddVWW(SB), NOSPLIT, $0
   357  	MOVD    z+0(FP), R10      // R10 = z[]
   358  	MOVD    x+24(FP), R8      // R8 = x[]
   359  	MOVD    y+48(FP), R9      // R9 = y
   360  	MOVD    r+56(FP), R4      // R4 = r = c
   361  	MOVD    z_len+8(FP), R11  // R11 = z_len
   362  
   363  	CMP     R0, R11
   364  	BEQ     done
   365  
   366  	MOVD    0(R8), R20
   367  	ADD     $-1, R11
   368  	MULLD   R9, R20, R6       // R6 = z0 = Low-order(x[i]*y)
   369  	MULHDU  R9, R20, R7       // R7 = z1 = High-order(x[i]*y)
   370  	ADDC    R4, R6            // R6 = z0 + r
   371  	ADDZE   R7                // R7 = z1 + CA
   372  	CMP     R0, R11
   373  	MOVD    R7, R4            // R4 = c
   374  	MOVD    R6, 0(R10)        // z[i]
   375  	BEQ     done
   376  
   377  	// We will read 4 elements per iteration
   378  	SRD     $2, R11, R14      // R14 = z_len/4
   379  	DCBT    (R8)
   380  	CMP     R0, R14
   381  	MOVD    R14, CTR          // Set up the loop counter
   382  	BEQ     tail              // If R9 = 0, we can't use the loop
   383  
   384  loop:
   385  	MOVD    8(R8), R20        // R20 = x[i]
   386  	MOVD    16(R8), R21       // R21 = x[i+1]
   387  	MOVD    24(R8), R22       // R22 = x[i+2]
   388  	MOVDU   32(R8), R23       // R23 = x[i+3]
   389  	MULLD   R9, R20, R24      // R24 = z0[i]
   390  	MULHDU  R9, R20, R20      // R20 = z1[i]
   391  	ADDC    R4, R24           // R24 = z0[i] + c
   392  	ADDZE   R20               // R7 = z1[i] + CA
   393  	MULLD   R9, R21, R25
   394  	MULHDU  R9, R21, R21
   395  	ADDC    R20, R25
   396  	ADDZE   R21
   397  	MULLD   R9, R22, R26
   398  	MULHDU  R9, R22, R22
   399  	MULLD   R9, R23, R27
   400  	MULHDU  R9, R23, R23
   401  	ADDC    R21, R26
   402  	ADDZE   R22
   403  	MOVD    R24, 8(R10)       // z[i]
   404  	MOVD    R25, 16(R10)      // z[i+1]
   405  	ADDC    R22, R27
   406  	ADDZE   R23,R4		  // update carry
   407  	MOVD    R26, 24(R10)      // z[i+2]
   408  	MOVDU   R27, 32(R10)      // z[i+3]
   409  	ADD     $-4, R11          // R11 = z_len - 4
   410  	BC      16, 0, loop       // bdnz
   411  
   412  	// We may have some elements to read
   413  	CMP   R0, R11
   414  	BEQ   done
   415  
   416  	// Process the remaining elements, one at a time
   417  tail:
   418  	MOVDU   8(R8), R20        // R20 = x[i]
   419  	MULLD   R9, R20, R24      // R24 = z0[i]
   420  	MULHDU  R9, R20, R25      // R25 = z1[i]
   421  	ADD     $-1, R11          // R11 = z_len - 1
   422  	ADDC    R4, R24
   423  	ADDZE   R25
   424  	MOVDU   R24, 8(R10)       // z[i]
   425  	CMP     R0, R11
   426  	MOVD    R25, R4           // R4 = c
   427  	BEQ     done              // If R11 = 0, we are done
   428  
   429  	MOVDU   8(R8), R20
   430  	MULLD   R9, R20, R24
   431  	MULHDU  R9, R20, R25
   432  	ADD     $-1, R11
   433  	ADDC    R4, R24
   434  	ADDZE   R25
   435  	MOVDU   R24, 8(R10)
   436  	CMP     R0, R11
   437  	MOVD    R25, R4
   438  	BEQ     done
   439  
   440  	MOVD    8(R8), R20
   441  	MULLD   R9, R20, R24
   442  	MULHDU  R9, R20, R25
   443  	ADD     $-1, R11
   444  	ADDC    R4, R24
   445  	ADDZE   R25
   446  	MOVD    R24, 8(R10)
   447  	MOVD    R25, R4
   448  
   449  done:
   450  	MOVD    R4, c+64(FP)
   451  	RET
   452  
   453  // func addMulVVW(z, x []Word, y Word) (c Word)
   454  TEXT ·addMulVVW(SB), NOSPLIT, $0
   455  	MOVD z+0(FP), R10	// R10 = z[]
   456  	MOVD x+24(FP), R8	// R8 = x[]
   457  	MOVD y+48(FP), R9	// R9 = y
   458  	MOVD z_len+8(FP), R22	// R22 = z_len
   459  
   460  	MOVD R0, R3		// R3 will be the index register
   461  	CMP  R0, R22
   462  	MOVD R0, R4		// R4 = c = 0
   463  	MOVD R22, CTR		// Initialize loop counter
   464  	BEQ  done
   465  
   466  loop:
   467  	MOVD  (R8)(R3), R20	// Load x[i]
   468  	MOVD  (R10)(R3), R21	// Load z[i]
   469  	MULLD  R9, R20, R6	// R6 = Low-order(x[i]*y)
   470  	MULHDU R9, R20, R7	// R7 = High-order(x[i]*y)
   471  	ADDC   R21, R6		// R6 = z0
   472  	ADDZE  R7		// R7 = z1
   473  	ADDC   R4, R6		// R6 = z0 + c + 0
   474  	ADDZE  R7, R4           // c += z1
   475  	MOVD   R6, (R10)(R3)	// Store z[i]
   476  	ADD    $8, R3
   477  	BC  16, 0, loop		// bdnz
   478  
   479  done:
   480  	MOVD R4, c+56(FP)
   481  	RET
   482  
   483  
   484  

View as plain text