Text file src/internal/bytealg/index_ppc64x.s

     1  // Copyright 2021 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // This is an implementation based on the s390x
     6  // implementation.
     7  
     8  // Find a separator with 2 <= len <= 32 within a string.
     9  // Separators with lengths of 2, 3 or 4 are handled
    10  // specially.
    11  
    12  // This works on power8 and above. The loads and
    13  // compares are done in big endian order
    14  // since that allows the used of VCLZD, and allows
    15  // the same implementation to work on big and little
    16  // endian platforms with minimal conditional changes.
    17  
    18  // NOTE: There is a power9 implementation that
    19  // improves performance by 10-15% on little
    20  // endian for some of the benchmarks, but
    21  // work is still needed for a big endian
    22  // implementation on power9.
    23  
    24  //go:build ppc64 || ppc64le
    25  
    26  #include "go_asm.h"
    27  #include "textflag.h"
    28  
    29  // Needed to swap LXVD2X loads to the correct
    30  // byte order to work on POWER8.
    31  
    32  #ifdef GOARCH_ppc64
    33  DATA byteswap<>+0(SB)/8, $0x0001020304050607
    34  DATA byteswap<>+8(SB)/8, $0x08090a0b0c0d0e0f
    35  #else
    36  DATA byteswap<>+0(SB)/8, $0x0706050403020100
    37  DATA byteswap<>+8(SB)/8, $0x0f0e0d0c0b0a0908
    38  #endif
    39  
    40  // Load bytes in big endian order. Address
    41  // alignment does not need checking.
    42  #define VLOADSWAP(base, index, vreg, vsreg) \
    43  	LXVD2X (base)(index), vsreg;  \
    44  	VPERM  vreg, vreg, SWAP, vreg
    45  
    46  GLOBL byteswap<>+0(SB), RODATA, $16
    47  
    48  TEXT ·Index<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-56
    49  #ifdef GOEXPERIMENT_regabiargs
    50  // R3 = byte array pointer
    51  // R4 = length
    52          MOVD R6,R5             // R5 = separator pointer
    53          MOVD R7,R6             // R6 = separator length
    54  #else
    55  	MOVD a_base+0(FP), R3  // R3 = byte array pointer
    56  	MOVD a_len+8(FP), R4   // R4 = length
    57  	MOVD b_base+24(FP), R5 // R5 = separator pointer
    58  	MOVD b_len+32(FP), R6  // R6 = separator length
    59  	MOVD $ret+48(FP), R14  // R14 = &ret
    60  #endif
    61  
    62  
    63  #ifdef GOARCH_ppc64le
    64  	MOVBZ internal∕cpu·PPC64+const_offsetPPC64HasPOWER9(SB), R7
    65  	CMP   R7, $1
    66  	BNE   power8
    67  	BR    indexbodyp9<>(SB)
    68  
    69  #endif
    70  power8:
    71  	BR indexbody<>(SB)
    72  
    73  TEXT ·IndexString<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-40
    74  #ifndef GOEXPERIMENT_regabiargs
    75  	MOVD a_base+0(FP), R3  // R3 = string
    76  	MOVD a_len+8(FP), R4   // R4 = length
    77  	MOVD b_base+16(FP), R5 // R5 = separator pointer
    78  	MOVD b_len+24(FP), R6  // R6 = separator length
    79  	MOVD $ret+32(FP), R14  // R14 = &ret
    80  #endif
    81  
    82  
    83  #ifdef GOARCH_ppc64le
    84  	MOVBZ internal∕cpu·PPC64+const_offsetPPC64HasPOWER9(SB), R7
    85  	CMP   R7, $1
    86  	BNE   power8
    87  	BR    indexbodyp9<>(SB)
    88  
    89  #endif
    90  power8:
    91  	BR indexbody<>(SB)
    92  
    93  	// s: string we are searching
    94  	// sep: string to search for
    95  	// R3=&s[0], R4=len(s)
    96  	// R5=&sep[0], R6=len(sep)
    97  	// R14=&ret (index where sep found)
    98  	// R7=working addr of string
    99  	// R16=index value 16
   100  	// R17=index value 17
   101  	// R18=index value 18
   102  	// R19=index value 1
   103  	// R26=LASTBYTE of string
   104  	// R27=LASTSTR last start byte to compare with sep
   105  	// R8, R9 scratch
   106  	// V0=sep left justified zero fill
   107  	// CR4=sep length >= 16
   108  
   109  #define SEPMASK V17
   110  #define LASTBYTE R26
   111  #define LASTSTR R27
   112  #define ONES V20
   113  #define SWAP V21
   114  #define V0_ VS32
   115  #define V1_ VS33
   116  #define V2_ VS34
   117  #define V3_ VS35
   118  #define V4_ VS36
   119  #define V5_ VS37
   120  #define V6_ VS38
   121  #define V7_ VS39
   122  #define V8_ VS40
   123  #define V9_ VS41
   124  #define SWAP_ VS53
   125  TEXT indexbody<>(SB), NOSPLIT|NOFRAME, $0
   126  	CMP      R6, R4                 // Compare lengths
   127  	BGT      notfound               // If sep len is > string, notfound
   128  	ADD      R4, R3, LASTBYTE       // find last byte addr
   129  	SUB      R6, LASTBYTE, LASTSTR  // LAST=&s[len(s)-len(sep)] (last valid start index)
   130  	CMP      R6, $0                 // Check sep len
   131  	BEQ      notfound               // sep len 0 -- not found
   132  	MOVD     R3, R7                 // Copy of string addr
   133  	MOVD     $16, R16               // Index value 16
   134  	MOVD     $17, R17               // Index value 17
   135  	MOVD     $18, R18               // Index value 18
   136  	MOVD     $1, R19                // Index value 1
   137  	MOVD     $byteswap<>+00(SB), R8
   138  	VSPLTISB $0xFF, ONES            // splat all 1s
   139  	LXVD2X   (R8)(R0), SWAP_        // Set up swap string
   140  
   141  	CMP    R6, $16, CR4        // CR4 for len(sep) >= 16
   142  	VOR    ONES, ONES, SEPMASK // Set up full SEPMASK
   143  	BGE    CR4, loadge16       // Load for len(sep) >= 16
   144  	SUB    R6, R16, R9         // 16-len of sep
   145  	SLD    $3, R9              // Set up for VSLO
   146  	MTVSRD R9, V9_             // Set up for VSLO
   147  	VSLDOI $8, V9, V9, V9      // Set up for VSLO
   148  	VSLO   ONES, V9, SEPMASK   // Mask for separator len(sep) < 16
   149  
   150  loadge16:
   151  	ANDCC $15, R5, R9 // Find byte offset of sep
   152  	ADD   R9, R6, R10 // Add sep len
   153  	CMP   R10, $16    // Check if sep len+offset > 16
   154  	BGT   sepcross16  // Sep crosses 16 byte boundary
   155  
   156  	RLDICR $0, R5, $59, R8 // Adjust addr to 16 byte container
   157  	VLOADSWAP(R8, R0, V0, V0_)// Load 16 bytes @R8 into V0
   158  	SLD    $3, R9          // Set up shift count for VSLO
   159  	MTVSRD R9, V8_         // Set up shift count for VSLO
   160  	VSLDOI $8, V8, V8, V8
   161  	VSLO   V0, V8, V0      // Shift by start byte
   162  
   163  	VAND V0, SEPMASK, V0 // Mask separator (< 16)
   164  	BR   index2plus
   165  
   166  sepcross16:
   167  	VLOADSWAP(R5, R0, V0, V0_) // Load 16 bytes @R5 into V0
   168  
   169  	VAND V0, SEPMASK, V0 // mask out separator
   170  	BLE  CR4, index2to16
   171  	BR   index17plus     // Handle sep > 16
   172  
   173  index2plus:
   174  	CMP      R6, $2       // Check length of sep
   175  	BNE      index3plus   // If not 2, check for 3
   176  	ADD      $16, R7, R9  // Check if next 16 bytes past last
   177  	CMP      R9, LASTBYTE // compare with last
   178  	BGE      index2to16   // 2 <= len(string) <= 16
   179  	MOVD     $0xff00, R21 // Mask for later
   180  	MTVSRD   R21, V25     // Move to Vreg
   181  	VSPLTH   $3, V25, V31 // Splat mask
   182  	VSPLTH   $0, V0, V1   // Splat 1st 2 bytes of sep
   183  	VSPLTISB $0, V10      // Clear V10
   184  
   185  	// First case: 2 byte separator
   186  	// V1: 2 byte separator splatted
   187  	// V2: 16 bytes at addr
   188  	// V4: 16 bytes at addr+1
   189  	// Compare 2 byte separator at start
   190  	// and at start+1. Use VSEL to combine
   191  	// those results to find the first
   192  	// matching start byte, returning
   193  	// that value when found. Loop as
   194  	// long as len(string) > 16
   195  index2loop2:
   196  	VLOADSWAP(R7, R19, V3, V3_) // Load 16 bytes @R7+1 into V3
   197  
   198  index2loop:
   199  	VLOADSWAP(R7, R0, V2, V2_) // Load 16 bytes @R7 into V2
   200  	VCMPEQUH V1, V2, V5        // Search for sep
   201  	VCMPEQUH V1, V3, V6        // Search for sep offset by 1
   202  	VSEL     V6, V5, V31, V7   // merge even and odd indices
   203  	VCLZD    V7, V18           // find index of first match
   204  	MFVSRD   V18, R25          // get first value
   205  	CMP      R25, $64          // Found if < 64
   206  	BLT      foundR25          // Return byte index where found
   207  	VSLDOI   $8, V18, V18, V18 // Adjust 2nd value
   208  	MFVSRD   V18, R25          // get second value
   209  	CMP      R25, $64          // Found if < 64
   210  	ADD      $64, R25          // Update byte offset
   211  	BLT      foundR25          // Return value
   212  	ADD      $16, R7           // R7+=16 Update string pointer
   213  	ADD      $17, R7, R9       // R9=F7+17 since loop unrolled
   214  	CMP      R9, LASTBYTE      // Compare addr+17 against last byte
   215  	BLT      index2loop2       // If < last, continue loop
   216  	CMP      R7, LASTBYTE      // Compare addr+16 against last byte
   217  	BLT      index2to16        // If < 16 handle specially
   218  	VLOADSWAP(R7, R0, V3, V3_) // Load 16 bytes @R7 into V3
   219  	VSLDOI   $1, V3, V10, V3   // Shift left by 1 byte
   220  	BR       index2loop
   221  
   222  index3plus:
   223  	CMP    R6, $3       // Check if sep == 3
   224  	BNE    index4plus   // If not check larger
   225  	ADD    $19, R7, R9  // Find bytes for use in this loop
   226  	CMP    R9, LASTBYTE // Compare against last byte
   227  	BGE    index2to16   // Remaining string 2<=len<=16
   228  	MOVD   $0xff00, R21 // Set up mask for upcoming loop
   229  	MTVSRD R21, V25     // Move mask to Vreg
   230  	VSPLTH $3, V25, V31 // Splat mask
   231  	VSPLTH $0, V0, V1   // Splat 1st two bytes of sep
   232  	VSPLTB $2, V0, V8   // Splat 3rd byte of sep
   233  
   234  	// Loop to process 3 byte separator.
   235  	// string[0:16] is in V2
   236  	// string[2:18] is in V3
   237  	// sep[0:2] splatted in V1
   238  	// sec[3] splatted in v8
   239  	// Load vectors at string, string+1
   240  	// and string+2. Compare string, string+1
   241  	// against first 2 bytes of separator
   242  	// splatted, and string+2 against 3rd
   243  	// byte splatted. Merge the results with
   244  	// VSEL to find the first byte of a match.
   245  
   246  	// Special handling for last 16 bytes if the
   247  	// string fits in 16 byte multiple.
   248  index3loop2:
   249  	MOVD     $2, R21          // Set up index for 2
   250  	VSPLTISB $0, V10          // Clear V10
   251  	VLOADSWAP(R7, R21, V3, V3_)// Load 16 bytes @R7+2 into V3
   252  	VSLDOI   $14, V3, V10, V3 // Left justify next 2 bytes
   253  
   254  index3loop:
   255  	VLOADSWAP(R7, R0, V2, V2_) // Load with correct order
   256  	VSLDOI   $1, V2, V3, V4    // string[1:17]
   257  	VSLDOI   $2, V2, V3, V9    // string[2:18]
   258  	VCMPEQUH V1, V2, V5        // compare hw even indices
   259  	VCMPEQUH V1, V4, V6        // compare hw odd indices
   260  	VCMPEQUB V8, V9, V10       // compare 3rd to last byte
   261  	VSEL     V6, V5, V31, V7   // Find 1st matching byte using mask
   262  	VAND     V7, V10, V7       // AND matched bytes with matched 3rd byte
   263  	VCLZD    V7, V18           // Find first nonzero indexes
   264  	MFVSRD   V18, R25          // Move 1st doubleword
   265  	CMP      R25, $64          // If < 64 found
   266  	BLT      foundR25          // Return matching index
   267  	VSLDOI   $8, V18, V18, V18 // Move value
   268  	MFVSRD   V18, R25          // Move 2nd doubleword
   269  	CMP      R25, $64          // If < 64 found
   270  	ADD      $64, R25          // Update byte index
   271  	BLT      foundR25          // Return matching index
   272  	ADD      $16, R7           // R7+=16 string ptr
   273  	ADD      $19, R7, R9       // Number of string bytes for loop
   274  	CMP      R9, LASTBYTE      // Compare against last byte of string
   275  	BLT      index3loop2       // If within, continue this loop
   276  	CMP      R7, LASTSTR       // Compare against last start byte
   277  	BLT      index2to16        // Process remainder
   278  	VSPLTISB $0, V3            // Special case for last 16 bytes
   279  	BR       index3loop        // Continue this loop
   280  
   281  	// Loop to process 4 byte separator
   282  	// string[0:16] in V2
   283  	// string[3:16] in V3
   284  	// sep[0:4] splatted in V1
   285  	// Set up vectors with strings at offsets
   286  	// 0, 1, 2, 3 and compare against the 4 byte
   287  	// separator also splatted. Use VSEL with the
   288  	// compare results to find the first byte where
   289  	// a separator match is found.
   290  index4plus:
   291  	CMP  R6, $4       // Check if 4 byte separator
   292  	BNE  index5plus   // If not next higher
   293  	ADD  $20, R7, R9  // Check string size to load
   294  	CMP  R9, LASTBYTE // Verify string length
   295  	BGE  index2to16   // If not large enough, process remaining
   296  	MOVD $2, R15      // Set up index
   297  
   298  	// Set up masks for use with VSEL
   299  	MOVD   $0xff, R21        // Set up mask 0xff000000ff000000...
   300  	SLD    $24, R21
   301  	MTVSRD R21, V10
   302  	VSPLTW $1, V10, V29
   303  	VSLDOI $2, V29, V29, V30 // Mask 0x0000ff000000ff00...
   304  	MOVD   $0xffff, R21
   305  	SLD    $16, R21
   306  	MTVSRD R21, V10
   307  	VSPLTW $1, V10, V31      // Mask 0xffff0000ffff0000...
   308  	VSPLTW $0, V0, V1        // Splat 1st word of separator
   309  
   310  index4loop:
   311  	VLOADSWAP(R7, R0, V2, V2_) // Load 16 bytes @R7 into V2
   312  
   313  next4:
   314  	VSPLTISB $0, V10            // Clear
   315  	MOVD     $3, R9             // Number of bytes beyond 16
   316  	VLOADSWAP(R7, R9, V3, V3_)  // Load 16 bytes @R7+3 into V3
   317  	VSLDOI   $13, V3, V10, V3   // Shift left last 3 bytes
   318  	VSLDOI   $1, V2, V3, V4     // V4=(V2:V3)<<1
   319  	VSLDOI   $2, V2, V3, V9     // V9=(V2:V3)<<2
   320  	VSLDOI   $3, V2, V3, V10    // V10=(V2:v3)<<3
   321  	VCMPEQUW V1, V2, V5         // compare index 0, 4, ... with sep
   322  	VCMPEQUW V1, V4, V6         // compare index 1, 5, ... with sep
   323  	VCMPEQUW V1, V9, V11        // compare index 2, 6, ... with sep
   324  	VCMPEQUW V1, V10, V12       // compare index 3, 7, ... with sep
   325  	VSEL     V6, V5, V29, V13   // merge index 0, 1, 4, 5, using mask
   326  	VSEL     V12, V11, V30, V14 // merge index 2, 3, 6, 7, using mask
   327  	VSEL     V14, V13, V31, V7  // final merge
   328  	VCLZD    V7, V18            // Find first index for each half
   329  	MFVSRD   V18, R25           // Isolate value
   330  	CMP      R25, $64           // If < 64, found
   331  	BLT      foundR25           // Return found index
   332  	VSLDOI   $8, V18, V18, V18  // Move for MFVSRD
   333  	MFVSRD   V18, R25           // Isolate other value
   334  	CMP      R25, $64           // If < 64, found
   335  	ADD      $64, R25           // Update index for high doubleword
   336  	BLT      foundR25           // Return found index
   337  	ADD      $16, R7            // R7+=16 for next string
   338  	ADD      $20, R7, R9        // R+20 for all bytes to load
   339  	CMP      R9, LASTBYTE       // Past end? Maybe check for extra?
   340  	BLT      index4loop         // If not, continue loop
   341  	CMP      R7, LASTSTR        // Check remainder
   342  	BLE      index2to16         // Process remainder
   343  	BR       notfound           // Not found
   344  
   345  index5plus:
   346  	CMP R6, $16     // Check for sep > 16
   347  	BGT index17plus // Handle large sep
   348  
   349  	// Assumption is that the separator is smaller than the string at this point
   350  index2to16:
   351  	CMP R7, LASTSTR // Compare last start byte
   352  	BGT notfound    // last takes len(sep) into account
   353  
   354  	ADD $16, R7, R9    // Check for last byte of string
   355  	CMP R9, LASTBYTE
   356  	BGT index2to16tail
   357  
   358  	// At least 16 bytes of string left
   359  	// Mask the number of bytes in sep
   360  index2to16loop:
   361  	VLOADSWAP(R7, R0, V1, V1_) // Load 16 bytes @R7 into V1
   362  
   363  compare:
   364  	VAND       V1, SEPMASK, V2 // Mask out sep size
   365  	VCMPEQUBCC V0, V2, V3      // Compare masked string
   366  	BLT        CR6, found      // All equal
   367  	ADD        $1, R7          // Update ptr to next byte
   368  	CMP        R7, LASTSTR     // Still less than last start byte
   369  	BGT        notfound        // Not found
   370  	ADD        $16, R7, R9     // Verify remaining bytes
   371  	CMP        R9, LASTBYTE    // At least 16
   372  	BLT        index2to16loop  // Try again
   373  
   374  	// Less than 16 bytes remaining in string
   375  	// Separator >= 2
   376  index2to16tail:
   377  	ADD   R3, R4, R9     // End of string
   378  	SUB   R7, R9, R9     // Number of bytes left
   379  	ANDCC $15, R7, R10   // 16 byte offset
   380  	ADD   R10, R9, R11   // offset + len
   381  	CMP   R11, $16       // >= 16?
   382  	BLE   short          // Does not cross 16 bytes
   383  	VLOADSWAP(R7, R0, V1, V1_)// Load 16 bytes @R7 into V1
   384  	BR    index2to16next // Continue on
   385  
   386  short:
   387  	RLDICR   $0, R7, $59, R9 // Adjust addr to 16 byte container
   388  	VLOADSWAP(R9, R0, V1, V1_)// Load 16 bytes @R9 into V1
   389  	SLD      $3, R10         // Set up shift
   390  	MTVSRD   R10, V8_        // Set up shift
   391  	VSLDOI   $8, V8, V8, V8
   392  	VSLO     V1, V8, V1      // Shift by start byte
   393  	VSPLTISB $0, V25         // Clear for later use
   394  
   395  index2to16next:
   396  	VAND       V1, SEPMASK, V2 // Just compare size of sep
   397  	VCMPEQUBCC V0, V2, V3      // Compare sep and partial string
   398  	BLT        CR6, found      // Found
   399  	ADD        $1, R7          // Not found, try next partial string
   400  	CMP        R7, LASTSTR     // Check for end of string
   401  	BGT        notfound        // If at end, then not found
   402  	VSLDOI     $1, V1, V25, V1 // Shift string left by 1 byte
   403  	BR         index2to16next  // Check the next partial string
   404  
   405  index17plus:
   406  	CMP      R6, $32      // Check if 17 < len(sep) <= 32
   407  	BGT      index33plus
   408  	SUB      $16, R6, R9  // Extra > 16
   409  	SLD      $56, R9, R10 // Shift to use in VSLO
   410  	MTVSRD   R10, V9_     // Set up for VSLO
   411  	VLOADSWAP(R5, R9, V1, V1_)// Load 16 bytes @R5+R9 into V1
   412  	VSLO     V1, V9, V1   // Shift left
   413  	VSPLTISB $0xff, V7    // Splat 1s
   414  	VSPLTISB $0, V27      // Splat 0
   415  
   416  index17to32loop:
   417  	VLOADSWAP(R7, R0, V2, V2_) // Load 16 bytes @R7 into V2
   418  
   419  next17:
   420  	VLOADSWAP(R7, R9, V3, V3_) // Load 16 bytes @R7+R9 into V3
   421  	VSLO       V3, V9, V3      // Shift left
   422  	VCMPEQUB   V0, V2, V4      // Compare first 16 bytes
   423  	VCMPEQUB   V1, V3, V5      // Compare extra over 16 bytes
   424  	VAND       V4, V5, V6      // Check if both equal
   425  	VCMPEQUBCC V6, V7, V8      // All equal?
   426  	BLT        CR6, found      // Yes
   427  	ADD        $1, R7          // On to next byte
   428  	CMP        R7, LASTSTR     // Check if last start byte
   429  	BGT        notfound        // If too high, not found
   430  	BR         index17to32loop // Continue
   431  
   432  notfound:
   433  #ifdef GOEXPERIMENT_regabiargs
   434          MOVD $-1, R3   // Return -1 if not found
   435  #else
   436  	MOVD $-1, R8   // Return -1 if not found
   437  	MOVD R8, (R14)
   438  #endif
   439  	RET
   440  
   441  index33plus:
   442  	MOVD $0, (R0) // Case not implemented
   443  	RET           // Crash before return
   444  
   445  foundR25:
   446  	SRD  $3, R25   // Convert from bits to bytes
   447  	ADD  R25, R7   // Add to current string address
   448  	SUB  R3, R7    // Subtract from start of string
   449  #ifdef GOEXPERIMENT_regabiargs
   450          MOVD R7, R3    // Return byte where found
   451  #else
   452  	MOVD R7, (R14) // Return byte where found
   453  #endif
   454  	RET
   455  
   456  found:
   457  	SUB  R3, R7    // Return byte where found
   458  #ifdef GOEXPERIMENT_regabiargs
   459          MOVD R7, R3
   460  #else
   461  	MOVD R7, (R14)
   462  #endif
   463  	RET
   464  
   465  TEXT indexbodyp9<>(SB), NOSPLIT|NOFRAME, $0
   466  	CMP      R6, R4                // Compare lengths
   467  	BGT      notfound              // If sep len is > string, notfound
   468  	ADD      R4, R3, LASTBYTE      // find last byte addr
   469  	SUB      R6, LASTBYTE, LASTSTR // LAST=&s[len(s)-len(sep)] (last valid start index)
   470  	CMP      R6, $0                // Check sep len
   471  	BEQ      notfound              // sep len 0 -- not found
   472  	MOVD     R3, R7                // Copy of string addr
   473  	MOVD     $16, R16              // Index value 16
   474  	MOVD     $17, R17              // Index value 17
   475  	MOVD     $18, R18              // Index value 18
   476  	MOVD     $1, R19               // Index value 1
   477  	VSPLTISB $0xFF, ONES           // splat all 1s
   478  
   479  	CMP    R6, $16, CR4        // CR4 for len(sep) >= 16
   480  	VOR    ONES, ONES, SEPMASK // Set up full SEPMASK
   481  	BGE    CR4, loadge16       // Load for len(sep) >= 16
   482  	SUB    R6, R16, R9         // 16-len of sep
   483  	SLD    $3, R9              // Set up for VSLO
   484  	MTVSRD R9, V9_             // Set up for VSLO
   485  	VSLDOI $8, V9, V9, V9      // Set up for VSLO
   486  	VSLO   ONES, V9, SEPMASK   // Mask for separator len(sep) < 16
   487  
   488  loadge16:
   489  	ANDCC $15, R5, R9 // Find byte offset of sep
   490  	ADD   R9, R6, R10 // Add sep len
   491  	CMP   R10, $16    // Check if sep len+offset > 16
   492  	BGT   sepcross16  // Sep crosses 16 byte boundary
   493  
   494  	RLDICR  $0, R5, $59, R8 // Adjust addr to 16 byte container
   495  	LXVB16X (R8)(R0), V0_   // Load 16 bytes @R8 into V0
   496  	SLD     $3, R9          // Set up shift count for VSLO
   497  	MTVSRD  R9, V8_         // Set up shift count for VSLO
   498  	VSLDOI  $8, V8, V8, V8
   499  	VSLO    V0, V8, V0      // Shift by start byte
   500  
   501  	VAND V0, SEPMASK, V0 // Mask separator (< 16)
   502  	BR   index2plus
   503  
   504  sepcross16:
   505  	LXVB16X (R5)(R0), V0_ // Load 16 bytes @R5 into V0
   506  
   507  	VAND V0, SEPMASK, V0 // mask out separator
   508  	BLE  CR4, index2to16
   509  	BR   index17plus     // Handle sep > 16
   510  
   511  index2plus:
   512  	CMP      R6, $2       // Check length of sep
   513  	BNE      index3plus   // If not 2, check for 3
   514  	ADD      $16, R7, R9  // Check if next 16 bytes past last
   515  	CMP      R9, LASTBYTE // compare with last
   516  	BGE      index2to16   // 2 <= len(string) <= 16
   517  	MOVD     $0xff00, R21 // Mask for later
   518  	MTVSRD   R21, V25     // Move to Vreg
   519  	VSPLTH   $3, V25, V31 // Splat mask
   520  	VSPLTH   $0, V0, V1   // Splat 1st 2 bytes of sep
   521  	VSPLTISB $0, V10      // Clear V10
   522  
   523  	// First case: 2 byte separator
   524  	// V1: 2 byte separator splatted
   525  	// V2: 16 bytes at addr
   526  	// V4: 16 bytes at addr+1
   527  	// Compare 2 byte separator at start
   528  	// and at start+1. Use VSEL to combine
   529  	// those results to find the first
   530  	// matching start byte, returning
   531  	// that value when found. Loop as
   532  	// long as len(string) > 16
   533  index2loop2:
   534  	LXVB16X (R7)(R19), V3_ // Load 16 bytes @R7+1 into V3
   535  
   536  index2loop:
   537  	LXVB16X  (R7)(R0), V2_   // Load 16 bytes @R7 into V2
   538  	VCMPEQUH V1, V2, V5      // Search for sep
   539  	VCMPEQUH V1, V3, V6      // Search for sep offset by 1
   540  	VSEL     V6, V5, V31, V7 // merge even and odd indices
   541  	VCLZD    V7, V18         // find index of first match
   542  	MFVSRD   V18, R25        // get first value
   543  	CMP      R25, $64        // Found if < 64
   544  	BLT      foundR25        // Return byte index where found
   545  
   546  	MFVSRLD V18, R25        // get second value
   547  	CMP     R25, $64        // Found if < 64
   548  	ADD     $64, R25        // Update byte offset
   549  	BLT     foundR25        // Return value
   550  	ADD     $16, R7         // R7+=16 Update string pointer
   551  	ADD     $17, R7, R9     // R9=F7+17 since loop unrolled
   552  	CMP     R9, LASTBYTE    // Compare addr+17 against last byte
   553  	BLT     index2loop2     // If < last, continue loop
   554  	CMP     R7, LASTBYTE    // Compare addr+16 against last byte
   555  	BLT     index2to16      // If < 16 handle specially
   556  	LXVB16X (R7)(R0), V3_   // Load 16 bytes @R7 into V3
   557  	VSLDOI  $1, V3, V10, V3 // Shift left by 1 byte
   558  	BR      index2loop
   559  
   560  index3plus:
   561  	CMP    R6, $3       // Check if sep == 3
   562  	BNE    index4plus   // If not check larger
   563  	ADD    $19, R7, R9  // Find bytes for use in this loop
   564  	CMP    R9, LASTBYTE // Compare against last byte
   565  	BGE    index2to16   // Remaining string 2<=len<=16
   566  	MOVD   $0xff00, R21 // Set up mask for upcoming loop
   567  	MTVSRD R21, V25     // Move mask to Vreg
   568  	VSPLTH $3, V25, V31 // Splat mask
   569  	VSPLTH $0, V0, V1   // Splat 1st two bytes of sep
   570  	VSPLTB $2, V0, V8   // Splat 3rd byte of sep
   571  
   572  	// Loop to process 3 byte separator.
   573  	// string[0:16] is in V2
   574  	// string[2:18] is in V3
   575  	// sep[0:2] splatted in V1
   576  	// sec[3] splatted in v8
   577  	// Load vectors at string, string+1
   578  	// and string+2. Compare string, string+1
   579  	// against first 2 bytes of separator
   580  	// splatted, and string+2 against 3rd
   581  	// byte splatted. Merge the results with
   582  	// VSEL to find the first byte of a match.
   583  
   584  	// Special handling for last 16 bytes if the
   585  	// string fits in 16 byte multiple.
   586  index3loop2:
   587  	MOVD     $2, R21          // Set up index for 2
   588  	VSPLTISB $0, V10          // Clear V10
   589  	LXVB16X  (R7)(R21), V3_   // Load 16 bytes @R7+2 into V3
   590  	VSLDOI   $14, V3, V10, V3 // Left justify next 2 bytes
   591  
   592  index3loop:
   593  	LXVB16X  (R7)(R0), V2_   // Load 16 bytes @R7
   594  	VSLDOI   $1, V2, V3, V4  // string[1:17]
   595  	VSLDOI   $2, V2, V3, V9  // string[2:18]
   596  	VCMPEQUH V1, V2, V5      // compare hw even indices
   597  	VCMPEQUH V1, V4, V6      // compare hw odd indices
   598  	VCMPEQUB V8, V9, V10     // compare 3rd to last byte
   599  	VSEL     V6, V5, V31, V7 // Find 1st matching byte using mask
   600  	VAND     V7, V10, V7     // AND matched bytes with matched 3rd byte
   601  	VCLZD    V7, V18         // Find first nonzero indexes
   602  	MFVSRD   V18, R25        // Move 1st doubleword
   603  	CMP      R25, $64        // If < 64 found
   604  	BLT      foundR25        // Return matching index
   605  
   606  	MFVSRLD  V18, R25     // Move 2nd doubleword
   607  	CMP      R25, $64     // If < 64 found
   608  	ADD      $64, R25     // Update byte index
   609  	BLT      foundR25     // Return matching index
   610  	ADD      $16, R7      // R7+=16 string ptr
   611  	ADD      $19, R7, R9  // Number of string bytes for loop
   612  	CMP      R9, LASTBYTE // Compare against last byte of string
   613  	BLT      index3loop2  // If within, continue this loop
   614  	CMP      R7, LASTSTR  // Compare against last start byte
   615  	BLT      index2to16   // Process remainder
   616  	VSPLTISB $0, V3       // Special case for last 16 bytes
   617  	BR       index3loop   // Continue this loop
   618  
   619  	// Loop to process 4 byte separator
   620  	// string[0:16] in V2
   621  	// string[3:16] in V3
   622  	// sep[0:4] splatted in V1
   623  	// Set up vectors with strings at offsets
   624  	// 0, 1, 2, 3 and compare against the 4 byte
   625  	// separator also splatted. Use VSEL with the
   626  	// compare results to find the first byte where
   627  	// a separator match is found.
   628  index4plus:
   629  	CMP  R6, $4       // Check if 4 byte separator
   630  	BNE  index5plus   // If not next higher
   631  	ADD  $20, R7, R9  // Check string size to load
   632  	CMP  R9, LASTBYTE // Verify string length
   633  	BGE  index2to16   // If not large enough, process remaining
   634  	MOVD $2, R15      // Set up index
   635  
   636  	// Set up masks for use with VSEL
   637  	MOVD    $0xff, R21 // Set up mask 0xff000000ff000000...
   638  	SLD     $24, R21
   639  	MTVSRWS R21, V29
   640  
   641  	VSLDOI  $2, V29, V29, V30 // Mask 0x0000ff000000ff00...
   642  	MOVD    $0xffff, R21
   643  	SLD     $16, R21
   644  	MTVSRWS R21, V31
   645  
   646  	VSPLTW $0, V0, V1 // Splat 1st word of separator
   647  
   648  index4loop:
   649  	LXVB16X (R7)(R0), V2_ // Load 16 bytes @R7 into V2
   650  
   651  next4:
   652  	VSPLTISB $0, V10            // Clear
   653  	MOVD     $3, R9             // Number of bytes beyond 16
   654  	LXVB16X  (R7)(R9), V3_      // Load 16 bytes @R7 into V2
   655  	VSLDOI   $13, V3, V10, V3   // Shift left last 3 bytes
   656  	VSLDOI   $1, V2, V3, V4     // V4=(V2:V3)<<1
   657  	VSLDOI   $2, V2, V3, V9     // V9=(V2:V3)<<2
   658  	VSLDOI   $3, V2, V3, V10    // V10=(V2:v3)<<3
   659  	VCMPEQUW V1, V2, V5         // compare index 0, 4, ... with sep
   660  	VCMPEQUW V1, V4, V6         // compare index 1, 5, ... with sep
   661  	VCMPEQUW V1, V9, V11        // compare index 2, 6, ... with sep
   662  	VCMPEQUW V1, V10, V12       // compare index 3, 7, ... with sep
   663  	VSEL     V6, V5, V29, V13   // merge index 0, 1, 4, 5, using mask
   664  	VSEL     V12, V11, V30, V14 // merge index 2, 3, 6, 7, using mask
   665  	VSEL     V14, V13, V31, V7  // final merge
   666  	VCLZD    V7, V18            // Find first index for each half
   667  	MFVSRD   V18, R25           // Isolate value
   668  	CMP      R25, $64           // If < 64, found
   669  	BLT      foundR25           // Return found index
   670  
   671  	MFVSRLD V18, R25     // Isolate other value
   672  	CMP     R25, $64     // If < 64, found
   673  	ADD     $64, R25     // Update index for high doubleword
   674  	BLT     foundR25     // Return found index
   675  	ADD     $16, R7      // R7+=16 for next string
   676  	ADD     $20, R7, R9  // R+20 for all bytes to load
   677  	CMP     R9, LASTBYTE // Past end? Maybe check for extra?
   678  	BLT     index4loop   // If not, continue loop
   679  	CMP     R7, LASTSTR  // Check remainder
   680  	BLE     index2to16   // Process remainder
   681  	BR      notfound     // Not found
   682  
   683  index5plus:
   684  	CMP R6, $16     // Check for sep > 16
   685  	BGT index17plus // Handle large sep
   686  
   687  	// Assumption is that the separator is smaller than the string at this point
   688  index2to16:
   689  	CMP R7, LASTSTR // Compare last start byte
   690  	BGT notfound    // last takes len(sep) into account
   691  
   692  	ADD $16, R7, R9    // Check for last byte of string
   693  	CMP R9, LASTBYTE
   694  	BGT index2to16tail
   695  
   696  	// At least 16 bytes of string left
   697  	// Mask the number of bytes in sep
   698  index2to16loop:
   699  	LXVB16X (R7)(R0), V1_ // Load 16 bytes @R7 into V1
   700  
   701  compare:
   702  	VAND       V1, SEPMASK, V2 // Mask out sep size
   703  	VCMPEQUBCC V0, V2, V3      // Compare masked string
   704  	BLT        CR6, found      // All equal
   705  	ADD        $1, R7          // Update ptr to next byte
   706  	CMP        R7, LASTSTR     // Still less than last start byte
   707  	BGT        notfound        // Not found
   708  	ADD        $16, R7, R9     // Verify remaining bytes
   709  	CMP        R9, LASTBYTE    // At least 16
   710  	BLT        index2to16loop  // Try again
   711  
   712  	// Less than 16 bytes remaining in string
   713  	// Separator >= 2
   714  index2to16tail:
   715  	ADD     R3, R4, R9     // End of string
   716  	SUB     R7, R9, R9     // Number of bytes left
   717  	ANDCC   $15, R7, R10   // 16 byte offset
   718  	ADD     R10, R9, R11   // offset + len
   719  	CMP     R11, $16       // >= 16?
   720  	BLE     short          // Does not cross 16 bytes
   721  	LXVB16X (R7)(R0), V1_  // Load 16 bytes @R7 into V1
   722  	BR      index2to16next // Continue on
   723  
   724  short:
   725  	RLDICR   $0, R7, $59, R9 // Adjust addr to 16 byte container
   726  	LXVB16X  (R9)(R0), V1_   // Load 16 bytes @R9 into V1
   727  	SLD      $3, R10         // Set up shift
   728  	MTVSRD   R10, V8_        // Set up shift
   729  	VSLDOI   $8, V8, V8, V8
   730  	VSLO     V1, V8, V1      // Shift by start byte
   731  	VSPLTISB $0, V25         // Clear for later use
   732  
   733  index2to16next:
   734  	VAND       V1, SEPMASK, V2 // Just compare size of sep
   735  	VCMPEQUBCC V0, V2, V3      // Compare sep and partial string
   736  	BLT        CR6, found      // Found
   737  	ADD        $1, R7          // Not found, try next partial string
   738  	CMP        R7, LASTSTR     // Check for end of string
   739  	BGT        notfound        // If at end, then not found
   740  	VSLDOI     $1, V1, V25, V1 // Shift string left by 1 byte
   741  	BR         index2to16next  // Check the next partial string
   742  
   743  index17plus:
   744  	CMP      R6, $32       // Check if 17 < len(sep) <= 32
   745  	BGT      index33plus
   746  	SUB      $16, R6, R9   // Extra > 16
   747  	SLD      $56, R9, R10  // Shift to use in VSLO
   748  	MTVSRD   R10, V9_      // Set up for VSLO
   749  	LXVB16X  (R5)(R9), V1_ // Load 16 bytes @R5+R9 into V1
   750  	VSLO     V1, V9, V1    // Shift left
   751  	VSPLTISB $0xff, V7     // Splat 1s
   752  	VSPLTISB $0, V27       // Splat 0
   753  
   754  index17to32loop:
   755  	LXVB16X (R7)(R0), V2_ // Load 16 bytes @R7 into V2
   756  
   757  next17:
   758  	LXVB16X    (R7)(R9), V3_   // Load 16 bytes @R7+R9 into V3
   759  	VSLO       V3, V9, V3      // Shift left
   760  	VCMPEQUB   V0, V2, V4      // Compare first 16 bytes
   761  	VCMPEQUB   V1, V3, V5      // Compare extra over 16 bytes
   762  	VAND       V4, V5, V6      // Check if both equal
   763  	VCMPEQUBCC V6, V7, V8      // All equal?
   764  	BLT        CR6, found      // Yes
   765  	ADD        $1, R7          // On to next byte
   766  	CMP        R7, LASTSTR     // Check if last start byte
   767  	BGT        notfound        // If too high, not found
   768  	BR         index17to32loop // Continue
   769  
   770  notfound:
   771  #ifdef GOEXPERIMENT_regabiargs
   772          MOVD $-1, R3   // Return -1 if not found
   773  #else
   774  	MOVD $-1, R8   // Return -1 if not found
   775  	MOVD R8, (R14)
   776  #endif
   777  	RET
   778  
   779  index33plus:
   780  	MOVD $0, (R0) // Case not implemented
   781  	RET           // Crash before return
   782  
   783  foundR25:
   784  	SRD  $3, R25   // Convert from bits to bytes
   785  	ADD  R25, R7   // Add to current string address
   786  	SUB  R3, R7    // Subtract from start of string
   787  #ifdef GOEXPERIMENT_regabiargs
   788          MOVD R7, R3    // Return byte where found
   789  #else
   790  	MOVD R7, (R14) // Return byte where found
   791  #endif
   792  	RET
   793  
   794  found:
   795  	SUB  R3, R7    // Return byte where found
   796  #ifdef GOEXPERIMENT_regabiargs
   797          MOVD R7, R3
   798  #else
   799  	MOVD R7, (R14)
   800  #endif
   801  	RET
   802  
   803  

View as plain text